WeatherNews.jl/bin/aupay-insert.pl

#!/usr/bin/env perl
use strict;
use warnings;
use URI;
use URI::QueryParam;
use Web::Scraper;
use Encode;
use Try::Tiny;
use File::Basename;
use DBI;
use DBD::SQLite;
use Path::Tiny;

my $utils_pm = path(__FILE__)->parent->parent->path("www/Utils.pm")->absolute;
require $utils_pm;

# origin: https://www.youtube.com/@live_tv_aupaymarket/streams
# However, it's easier to scrape invidious instances.
my $channel_streams_url =
  "https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";
# "https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw";
# "https://yewtu.be/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";

# caster_id for name written in kanji
my %cid = (
  'takayama' => 1,
  '山岸愛梨' => 2,
  '戸北美月' => 3,
  '魚住茉由' => 4,
  '小川千奈' => 5,
  '松雪彩花' => 6,
  'kawabata' => 7,
  '駒木結衣' => 8,
  '小林李衣奈' => 9,
  '大島璃音' => 10,
  '檜山沙耶' => 11,
);

# get a list of video urls the channel page
my $videos = scraper {
  process 'div.thumbnail + div.video-card-row a', "videos[]" => '@href';
  result 'videos';
};

# In the db, schedule.jst is unique, so this is for
# evading the UNIQUE constraint in the unlikely event that an
# au PAY show happens in the same hour as a
# WN show.
my %showtime = (
  5  => 1,
  8  => 1,
  11 => 11,
  14 => 1,
  17 => 1,
  20 => 1,
);

# pass 1: parse video page
my $guess = scraper {
  process '#published-date', 'published_date' => 'TEXT';
  process "#descriptionWrapper", 'datetime' => sub {
    my $el = shift;
    my $text = encode('utf-8', $el->as_text);
    my ($month) = $text =~ /(\d+)月/;
    my ($day)   = $text =~ /(\d+)日/;
    my ($hour)  = $text =~ /(\d+)時/;
    if ($hour) {
      my $extra_second = $showtime{$hour} ? "01" : "00";
      return sprintf(
        "XXXX-%02d-%02dT%02d:00:%s+09:00",
        $month, $day, $hour, $extra_second
      );
    } else {
      return "";
    }
  };
  process '#descriptionWrapper a', 'bitly' => '@href';
  process 'div.h-box h1', 'title' => 'TEXT';
};

# dig out the aupay video_id by inspecting the bitly redirect
sub aupay {
  my $bitly = shift;
  my $redirect = `curl -Is "$bitly" | grep '^location:' | sed 's/location: //'`;
  my $u1 = URI->new($redirect);
  my $u2 = URI->new($u1->query_param('link'));
  my $id = $u2->query_param('id');
  return $id;
}

# pass 2: take parsed video page results and derive more values
sub guess {
  my $uri = shift;
  my $v = $guess->scrape($uri);
  my ($year) = $v->{published_date} =~ / (\d+)$/;
  $v->{datetime} =~ s/XXXX/$year/;
  $v->{youtube_id} = $uri->query_param('v');
  $v->{video_id} = aupay($v->{bitly});
  $v->{title} = Encode::encode('utf-8', $v->{title});
  $v->{tags} = [ $v->{title} =~ /\s#(\S+)/g ];
  my $tlen = scalar @{$v->{tags}};
  $v->{caster} = $tlen > 0 ? $v->{tags}[$tlen - 1] : "";
  $v->{cid} = $v->{caster} ? $cid{$v->{caster}} : 0;
  return $v;
}

# array ref of video urls
sub vs {
  return $videos->scrape(URI->new($channel_streams_url));
}

# run guess() over the video urls and return the results
sub vgs {
  my $vs = $videos->scrape(URI->new($channel_streams_url));
  my @r;
  my @e;
  my $i = 0;
  for my $v (@$vs) {
    try {
      my $v2 = guess($v);
      $v2->{errors} = [];
      push @{$v2->{errors}}, "no video_id" if (!$v2->{video_id});
      push @{$v2->{errors}}, "no datetime" if (!$v2->{datetime});
      push @{$v2->{errors}}, "no cid"      if (!$v2->{cid});
      if (scalar @{$v2->{errors}}) {
        print "- $i $v2->{youtube_id}\n" if $ENV{DEBUG};
        push @e, $v2;
      } else {
        print "+ $i $v2->{youtube_id}\n" if $ENV{DEBUG};
        push @r, $v2;
      }
    } catch {
      # If it blows up, just put the youtube video_id in @e.
      warn "caught error: $_";
      push @e, $v->query_param('v');
    };
    $i++;
  }
  return {
    e => \@e, # problematic videos that can't be inserted
    r => \@r, # results that are usable
  };
}

# insert a video into the schedule
sub insert {
  my ($do, $vg) = @_;
  my $insert = "
    INSERT INTO schedule (segment_id, caster_id, video_id, jst)
    VALUES (8, ?, ?, ?)";
  try {
    $do->($insert, $vg->{cid}, $vg->{video_id}, $vg->{datetime});
  } catch {
    warn "$_" if ($ENV{DEBUG});
  };
}

my $default_db = path(dirname(__FILE__) . "/../sql/wn.db")->realpath->relative;
sub main {
  my $dbname = shift || $default_db;
  my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname")
    || die $DBI::errstr;
  my ($sql, $do) = sql_fn($dbh);
  my $vgs = vgs();
  for my $vg (@{$vgs->{r}}) {
    insert($do, $vg);
  }
}

unless (caller) {
  main(@ARGV);
}

__END__

=head1 NAME

aupay-insert.pl - scrape au pay channel and insert new videos

=head1 SYNOPSIS

From the command line

  bin/aupay-insert.pl

Using a specific SQLite database

  bin/aupay-insert.pl /tmp/test.db

From the REPL

  $ reply
  0> do "./bin/aupay-insert.pl"
  $res[0] = 'main'

  1> aupay "https://bit.ly/3KAe8xh"
  $res[1] = '10211'

  2> $ENV{DEBUG} = 1
  $res[2] = '1'

  3> my $vgs = vgs

=head1 DESCRIPTION

@pta is making me work hard.

=head1 JULIA

Maybe I should've done this in Julia as an exercise, but this
task kinda caught me by surprise.  Next time, look into Gumbo.jl
and Cascadia.jl.

https://github.com/JuliaWeb/Gumbo.jl

https://github.com/Algocircle/Cascadia.jl

=cut