diff --git a/bin/aupay-insert.pl b/bin/aupay-insert.pl new file mode 100755 index 0000000..556d942 --- /dev/null +++ b/bin/aupay-insert.pl @@ -0,0 +1,193 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use URI; +use URI::QueryParam; +use Web::Scraper; +use Encode; +use Try::Tiny; +use File::Basename; +use DBI; +use DBD::SQLite; +use Path::Tiny; + +require "../www/Utils.pm"; + +# origin: https://www.youtube.com/@live_tv_aupaymarket/streams +# However, it's easier to scrape invidious instances. +my $channel_streams_url = + "https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams"; +# "https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw"; +# "https://yewtu.be/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams"; + +# caster_id for name written in kanji +my %cid = ( + 'takayama' => 1, + '山岸愛梨' => 2, + '戸北美月' => 3, + '魚住茉由' => 4, + '小川千奈' => 5, + '松雪彩花' => 6, + 'kawabata' => 7, + '駒木結衣' => 8, + '小林李衣奈' => 9, + '大島璃音' => 10, + '檜山沙耶' => 11, +); + +# get a list of video urls the channel page +my $videos = scraper { + process 'div.thumbnail + div.video-card-row a', "videos[]" => '@href'; + result 'videos'; +}; + +# In the db, schedule.jst is unique, so this is for +# evading the UNIQUE constraint in the unlikely event that an +# au PAY show happens in the same hour as a +# WN show. +my %showtime = ( + 5 => 1, + 8 => 1, + 11 => 11, + 14 => 1, + 17 => 1, + 20 => 1, +); + +# pass 1: parse video page +my $guess = scraper { + process '#published-date', 'published_date' => 'TEXT'; + process "#descriptionWrapper", 'datetime' => sub { + my $el = shift; + my $text = encode('utf-8', $el->as_text); + my ($month) = $text =~ /(\d+)月/; + my ($day) = $text =~ /(\d+)日/; + my ($hour) = $text =~ /(\d+)時/; + if ($hour) { + my $extra_second = $showtime{$hour} ? "01" : "00"; + return sprintf( + "XXXX-%02d-%02dT%02d:00:%s+09:00", + $month, $day, $hour, $extra_second + ); + } else { + return ""; + } + }; + process '#descriptionWrapper a', 'bitly' => '@href'; + process 'div.h-box h1', 'title' => 'TEXT'; +}; + +# dig out the aupay video_id by inspecting the bitly redirect +sub aupay { + my $bitly = shift; + my $redirect = `curl -Is "$bitly" | grep '^location:' | sed 's/location: //'`; + my $u1 = URI->new($redirect); + my $u2 = URI->new($u1->query_param('link')); + my $id = $u2->query_param('id'); + return $id; +} + +# pass 2: take parsed video page results and derive more values +sub guess { + my $uri = shift; + my $v = $guess->scrape($uri); + my ($year) = $v->{published_date} =~ / (\d+)$/; + $v->{datetime} =~ s/XXXX/$year/; + $v->{youtube_id} = $uri->query_param('v'); + $v->{video_id} = aupay($v->{bitly}); + $v->{title} = Encode::encode('utf-8', $v->{title}); + $v->{tags} = [ $v->{title} =~ /\s#(\S+)/g ]; + my $tlen = scalar @{$v->{tags}}; + $v->{caster} = $tlen > 0 ? $v->{tags}[$tlen - 1] : ""; + $v->{cid} = $v->{caster} ? $cid{$v->{caster}} : 0; + return $v; +} + +# array ref of video urls +sub vs { + return $videos->scrape(URI->new($channel_streams_url)); +} + +# run guess() over the video urls and return the results +sub vgs { + my $vs = $videos->scrape(URI->new($channel_streams_url)); + my @r; + my @e; + my $i = 0; + for my $v (@$vs) { + try { + my $v2 = guess($v); + $v2->{errors} = []; + push @{$v2->{errors}}, "no video_id" if (!$v2->{video_id}); + push @{$v2->{errors}}, "no datetime" if (!$v2->{datetime}); + push @{$v2->{errors}}, "no cid" if (!$v2->{cid}); + if (scalar @{$v2->{errors}}) { + print "- $i $v2->{youtube_id}\n" if $ENV{DEBUG}; + push @e, $v2; + } else { + print "+ $i $v2->{youtube_id}\n" if $ENV{DEBUG}; + push @r, $v2; + } + } catch { + # If it blows up, just put the youtube video_id in @e. + warn "caught error: $_"; + push @e, $v->query_param('v'); + }; + $i++; + } + return { + e => \@e, # problematic videos that can't be inserted + r => \@r, # results that are usable + }; +} + +# insert a video into the schedule +sub insert { + my ($do, $vg) = @_; + my $insert = " + INSERT INTO schedule (segment_id, caster_id, video_id, jst) + VALUES (8, ?, ?, ?)"; + try { + $do->($insert, $vg->{cid}, $vg->{video_id}, $vg->{datetime}); + } catch { + warn "$_" if ($ENV{DEBUG}); + }; +} + +my $default_db = path(dirname(__FILE__) . "/../sql/wn.db")->realpath->relative; +sub main { + my $dbname = shift || $default_db; + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname") + || die $DBI::errstr; + my ($sql, $do) = sql_fn($dbh); + my $vgs = vgs(); + for my $vg (@{$vgs->{r}}) { + insert($do, $vg); + } +} + +unless (caller) { + main(@ARGV); +} + +__END__ + +=head1 NAME + +aupay-insert.pl - scrape au pay channel and insert new videos + +=head1 DESCRIPTION + +@pta is making me work hard. + +=head1 JULIA + +Maybe I should've done this in Julia as an exercise, but this +task kinda caught me by surprise. Next time, look into Gumbo.jl +and Cascadia.jl. + +https://github.com/JuliaWeb/Gumbo.jl + +https://github.com/Algocircle/Cascadia.jl + +=cut