Add a script for inserting au PAY shows #5

- It scrapes an invidious page for:
  https://www.youtube.com/@live_tv_aupaymarket/streams
- It inserts the ones it hasn't seen yet.
- If it doesn't have enough info to do an insertion,
  there are ways you can use the REPL to find the problematic
  videos and handle them manually.

Usage:

  perl bin/aupay-insert.pl
This commit is contained in:
gg 2023-09-02 21:38:26 -07:00
parent 0c9683b6a4
commit ae8187542d
1 changed files with 193 additions and 0 deletions

193
bin/aupay-insert.pl Executable file
View File

@ -0,0 +1,193 @@
#!/usr/bin/env perl
use strict;
use warnings;
use URI;
use URI::QueryParam;
use Web::Scraper;
use Encode;
use Try::Tiny;
use File::Basename;
use DBI;
use DBD::SQLite;
use Path::Tiny;
require "../www/Utils.pm";
# origin: https://www.youtube.com/@live_tv_aupaymarket/streams
# However, it's easier to scrape invidious instances.
my $channel_streams_url =
"https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";
# "https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw";
# "https://yewtu.be/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";
# caster_id for name written in kanji
my %cid = (
'takayama' => 1,
'山岸愛梨' => 2,
'戸北美月' => 3,
'魚住茉由' => 4,
'小川千奈' => 5,
'松雪彩花' => 6,
'kawabata' => 7,
'駒木結衣' => 8,
'小林李衣奈' => 9,
'大島璃音' => 10,
'檜山沙耶' => 11,
);
# get a list of video urls the channel page
my $videos = scraper {
process 'div.thumbnail + div.video-card-row a', "videos[]" => '@href';
result 'videos';
};
# In the db, schedule.jst is unique, so this is for
# evading the UNIQUE constraint in the unlikely event that an
# au PAY show happens in the same hour as a
# WN show.
my %showtime = (
5 => 1,
8 => 1,
11 => 11,
14 => 1,
17 => 1,
20 => 1,
);
# pass 1: parse video page
my $guess = scraper {
process '#published-date', 'published_date' => 'TEXT';
process "#descriptionWrapper", 'datetime' => sub {
my $el = shift;
my $text = encode('utf-8', $el->as_text);
my ($month) = $text =~ /(\d+)月/;
my ($day) = $text =~ /(\d+)日/;
my ($hour) = $text =~ /(\d+)時/;
if ($hour) {
my $extra_second = $showtime{$hour} ? "01" : "00";
return sprintf(
"XXXX-%02d-%02dT%02d:00:%s+09:00",
$month, $day, $hour, $extra_second
);
} else {
return "";
}
};
process '#descriptionWrapper a', 'bitly' => '@href';
process 'div.h-box h1', 'title' => 'TEXT';
};
# dig out the aupay video_id by inspecting the bitly redirect
sub aupay {
my $bitly = shift;
my $redirect = `curl -Is "$bitly" | grep '^location:' | sed 's/location: //'`;
my $u1 = URI->new($redirect);
my $u2 = URI->new($u1->query_param('link'));
my $id = $u2->query_param('id');
return $id;
}
# pass 2: take parsed video page results and derive more values
sub guess {
my $uri = shift;
my $v = $guess->scrape($uri);
my ($year) = $v->{published_date} =~ / (\d+)$/;
$v->{datetime} =~ s/XXXX/$year/;
$v->{youtube_id} = $uri->query_param('v');
$v->{video_id} = aupay($v->{bitly});
$v->{title} = Encode::encode('utf-8', $v->{title});
$v->{tags} = [ $v->{title} =~ /\s#(\S+)/g ];
my $tlen = scalar @{$v->{tags}};
$v->{caster} = $tlen > 0 ? $v->{tags}[$tlen - 1] : "";
$v->{cid} = $v->{caster} ? $cid{$v->{caster}} : 0;
return $v;
}
# array ref of video urls
sub vs {
return $videos->scrape(URI->new($channel_streams_url));
}
# run guess() over the video urls and return the results
sub vgs {
my $vs = $videos->scrape(URI->new($channel_streams_url));
my @r;
my @e;
my $i = 0;
for my $v (@$vs) {
try {
my $v2 = guess($v);
$v2->{errors} = [];
push @{$v2->{errors}}, "no video_id" if (!$v2->{video_id});
push @{$v2->{errors}}, "no datetime" if (!$v2->{datetime});
push @{$v2->{errors}}, "no cid" if (!$v2->{cid});
if (scalar @{$v2->{errors}}) {
print "- $i $v2->{youtube_id}\n" if $ENV{DEBUG};
push @e, $v2;
} else {
print "+ $i $v2->{youtube_id}\n" if $ENV{DEBUG};
push @r, $v2;
}
} catch {
# If it blows up, just put the youtube video_id in @e.
warn "caught error: $_";
push @e, $v->query_param('v');
};
$i++;
}
return {
e => \@e, # problematic videos that can't be inserted
r => \@r, # results that are usable
};
}
# insert a video into the schedule
sub insert {
my ($do, $vg) = @_;
my $insert = "
INSERT INTO schedule (segment_id, caster_id, video_id, jst)
VALUES (8, ?, ?, ?)";
try {
$do->($insert, $vg->{cid}, $vg->{video_id}, $vg->{datetime});
} catch {
warn "$_" if ($ENV{DEBUG});
};
}
my $default_db = path(dirname(__FILE__) . "/../sql/wn.db")->realpath->relative;
sub main {
my $dbname = shift || $default_db;
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname")
|| die $DBI::errstr;
my ($sql, $do) = sql_fn($dbh);
my $vgs = vgs();
for my $vg (@{$vgs->{r}}) {
insert($do, $vg);
}
}
unless (caller) {
main(@ARGV);
}
__END__
=head1 NAME
aupay-insert.pl - scrape au pay channel and insert new videos
=head1 DESCRIPTION
@pta is making me work hard.
=head1 JULIA
Maybe I should've done this in Julia as an exercise, but this
task kinda caught me by surprise. Next time, look into Gumbo.jl
and Cascadia.jl.
https://github.com/JuliaWeb/Gumbo.jl
https://github.com/Algocircle/Cascadia.jl
=cut