Add a script for inserting au PAY shows #5
- It scrapes an invidious page for: https://www.youtube.com/@live_tv_aupaymarket/streams - It inserts the ones it hasn't seen yet. - If it doesn't have enough info to do an insertion, there are ways you can use the REPL to find the problematic videos and handle them manually. Usage: perl bin/aupay-insert.pl
This commit is contained in:
parent
0c9683b6a4
commit
ae8187542d
|
@ -0,0 +1,193 @@
|
|||
#!/usr/bin/env perl
|
||||
use strict;
|
||||
use warnings;
|
||||
use URI;
|
||||
use URI::QueryParam;
|
||||
use Web::Scraper;
|
||||
use Encode;
|
||||
use Try::Tiny;
|
||||
use File::Basename;
|
||||
use DBI;
|
||||
use DBD::SQLite;
|
||||
use Path::Tiny;
|
||||
|
||||
require "../www/Utils.pm";
|
||||
|
||||
# origin: https://www.youtube.com/@live_tv_aupaymarket/streams
|
||||
# However, it's easier to scrape invidious instances.
|
||||
my $channel_streams_url =
|
||||
"https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";
|
||||
# "https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw";
|
||||
# "https://yewtu.be/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";
|
||||
|
||||
# caster_id for name written in kanji
|
||||
my %cid = (
|
||||
'takayama' => 1,
|
||||
'山岸愛梨' => 2,
|
||||
'戸北美月' => 3,
|
||||
'魚住茉由' => 4,
|
||||
'小川千奈' => 5,
|
||||
'松雪彩花' => 6,
|
||||
'kawabata' => 7,
|
||||
'駒木結衣' => 8,
|
||||
'小林李衣奈' => 9,
|
||||
'大島璃音' => 10,
|
||||
'檜山沙耶' => 11,
|
||||
);
|
||||
|
||||
# get a list of video urls the channel page
|
||||
my $videos = scraper {
|
||||
process 'div.thumbnail + div.video-card-row a', "videos[]" => '@href';
|
||||
result 'videos';
|
||||
};
|
||||
|
||||
# In the db, schedule.jst is unique, so this is for
|
||||
# evading the UNIQUE constraint in the unlikely event that an
|
||||
# au PAY show happens in the same hour as a
|
||||
# WN show.
|
||||
my %showtime = (
|
||||
5 => 1,
|
||||
8 => 1,
|
||||
11 => 11,
|
||||
14 => 1,
|
||||
17 => 1,
|
||||
20 => 1,
|
||||
);
|
||||
|
||||
# pass 1: parse video page
|
||||
my $guess = scraper {
|
||||
process '#published-date', 'published_date' => 'TEXT';
|
||||
process "#descriptionWrapper", 'datetime' => sub {
|
||||
my $el = shift;
|
||||
my $text = encode('utf-8', $el->as_text);
|
||||
my ($month) = $text =~ /(\d+)月/;
|
||||
my ($day) = $text =~ /(\d+)日/;
|
||||
my ($hour) = $text =~ /(\d+)時/;
|
||||
if ($hour) {
|
||||
my $extra_second = $showtime{$hour} ? "01" : "00";
|
||||
return sprintf(
|
||||
"XXXX-%02d-%02dT%02d:00:%s+09:00",
|
||||
$month, $day, $hour, $extra_second
|
||||
);
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
};
|
||||
process '#descriptionWrapper a', 'bitly' => '@href';
|
||||
process 'div.h-box h1', 'title' => 'TEXT';
|
||||
};
|
||||
|
||||
# dig out the aupay video_id by inspecting the bitly redirect
|
||||
sub aupay {
|
||||
my $bitly = shift;
|
||||
my $redirect = `curl -Is "$bitly" | grep '^location:' | sed 's/location: //'`;
|
||||
my $u1 = URI->new($redirect);
|
||||
my $u2 = URI->new($u1->query_param('link'));
|
||||
my $id = $u2->query_param('id');
|
||||
return $id;
|
||||
}
|
||||
|
||||
# pass 2: take parsed video page results and derive more values
|
||||
sub guess {
|
||||
my $uri = shift;
|
||||
my $v = $guess->scrape($uri);
|
||||
my ($year) = $v->{published_date} =~ / (\d+)$/;
|
||||
$v->{datetime} =~ s/XXXX/$year/;
|
||||
$v->{youtube_id} = $uri->query_param('v');
|
||||
$v->{video_id} = aupay($v->{bitly});
|
||||
$v->{title} = Encode::encode('utf-8', $v->{title});
|
||||
$v->{tags} = [ $v->{title} =~ /\s#(\S+)/g ];
|
||||
my $tlen = scalar @{$v->{tags}};
|
||||
$v->{caster} = $tlen > 0 ? $v->{tags}[$tlen - 1] : "";
|
||||
$v->{cid} = $v->{caster} ? $cid{$v->{caster}} : 0;
|
||||
return $v;
|
||||
}
|
||||
|
||||
# array ref of video urls
|
||||
sub vs {
|
||||
return $videos->scrape(URI->new($channel_streams_url));
|
||||
}
|
||||
|
||||
# run guess() over the video urls and return the results
|
||||
sub vgs {
|
||||
my $vs = $videos->scrape(URI->new($channel_streams_url));
|
||||
my @r;
|
||||
my @e;
|
||||
my $i = 0;
|
||||
for my $v (@$vs) {
|
||||
try {
|
||||
my $v2 = guess($v);
|
||||
$v2->{errors} = [];
|
||||
push @{$v2->{errors}}, "no video_id" if (!$v2->{video_id});
|
||||
push @{$v2->{errors}}, "no datetime" if (!$v2->{datetime});
|
||||
push @{$v2->{errors}}, "no cid" if (!$v2->{cid});
|
||||
if (scalar @{$v2->{errors}}) {
|
||||
print "- $i $v2->{youtube_id}\n" if $ENV{DEBUG};
|
||||
push @e, $v2;
|
||||
} else {
|
||||
print "+ $i $v2->{youtube_id}\n" if $ENV{DEBUG};
|
||||
push @r, $v2;
|
||||
}
|
||||
} catch {
|
||||
# If it blows up, just put the youtube video_id in @e.
|
||||
warn "caught error: $_";
|
||||
push @e, $v->query_param('v');
|
||||
};
|
||||
$i++;
|
||||
}
|
||||
return {
|
||||
e => \@e, # problematic videos that can't be inserted
|
||||
r => \@r, # results that are usable
|
||||
};
|
||||
}
|
||||
|
||||
# insert a video into the schedule
|
||||
sub insert {
|
||||
my ($do, $vg) = @_;
|
||||
my $insert = "
|
||||
INSERT INTO schedule (segment_id, caster_id, video_id, jst)
|
||||
VALUES (8, ?, ?, ?)";
|
||||
try {
|
||||
$do->($insert, $vg->{cid}, $vg->{video_id}, $vg->{datetime});
|
||||
} catch {
|
||||
warn "$_" if ($ENV{DEBUG});
|
||||
};
|
||||
}
|
||||
|
||||
my $default_db = path(dirname(__FILE__) . "/../sql/wn.db")->realpath->relative;
|
||||
sub main {
|
||||
my $dbname = shift || $default_db;
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname")
|
||||
|| die $DBI::errstr;
|
||||
my ($sql, $do) = sql_fn($dbh);
|
||||
my $vgs = vgs();
|
||||
for my $vg (@{$vgs->{r}}) {
|
||||
insert($do, $vg);
|
||||
}
|
||||
}
|
||||
|
||||
unless (caller) {
|
||||
main(@ARGV);
|
||||
}
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
aupay-insert.pl - scrape au pay channel and insert new videos
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
@pta is making me work hard.
|
||||
|
||||
=head1 JULIA
|
||||
|
||||
Maybe I should've done this in Julia as an exercise, but this
|
||||
task kinda caught me by surprise. Next time, look into Gumbo.jl
|
||||
and Cascadia.jl.
|
||||
|
||||
https://github.com/JuliaWeb/Gumbo.jl
|
||||
|
||||
https://github.com/Algocircle/Cascadia.jl
|
||||
|
||||
=cut
|
Loading…
Reference in New Issue