forked from gg/WeatherNews.jl
219 lines
5.2 KiB
Perl
Executable File
219 lines
5.2 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
use strict;
|
|
use warnings;
|
|
use URI;
|
|
use URI::QueryParam;
|
|
use Web::Scraper;
|
|
use Encode;
|
|
use Try::Tiny;
|
|
use File::Basename;
|
|
use DBI;
|
|
use DBD::SQLite;
|
|
use Path::Tiny;
|
|
|
|
my $utils_pm = path(__FILE__)->parent->parent->path("www/Utils.pm")->absolute;
|
|
require $utils_pm;
|
|
|
|
# origin: https://www.youtube.com/@live_tv_aupaymarket/streams
|
|
# However, it's easier to scrape invidious instances.
|
|
my $channel_streams_url =
|
|
"https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";
|
|
# "https://invidious.protokolla.fi/channel/UCZ38ImrKrXIJdg3eivJhuDw";
|
|
# "https://yewtu.be/channel/UCZ38ImrKrXIJdg3eivJhuDw/streams";
|
|
|
|
# caster_id for name written in kanji
|
|
my %cid = (
|
|
'takayama' => 1,
|
|
'山岸愛梨' => 2,
|
|
'戸北美月' => 3,
|
|
'魚住茉由' => 4,
|
|
'小川千奈' => 5,
|
|
'松雪彩花' => 6,
|
|
'kawabata' => 7,
|
|
'駒木結衣' => 8,
|
|
'小林李衣奈' => 9,
|
|
'大島璃音' => 10,
|
|
'檜山沙耶' => 11,
|
|
);
|
|
|
|
# get a list of video urls the channel page
|
|
my $videos = scraper {
|
|
process 'div.thumbnail + div.video-card-row a', "videos[]" => '@href';
|
|
result 'videos';
|
|
};
|
|
|
|
# In the db, schedule.jst is unique, so this is for
|
|
# evading the UNIQUE constraint in the unlikely event that an
|
|
# au PAY show happens in the same hour as a
|
|
# WN show.
|
|
my %showtime = (
|
|
5 => 1,
|
|
8 => 1,
|
|
11 => 11,
|
|
14 => 1,
|
|
17 => 1,
|
|
20 => 1,
|
|
);
|
|
|
|
# pass 1: parse video page
|
|
my $guess = scraper {
|
|
process '#published-date', 'published_date' => 'TEXT';
|
|
process "#descriptionWrapper", 'datetime' => sub {
|
|
my $el = shift;
|
|
my $text = encode('utf-8', $el->as_text);
|
|
my ($month) = $text =~ /(\d+)月/;
|
|
my ($day) = $text =~ /(\d+)日/;
|
|
my ($hour) = $text =~ /(\d+)時/;
|
|
if ($hour) {
|
|
my $extra_second = $showtime{$hour} ? "01" : "00";
|
|
return sprintf(
|
|
"XXXX-%02d-%02dT%02d:00:%s+09:00",
|
|
$month, $day, $hour, $extra_second
|
|
);
|
|
} else {
|
|
return "";
|
|
}
|
|
};
|
|
process '#descriptionWrapper a', 'bitly' => '@href';
|
|
process 'div.h-box h1', 'title' => 'TEXT';
|
|
};
|
|
|
|
# dig out the aupay video_id by inspecting the bitly redirect
|
|
sub aupay {
|
|
my $bitly = shift;
|
|
my $redirect = `curl -Is "$bitly" | grep '^location:' | sed 's/location: //'`;
|
|
my $u1 = URI->new($redirect);
|
|
my $u2 = URI->new($u1->query_param('link'));
|
|
my $id = $u2->query_param('id');
|
|
return $id;
|
|
}
|
|
|
|
# pass 2: take parsed video page results and derive more values
|
|
sub guess {
|
|
my $uri = shift;
|
|
my $v = $guess->scrape($uri);
|
|
my ($year) = $v->{published_date} =~ / (\d+)$/;
|
|
$v->{datetime} =~ s/XXXX/$year/;
|
|
$v->{youtube_id} = $uri->query_param('v');
|
|
$v->{video_id} = aupay($v->{bitly});
|
|
$v->{title} = Encode::encode('utf-8', $v->{title});
|
|
$v->{tags} = [ $v->{title} =~ /\s#(\S+)/g ];
|
|
my $tlen = scalar @{$v->{tags}};
|
|
$v->{caster} = $tlen > 0 ? $v->{tags}[$tlen - 1] : "";
|
|
$v->{cid} = $v->{caster} ? $cid{$v->{caster}} : 0;
|
|
return $v;
|
|
}
|
|
|
|
# array ref of video urls
|
|
sub vs {
|
|
return $videos->scrape(URI->new($channel_streams_url));
|
|
}
|
|
|
|
# run guess() over the video urls and return the results
|
|
sub vgs {
|
|
my $vs = $videos->scrape(URI->new($channel_streams_url));
|
|
my @r;
|
|
my @e;
|
|
my $i = 0;
|
|
for my $v (@$vs) {
|
|
try {
|
|
my $v2 = guess($v);
|
|
$v2->{errors} = [];
|
|
push @{$v2->{errors}}, "no video_id" if (!$v2->{video_id});
|
|
push @{$v2->{errors}}, "no datetime" if (!$v2->{datetime});
|
|
push @{$v2->{errors}}, "no cid" if (!$v2->{cid});
|
|
if (scalar @{$v2->{errors}}) {
|
|
print "- $i $v2->{youtube_id}\n" if $ENV{DEBUG};
|
|
push @e, $v2;
|
|
} else {
|
|
print "+ $i $v2->{youtube_id}\n" if $ENV{DEBUG};
|
|
push @r, $v2;
|
|
}
|
|
} catch {
|
|
# If it blows up, just put the youtube video_id in @e.
|
|
warn "caught error: $_";
|
|
push @e, $v->query_param('v');
|
|
};
|
|
$i++;
|
|
}
|
|
return {
|
|
e => \@e, # problematic videos that can't be inserted
|
|
r => \@r, # results that are usable
|
|
};
|
|
}
|
|
|
|
# insert a video into the schedule
|
|
sub insert {
|
|
my ($do, $vg) = @_;
|
|
my $insert = "
|
|
INSERT INTO schedule (segment_id, caster_id, video_id, jst)
|
|
VALUES (8, ?, ?, ?)";
|
|
try {
|
|
$do->($insert, $vg->{cid}, $vg->{video_id}, $vg->{datetime});
|
|
} catch {
|
|
warn "$_" if ($ENV{DEBUG});
|
|
};
|
|
}
|
|
|
|
my $default_db = path(dirname(__FILE__) . "/../sql/wn.db")->realpath->relative;
|
|
sub main {
|
|
my $dbname = shift || $default_db;
|
|
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname")
|
|
|| die $DBI::errstr;
|
|
my ($sql, $do) = sql_fn($dbh);
|
|
my $vgs = vgs();
|
|
for my $vg (@{$vgs->{r}}) {
|
|
insert($do, $vg);
|
|
}
|
|
}
|
|
|
|
unless (caller) {
|
|
main(@ARGV);
|
|
}
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
aupay-insert.pl - scrape au pay channel and insert new videos
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
From the command line
|
|
|
|
bin/aupay-insert.pl
|
|
|
|
Using a specific SQLite database
|
|
|
|
bin/aupay-insert.pl /tmp/test.db
|
|
|
|
From the REPL
|
|
|
|
$ reply
|
|
0> do "./bin/aupay-insert.pl"
|
|
$res[0] = 'main'
|
|
|
|
1> aupay "https://bit.ly/3KAe8xh"
|
|
$res[1] = '10211'
|
|
|
|
2> $ENV{DEBUG} = 1
|
|
$res[2] = '1'
|
|
|
|
3> my $vgs = vgs
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
@pta is making me work hard.
|
|
|
|
=head1 JULIA
|
|
|
|
Maybe I should've done this in Julia as an exercise, but this
|
|
task kinda caught me by surprise. Next time, look into Gumbo.jl
|
|
and Cascadia.jl.
|
|
|
|
https://github.com/JuliaWeb/Gumbo.jl
|
|
|
|
https://github.com/Algocircle/Cascadia.jl
|
|
|
|
=cut
|