MeMe/meme/scraping.scm

550 lines
18 KiB
Scheme

;; Copyright (C) 2023 Skylar Widulski <cobra@vern.cc>
;;
;; This file is part of MeMe
;;
;; MeMe is free software: you can redistribute it and/or modify it under the
;; terms of the GNU Affero General Public License as published by the Free
;; Software Foundation, either version 3 of the License, or (at your option) any
;; later version.
;;
;; This program is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
;; for more details.
;;
;; You should have received a copy of the GNU Affero General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
(define-module (meme scraping)
#:use-module (meme libxml2)
#:use-module (system foreign)
#:use-module (ice-9 string-fun)
#:export (proxy
get-leaderboard
get-trending
get-newsfeed
get-meme
get-photos
get-photo
get-sidebar-gallery
get-sidebar-trending
get-pagination))
(define (proxy url)
(string-append "/proxy?url=" url))
(define (get-leaderboard body)
(list
(list
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[1]/article/@id"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[1]/article/a/@href"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[1]/article/a/img/@src"
body #t)
(get-xpath-string
(string-append
"/html/body/div[1]/div/div[2]/section[1]/"
"ul/li[1]/article/a/div/div/text()")
body #f))
(list
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[2]/article/@id"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[2]/article/a/@href"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[2]/article/a/img/@src"
body #t)
(get-xpath-string
(string-append
"/html/body/div[1]/div/div[2]/section[1]"
"/ul/li[2]/article/a/div/div/text()")
body #f))
(list
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[3]/article/@id"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[3]/article/a/@href"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[3]/article/a/img/@src"
body #t)
(get-xpath-string
(string-append
"/html/body/div[1]/div/div[2]/section[1]"
"/ul/li[3]/article/a/div/div/text()")
body #f))
(list
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[4]/article/@id"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[4]/article/a/@href"
body #t)
(get-xpath-string
"/html/body/div[1]/div/div[2]/section[1]/ul/li[4]/article/a/img/@src"
body #t)
(get-xpath-string
(string-append
"/html/body/div[1]/div/div[2]/section[1]"
"/ul/li[4]/article/a/div/div/text()")
body #f))))
(define (get-trending body)
(define trending (list ""))
(do ((i 1 (1+ i)))
((=
i
(xml-child-element-count
(get-xpath-node "/html/body/div[1]/div/div[2]/section[2]" body) 0))
trending)
(append!
trending
(list
(list
(get-xpath-string
(string-append
"/html/body/div[1]/div/div[2]/section[2]/a["
(number->string i)
"]/@href")
body #t)
(get-xpath-string
(string-append
"/html/body/div[1]/div/div[2]/section[2]/a["
(number->string i)
"]/text()")
body #f))))))
(define (get-newsfeed body)
(define newsfeed (list ""))
(do ((i 1 (1+ i)))
((= i 6) newsfeed)
(append!
newsfeed
(list
(list
(get-xpath-string
(string-append
"/html/body/div[3]/div/div[3]/article["
(number->string i)
"]/@id")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/div/div[3]/article["
(number->string i)
"]/div/section/h1/a/@href")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/div/div[3]/article["
(number->string i)
"]/div/section/h1/a/text()")
body #f)
(get-xpath-string
(string-append
"/html/body/div[3]/div/div[3]/article["
(number->string i)
"]//img/@data-src")
body #t)
(if
(equal?
(xml-child-element-count
(get-xpath-node
(string-append
"/html/body/div[3]/div/div[3]/article["
(number->string i)
"]/div/section/div[2]")
body)
0)
0)
""
(get-xpath-string
(string-append
"/html/body/div[3]/div/div[3]/article["
(number->string i)
"]/div/section/div[2]/p/text()")
body #f))
(get-xpath-string
(string-append
"/html/body/div[3]/div/div[3]/article["
(number->string i)
"]/div/section/p[1]/em/text()")
body #f))))))
(define (get-meme body)
(define bodycopy
(get-xpath-node
"/html/body/div[3]/div/article//section[@class=\"bodycopy\"]"
body))
(define chld (child1 bodycopy 0))
(define chld1 %null-pointer)
(define chld2 %null-pointer)
(define lst (list ""))
(define lst1 (list ""))
(define skip #f)
(while (not (null-pointer? chld))
(cond
(skip
(set! skip #f)))
(cond
((equal? (name chld) "br") (append! lst (list "<br>")))
((equal? (name chld) "h2")
(cond
((equal?
(pointer->string
(xml-node-list-get-string
(gumbo-libxml-parse (string->pointer body))
(child chld) 1))
"Search Interest")
(set! skip #t)
(set! chld (next chld 0))
(continue)))
(append!
lst
(list (dump-xml chld body))))
((equal? (name chld) "p")
(if (not (equal? (dump-xml (child chld) body) "<br/>"))
(append!
lst
(list (dump-xml chld body)))))
((equal? (name chld) "div")
(if (and (not (null-pointer? (child chld)))
(equal? (name (child1 chld 0)) "p"))
(append!
lst
(list (dump-xml chld body)))))
((equal? (name chld) "center")
(set! chld1 (child1 chld 0))
(set! lst1 (list ""))
(while (not (null-pointer? chld1))
(cond
((and (equal? (name chld1) "a")
(equal? (name (child chld1)) "img"))
(set! chld2 (props (child chld1)))
(while (not (null-pointer? chld2))
(cond
((equal? (name chld2) "data-src")
(append!
lst1
(list
(string-append
"<img src=\""
(proxy
(pointer->string
(xml-node-list-get-string
(doc chld2)
(child chld2)
1)))
"\" />")))
(break)))
(set! chld2 (next chld2 0)))))
(set! chld1 (next chld1 0)))
(append! lst (cdr lst1))))
(set! chld (next chld 0)))
(append
(list
(get-xpath-string
"/html/body/div[3]/div/article/header/section/h1/text()" body #f)
(get-xpath-string
"/html/body/div[3]/div/article/header/a/img/@src" body #t)
(dump-xpath-xml
"/html/body/div[3]/div/article/header/section/div/h5" body)
(dump-xpath-xml
"/html/body/div[3]/div/article/header/section/div/footer/p[1]" body)
(dump-xpath-xml
"/html/body/div[3]/div/article/header/section/div/footer/p[2]" body))
(cdr lst)))
(define (get-photos body)
(define gallery
(get-xpath-node
"//*[@id=\"photo_gallery\"]"
body))
(define lst (list ""))
(define pair '(() . ()))
(define chld (child1 gallery 0))
(define chld1 %null-pointer)
(define chld2 %null-pointer)
(define chld3 %null-pointer)
(define chld4 %null-pointer)
(while (not (null-pointer? chld))
(cond
((equal? (name chld) "div")
(set! chld1 (props chld))
(while (not (null-pointer? chld1))
(cond
((and (equal? (name chld1) "class")
(equal?
(pointer->string
(xml-node-list-get-string
(doc chld1)
(child chld1)
1))
"item"))
(set! chld2 (child1 chld 0))
(set! chld3 (props chld2))
(while (not (null-pointer? chld3))
(cond
((equal? (name chld3) "href")
(set-car!
pair
(pointer->string
(xml-node-list-get-string
(doc chld3)
(child chld3)
1)))
(break)))
(set! chld3 (next chld3 0)))
(set! chld4 (props (child1 chld2 0)))
(while (not (null-pointer? chld4))
(cond
((equal? (name chld4) "data-src")
(set-cdr!
pair
(pointer->string
(xml-node-list-get-string
(doc chld4)
(child chld4)
1)))
(break)))
(set! chld4 (next chld4 0)))
(append!
lst
(list
(cons (car pair)
(cdr pair))))))
(set! chld1 (next chld1 0)))))
(set! chld (next chld 0)))
(append
(list
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" body)
(if (null-pointer?
(caddr (parse-c-struct
(get-xpath-nodeset
"/html/body/div[3]/div/div[1]/header/hgroup/p"
body)
xml-nodeset)))
""
(get-xpath-string
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
body #f))
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" body)
(dump-xpath-xml "/html/body/div[3]/div/div[3]" body))
(cdr lst)))
(define (get-photo body)
(list
(dump-xpath-xml "/html/body/div[3]/div/div[2]/header/h1" body)
(get-xpath-string "/html/body/div[3]/div/div[2]/div[2]/a/img/@alt" body #t)
(proxy
(get-xpath-string
"/html/body/div[3]/div/div[2]/div[2]/a/img/@src"
body #t))
(dump-xpath-xml
"/html/body/div[3]/aside/div[2]/div[@class=\"row\"][4]/p/a" body)))
(define (get-sidebar-gallery body)
(define gallery (list ""))
(do ((i 1 (+ i 2)))
((>=
i
(- (xml-child-element-count
(get-xpath-node "/html/body/div[3]/aside" body) 0)
1))
gallery)
(append!
gallery
(list
(list
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/h3/text()")
body #f)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[1]/a/@href")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[1]/a/img/@alt")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[1]/a/img/@data-src")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[1]/h4/a/text()")
body #f)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[2]/a/@href")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[2]/a/img/@alt")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[2]/a/img/@data-src")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[1]/td[2]/h4/a/text()")
body #f)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[1]/a/@href")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[1]/a/img/@alt")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[1]/a/img/@data-src")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[1]/h4/a/text()")
body #f)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[2]/a/@href")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[2]/a/img/@alt")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[2]/a/img/@data-src")
body #t)
(get-xpath-string
(string-append
"/html/body/div[3]/aside/div["
(number->string i)
"]/table/tbody/tr[2]/td[2]/h4/a/text()")
body #f))))))
(define (get-sidebar-trending body)
(list
(get-xpath-string "/html/body/div[3]/aside/div[4]/h3/a/@href" body #t)
(get-xpath-string "/html/body/div[3]/aside/div[4]/h3/a/text()" body #f)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[1]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[1]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[2]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[2]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[3]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[3]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[1]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[1]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[2]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[2]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[3]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[3]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[1]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[1]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[2]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[2]/a/img/@data-src"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[3]/a/@href"
body #t)
(get-xpath-string
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[3]/a/img/@data-src"
body #t)))
(define (get-pagination body)
(if (null-pointer?
(dereference-pointer
(caddr
(parse-c-struct
(get-xpath-nodeset
"//div[@class=\"pagination\"]"
body)
xml-nodeset))))
""
(dump-xpath-xml "//div[@class=\"pagination\"]" body)))