550 lines
18 KiB
Scheme
550 lines
18 KiB
Scheme
;; Copyright (C) 2023 Skylar Widulski <cobra@vern.cc>
|
|
;;
|
|
;; This file is part of MeMe
|
|
;;
|
|
;; MeMe is free software: you can redistribute it and/or modify it under the
|
|
;; terms of the GNU Affero General Public License as published by the Free
|
|
;; Software Foundation, either version 3 of the License, or (at your option) any
|
|
;; later version.
|
|
;;
|
|
;; This program is distributed in the hope that it will be useful, but WITHOUT
|
|
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
|
|
;; for more details.
|
|
;;
|
|
;; You should have received a copy of the GNU Affero General Public License
|
|
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
(define-module (meme scraping)
|
|
#:use-module (meme libxml2)
|
|
#:use-module (system foreign)
|
|
#:use-module (ice-9 string-fun)
|
|
#:export (proxy
|
|
get-leaderboard
|
|
get-trending
|
|
get-newsfeed
|
|
get-meme
|
|
get-photos
|
|
get-photo
|
|
get-sidebar-gallery
|
|
get-sidebar-trending
|
|
get-pagination))
|
|
|
|
(define (proxy url)
|
|
(string-append "/proxy?url=" url))
|
|
|
|
(define (get-leaderboard body)
|
|
(list
|
|
(list
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[1]/article/@id"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[1]/article/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[1]/article/a/img/@src"
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[1]/div/div[2]/section[1]/"
|
|
"ul/li[1]/article/a/div/div/text()")
|
|
body #f))
|
|
(list
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[2]/article/@id"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[2]/article/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[2]/article/a/img/@src"
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[1]/div/div[2]/section[1]"
|
|
"/ul/li[2]/article/a/div/div/text()")
|
|
body #f))
|
|
(list
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[3]/article/@id"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[3]/article/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[3]/article/a/img/@src"
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[1]/div/div[2]/section[1]"
|
|
"/ul/li[3]/article/a/div/div/text()")
|
|
body #f))
|
|
(list
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[4]/article/@id"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[4]/article/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[1]/div/div[2]/section[1]/ul/li[4]/article/a/img/@src"
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[1]/div/div[2]/section[1]"
|
|
"/ul/li[4]/article/a/div/div/text()")
|
|
body #f))))
|
|
|
|
(define (get-trending body)
|
|
(define trending (list ""))
|
|
(do ((i 1 (1+ i)))
|
|
((=
|
|
i
|
|
(xml-child-element-count
|
|
(get-xpath-node "/html/body/div[1]/div/div[2]/section[2]" body) 0))
|
|
trending)
|
|
(append!
|
|
trending
|
|
(list
|
|
(list
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[1]/div/div[2]/section[2]/a["
|
|
(number->string i)
|
|
"]/@href")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[1]/div/div[2]/section[2]/a["
|
|
(number->string i)
|
|
"]/text()")
|
|
body #f))))))
|
|
|
|
(define (get-newsfeed body)
|
|
(define newsfeed (list ""))
|
|
(do ((i 1 (1+ i)))
|
|
((= i 6) newsfeed)
|
|
(append!
|
|
newsfeed
|
|
(list
|
|
(list
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]/@id")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]/div/section/h1/a/@href")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]/div/section/h1/a/text()")
|
|
body #f)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]//img/@data-src")
|
|
body #t)
|
|
(if
|
|
(equal?
|
|
(xml-child-element-count
|
|
(get-xpath-node
|
|
(string-append
|
|
"/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]/div/section/div[2]")
|
|
body)
|
|
0)
|
|
0)
|
|
""
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]/div/section/div[2]/p/text()")
|
|
body #f))
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]/div/section/p[1]/em/text()")
|
|
body #f))))))
|
|
|
|
(define (get-meme body)
|
|
(define bodycopy
|
|
(get-xpath-node
|
|
"/html/body/div[3]/div/article//section[@class=\"bodycopy\"]"
|
|
body))
|
|
(define chld (child1 bodycopy 0))
|
|
(define chld1 %null-pointer)
|
|
(define chld2 %null-pointer)
|
|
(define lst (list ""))
|
|
(define lst1 (list ""))
|
|
(define skip #f)
|
|
(while (not (null-pointer? chld))
|
|
(cond
|
|
(skip
|
|
(set! skip #f)))
|
|
(cond
|
|
((equal? (name chld) "br") (append! lst (list "<br>")))
|
|
((equal? (name chld) "h2")
|
|
(cond
|
|
((equal?
|
|
(pointer->string
|
|
(xml-node-list-get-string
|
|
(gumbo-libxml-parse (string->pointer body))
|
|
(child chld) 1))
|
|
"Search Interest")
|
|
(set! skip #t)
|
|
(set! chld (next chld 0))
|
|
(continue)))
|
|
(append!
|
|
lst
|
|
(list (dump-xml chld body))))
|
|
((equal? (name chld) "p")
|
|
(if (not (equal? (dump-xml (child chld) body) "<br/>"))
|
|
(append!
|
|
lst
|
|
(list (dump-xml chld body)))))
|
|
((equal? (name chld) "div")
|
|
(if (and (not (null-pointer? (child chld)))
|
|
(equal? (name (child1 chld 0)) "p"))
|
|
(append!
|
|
lst
|
|
(list (dump-xml chld body)))))
|
|
((equal? (name chld) "center")
|
|
(set! chld1 (child1 chld 0))
|
|
(set! lst1 (list ""))
|
|
(while (not (null-pointer? chld1))
|
|
(cond
|
|
((and (equal? (name chld1) "a")
|
|
(equal? (name (child chld1)) "img"))
|
|
(set! chld2 (props (child chld1)))
|
|
(while (not (null-pointer? chld2))
|
|
(cond
|
|
((equal? (name chld2) "data-src")
|
|
(append!
|
|
lst1
|
|
(list
|
|
(string-append
|
|
"<img src=\""
|
|
(proxy
|
|
(pointer->string
|
|
(xml-node-list-get-string
|
|
(doc chld2)
|
|
(child chld2)
|
|
1)))
|
|
"\" />")))
|
|
(break)))
|
|
(set! chld2 (next chld2 0)))))
|
|
(set! chld1 (next chld1 0)))
|
|
(append! lst (cdr lst1))))
|
|
(set! chld (next chld 0)))
|
|
(append
|
|
(list
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/article/header/section/h1/text()" body #f)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/article/header/a/img/@src" body #t)
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/div/article/header/section/div/h5" body)
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/div/article/header/section/div/footer/p[1]" body)
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/div/article/header/section/div/footer/p[2]" body))
|
|
(cdr lst)))
|
|
|
|
(define (get-photos body)
|
|
(define gallery
|
|
(get-xpath-node
|
|
"//*[@id=\"photo_gallery\"]"
|
|
body))
|
|
(define lst (list ""))
|
|
(define pair '(() . ()))
|
|
(define chld (child1 gallery 0))
|
|
(define chld1 %null-pointer)
|
|
(define chld2 %null-pointer)
|
|
(define chld3 %null-pointer)
|
|
(define chld4 %null-pointer)
|
|
(while (not (null-pointer? chld))
|
|
(cond
|
|
((equal? (name chld) "div")
|
|
(set! chld1 (props chld))
|
|
(while (not (null-pointer? chld1))
|
|
(cond
|
|
((and (equal? (name chld1) "class")
|
|
(equal?
|
|
(pointer->string
|
|
(xml-node-list-get-string
|
|
(doc chld1)
|
|
(child chld1)
|
|
1))
|
|
"item"))
|
|
(set! chld2 (child1 chld 0))
|
|
(set! chld3 (props chld2))
|
|
(while (not (null-pointer? chld3))
|
|
(cond
|
|
((equal? (name chld3) "href")
|
|
(set-car!
|
|
pair
|
|
(pointer->string
|
|
(xml-node-list-get-string
|
|
(doc chld3)
|
|
(child chld3)
|
|
1)))
|
|
(break)))
|
|
(set! chld3 (next chld3 0)))
|
|
(set! chld4 (props (child1 chld2 0)))
|
|
(while (not (null-pointer? chld4))
|
|
(cond
|
|
((equal? (name chld4) "data-src")
|
|
(set-cdr!
|
|
pair
|
|
(pointer->string
|
|
(xml-node-list-get-string
|
|
(doc chld4)
|
|
(child chld4)
|
|
1)))
|
|
(break)))
|
|
(set! chld4 (next chld4 0)))
|
|
(append!
|
|
lst
|
|
(list
|
|
(cons (car pair)
|
|
(cdr pair))))))
|
|
(set! chld1 (next chld1 0)))))
|
|
(set! chld (next chld 0)))
|
|
(append
|
|
(list
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" body)
|
|
(if (null-pointer?
|
|
(caddr (parse-c-struct
|
|
(get-xpath-nodeset
|
|
"/html/body/div[3]/div/div[1]/header/hgroup/p"
|
|
body)
|
|
xml-nodeset)))
|
|
""
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
|
|
body #f))
|
|
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" body)
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[3]" body))
|
|
(cdr lst)))
|
|
|
|
(define (get-photo body)
|
|
(list
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[2]/header/h1" body)
|
|
(get-xpath-string "/html/body/div[3]/div/div[2]/div[2]/a/img/@alt" body #t)
|
|
(proxy
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/div[2]/div[2]/a/img/@src"
|
|
body #t))
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/aside/div[2]/div[@class=\"row\"][4]/p/a" body)))
|
|
|
|
(define (get-sidebar-gallery body)
|
|
(define gallery (list ""))
|
|
(do ((i 1 (+ i 2)))
|
|
((>=
|
|
i
|
|
(- (xml-child-element-count
|
|
(get-xpath-node "/html/body/div[3]/aside" body) 0)
|
|
1))
|
|
gallery)
|
|
(append!
|
|
gallery
|
|
(list
|
|
(list
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/h3/text()")
|
|
body #f)
|
|
|
|
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[1]/a/@href")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[1]/a/img/@alt")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[1]/a/img/@data-src")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[1]/h4/a/text()")
|
|
body #f)
|
|
|
|
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[2]/a/@href")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[2]/a/img/@alt")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[2]/a/img/@data-src")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[1]/td[2]/h4/a/text()")
|
|
body #f)
|
|
|
|
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[1]/a/@href")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[1]/a/img/@alt")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[1]/a/img/@data-src")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[1]/h4/a/text()")
|
|
body #f)
|
|
|
|
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[2]/a/@href")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[2]/a/img/@alt")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[2]/a/img/@data-src")
|
|
body #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"/html/body/div[3]/aside/div["
|
|
(number->string i)
|
|
"]/table/tbody/tr[2]/td[2]/h4/a/text()")
|
|
body #f))))))
|
|
|
|
(define (get-sidebar-trending body)
|
|
(list
|
|
(get-xpath-string "/html/body/div[3]/aside/div[4]/h3/a/@href" body #t)
|
|
(get-xpath-string "/html/body/div[3]/aside/div[4]/h3/a/text()" body #f)
|
|
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[1]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[1]/a/img/@data-src"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[2]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[2]/a/img/@data-src"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[3]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[1]/td[3]/a/img/@data-src"
|
|
body #t)
|
|
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[1]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[1]/a/img/@data-src"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[2]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[2]/a/img/@data-src"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[3]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[2]/td[3]/a/img/@data-src"
|
|
body #t)
|
|
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[1]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[1]/a/img/@data-src"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[2]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[2]/a/img/@data-src"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[3]/a/@href"
|
|
body #t)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/aside/div[4]/table/tbody/tr[3]/td[3]/a/img/@data-src"
|
|
body #t)))
|
|
|
|
(define (get-pagination body)
|
|
(if (null-pointer?
|
|
(dereference-pointer
|
|
(caddr
|
|
(parse-c-struct
|
|
(get-xpath-nodeset
|
|
"//div[@class=\"pagination\"]"
|
|
body)
|
|
xml-nodeset))))
|
|
""
|
|
(dump-xpath-xml "//div[@class=\"pagination\"]" body)))
|