529 lines
20 KiB
Scheme
529 lines
20 KiB
Scheme
;; Copyright (C) 2024 Skylar Widulski <cobra@vern.cc>
|
|
;;
|
|
;; This file is part of MeMe
|
|
;;
|
|
;; MeMe is free software: you can redistribute it and/or modify it under the
|
|
;; terms of the GNU Affero General Public License as published by the Free
|
|
;; Software Foundation, either version 3 of the License, or (at your option) any
|
|
;; later version.
|
|
;;
|
|
;; This program is distributed in the hope that it will be useful, but WITHOUT
|
|
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
|
|
;; for more details.
|
|
;;
|
|
;; You should have received a copy of the GNU Affero General Public License
|
|
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
(define-module (meme scraping)
|
|
#:use-module (libxml2)
|
|
#:use-module (system foreign)
|
|
#:use-module (ice-9 string-fun)
|
|
#:export (proxy
|
|
get-leaderboard
|
|
get-trending
|
|
get-articles
|
|
get-meme
|
|
get-meme-list
|
|
get-category-list
|
|
get-photos
|
|
get-photo
|
|
get-videos
|
|
get-sidebar-gallery
|
|
get-sidebar-trending
|
|
get-pagination))
|
|
|
|
(define (proxy url)
|
|
(string-append "/proxy?url=" url))
|
|
|
|
(define (get-leaderboard d)
|
|
(define lst (list ""))
|
|
(define pref "")
|
|
(do ((i 1 (1+ i)))
|
|
((> i 5))
|
|
(set! pref (string-append "//div[@id=\"omgkym\"]/section[1]/ul/li["
|
|
(number->string i)
|
|
"]/article"))
|
|
(append!
|
|
lst
|
|
(list
|
|
(list
|
|
(get-xpath-string (string-append pref "/@id") d #t)
|
|
(get-xpath-string (string-append pref "/a/@href") d #t)
|
|
(get-xpath-string (string-append pref "/a/img/@src") d #t)
|
|
(get-xpath-string (string-append pref "/a/div/div/text()") d #f)))))
|
|
(cdr lst))
|
|
|
|
(define (get-trending d)
|
|
(define trending (list ""))
|
|
(define pref "")
|
|
(do ((i 1 (1+ i)))
|
|
((= i
|
|
(xml-child-element-count
|
|
(get-xpath-node "//*[@id=\"trending-bar\"]" d) 0))
|
|
(cdr trending))
|
|
(set! pref (string-append "//*[@id=\"trending-bar\"]/a["
|
|
(number->string i)
|
|
"]"))
|
|
(append!
|
|
trending
|
|
(list
|
|
(list
|
|
(get-xpath-string (string-append pref "/@href") d #t)
|
|
(get-xpath-string (string-append pref "/text()") d #f))))))
|
|
|
|
(define (get-articles d)
|
|
(define pref "")
|
|
(define articles (list ""))
|
|
(do ((i 1 (1+ i)))
|
|
((xpath-null?
|
|
(string-append "/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]") d)
|
|
(cdr articles))
|
|
(set! pref (string-append "/html/body/div[3]/div/div[3]/article["
|
|
(number->string i)
|
|
"]"))
|
|
(append!
|
|
articles
|
|
(list
|
|
(if (not (xpath-null? (string-append pref "/div/section") d))
|
|
(list
|
|
(get-xpath-string (string-append pref "/div/section/h1/a/@href")
|
|
d #t)
|
|
(get-xpath-string (string-append pref "/div/section/h1/a/text()")
|
|
d #f)
|
|
(get-xpath-string (string-append pref "//img/@data-src") d #t)
|
|
(if
|
|
(equal?
|
|
(xml-child-element-count
|
|
(get-xpath-node (string-append pref "/div/section/div[2]") d)
|
|
0) 0)
|
|
""
|
|
(get-xpath-string
|
|
(string-append pref "/div/section/div[2]/p/text()") d #f))
|
|
(get-xpath-string
|
|
(string-append pref "/div/section/p[1]/em/text()") d #f))
|
|
(list
|
|
(get-xpath-string (string-append pref "/h1/a/@href") d #t)
|
|
(get-xpath-string (string-append pref "/h1/a/text()") d #f)
|
|
(get-xpath-string (string-append pref "//img/@data-src") d #t)
|
|
(get-xpath-string (string-append pref "/div[2]/p/text()") d #f)
|
|
(get-xpath-string (string-append pref "/div[3]/em/text()")
|
|
d #f)))))))
|
|
|
|
(define (get-meme d)
|
|
(define bodycopy
|
|
(get-xpath-node
|
|
"/html/body/div[3]/div/article//section[@class=\"bodycopy\"]"
|
|
d))
|
|
(define chld (child1 bodycopy 0))
|
|
(define chld1 %null-pointer)
|
|
(define chld2 %null-pointer)
|
|
(define lst (list ""))
|
|
(define lst1 (list ""))
|
|
(define lst2 (list ""))
|
|
(define skip #f)
|
|
(while (not (null-pointer? chld))
|
|
(cond
|
|
(skip
|
|
(set! skip #f)))
|
|
(cond
|
|
((equal? (name chld) "h2")
|
|
(cond
|
|
((equal?
|
|
(text chld)
|
|
"Search Interest")
|
|
(set! skip #t)
|
|
(set! chld (next chld 0))
|
|
(continue)))
|
|
(append!
|
|
lst
|
|
(list (dump-xml chld))))
|
|
((or
|
|
(equal? (name chld) "blockquote")
|
|
(equal? (name chld) "h3")
|
|
(equal? (name chld) "h4")
|
|
(equal? (name chld) "h5")
|
|
(equal? (name chld) "h6"))
|
|
(append! lst (list (dump-xml chld))))
|
|
((equal? (name chld) "p")
|
|
(if (and
|
|
(not (equal? (dump-xml (child chld)) "<br/>"))
|
|
(not (equal? (dump-xml chld)
|
|
"<p><em>Unavailable</em>.</p>")))
|
|
(append!
|
|
lst
|
|
(list (dump-xml chld)))))
|
|
((equal? (name chld) "div")
|
|
(if (and (not (null-pointer? (child chld)))
|
|
(equal? (name (child1 chld 0)) "p"))
|
|
(append!
|
|
lst
|
|
(list (dump-xml chld))))
|
|
(cond
|
|
((and (equal? (name (attrs chld)) "class")
|
|
(equal? (text (attrs chld)) "collection-item"))
|
|
(append!
|
|
lst2
|
|
(list
|
|
(list
|
|
(dump-xml (child1 chld 0))
|
|
(text (attrs (child (next (child1 chld 0) 1))))
|
|
(text
|
|
(next
|
|
(attrs (child (child (next (child1 chld 0) 1))))
|
|
6))
|
|
(text (attrs (child1 (next (child1 chld 0) 3) 0)))))))))
|
|
((equal? (name chld) "center")
|
|
(set! chld1 (child1 chld 0))
|
|
(set! lst1 (list ""))
|
|
(while (not (null-pointer? chld1))
|
|
(cond
|
|
((and (equal? (name chld1) "a")
|
|
(equal? (name (child chld1)) "img"))
|
|
(set! chld2 (attrs (child chld1)))
|
|
(while (not (null-pointer? chld2))
|
|
(cond
|
|
((equal? (name chld2) "data-src")
|
|
(append!
|
|
lst1
|
|
(list
|
|
(string-append
|
|
"<img src=\""
|
|
(proxy (text chld2))
|
|
"\" />")))
|
|
(break)))
|
|
(set! chld2 (next chld2 0)))))
|
|
(set! chld1 (next chld1 0)))
|
|
(append! lst (cdr lst1))))
|
|
(set! chld (next chld 0)))
|
|
(define final-xpath-null?
|
|
(xpath-null?
|
|
"/html/body/div[3]/div/article/header/section/div/footer/p[2]" d))
|
|
(append
|
|
(append
|
|
(list
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/article/header/section/h1/text()" d #f)
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/article/header//img/@src" d #t)
|
|
(if (xpath-null?
|
|
"/html/body/div[3]/div/article/header/section/div/h5" d)
|
|
""
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/div/article/header/section/div/h5" d))
|
|
(if final-xpath-null? ""
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/div/article/header/section/div/footer/p[1]" d))
|
|
(if final-xpath-null?
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/div/article/header/section/div/footer/p" d)
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/div/article/header/section/div/footer/p[2]" d)))
|
|
(cdr lst))
|
|
(list lst2)))
|
|
|
|
(define (get-meme-list d)
|
|
(define table
|
|
(get-xpath-node
|
|
"//div[@id=\"entries_list\"]/table/tbody"
|
|
d))
|
|
(define lst (list ""))
|
|
(define lst1 (list ""))
|
|
(define chld (child table))
|
|
(define chld1 %null-pointer)
|
|
(define chld2 %null-pointer)
|
|
(define chld3 %null-pointer)
|
|
(while (not (null-pointer? chld))
|
|
(cond
|
|
((equal? (name chld) "tr")
|
|
(set! chld1 (child1 chld 0))
|
|
(while (not (null-pointer? chld1))
|
|
(set! lst1 (list ""))
|
|
(cond ((and (equal? (name chld1) "td")
|
|
(not (null-pointer? (child1 chld1 0)))
|
|
(not (equal? (name (child1 chld1 0)) "div")))
|
|
(set! chld2 (attrs (child1 chld1 0)))
|
|
(while (not (null-pointer? chld2))
|
|
(cond
|
|
((equal? (name chld2) "href")
|
|
(append!
|
|
lst1
|
|
(list (text chld2)))
|
|
(break)))
|
|
(set! chld2 (next chld2 0)))
|
|
(set! chld3 (attrs (child (child1 chld1 0))))
|
|
(while (not (null-pointer? chld3))
|
|
(cond
|
|
((equal? (name chld3) "data-src")
|
|
(append!
|
|
lst1
|
|
(list (text chld3)))
|
|
(break)))
|
|
(set! chld3 (next chld3 0)))
|
|
(append!
|
|
lst1
|
|
(list
|
|
(text (child1 (next (child1 chld1 0) 0) 0))))
|
|
(append! lst (list (cdr lst1)))))
|
|
(set! chld1 (next chld1 1)))))
|
|
(set! chld (next chld 0)))
|
|
(append
|
|
(list
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d)
|
|
(if (xpath-null? "/html/body/div[3]/div/div[1]/header/hgroup/p" d)
|
|
""
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
|
|
d #f))
|
|
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" d)
|
|
(if (xpath-null? "/html/body/div[3]/div/div[3]/section/div/p" d)
|
|
""
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[3]/section/div/p" d)))
|
|
(cdr lst)))
|
|
|
|
(define (get-category-list d)
|
|
(define categories
|
|
(get-xpath-node
|
|
"//div[@id=\"categories-list\"]"
|
|
d))
|
|
(define lst (list ""))
|
|
(define lst1 (list ""))
|
|
(define lst2 (list ""))
|
|
(define lst3 (list ""))
|
|
(define cat #f)
|
|
(define skip #t)
|
|
(define chld (child categories))
|
|
(define chld1 %null-pointer)
|
|
(while (not (null-pointer? chld))
|
|
(cond
|
|
((and (equal? (name chld) "a")
|
|
(not (null-pointer? (child chld)))
|
|
(not (equal? (name (child chld)) "i")))
|
|
(set! chld1 (attrs chld))
|
|
(while (not (null-pointer? chld1))
|
|
(cond
|
|
((equal? (name chld1) "data-category")
|
|
(set! cat #t)
|
|
(break))
|
|
((equal? (name chld1) "data-type")
|
|
(set! cat #f)
|
|
(break)))
|
|
(set! chld1 (next chld1 0)))
|
|
(set! lst1 (list ""))
|
|
(set! chld1 (attrs chld))
|
|
(append! lst1 (list (text (child1 chld 0))))
|
|
(while (not (null-pointer? chld1))
|
|
(cond
|
|
((equal? (name chld1) "href")
|
|
(append! lst1 (list (text chld1))))
|
|
((equal? (name chld1) "data-category")
|
|
(append! lst1 (list (text chld1))))
|
|
((equal? (name chld1) "data-type")
|
|
(append! lst1 (list (text chld1)))))
|
|
(set! chld1 (next chld1 0)))
|
|
(cond
|
|
(cat
|
|
(if skip
|
|
(set! skip #f)
|
|
(append! lst (list (list (cdr lst3) (cdr lst2)))))
|
|
(set! lst3 (list-copy lst1))
|
|
(set! lst1 (list ""))
|
|
(set! lst2 (list "")))
|
|
(else (append! lst2 (list (cdr lst1)))))))
|
|
(set! chld (next chld 0)))
|
|
(append! lst (list (list (cdr lst3) (cdr lst2))))
|
|
(append
|
|
(list (dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d))
|
|
(list (cdr lst))))
|
|
|
|
(define (get-photos d)
|
|
(define gallery (get-xpath-node "//*[@id=\"photo_gallery\"]" d))
|
|
(define lst (list ""))
|
|
(define pair '(() . ()))
|
|
(define chld (child1 gallery 0))
|
|
(define chld1 %null-pointer)
|
|
(define chld2 %null-pointer)
|
|
(define chld3 %null-pointer)
|
|
(define chld4 %null-pointer)
|
|
(while (not (null-pointer? chld))
|
|
(cond
|
|
((equal? (name chld) "div")
|
|
(set! chld1 (attrs chld))
|
|
(while (not (null-pointer? chld1))
|
|
(cond
|
|
((and (equal? (name chld1) "class")
|
|
(equal?
|
|
(text chld1)
|
|
"item"))
|
|
(set! chld2 (child1 chld 0))
|
|
(set! chld3 (attrs chld2))
|
|
(while (not (null-pointer? chld3))
|
|
(cond
|
|
((equal? (name chld3) "href")
|
|
(set-car! pair (text chld3))
|
|
(break)))
|
|
(set! chld3 (next chld3 0)))
|
|
(set! chld4 (attrs (child1 chld2 0)))
|
|
(while (not (null-pointer? chld4))
|
|
(cond
|
|
((equal? (name chld4) "data-src")
|
|
(set-cdr! pair (text chld4))
|
|
(break)))
|
|
(set! chld4 (next chld4 0)))
|
|
(append!
|
|
lst
|
|
(list
|
|
(cons (car pair)
|
|
(cdr pair))))))
|
|
(set! chld1 (next chld1 0)))))
|
|
(set! chld (next chld 1)))
|
|
(append
|
|
(list
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d)
|
|
(if (xpath-null? "/html/body/div[3]/div/div[1]/header/hgroup/p" d)
|
|
""
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
|
|
d #f))
|
|
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" d)
|
|
(if (xpath-null? "/html/body/div[3]/div/div[3]//p" d)
|
|
""
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[3]//p" d)))
|
|
(cdr lst)))
|
|
|
|
(define (get-photo d)
|
|
(list
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[2]/header/h1" d)
|
|
(get-xpath-string "/html/body/div[3]/div/div[2]/div[2]/a/img/@alt" d #t)
|
|
(proxy
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/div[2]/div[2]/a/img/@src"
|
|
d #t))
|
|
(dump-xpath-xml
|
|
"/html/body/div[3]/aside/div[2]/div[@class=\"row\"][4]/p/a" d)))
|
|
|
|
(define (get-videos d)
|
|
(define table
|
|
(get-xpath-node
|
|
"/html/body/div[3]/div/section/div/table/tbody"
|
|
d))
|
|
(define lst (list ""))
|
|
(define pair '(() . ()))
|
|
(define chld (child table))
|
|
(define chld1 %null-pointer)
|
|
(define chld2 %null-pointer)
|
|
(define chld3 %null-pointer)
|
|
(while (not (null-pointer? chld))
|
|
(cond
|
|
((equal? (name chld) "tr")
|
|
(set! chld1 (child chld))
|
|
(while (not (null-pointer? chld1))
|
|
(cond ((equal? (name chld1) "td")
|
|
(set! chld2 (attrs (child1 chld1 1)))
|
|
(while (not (null-pointer? chld2))
|
|
(cond
|
|
((equal? (name chld2) "href")
|
|
(set-car! pair (text chld2))
|
|
(break)))
|
|
(set! chld2 (next chld2 0)))
|
|
(set! chld3 (attrs (child1 chld1 2)))
|
|
(while (not (null-pointer? chld3))
|
|
(cond
|
|
((equal? (name chld3) "data-tiktok-url")
|
|
(set-cdr! pair
|
|
(get-tiktok-thumbnail
|
|
(text chld2)))
|
|
(append! lst (list (cons (car pair)
|
|
(cdr pair))))
|
|
(break))
|
|
((equal? (name chld3) "data-src")
|
|
(set-cdr! pair (text chld3))
|
|
(append! lst (list (cons (car pair)
|
|
(cdr pair))))
|
|
(break)))
|
|
(set! chld3 (next chld3 0)))))
|
|
(set! chld1 (next chld1 0)))))
|
|
(set! chld (next chld 0)))
|
|
(append
|
|
(list
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d)
|
|
(if (xpath-null? "/html/body/div[3]/div/div[1]/header/hgroup/p" d)
|
|
""
|
|
(get-xpath-string
|
|
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
|
|
d #f))
|
|
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" d)
|
|
(dump-xpath-xml "/html/body/div[3]/div/div[3]/section/div/p" d))
|
|
(cdr lst)))
|
|
|
|
(define (get-sidebar-gallery d)
|
|
(define gallery (list ""))
|
|
(define lst (list ""))
|
|
(define xpath-start "")
|
|
(do ((i 1 (1+ i)))
|
|
((> i 2) (cdr gallery))
|
|
(set! lst (list
|
|
(get-xpath-string
|
|
(string-append
|
|
"//*[@id=\"popular_galleries\"]["
|
|
(number->string i)
|
|
"]/h3/text()")
|
|
d #f)))
|
|
(do ((j 1 (+ j 1)))
|
|
((> j 2))
|
|
(do ((k 1 (+ k 1)))
|
|
((> k 2))
|
|
(set! xpath-start (string-append
|
|
"//*[@id=\"popular_galleries\"][" (number->string i)
|
|
"]/table/tbody/tr[" (number->string j) "]/td["
|
|
(number->string k) "]"))
|
|
(append!
|
|
lst
|
|
(list
|
|
(get-xpath-string (string-append xpath-start "/a/@href") d #t)
|
|
(get-xpath-string (string-append xpath-start "/a/img/@alt") d #t)
|
|
(get-xpath-string (string-append xpath-start "/a/img/@data-src") d
|
|
#t)
|
|
(get-xpath-string (string-append xpath-start "/h4/a/text()") d
|
|
#f)))))
|
|
(append!
|
|
gallery
|
|
(list lst))))
|
|
|
|
(define (get-sidebar-trending d)
|
|
(define lst '())
|
|
(cond
|
|
((null-pointer? (get-xpath-node "//*[@id=\"trending_photos\"]" d)) "")
|
|
(else
|
|
(set!
|
|
lst
|
|
(list
|
|
(get-xpath-string "//*[@id=\"trending_photos\"]/h3/a/@href" d #t)
|
|
(get-xpath-string "//*[@id=\"trending_photos\"]/h3/a/text()" d #f)))
|
|
(do ((i 1 (1+ i)))
|
|
((> i 3))
|
|
(do ((j 1 (1+ j)))
|
|
((> j 3))
|
|
(append!
|
|
lst
|
|
(list
|
|
(get-xpath-string
|
|
(string-append
|
|
"//*[@id=\"trending_photos\"]/table/tbody/tr["
|
|
(number->string i)
|
|
"]/td[" (number->string j) "]/a/@href")
|
|
d #t)
|
|
(get-xpath-string
|
|
(string-append
|
|
"//*[@id=\"trending_photos\"]/table/tbody/tr["
|
|
(number->string i)
|
|
"]/td[" (number->string j) "]/a/img/@data-src")
|
|
d #t)))))
|
|
lst)))
|
|
|
|
(define (get-pagination d)
|
|
(if (null-pointer? (get-xpath-node "//*[@class=\"pagination\"]" d)) ""
|
|
(dump-xpath-xml "//*[@class=\"pagination\"]" d)))
|