MeMe/meme/scraping.scm

529 lines
20 KiB
Scheme

;; Copyright (C) 2024 Skylar Widulski <cobra@vern.cc>
;;
;; This file is part of MeMe
;;
;; MeMe is free software: you can redistribute it and/or modify it under the
;; terms of the GNU Affero General Public License as published by the Free
;; Software Foundation, either version 3 of the License, or (at your option) any
;; later version.
;;
;; This program is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
;; for more details.
;;
;; You should have received a copy of the GNU Affero General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
(define-module (meme scraping)
#:use-module (libxml2)
#:use-module (system foreign)
#:use-module (ice-9 string-fun)
#:export (proxy
get-leaderboard
get-trending
get-articles
get-meme
get-meme-list
get-category-list
get-photos
get-photo
get-videos
get-sidebar-gallery
get-sidebar-trending
get-pagination))
(define (proxy url)
(string-append "/proxy?url=" url))
(define (get-leaderboard d)
(define lst (list ""))
(define pref "")
(do ((i 1 (1+ i)))
((> i 5))
(set! pref (string-append "//div[@id=\"omgkym\"]/section[1]/ul/li["
(number->string i)
"]/article"))
(append!
lst
(list
(list
(get-xpath-string (string-append pref "/@id") d #t)
(get-xpath-string (string-append pref "/a/@href") d #t)
(get-xpath-string (string-append pref "/a/img/@src") d #t)
(get-xpath-string (string-append pref "/a/div/div/text()") d #f)))))
(cdr lst))
(define (get-trending d)
(define trending (list ""))
(define pref "")
(do ((i 1 (1+ i)))
((= i
(xml-child-element-count
(get-xpath-node "//*[@id=\"trending-bar\"]" d) 0))
(cdr trending))
(set! pref (string-append "//*[@id=\"trending-bar\"]/a["
(number->string i)
"]"))
(append!
trending
(list
(list
(get-xpath-string (string-append pref "/@href") d #t)
(get-xpath-string (string-append pref "/text()") d #f))))))
(define (get-articles d)
(define pref "")
(define articles (list ""))
(do ((i 1 (1+ i)))
((xpath-null?
(string-append "/html/body/div[3]/div/div[3]/article["
(number->string i)
"]") d)
(cdr articles))
(set! pref (string-append "/html/body/div[3]/div/div[3]/article["
(number->string i)
"]"))
(append!
articles
(list
(if (not (xpath-null? (string-append pref "/div/section") d))
(list
(get-xpath-string (string-append pref "/div/section/h1/a/@href")
d #t)
(get-xpath-string (string-append pref "/div/section/h1/a/text()")
d #f)
(get-xpath-string (string-append pref "//img/@data-src") d #t)
(if
(equal?
(xml-child-element-count
(get-xpath-node (string-append pref "/div/section/div[2]") d)
0) 0)
""
(get-xpath-string
(string-append pref "/div/section/div[2]/p/text()") d #f))
(get-xpath-string
(string-append pref "/div/section/p[1]/em/text()") d #f))
(list
(get-xpath-string (string-append pref "/h1/a/@href") d #t)
(get-xpath-string (string-append pref "/h1/a/text()") d #f)
(get-xpath-string (string-append pref "//img/@data-src") d #t)
(get-xpath-string (string-append pref "/div[2]/p/text()") d #f)
(get-xpath-string (string-append pref "/div[3]/em/text()")
d #f)))))))
(define (get-meme d)
(define bodycopy
(get-xpath-node
"/html/body/div[3]/div/article//section[@class=\"bodycopy\"]"
d))
(define chld (child1 bodycopy 0))
(define chld1 %null-pointer)
(define chld2 %null-pointer)
(define lst (list ""))
(define lst1 (list ""))
(define lst2 (list ""))
(define skip #f)
(while (not (null-pointer? chld))
(cond
(skip
(set! skip #f)))
(cond
((equal? (name chld) "h2")
(cond
((equal?
(text chld)
"Search Interest")
(set! skip #t)
(set! chld (next chld 0))
(continue)))
(append!
lst
(list (dump-xml chld))))
((or
(equal? (name chld) "blockquote")
(equal? (name chld) "h3")
(equal? (name chld) "h4")
(equal? (name chld) "h5")
(equal? (name chld) "h6"))
(append! lst (list (dump-xml chld))))
((equal? (name chld) "p")
(if (and
(not (equal? (dump-xml (child chld)) "<br/>"))
(not (equal? (dump-xml chld)
"<p><em>Unavailable</em>.</p>")))
(append!
lst
(list (dump-xml chld)))))
((equal? (name chld) "div")
(if (and (not (null-pointer? (child chld)))
(equal? (name (child1 chld 0)) "p"))
(append!
lst
(list (dump-xml chld))))
(cond
((and (equal? (name (attrs chld)) "class")
(equal? (text (attrs chld)) "collection-item"))
(append!
lst2
(list
(list
(dump-xml (child1 chld 0))
(text (attrs (child (next (child1 chld 0) 1))))
(text
(next
(attrs (child (child (next (child1 chld 0) 1))))
6))
(text (attrs (child1 (next (child1 chld 0) 3) 0)))))))))
((equal? (name chld) "center")
(set! chld1 (child1 chld 0))
(set! lst1 (list ""))
(while (not (null-pointer? chld1))
(cond
((and (equal? (name chld1) "a")
(equal? (name (child chld1)) "img"))
(set! chld2 (attrs (child chld1)))
(while (not (null-pointer? chld2))
(cond
((equal? (name chld2) "data-src")
(append!
lst1
(list
(string-append
"<img src=\""
(proxy (text chld2))
"\" />")))
(break)))
(set! chld2 (next chld2 0)))))
(set! chld1 (next chld1 0)))
(append! lst (cdr lst1))))
(set! chld (next chld 0)))
(define final-xpath-null?
(xpath-null?
"/html/body/div[3]/div/article/header/section/div/footer/p[2]" d))
(append
(append
(list
(get-xpath-string
"/html/body/div[3]/div/article/header/section/h1/text()" d #f)
(get-xpath-string
"/html/body/div[3]/div/article/header//img/@src" d #t)
(if (xpath-null?
"/html/body/div[3]/div/article/header/section/div/h5" d)
""
(dump-xpath-xml
"/html/body/div[3]/div/article/header/section/div/h5" d))
(if final-xpath-null? ""
(dump-xpath-xml
"/html/body/div[3]/div/article/header/section/div/footer/p[1]" d))
(if final-xpath-null?
(dump-xpath-xml
"/html/body/div[3]/div/article/header/section/div/footer/p" d)
(dump-xpath-xml
"/html/body/div[3]/div/article/header/section/div/footer/p[2]" d)))
(cdr lst))
(list lst2)))
(define (get-meme-list d)
(define table
(get-xpath-node
"//div[@id=\"entries_list\"]/table/tbody"
d))
(define lst (list ""))
(define lst1 (list ""))
(define chld (child table))
(define chld1 %null-pointer)
(define chld2 %null-pointer)
(define chld3 %null-pointer)
(while (not (null-pointer? chld))
(cond
((equal? (name chld) "tr")
(set! chld1 (child1 chld 0))
(while (not (null-pointer? chld1))
(set! lst1 (list ""))
(cond ((and (equal? (name chld1) "td")
(not (null-pointer? (child1 chld1 0)))
(not (equal? (name (child1 chld1 0)) "div")))
(set! chld2 (attrs (child1 chld1 0)))
(while (not (null-pointer? chld2))
(cond
((equal? (name chld2) "href")
(append!
lst1
(list (text chld2)))
(break)))
(set! chld2 (next chld2 0)))
(set! chld3 (attrs (child (child1 chld1 0))))
(while (not (null-pointer? chld3))
(cond
((equal? (name chld3) "data-src")
(append!
lst1
(list (text chld3)))
(break)))
(set! chld3 (next chld3 0)))
(append!
lst1
(list
(text (child1 (next (child1 chld1 0) 0) 0))))
(append! lst (list (cdr lst1)))))
(set! chld1 (next chld1 1)))))
(set! chld (next chld 0)))
(append
(list
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d)
(if (xpath-null? "/html/body/div[3]/div/div[1]/header/hgroup/p" d)
""
(get-xpath-string
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
d #f))
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" d)
(if (xpath-null? "/html/body/div[3]/div/div[3]/section/div/p" d)
""
(dump-xpath-xml "/html/body/div[3]/div/div[3]/section/div/p" d)))
(cdr lst)))
(define (get-category-list d)
(define categories
(get-xpath-node
"//div[@id=\"categories-list\"]"
d))
(define lst (list ""))
(define lst1 (list ""))
(define lst2 (list ""))
(define lst3 (list ""))
(define cat #f)
(define skip #t)
(define chld (child categories))
(define chld1 %null-pointer)
(while (not (null-pointer? chld))
(cond
((and (equal? (name chld) "a")
(not (null-pointer? (child chld)))
(not (equal? (name (child chld)) "i")))
(set! chld1 (attrs chld))
(while (not (null-pointer? chld1))
(cond
((equal? (name chld1) "data-category")
(set! cat #t)
(break))
((equal? (name chld1) "data-type")
(set! cat #f)
(break)))
(set! chld1 (next chld1 0)))
(set! lst1 (list ""))
(set! chld1 (attrs chld))
(append! lst1 (list (text (child1 chld 0))))
(while (not (null-pointer? chld1))
(cond
((equal? (name chld1) "href")
(append! lst1 (list (text chld1))))
((equal? (name chld1) "data-category")
(append! lst1 (list (text chld1))))
((equal? (name chld1) "data-type")
(append! lst1 (list (text chld1)))))
(set! chld1 (next chld1 0)))
(cond
(cat
(if skip
(set! skip #f)
(append! lst (list (list (cdr lst3) (cdr lst2)))))
(set! lst3 (list-copy lst1))
(set! lst1 (list ""))
(set! lst2 (list "")))
(else (append! lst2 (list (cdr lst1)))))))
(set! chld (next chld 0)))
(append! lst (list (list (cdr lst3) (cdr lst2))))
(append
(list (dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d))
(list (cdr lst))))
(define (get-photos d)
(define gallery (get-xpath-node "//*[@id=\"photo_gallery\"]" d))
(define lst (list ""))
(define pair '(() . ()))
(define chld (child1 gallery 0))
(define chld1 %null-pointer)
(define chld2 %null-pointer)
(define chld3 %null-pointer)
(define chld4 %null-pointer)
(while (not (null-pointer? chld))
(cond
((equal? (name chld) "div")
(set! chld1 (attrs chld))
(while (not (null-pointer? chld1))
(cond
((and (equal? (name chld1) "class")
(equal?
(text chld1)
"item"))
(set! chld2 (child1 chld 0))
(set! chld3 (attrs chld2))
(while (not (null-pointer? chld3))
(cond
((equal? (name chld3) "href")
(set-car! pair (text chld3))
(break)))
(set! chld3 (next chld3 0)))
(set! chld4 (attrs (child1 chld2 0)))
(while (not (null-pointer? chld4))
(cond
((equal? (name chld4) "data-src")
(set-cdr! pair (text chld4))
(break)))
(set! chld4 (next chld4 0)))
(append!
lst
(list
(cons (car pair)
(cdr pair))))))
(set! chld1 (next chld1 0)))))
(set! chld (next chld 1)))
(append
(list
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d)
(if (xpath-null? "/html/body/div[3]/div/div[1]/header/hgroup/p" d)
""
(get-xpath-string
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
d #f))
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" d)
(if (xpath-null? "/html/body/div[3]/div/div[3]//p" d)
""
(dump-xpath-xml "/html/body/div[3]/div/div[3]//p" d)))
(cdr lst)))
(define (get-photo d)
(list
(dump-xpath-xml "/html/body/div[3]/div/div[2]/header/h1" d)
(get-xpath-string "/html/body/div[3]/div/div[2]/div[2]/a/img/@alt" d #t)
(proxy
(get-xpath-string
"/html/body/div[3]/div/div[2]/div[2]/a/img/@src"
d #t))
(dump-xpath-xml
"/html/body/div[3]/aside/div[2]/div[@class=\"row\"][4]/p/a" d)))
(define (get-videos d)
(define table
(get-xpath-node
"/html/body/div[3]/div/section/div/table/tbody"
d))
(define lst (list ""))
(define pair '(() . ()))
(define chld (child table))
(define chld1 %null-pointer)
(define chld2 %null-pointer)
(define chld3 %null-pointer)
(while (not (null-pointer? chld))
(cond
((equal? (name chld) "tr")
(set! chld1 (child chld))
(while (not (null-pointer? chld1))
(cond ((equal? (name chld1) "td")
(set! chld2 (attrs (child1 chld1 1)))
(while (not (null-pointer? chld2))
(cond
((equal? (name chld2) "href")
(set-car! pair (text chld2))
(break)))
(set! chld2 (next chld2 0)))
(set! chld3 (attrs (child1 chld1 2)))
(while (not (null-pointer? chld3))
(cond
((equal? (name chld3) "data-tiktok-url")
(set-cdr! pair
(get-tiktok-thumbnail
(text chld2)))
(append! lst (list (cons (car pair)
(cdr pair))))
(break))
((equal? (name chld3) "data-src")
(set-cdr! pair (text chld3))
(append! lst (list (cons (car pair)
(cdr pair))))
(break)))
(set! chld3 (next chld3 0)))))
(set! chld1 (next chld1 0)))))
(set! chld (next chld 0)))
(append
(list
(dump-xpath-xml "/html/body/div[3]/div/div[1]/header/hgroup/h1" d)
(if (xpath-null? "/html/body/div[3]/div/div[1]/header/hgroup/p" d)
""
(get-xpath-string
"/html/body/div[3]/div/div[1]/header/hgroup/p/text()"
d #f))
(dump-xpath-xml "/html/body/div[3]/div/nav/ul" d)
(dump-xpath-xml "/html/body/div[3]/div/div[3]/section/div/p" d))
(cdr lst)))
(define (get-sidebar-gallery d)
(define gallery (list ""))
(define lst (list ""))
(define xpath-start "")
(do ((i 1 (1+ i)))
((> i 2) (cdr gallery))
(set! lst (list
(get-xpath-string
(string-append
"//*[@id=\"popular_galleries\"]["
(number->string i)
"]/h3/text()")
d #f)))
(do ((j 1 (+ j 1)))
((> j 2))
(do ((k 1 (+ k 1)))
((> k 2))
(set! xpath-start (string-append
"//*[@id=\"popular_galleries\"][" (number->string i)
"]/table/tbody/tr[" (number->string j) "]/td["
(number->string k) "]"))
(append!
lst
(list
(get-xpath-string (string-append xpath-start "/a/@href") d #t)
(get-xpath-string (string-append xpath-start "/a/img/@alt") d #t)
(get-xpath-string (string-append xpath-start "/a/img/@data-src") d
#t)
(get-xpath-string (string-append xpath-start "/h4/a/text()") d
#f)))))
(append!
gallery
(list lst))))
(define (get-sidebar-trending d)
(define lst '())
(cond
((null-pointer? (get-xpath-node "//*[@id=\"trending_photos\"]" d)) "")
(else
(set!
lst
(list
(get-xpath-string "//*[@id=\"trending_photos\"]/h3/a/@href" d #t)
(get-xpath-string "//*[@id=\"trending_photos\"]/h3/a/text()" d #f)))
(do ((i 1 (1+ i)))
((> i 3))
(do ((j 1 (1+ j)))
((> j 3))
(append!
lst
(list
(get-xpath-string
(string-append
"//*[@id=\"trending_photos\"]/table/tbody/tr["
(number->string i)
"]/td[" (number->string j) "]/a/@href")
d #t)
(get-xpath-string
(string-append
"//*[@id=\"trending_photos\"]/table/tbody/tr["
(number->string i)
"]/td[" (number->string j) "]/a/img/@data-src")
d #t)))))
lst)))
(define (get-pagination d)
(if (null-pointer? (get-xpath-node "//*[@class=\"pagination\"]" d)) ""
(dump-xpath-xml "//*[@class=\"pagination\"]" d)))