Suds/main.py

#!/usr/bin/env python

from flask import Flask, render_template, request, redirect, Response, stream_with_context
import requests
import re
from bs4 import BeautifulSoup
from copy import copy

def secondary_list(path):
    data = requests.get("https://www.snopes.com/" + path)

    if data.status_code == 200:
        soup = BeautifulSoup(data.text, "html.parser")

        # Tags
        tags = []
        for tag in soup.select(".tag_wrapper"):
            tag.a["href"] = tag.a["href"].replace("https://www.snopes.com", '')
            tag_link = tag.a["href"]
            tag_name = tag.text
            tags.append([tag_link, tag_name])

        # Header
        article_type = soup.find(attrs={"class": ["section_title"]})
        if (article_type.find("svg") != None):
            article_type.find("svg").decompose()
        head = article_type.find("span")
        if head != None:
            head = head.text
        else:
            head = article_type.text
        article_type_head = soup.find(attrs={"class": ["img_title_wrap"]})
        description = soup.find(attrs={"class": ["desc_cont_wrap"]}).text

        # Article list
        article_list = []
        for div in soup.select(".article_wrapper"):
            div.a["href"] = div.a["href"].replace("https://www.snopes.com", '')
            link = div.a["href"]
            image = "/proxy/?url=" + div.select("img")[0].get("data-src")
            alt = div.select("img")[0].get("alt")
            if len(list(div.select(".article_text"))) != 0:
                text = div.find(attrs={"class": ["article_text"]})
                title = text.select("h3")[0].text
                author = ''
                if text.select(".article_author") != []:
                    author = text.find(attrs={"class": ["article_author"]}).text
                date = text.find(attrs={"class": ["article_date"]}).text
                byline = text.find(attrs={"class": ["article_byline"]}).text
                article_list.append([link, image, alt, title, author, date, byline])

        # Next/previous buttons
        prev_button = next_button = ''
        if soup.select("#both-buttons") != []:
            buttons = soup.select("#both-buttons")[0].select("a")
            prev_button = buttons[0]
            next_button = buttons[1]
        pagenumber = soup.find(attrs={"class": ["pagenumber"]}).text

        return [article_type, description, article_list, prev_button, next_button, pagenumber, tags, head]

    else:
        return data.status_code


def article_list(path):
    data = requests.get("https://www.snopes.com/" + path)

    if data.status_code == 200:
        soup = BeautifulSoup(data.text, "html.parser")

        # Tags
        tags = []
        for tag in soup.select(".tag_wrapper"):
            tag.a["href"] = tag.a["href"].replace("https://www.snopes.com", '')
            tag_link = tag.a["href"]
            tag_name = tag.text
            tags.append([tag_link, tag_name])

        # Header
        article_type = soup.find(attrs={"class": ["section_title"]})
        article_type.find("svg").decompose()
        head = article_type.find("span")
        if head != None:
            head = head.text
        else:
            head = article_type.text
        article_type_head = soup.find(attrs={"class": ["img_title_wrap"]})
        description = soup.find(attrs={"class": ["page_desc_wrapper"]}).text

        # Article list
        article_list = []
        for div in soup.select(".article_wrapper"):
            div.a["href"] = div.a["href"].replace("https://www.snopes.com", '')
            link = div.a["href"]
            image = "/proxy/?url=" + div.select("img")[0].get("data-src")
            alt = div.select("img")[0].get("alt")
            if len(list(div.select(".article_text"))) != 0:
                text = div.find(attrs={"class": ["article_text"]})
                title = text.select("h3")[0].text
                author = text.find(attrs={"class": ["article_author"]}).text
                date = text.find(attrs={"class": ["article_date"]}).text
                byline = text.find(attrs={"class": ["article_byline"]}).text
                article_list.append([link, image, alt, title, author, date, byline])

        # Next/previous buttons
        buttons = soup.select("#both-buttons")[0].select("a")
        prev_button = buttons[0]
        next_button = buttons[1]
        pagenumber = soup.find(attrs={"class": ["pagenumber"]}).text

        return [article_type, description, article_list, prev_button, next_button, pagenumber, tags, head]

    else:
        return data.status_code


def news(path):
    data = requests.get("https://www.snopes.com/" + path)

    if data.status_code == 200:
        soup = BeautifulSoup(data.text, "html.parser")

        # Header
        article_type = soup.find(attrs={"class": ["section_title"]})
        article_type.find("svg").decompose()
        title_container = soup.find(attrs={"class": ["title-container"]})
        title = title_container.select("h1")[0].text
        subtitle = title_container.select("h2")[0].text
        author_container = title_container.find(attrs={"class": ["author_info_wrapper"]})
        author_button = author_container.select("h3 a")[0]
        author_link = author_button.get('href')
        author = author_button.text
        date = author_container.find(attrs={"class": ["published_date"]}).text

        # Cover image
        cover = soup.find(attrs={"id": ["cover-main"]}).get("src")
        cover_alt = soup.find(attrs={"id": ["cover-main"]}).get("alt")
        cover_desc = ""
        if soup.find(attrs={"class": ["article_img_desc"]}).select("span") != []:
            cover_desc = soup.find(attrs={"class": ["article_img_desc"]}).select("span")[0].text

        # Article
        article = soup.find(attrs={"id": ["article-content"]})

        for div in article.select("div"): div.decompose()
        for script in article.select("script"): script.decompose()

        for a in article.select("p a:has(img)"):
            a["href"] = re.sub(r"^(.*)$", r"/proxy/?url=\1", a["href"])
        article = re.sub(r"(src=\")", r"\1/proxy/?url=", str(article))

        # Author bio
        author_bio = soup.select("div.author_bio p")[0].text

        return [article_type, title, subtitle, author.strip(), author_link, date, cover, cover_alt, cover_desc, article, author_bio]
    else:
        return data.status_code

def fact_check(path):
    data = requests.get("https://www.snopes.com/fact-check/" + path)

    if data.status_code == 200:
        soup = BeautifulSoup(data.text, "html.parser")

        # Header
        article_type = soup.find(attrs={"class": ["section_title"]})
        article_type.find("svg").decompose()
        article_type = article_type.text
        title_container = soup.find(attrs={"class": ["title-container"]})
        title = title_container.select("h1")[0].text
        subtitle = title_container.select("h2")[0].text
        author_container = title_container.find(attrs={"class": ["author_info_wrapper"]})
        author_button = author_container.select("h3 a")[0]
        author_link = author_button.get('href')
        author = author_button.text
        date = author_container.find(attrs={"class": ["published_date"]}).text

        # Cover image
        cover = soup.find(attrs={"id": ["cover-main"]}).get("src")
        cover_alt = soup.find(attrs={"id": ["cover-main"]}).get("alt")
        cover_desc = ""
        if soup.find(attrs={"class": ["article_img_desc"]}).select("span") != []:
            cover_desc = soup.find(attrs={"class": ["article_img_desc"]}).select("span")[0].text

        # Article
        article = soup.find(attrs={"id": ["article-content"]})
        rating = copy(article.find(attrs={"id": ["fact_check_rating_container"]}))

        for div in article.select("div"): div.decompose()
        for script in article.select("script"): script.decompose()
        if article.find("section") != None:
            article.find("section").decompose()

        for a in article.select("p a:has(img)"):
            a["href"] = re.sub(r"^(.*)$", r"/proxy/?url=\1", a["href"])
        article = re.sub(r"(src=\")", r"\1/proxy/?url=", str(article))

        if rating != None:
            rating.find("svg").decompose()
            rating.find("img").decompose()
        else:
            rating = ''

        # Author bio
        author_bio = soup.select("div.author_bio p")[0].text

        return [article_type, title, subtitle, author.strip(), author_link, date, cover, cover_alt, cover_desc, rating, article, author_bio]
    else:
        return data.status_code

app = Flask(__name__, template_folder="templates", static_folder="static")

@app.route('/fact-check/')
def route_fact_check_list():
    if request.args.get('pagenum') != None:
        data = article_list(f"fact-check/?pagenum={request.args.get('pagenum')}")
    else:
        data = article_list("fact-check/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/fact-check/category/<category>/')
def route_fact_check_category(category):
    if request.args.get('pagenum') != None:
        data = secondary_list(f"fact-check/category/{category}/?pagenum={request.args.get('pagenum')}")
    else:
        data = secondary_list(f"fact-check/category/{category}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/fact-check/rating/<rating>/')
def route_fact_check_rating(rating):
    if request.args.get('pagenum') != None:
        data = secondary_list(f"fact-check/rating/{rating}/?pagenum={request.args.get('pagenum')}")
    else:
        data = secondary_list(f"fact-check/rating/{rating}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/latest/')
def route_latest():
    if request.args.get('pagenum') != None:
        data = article_list(f"latest/?pagenum={request.args.get('pagenum')}")
    else:
        data = article_list("latest/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/top/')
def route_top():
    if request.args.get('pagenum') != None:
        data = article_list(f"top/?pagenum={request.args.get('pagenum')}")
    else:
        data = article_list("top/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/news/')
def route_news_list():
    if request.args.get('pagenum') != None:
        data = article_list(f"news/?pagenum={request.args.get('pagenum')}")
    else:
        data = article_list("news/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/news/category/<category>/')
def route_news_category(category):
    if request.args.get('pagenum') != None:
        data = secondary_list(f"news/category/{category}/?pagenum={request.args.get('pagenum')}")
    else:
        data = secondary_list(f"news/category/{category}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/articles/')
def route_articles():
    if request.args.get('pagenum') != None:
        data = article_list(f"articles/?pagenum={request.args.get('pagenum')}")
    else:
        data = article_list("articles/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/classics/')
def route_classics():
    if request.args.get('pagenum') != None:
        data = article_list(f"classics/?pagenum={request.args.get('pagenum')}")
    else:
        data = article_list("classics/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/news/<year>/<month>/<day>/<article>/')
def route_news(year, month, day, article):
    data = news(f"news/{year}/{month}/{day}/{article}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("news.html", data=data)

@app.route('/articles/<article_id>/<article>/')
def route_article(article_id, article):
    data = news(f"articles/{article_id}/{article}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("news.html", data=data)

@app.route('/fact-check/<article>/')
def route_fact_check(article):
    data = fact_check(article)
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("fact-check.html", data=data)

@app.route('/author/<author>/')
def route_author(author):
    if request.args.get('pagenum') != None:
        data = secondary_list(f"author/{author}/?pagenum={request.args.get('pagenum')}")
    else:
        data = secondary_list(f"author/{author}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/tag/<tag>/')
def route_tag(tag):
    if request.args.get('pagenum') != None:
        data = secondary_list(f"tag/{tag}/?pagenum={request.args.get('pagenum')}")
    else:
        data = secondary_list(f"tag/{tag}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/collections/')
def route_collections():
    if request.args.get('pagenum') != None:
        data = secondary_list(f"collections/?pagenum={request.args.get('pagenum')}")
    else:
        data = secondary_list("collections/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/collections/<collection>/')
def route_collection(collection):
    data = requests.get(f"https://www.snopes.com/collections/{collection}/")
    print(data)
    if data.status_code != 200:
        return Response(render_template(str(data.status_code) + ".html"), data.status_code)
    soup = BeautifulSoup(data.text, "html.parser")

    # Header
    article_type = soup.find(attrs={"class": ["section_title"]})
    article_type.find("svg").decompose()
    article_type = article_type.text
    title_container = soup.find(attrs={"class": ["title-container"]})
    title = title_container.select("h1")[0].text
    subtitle = title_container.select("h2")[0].text
    author_container = title_container.find(attrs={"class": ["author_info_wrapper"]})
    author_button = author_container.select("h3 a")[0]
    author_link = author_button.get('href')
    author = author_button.text
    date = author_container.find(attrs={"class": ["published_date"]}).text

    # Cover image
    cover = soup.find(attrs={"id": ["cover-main"]}).get("src")
    cover_alt = soup.find(attrs={"id": ["cover-main"]}).get("alt")
    cover_desc = ""
    if soup.find(attrs={"class": ["article_img_desc"]}).select("span") != []:
        cover_desc = soup.find(attrs={"class": ["article_img_desc"]}).select("span")[0].text

    # Article
    article = soup.find(attrs={"id": ["article-content"]})
    article_copy = copy(article)

    for div in article.select("div"): div.decompose()
    for script in article.select("script"): script.decompose()

    for a in article.select("p a:has(img)"):
        a["href"] = re.sub(r"^(.*)$", r"/proxy/?url=\1", a["href"])
    article = re.sub(r"(src=\")", r"\1/proxy/?url=", str(article))

    # Author bio
    author_bio = soup.select("div.author_bio p")[0].text

    # Article list
    article_list = []
    for div in article_copy.select(".article_wrapper"):
        div.a["href"] = div.a["href"].replace("https://www.snopes.com", '')
        link = div.a["href"]
        image = "/proxy/?url=" + div.select("img")[0].get("data-src")
        alt = div.select("img")[0].get("alt")
        if len(list(div.select(".article_text"))) != 0:
            text = div.find(attrs={"class": ["article_text"]})
            title = text.select("h3")[0].text
            date = text.find(attrs={"class": ["article_date"]}).text
            byline = text.find(attrs={"class": ["article_byline"]}).text
            article_list.append([link, image, alt, title, date, byline])


    return render_template("collection.html", data=[article_type, title, subtitle, author_link, str(author).strip(), date, cover, cover_alt, cover_desc, article, author_bio, article_list])

@app.route('/search/')
def route_search_blank():
    if request.args.get('q') == None:
        if request.args.get('pagenum') != None:
            data = secondary_list(f"search/?pagenum={request.args.get('pagenum')}")
        else:
            data = secondary_list("search/")
        if type(data) == int:
            return Response(render_template(str(data) + ".html"), data)
        return render_template("list.html", data=data)
    else:
        return redirect(f"/search/{request.args.get('q')}/")

@app.route('/search/<query>/')
def route_search(query):
    if request.args.get('pagenum') != None:
        data = secondary_list(f"search/{query}/?pagenum={request.args.get('pagenum')}")
    else:
        data = secondary_list(f"search/{query}/")
    if type(data) == int:
        return Response(render_template(str(data) + ".html"), data)
    return render_template("list.html", data=data)

@app.route('/random/')
def route_random():
    data = requests.get("https://www.snopes.com/random/")
    if data.status_code != 200:
        return Response(render_template(str(data.status_code) + ".html"), data.status_code)
    return redirect(data.url.replace("https://www.snopes.com", ''), 307)

@app.route('/sitemap/')
def route_sitemap():
    data = requests.get("https://www.snopes.com/sitemap/")
    if data.status_code != 200:
        return Response(render_template(str(data.status_code) + ".html"), data.status_code)

    soup = BeautifulSoup(data.text, "html.parser")
    title = soup.find("h2").text
    archives = []
    for archive_box in soup.select(".archive_box"):
        archive_type = archive_box.find(attrs={"class": ["section_title"]})
        if (archive_type.find("svg") != None):
            archive_type.find("svg").decompose()
        archive_sections = []
        for archive_section_item in archive_box.select(".archive_section_item"):
            archive_section_item.a["href"] = archive_section_item.a["href"].replace("https://www.snopes.com", '')
            archive_sections.append(archive_section_item.a)
        archives.append([archive_type, archive_sections])

    return render_template("sitemap.html", data=[title, archives])

@app.route('/')
def route_home():
    return render_template("index.html")


@app.route('/proxy/')
def route_proxy():
    url = request.args.get("url")
    if url != None:
        if url.startswith("https://mediaproxy.snopes.com/") or url.startswith("https://media.snopes.com/") or url.startswith("https://www.snopes.com/"):
            data = requests.get(url)
            return Response(data.content, content_type=data.headers["content-type"])
        elif url.startswith("/") and not url.startswith("//"):
            data = requests.get("https://www.snopes.com" + url)
            return Response(data.content, content_type=data.headers["content-type"])
        else:
            return Response(render_template("400.html"), status=400)
    else:
        return Response(render_template("400.html"), status=400)

@app.before_request
def add_slash():
    if not request.path.endswith('/'):
        return redirect(request.path + '/')

@app.errorhandler(404)
def not_found(e):
    return render_template("404.html")

if __name__ == '__main__':
    app.run(port=8001)