#!/usr/bin/env python from flask import Flask, render_template, request, redirect, Response, stream_with_context import requests import re from bs4 import BeautifulSoup from copy import copy def secondary_list(path): data = requests.get("https://www.snopes.com/" + path) if data.status_code == 200: soup = BeautifulSoup(data.text, "html.parser") # Tags tags = [] for tag in soup.select(".tag_wrapper"): tag.a["href"] = tag.a["href"].replace("https://www.snopes.com", '') tag_link = tag.a["href"] tag_name = tag.text tags.append([tag_link, tag_name]) # Header article_type = soup.find(attrs={"class": ["section_title"]}) if (article_type.find("svg") != None): article_type.find("svg").decompose() head = article_type.find("span") if head != None: head = head.text else: head = article_type.text article_type_head = soup.find(attrs={"class": ["img_title_wrap"]}) description = soup.find(attrs={"class": ["desc_cont_wrap"]}).text # Article list article_list = [] for div in soup.select(".article_wrapper"): div.a["href"] = div.a["href"].replace("https://www.snopes.com", '') link = div.a["href"] image = "/proxy/?url=" + div.select("img")[0].get("data-src") alt = div.select("img")[0].get("alt") if len(list(div.select(".article_text"))) != 0: text = div.find(attrs={"class": ["article_text"]}) title = text.select("h3")[0].text author = '' if text.select(".article_author") != []: author = text.find(attrs={"class": ["article_author"]}).text date = text.find(attrs={"class": ["article_date"]}).text byline = text.find(attrs={"class": ["article_byline"]}).text article_list.append([link, image, alt, title, author, date, byline]) # Next/previous buttons prev_button = next_button = '' if soup.select("#both-buttons") != []: buttons = soup.select("#both-buttons")[0].select("a") prev_button = buttons[0] next_button = buttons[1] pagenumber = soup.find(attrs={"class": ["pagenumber"]}).text return [article_type, description, article_list, prev_button, next_button, pagenumber, tags, head] else: return data.status_code def article_list(path): data = requests.get("https://www.snopes.com/" + path) if data.status_code == 200: soup = BeautifulSoup(data.text, "html.parser") # Tags tags = [] for tag in soup.select(".tag_wrapper"): tag.a["href"] = tag.a["href"].replace("https://www.snopes.com", '') tag_link = tag.a["href"] tag_name = tag.text tags.append([tag_link, tag_name]) # Header article_type = soup.find(attrs={"class": ["section_title"]}) article_type.find("svg").decompose() head = article_type.find("span") if head != None: head = head.text else: head = article_type.text article_type_head = soup.find(attrs={"class": ["img_title_wrap"]}) description = soup.find(attrs={"class": ["page_desc_wrapper"]}).text # Article list article_list = [] for div in soup.select(".article_wrapper"): div.a["href"] = div.a["href"].replace("https://www.snopes.com", '') link = div.a["href"] image = "/proxy/?url=" + div.select("img")[0].get("data-src") alt = div.select("img")[0].get("alt") if len(list(div.select(".article_text"))) != 0: text = div.find(attrs={"class": ["article_text"]}) title = text.select("h3")[0].text author = text.find(attrs={"class": ["article_author"]}).text date = text.find(attrs={"class": ["article_date"]}).text byline = text.find(attrs={"class": ["article_byline"]}).text article_list.append([link, image, alt, title, author, date, byline]) # Next/previous buttons buttons = soup.select("#both-buttons")[0].select("a") prev_button = buttons[0] next_button = buttons[1] pagenumber = soup.find(attrs={"class": ["pagenumber"]}).text return [article_type, description, article_list, prev_button, next_button, pagenumber, tags, head] else: return data.status_code def news(path): data = requests.get("https://www.snopes.com/" + path) if data.status_code == 200: soup = BeautifulSoup(data.text, "html.parser") # Header article_type = soup.find(attrs={"class": ["section_title"]}) article_type.find("svg").decompose() title_container = soup.find(attrs={"class": ["title-container"]}) title = title_container.select("h1")[0].text subtitle = title_container.select("h2")[0].text author_container = title_container.find(attrs={"class": ["author_info_wrapper"]}) author_button = author_container.select("h3 a")[0] author_link = author_button.get('href') author = author_button.text date = author_container.find(attrs={"class": ["published_date"]}).text # Cover image cover = soup.find(attrs={"id": ["cover-main"]}).get("src") cover_alt = soup.find(attrs={"id": ["cover-main"]}).get("alt") cover_desc = "" if soup.find(attrs={"class": ["article_img_desc"]}).select("span") != []: cover_desc = soup.find(attrs={"class": ["article_img_desc"]}).select("span")[0].text # Article article = soup.find(attrs={"id": ["article-content"]}) for div in article.select("div"): div.decompose() for script in article.select("script"): script.decompose() for a in article.select("p a:has(img)"): a["href"] = re.sub(r"^(.*)$", r"/proxy/?url=\1", a["href"]) article = re.sub(r"(src=\")", r"\1/proxy/?url=", str(article)) article = article.replace("https://www.snopes.com", '') # Author bio author_bio = soup.select("div.author_bio p")[0].text return [article_type, title, subtitle, author.strip(), author_link, date, cover, cover_alt, cover_desc, article, author_bio] else: return data.status_code def fact_check(path): data = requests.get("https://www.snopes.com/fact-check/" + path) if data.status_code == 200: soup = BeautifulSoup(data.text, "html.parser") # Header article_type = soup.find(attrs={"class": ["section_title"]}) article_type.find("svg").decompose() article_type = article_type.text title_container = soup.find(attrs={"class": ["title-container"]}) title = title_container.select("h1")[0].text subtitle = title_container.select("h2")[0].text author_container = title_container.find(attrs={"class": ["author_info_wrapper"]}) author_button = author_container.select("h3 a")[0] author_link = author_button.get('href') author = author_button.text date = author_container.find(attrs={"class": ["published_date"]}).text # Cover image cover = soup.find(attrs={"id": ["cover-main"]}).get("src") cover_alt = soup.find(attrs={"id": ["cover-main"]}).get("alt") cover_desc = "" if soup.find(attrs={"class": ["article_img_desc"]}).select("span") != []: cover_desc = soup.find(attrs={"class": ["article_img_desc"]}).select("span")[0].text # Article article = soup.find(attrs={"id": ["article-content"]}) rating = copy(article.find(attrs={"id": ["fact_check_rating_container"]})) for div in article.select("div.snopesad"): div.decompose() for div in article.select("div"): del div["style"] for script in article.select("script"): script.decompose() if article.find("section") != None: article.find("section").decompose() for a in article.select("p a:has(img)"): a["href"] = re.sub(r"^(.*)$", r"/proxy/?url=\1", a["href"]) article = re.sub(r"(src=\")", r"\1/proxy/?url=", str(article)) article = article.replace("https://www.snopes.com", '') if rating != None: rating.find("svg").decompose() rating.find("img").decompose() rating = str(rating).replace("https://www.snopes.com", '') else: rating = '' # Author bio author_bio = soup.select("div.author_bio p")[0].text return [article_type, title, subtitle, author.strip(), author_link, date, cover, cover_alt, cover_desc, rating, article, author_bio] else: return data.status_code app = Flask(__name__, template_folder="templates", static_folder="static") @app.route('/fact-check/') def route_fact_check_list(): if request.args.get('pagenum') != None: data = article_list(f"fact-check/?pagenum={request.args.get('pagenum')}") else: data = article_list("fact-check/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/fact-check/category//') def route_fact_check_category(category): if request.args.get('pagenum') != None: data = secondary_list(f"fact-check/category/{category}/?pagenum={request.args.get('pagenum')}") else: data = secondary_list(f"fact-check/category/{category}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/fact-check/rating//') def route_fact_check_rating(rating): if request.args.get('pagenum') != None: data = secondary_list(f"fact-check/rating/{rating}/?pagenum={request.args.get('pagenum')}") else: data = secondary_list(f"fact-check/rating/{rating}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/latest/') def route_latest(): if request.args.get('pagenum') != None: data = article_list(f"latest/?pagenum={request.args.get('pagenum')}") else: data = article_list("latest/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/top/') def route_top(): if request.args.get('pagenum') != None: data = article_list(f"top/?pagenum={request.args.get('pagenum')}") else: data = article_list("top/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/news/') def route_news_list(): if request.args.get('pagenum') != None: data = article_list(f"news/?pagenum={request.args.get('pagenum')}") else: data = article_list("news/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/news/category//') def route_news_category(category): if request.args.get('pagenum') != None: data = secondary_list(f"news/category/{category}/?pagenum={request.args.get('pagenum')}") else: data = secondary_list(f"news/category/{category}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/articles/') def route_articles(): if request.args.get('pagenum') != None: data = article_list(f"articles/?pagenum={request.args.get('pagenum')}") else: data = article_list("articles/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/classics/') def route_classics(): if request.args.get('pagenum') != None: data = article_list(f"classics/?pagenum={request.args.get('pagenum')}") else: data = article_list("classics/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/news////
/') def route_news(year, month, day, article): data = news(f"news/{year}/{month}/{day}/{article}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("news.html", data=data) @app.route('/articles//
/') def route_article(article_id, article): data = news(f"articles/{article_id}/{article}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("news.html", data=data) @app.route('/fact-check/
/') def route_fact_check(article): data = fact_check(article) if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("fact-check.html", data=data) @app.route('/author//') def route_author(author): if request.args.get('pagenum') != None: data = secondary_list(f"author/{author}/?pagenum={request.args.get('pagenum')}") else: data = secondary_list(f"author/{author}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/tag//') def route_tag(tag): if request.args.get('pagenum') != None: data = secondary_list(f"tag/{tag}/?pagenum={request.args.get('pagenum')}") else: data = secondary_list(f"tag/{tag}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/collections/') def route_collections(): if request.args.get('pagenum') != None: data = secondary_list(f"collections/?pagenum={request.args.get('pagenum')}") else: data = secondary_list("collections/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/collections//') def route_collection(collection): data = requests.get(f"https://www.snopes.com/collections/{collection}/") print(data) if data.status_code != 200: return Response(render_template(str(data.status_code) + ".html"), data.status_code) soup = BeautifulSoup(data.text, "html.parser") # Header article_type = soup.find(attrs={"class": ["section_title"]}) article_type.find("svg").decompose() article_type = article_type.text title_container = soup.find(attrs={"class": ["title-container"]}) title = title_container.select("h1")[0].text subtitle = title_container.select("h2")[0].text author_container = title_container.find(attrs={"class": ["author_info_wrapper"]}) author_button = author_container.select("h3 a")[0] author_link = author_button.get('href') author = author_button.text date = author_container.find(attrs={"class": ["published_date"]}).text # Cover image cover = soup.find(attrs={"id": ["cover-main"]}).get("src") cover_alt = soup.find(attrs={"id": ["cover-main"]}).get("alt") cover_desc = "" if soup.find(attrs={"class": ["article_img_desc"]}).select("span") != []: cover_desc = soup.find(attrs={"class": ["article_img_desc"]}).select("span")[0].text # Article article = soup.find(attrs={"id": ["article-content"]}) article_copy = copy(article) for div in article.select("div"): div.decompose() for script in article.select("script"): script.decompose() for a in article.select("p a:has(img)"): a["href"] = re.sub(r"^(.*)$", r"/proxy/?url=\1", a["href"]) article = re.sub(r"(src=\")", r"\1/proxy/?url=", str(article)) # Author bio author_bio = soup.select("div.author_bio p")[0].text # Article list article_list = [] for div in article_copy.select(".article_wrapper"): div.a["href"] = div.a["href"].replace("https://www.snopes.com", '') link = div.a["href"] image = "/proxy/?url=" + div.select("img")[0].get("data-src") alt = div.select("img")[0].get("alt") if len(list(div.select(".article_text"))) != 0: text = div.find(attrs={"class": ["article_text"]}) title = text.select("h3")[0].text date = text.find(attrs={"class": ["article_date"]}).text byline = text.find(attrs={"class": ["article_byline"]}).text article_list.append([link, image, alt, title, date, byline]) return render_template("collection.html", data=[article_type, title, subtitle, author_link, str(author).strip(), date, cover, cover_alt, cover_desc, article, author_bio, article_list]) @app.route('/search/') def route_search_blank(): if request.args.get('q') == None: if request.args.get('pagenum') != None: data = secondary_list(f"search/?pagenum={request.args.get('pagenum')}") else: data = secondary_list("search/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) else: return redirect(f"/search/{request.args.get('q')}/") @app.route('/search//') def route_search(query): if request.args.get('pagenum') != None: data = secondary_list(f"search/{query}/?pagenum={request.args.get('pagenum')}") else: data = secondary_list(f"search/{query}/") if type(data) == int: return Response(render_template(str(data) + ".html"), data) return render_template("list.html", data=data) @app.route('/random/') def route_random(): data = requests.get("https://www.snopes.com/random/") if data.status_code != 200: return Response(render_template(str(data.status_code) + ".html"), data.status_code) return redirect(data.url.replace("https://www.snopes.com", ''), 307) @app.route('/sitemap/') def route_sitemap(): data = requests.get("https://www.snopes.com/sitemap/") if data.status_code != 200: return Response(render_template(str(data.status_code) + ".html"), data.status_code) soup = BeautifulSoup(data.text, "html.parser") title = soup.find("h2").text archives = [] for archive_box in soup.select(".archive_box"): archive_type = archive_box.find(attrs={"class": ["section_title"]}) if (archive_type.find("svg") != None): archive_type.find("svg").decompose() archive_sections = [] for archive_section_item in archive_box.select(".archive_section_item"): archive_section_item.a["href"] = archive_section_item.a["href"].replace("https://www.snopes.com", '') archive_sections.append(archive_section_item.a) archives.append([archive_type, archive_sections]) return render_template("sitemap.html", data=[title, archives]) @app.route('/') def route_home(): return render_template("index.html") @app.route('/proxy/') def route_proxy(): url = request.args.get("url") if url != None: if url.startswith("https://mediaproxy.snopes.com/") or url.startswith("https://media.snopes.com/") or url.startswith("https://www.snopes.com/"): data = requests.get(url) return Response(data.content, content_type=data.headers["content-type"]) elif url.startswith("/") and not url.startswith("//"): data = requests.get("https://www.snopes.com" + url) return Response(data.content, content_type=data.headers["content-type"]) else: return Response(render_template("400.html"), status=400) else: return Response(render_template("400.html"), status=400) @app.before_request def add_slash(): if not request.path.endswith('/'): return redirect(request.path + '/') @app.errorhandler(404) def not_found(e): return render_template("404.html") if __name__ == '__main__': app.run(port=8001)