User:IngenuityBot/getpagemetadata.py
Appearance
import requests, re
from html.parser import HTMLParser
urls = {
"cbc.ca": "[[CBC News]]",
"ctvnews.ca": "[[CTV News]]",
"globalnews.ca": "[[Global News]]",
"thestar.com": "[[Toronto Star]]",
"washingtonpost.com": "[[The Washington Post]]",
"nytimes.com": "[[The New York Times]]",
"theglobeandmail.com": "[[The Globe and Mail]]",
"nationalpost.com": "[[National Post]]",
"apnews.com": "[[Associated Press]]",
"reuters.com": "[[Reuters]]",
"bbc.com": "[[BBC News]]",
"theguardian.com": "[[The Guardian]]",
"aljazeera.com": "[[Al Jazeera]]",
"npr.org": "[[NPR]]",
"nbcnews.com": "[[NBC News]]",
"usatoday.com": "[[USA Today]]",
"latimes.com": "[[Los Angeles Times]]",
"wsj.com": "[[The Wall Street Journal]]",
"politico.com": "[[Politico]]",
"bloomberg.com": "[[Bloomberg News]]",
"axios.com": "[[Axios (website)|Axios]]",
"businessinsider.com": "[[Business Insider]]",
"thehill.com": "[[The Hill (newspaper)|The Hill]]",
"nypost.com": "[[New York Post]]",
"chicagotribune.com": "[[Chicago Tribune]]",
"vox.com": "[[Vox (website)|Vox]]",
"slate.com": "[[Slate (magazine)|Slate]]",
"theatlantic.com": "[[The Atlantic]]",
"newyorker.com": "[[The New Yorker]]",
"time.com": "[[Time (magazine)|Time]]",
"smh.com.au": "[[The Sydney Morning Herald]]",
"space.com": "[[Space.com]]",
"rollingstone.com": "[[Rolling Stone]]",
"nzherald.co.nz": "[[The New Zealand Herald]]",
"news.com.au": "[[News.com.au]]",
"nasa.gov": "[[NASA]]",
"msnbc.com": "[[MSNBC]]",
"thejc.com": "[[The Jewish Chronicle]]",
"theconversation.com": "[[The Conversation (website)|The Conversation]]",
"hollywoodreporter.com": "[[The Hollywood Reporter]]",
"gizmodo.com": "[[Gizmodo]]",
"thediplomat.com": "[[The Diplomat]]",
"deadline.com": "[[Deadline Hollywood]]",
"abcnews.go.com": "[[ABC News]]",
"cnn.com": "[[CNN]]",
"theverge.com": "[[The Verge]]",
"theage.com.au": "[[The Age]]",
"arstechica.com": "[[Ars Technica]]",
"avclub.com": "[[The A.V. Club]]",
"buzzfeednews.com": "[[BuzzFeed News]]",
"csmonitor.com": "[[The Christian Science Monitor]]",
"cnet.com": "[[CNET]]",
"telegraph.co.uk": "[[The Daily Telegraph]]",
"ew.com": "[[Entertainment Weekly]]",
"forbes.com": "[[Forbes]]",
"ign.com": "[[IGN]]",
"qz.com": "[[Quartz (publication)|Quartz]]",
"scientificamerican.com": "[[Scientific American]]",
"scmp.com": "[[South China Morning Post]]",
"variety.com": "[[Variety (magazine)|Variety]]",
"vogue.com": "[[Vogue (magazine)|Vogue]]",
"wired.com": "[[Wired (magazine)|Wired]]"
}
class Parser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.elements = []
self.metadata = {}
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
self.elements.append((tag, attrs))
if tag == "meta":
dataname, content = "", ""
for attr in attrs:
if attr[0] == "name" or attr[0] == "property":
dataname = attr[1]
elif attr[0] == "content":
content = attr[1]
if dataname and content:
self.metadata[dataname] = content
def getarchiveurl(url):
response = requests.get("https://archive.org/wayback/available?url=" + url).json()
if "closest" in response["archived_snapshots"]:
return response["archived_snapshots"]["closest"]["url"], response["archived_snapshots"]["closest"]["timestamp"]
else:
return ("", "")
def getmetadatabysite(url, site, metadata, elements):
metadata = {
"title": metadata["og:title"] if "og:title" in metadata else "",
"date": metadata["article:published_time"] if "article:published_time" in metadata else "",
"website": site
}
match site:
case "[[The New York Times]]":
metadata["date"] = metadata["article:published_time"] if "article:published_time" in metadata else ""
case "[[CBC News]]":
for item in elements:
if item[0] == "time":
for attr in item[1]:
if attr[0] == "datetime":
metadata["date"] = attr[1]
return metadata
def getpagemetadata(url):
website = ""
for item in urls:
if re.findall("[\.\/]" + item, url):
website = urls[item]
if not website:
return
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0"
}
content = requests.get(url, headers=headers).text
parser = Parser()
parser.feed(content)
metadata = getmetadatabysite(url, website, parser.metadata, parser.elements)
if not metadata:
return
archive_url, archive_date = getarchiveurl(url)
if archive_url:
metadata["archive-url"] = archive_url
metadata["archive-date"] = archive_date[:4] + "-" + archive_date[4:6] + "-" + archive_date[6:8]
metadata["url-status"] = "live"
metadata["url"] = url
return metadata