User:IngenuityBot/fixpages.py
Appearance
import requests, re
from getpagemetadata import getpagemetadata
urls = {
"cbc.ca": "[[CBC News]]",
"ctvnews.ca": "[[CTV News]]",
"globalnews.ca": "[[Global News]]",
"thestar.com": "[[Toronto Star]]",
"washingtonpost.com": "[[The Washington Post]]",
"nytimes.com": "[[The New York Times]]",
"theglobeandmail.com": "[[The Globe and Mail]]",
"nationalpost.com": "[[National Post]]",
"apnews.com": "[[Associated Press]]",
"reuters.com": "[[Reuters]]",
"bbc.com": "[[BBC News]]",
"theguardian.com": "[[The Guardian]]",
"aljazeera.com": "[[Al Jazeera]]",
"npr.org": "[[NPR]]",
"nbcnews.com": "[[NBC News]]",
"usatoday.com": "[[USA Today]]",
"latimes.com": "[[Los Angeles Times]]",
"wsj.com": "[[The Wall Street Journal]]",
"politico.com": "[[Politico]]",
"bloomberg.com": "[[Bloomberg News]]",
"axios.com": "[[Axios (website)|Axios]]",
"businessinsider.com": "[[Business Insider]]",
"thehill.com": "[[The Hill (newspaper)|The Hill]]",
"nypost.com": "[[New York Post]]",
"chicagotribune.com": "[[Chicago Tribune]]",
"vox.com": "[[Vox (website)|Vox]]",
"slate.com": "[[Slate (magazine)|Slate]]",
"theatlantic.com": "[[The Atlantic]]",
"newyorker.com": "[[The New Yorker]]",
"time.com": "[[Time (magazine)|Time]]",
"smh.com.au": "[[The Sydney Morning Herald]]",
"space.com": "[[Space.com]]",
"rollingstone.com": "[[Rolling Stone]]",
"nzherald.co.nz": "[[The New Zealand Herald]]",
"news.com.au": "[[News.com.au]]",
"nasa.gov": "[[NASA]]",
"msnbc.com": "[[MSNBC]]",
"thejc.com": "[[The Jewish Chronicle]]",
"theconversation.com": "[[The Conversation (website)|The Conversation]]",
"hollywoodreporter.com": "[[The Hollywood Reporter]]",
"gizmodo.com": "[[Gizmodo]]",
"thediplomat.com": "[[The Diplomat]]",
"deadline.com": "[[Deadline Hollywood]]",
"abcnews.go.com": "[[ABC News]]",
"cnn.com": "[[CNN]]",
"theverge.com": "[[The Verge]]",
"theage.com.au": "[[The Age]]",
"afp.com": "[[Agence France-Presse]]",
"arstechica.com": "[[Ars Technica]]",
"theaustralian.com.au": "[[The Australian]]",
"avclub.com": "[[The A.V. Club]]",
"buzzfeednews.com": "[[BuzzFeed News]]",
"csmonitor.com": "[[The Christian Science Monitor]]",
"cnet.com": "[[CNET]]",
"telegraph.co.uk": "[[The Daily Telegraph]]",
"ew.com": "[[Entertainment Weekly]]",
"forbes.com": "[[Forbes]]",
"foxnews.com": "[[Fox News]]",
"ign.com": "[[IGN]]",
"qz.com": "[[Quartz (publication)|Quartz]]",
"scientificamerican.com": "[[Scientific American]]",
"scmp.com": "[[South China Morning Post]]",
"variety.com": "[[Variety (magazine)|Variety]]",
"vogue.com": "[[Vogue (magazine)|Vogue]]",
"vox.com": "[[Vox (website)|Vox]]",
"wired.com": "[[Wired (magazine)|Wired]]"
}
def get_wikipedia_content(titles):
endpoint = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"prop": "revisions",
"rvprop": "content",
"format": "json",
"titles": "|".join(titles[:50])
}
response = requests.get(endpoint, params=params)
data = response.json()
pages = data["query"]["pages"]
content_dict = {}
for _, page_data in pages.items():
try:
content_dict[page_data["title"]] = page_data["revisions"][0]["*"]
except:
pass
if len(titles) > 50:
content_dict.update(get_wikipedia_content(titles[50:]))
return content_dict
def get_wikipedia_pages():
with open("pages.txt", "r") as f:
return f.read().split("\n")
def parse_time(timestamp):
return timestamp.split("T")[0]
def metadata_to_wikitext(metadata):
if not metadata["title"] or not metadata["url"]:
return None
metadata["title"] = metadata["title"].replace('|', '{{!}}')
args = []
for key, value in metadata.items():
if value:
args.append(f"|{key}={value}")
argtext = " ".join(args)
return "{{cite web " + argtext + "}}"
def main():
pages = get_wikipedia_pages()
print(f"Fetching content of {len(pages)} pages...")
content = get_wikipedia_content(pages)
print("Done.")
regex = re.compile(r"<ref(?:\s+name=\"?[^>]+\"?)?>\[?(http[^ <\]]+)]?(?: ?{{bare[^}]+?}})?<\/ref>")
for item in pages:
if item not in content:
continue
matches = re.findall(regex, content[item])
if not matches:
continue
to_replace = []
for item in matches:
for url in urls:
if re.findall("[\.\/]" + url, item):
to_replace.append(item)
break
if not to_replace:
continue
for item in to_replace:
metadata = getpagemetadata(item)
metadata["date"] = parse_time(metadata["date"]) if "date" in metadata else None
wikitext = metadata_to_wikitext(metadata)
if not wikitext:
continue
print(f"Original URL: {item}")
print(f"\tReplaced with: {wikitext}\n")
with open("results.txt", "a") as f:
f.write(f"{item}\n{wikitext}\n\n")
if __name__ == "__main__":
main()