Jump to content

User:IngenuityBot/fixpages.py

From Wikipedia, the free encyclopedia
import requests, re
from getpagemetadata import getpagemetadata

urls = {
    "cbc.ca": "[[CBC News]]",
    "ctvnews.ca": "[[CTV News]]",
    "globalnews.ca": "[[Global News]]",
    "thestar.com": "[[Toronto Star]]",
    "washingtonpost.com": "[[The Washington Post]]",
    "nytimes.com": "[[The New York Times]]",
    "theglobeandmail.com": "[[The Globe and Mail]]",
    "nationalpost.com": "[[National Post]]",
    "apnews.com": "[[Associated Press]]",
    "reuters.com": "[[Reuters]]",
    "bbc.com": "[[BBC News]]",
    "theguardian.com": "[[The Guardian]]",
    "aljazeera.com": "[[Al Jazeera]]",
    "npr.org": "[[NPR]]",
    "nbcnews.com": "[[NBC News]]",
    "usatoday.com": "[[USA Today]]",
    "latimes.com": "[[Los Angeles Times]]",
    "wsj.com": "[[The Wall Street Journal]]",
    "politico.com": "[[Politico]]",
    "bloomberg.com": "[[Bloomberg News]]",
    "axios.com": "[[Axios (website)|Axios]]",
    "businessinsider.com": "[[Business Insider]]",
    "thehill.com": "[[The Hill (newspaper)|The Hill]]",
    "nypost.com": "[[New York Post]]",
    "chicagotribune.com": "[[Chicago Tribune]]",
    "vox.com": "[[Vox (website)|Vox]]",
    "slate.com": "[[Slate (magazine)|Slate]]",
    "theatlantic.com": "[[The Atlantic]]",
    "newyorker.com": "[[The New Yorker]]",
    "time.com": "[[Time (magazine)|Time]]",
    "smh.com.au": "[[The Sydney Morning Herald]]",
    "space.com": "[[Space.com]]",
    "rollingstone.com": "[[Rolling Stone]]",
    "nzherald.co.nz": "[[The New Zealand Herald]]",
    "news.com.au": "[[News.com.au]]",
    "nasa.gov": "[[NASA]]",
    "msnbc.com": "[[MSNBC]]",
    "thejc.com": "[[The Jewish Chronicle]]",
    "theconversation.com": "[[The Conversation (website)|The Conversation]]",
    "hollywoodreporter.com": "[[The Hollywood Reporter]]",
    "gizmodo.com": "[[Gizmodo]]",
    "thediplomat.com": "[[The Diplomat]]",
    "deadline.com": "[[Deadline Hollywood]]",
    "abcnews.go.com": "[[ABC News]]",
    "cnn.com": "[[CNN]]",
    "theverge.com": "[[The Verge]]",
    "theage.com.au": "[[The Age]]",
    "afp.com": "[[Agence France-Presse]]",
    "arstechica.com": "[[Ars Technica]]",
    "theaustralian.com.au": "[[The Australian]]",
    "avclub.com": "[[The A.V. Club]]",
    "buzzfeednews.com": "[[BuzzFeed News]]",
    "csmonitor.com": "[[The Christian Science Monitor]]",
    "cnet.com": "[[CNET]]",
    "telegraph.co.uk": "[[The Daily Telegraph]]",
    "ew.com": "[[Entertainment Weekly]]",
    "forbes.com": "[[Forbes]]",
    "foxnews.com": "[[Fox News]]",
    "ign.com": "[[IGN]]",
    "qz.com": "[[Quartz (publication)|Quartz]]",
    "scientificamerican.com": "[[Scientific American]]",
    "scmp.com": "[[South China Morning Post]]",
    "variety.com": "[[Variety (magazine)|Variety]]",
    "vogue.com": "[[Vogue (magazine)|Vogue]]",
    "vox.com": "[[Vox (website)|Vox]]",
    "wired.com": "[[Wired (magazine)|Wired]]"
}


def get_wikipedia_content(titles):
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles[:50])
    }
    response = requests.get(endpoint, params=params)
    data = response.json()
    pages = data["query"]["pages"]
    content_dict = {}
    for _, page_data in pages.items():
        try:
            content_dict[page_data["title"]] = page_data["revisions"][0]["*"]
        except:
            pass
    if len(titles) > 50:
        content_dict.update(get_wikipedia_content(titles[50:]))
    return content_dict


def get_wikipedia_pages():
    with open("pages.txt", "r") as f:
        return f.read().split("\n")


def parse_time(timestamp):
    return timestamp.split("T")[0]


def metadata_to_wikitext(metadata):
    if not metadata["title"] or not metadata["url"]:
        return None

    metadata["title"] = metadata["title"].replace('|', '{{!}}')

    args = []

    for key, value in metadata.items():
        if value:
            args.append(f"|{key}={value}")

    argtext = " ".join(args)
    return "{{cite web " + argtext + "}}"


def main():
    pages = get_wikipedia_pages()
    print(f"Fetching content of {len(pages)} pages...")
    content = get_wikipedia_content(pages)
    print("Done.")

    regex = re.compile(r"<ref(?:\s+name=\"?[^>]+\"?)?>\[?(http[^ <\]]+)]?(?: ?{{bare[^}]+?}})?<\/ref>")

    for item in pages:
        if item not in content:
            continue

        matches = re.findall(regex, content[item])

        if not matches:
            continue

        to_replace = []

        for item in matches:
            for url in urls:
                if re.findall("[\.\/]" + url, item):
                    to_replace.append(item)
                    break
        
        if not to_replace:
            continue

        for item in to_replace:
            metadata = getpagemetadata(item)
            metadata["date"] = parse_time(metadata["date"]) if "date" in metadata else None
            wikitext = metadata_to_wikitext(metadata)
            if not wikitext:
                continue
            print(f"Original URL: {item}")
            print(f"\tReplaced with: {wikitext}\n")

            with open("results.txt", "a") as f:
                f.write(f"{item}\n{wikitext}\n\n")


if __name__ == "__main__":
    main()