User:WildBot/wildBot.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
WildBot: 
The members of Category:All disambiguation pages (169,187 entries) and 
Category:Redirects from incomplete disambiguations (3,215 entries) 
will be considered to be ambiguous links. Articles or redirects 
containing "(disambiguation)" will not be considered ambiguous.

The bot will operate off a cached copy of this list, updated periodically 
via an API call to categorymembers to retrive new additions, and periodic 
checks against its watchlist (containing all known disambiguation pages; 
assuming there's no technical limitation with a watchlist having 172402 
pages) to check for removals. If I'm granted a toolsever account maintaining 
this list might be better done via SQL queries.

Periodically (I propose every minute) queries API for New Pages since the 
last query in namespaces 0 (mainspace), 6 (file), 10 (template) and 14 (category).
    New redirects are excluded.
    New disambiguation pages are excluded.
    Each new page will be checked for any ambiguous links. If a page 
        has ambiguous links, a message will be left on that talk page.
    Affected pages will be monitored, 
        and the template changed or removed as the article changes.
"""

import sys, traceback, threading
import wikipedia
import dab_template_placer, article_queue, watchlist_monitor, category_filter, haltpage_excluder
import codecs

__metaclass__ = type


class ConfigFileProducer( threading.Thread ):
    def __init__(self, shutdown, queue, site=None):
        self.shutdown = shutdown
        self.queue = queue
        if site is None:
            site = wikipedia.getSite()
        self.site = site
        self.source_file = 'disambiguations/sample.txt'
        threading.Thread.__init__(self)

    def run(self):
        try:
            f = codecs.open(self.source_file, 'r', 'utf-8')
            try:
                for logtext in f:
                    print '>>%s<<' % logtext.strip()
                    page = wikipedia.Page(self.site, logtext.strip())
                    if page:
                        self.queue.add_page(page)
            finally:
                f.close()
        except:
            self.shutdown.set()
            self.queue.add_page(None)
            wikipedia.stopme()
            raise


class NewPagesProducer( threading.Thread ):
    def __init__(self, shutdown, queue, namespace=0, site=None):
        self.shutdown = shutdown
        self.queue = queue
        if site is None:
            site = wikipedia.getSite()
        self.site = site
        self.namespace = namespace
        self.number_to_fetch = 50
        threading.Thread.__init__(self)

    def newpages(self, get_redirect = False):
        """
        Yield new articles (as Page objects) from Special:Newpages.

        Starts with the newest article and fetches a number of articles. 
        It fetches again. If there is no new page, it blocks until there 
        is one, sleeping between subsequent fetches of Newpages.

        Page objects are yielded.
        """
        seen = set()
        try:
            d = self.site.apipath()
            del d
        except NotImplementedError:
            self.site.config.use_api = False

        while not self.shutdown.isSet():
            if wikipedia.config.use_api and self.site.versionnumber() >= 10:
                params = {
                    'action': 'query',
                    'list': 'recentchanges',
                    'rctype': 'new',
                    'rcnamespace': self.namespace,
                    'rclimit': int(self.number_to_fetch),
                    'rcprop': ['ids','title', 'ns'],
                    'rcshow': ['!bot','!redirect'],
                    #'': '',
                }
                self.number_to_fetch = 7
                data = wikipedia.query.GetData(params, self.site)['query']['recentchanges']

                for np in data:
                    if np['pageid'] not in seen:
                        seen.add(np['pageid'])
                        page = wikipedia.Page(self.site, np['title'], defaultNamespace=np['ns'])
                        yield page
            else:
                path = self.newpages_address(n=number, namespace=namespace)
                # The throttling is important here, so always enabled.
                get_throttle()
                html = self.site.getUrl(path)

                entryR = re.compile('<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
                    ' title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
                    ' .?<a href=".+?" title=".+?:(?P<username>.+?)">')
                for m in entryR.finditer(html):
                    date = m.group('date')
                    title = m.group('title')
                    title = title.replace('&quot;', '"')
                    length = int(re.sub("[,.]", "", m.group('length')))
                    loggedIn = u''
                    username = m.group('username')
                    comment = u''

                    if title not in seen:
                        seen.add(title)
                        page = wikipedia.Page(self.site, title)
                        yield page
            self.shutdown.wait(30)

    def run(self):
        try:
            #Load in all dab pages
            for (page) in self.newpages(get_redirect = False):
                wikipedia.output(u'New Page: %s' % page.title())
                self.queue.add_page(page)
        except:
            self.shutdown.set()
            self.queue.add_page(None)
            wikipedia.stopme()
            raise


# MAIN
if __name__ == "__main__":
    # Shutdown event
    shutdown = threading.Event()
    # communications queues for new articles
    new_pages = article_queue.ArticleQueue()
    nondab_pages = article_queue.ArticleQueue()
    # communications queues for changed pages
    changed_pages_1 = article_queue.ArticleQueue()
    changed_pages_2 = article_queue.ArticleQueue()
    try:
        print str(sys.stdout.encoding)
        for arg in wikipedia.handleArgs():
            wikipedia.output(u'Warning: argument "%s" not understood; ignoring.' % arg)
# WildBot Task 1
        # start message-placing and message-updating threads
        # message placer stores some statistics
        # message updater co-operates with watchlister to ensure only new changes are acted on
        TalkPageDabMsger = dab_template_placer.NewPageChecker(shutdown, nondab_pages)
        TalkPageDabMsger.start()
        #start thread to monitor stop page(s) and stop the world if they change
        halt_pages = []
        halt_pages.append('User:WildBot/Halt')
        halt_checker = haltpage_excluder.HaltpageFilter(
                shutdown, changed_pages_1, changed_pages_2, halt_pages)
        halt_checker.start()
        # start thread to remove dab pages from the new_pages queue
        dab_cats = []
        dab_cats.append('Category:All disambiguation pages')
        dab_page_remover = category_filter.CategoryFilter(
                shutdown, new_pages, nondab_pages, dab_cats)
        dab_page_remover.start()
        # start finding stuff threads, one per namespace
        newpages_watcher = NewPagesProducer(shutdown, new_pages, [0, 6, 10, 14])
        newpages_watcher.start()
        # start checking for changes to clear off the template
        TalkPageCleaner = dab_template_placer.TalkCleaner(shutdown, changed_pages_2)
        TalkPageCleaner.start()
        # start watchlist thread
        changes_watcher = watchlist_monitor.WatchlistProducer(shutdown, changed_pages_1)
        changes_watcher.start()
#        revist = ConfigFileProducer(shutdown, new_pages)
#        revist.start()
    except:
        shutdown.set()
        new_pages.add_page(None)
        changed_pages.add_page(None)
        wikipedia.stopme()
        raise