User:WildBot/wildBot.py
Appearance
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
WildBot:
The members of Category:All disambiguation pages (169,187 entries) and
Category:Redirects from incomplete disambiguations (3,215 entries)
will be considered to be ambiguous links. Articles or redirects
containing "(disambiguation)" will not be considered ambiguous.
The bot will operate off a cached copy of this list, updated periodically
via an API call to categorymembers to retrive new additions, and periodic
checks against its watchlist (containing all known disambiguation pages;
assuming there's no technical limitation with a watchlist having 172402
pages) to check for removals. If I'm granted a toolsever account maintaining
this list might be better done via SQL queries.
Periodically (I propose every minute) queries API for New Pages since the
last query in namespaces 0 (mainspace), 6 (file), 10 (template) and 14 (category).
New redirects are excluded.
New disambiguation pages are excluded.
Each new page will be checked for any ambiguous links. If a page
has ambiguous links, a message will be left on that talk page.
Affected pages will be monitored,
and the template changed or removed as the article changes.
"""
import sys, traceback, threading
import wikipedia
import dab_template_placer, article_queue, watchlist_monitor, category_filter, haltpage_excluder
import codecs
__metaclass__ = type
class ConfigFileProducer( threading.Thread ):
def __init__(self, shutdown, queue, site=None):
self.shutdown = shutdown
self.queue = queue
if site is None:
site = wikipedia.getSite()
self.site = site
self.source_file = 'disambiguations/sample.txt'
threading.Thread.__init__(self)
def run(self):
try:
f = codecs.open(self.source_file, 'r', 'utf-8')
try:
for logtext in f:
print '>>%s<<' % logtext.strip()
page = wikipedia.Page(self.site, logtext.strip())
if page:
self.queue.add_page(page)
finally:
f.close()
except:
self.shutdown.set()
self.queue.add_page(None)
wikipedia.stopme()
raise
class NewPagesProducer( threading.Thread ):
def __init__(self, shutdown, queue, namespace=0, site=None):
self.shutdown = shutdown
self.queue = queue
if site is None:
site = wikipedia.getSite()
self.site = site
self.namespace = namespace
self.number_to_fetch = 50
threading.Thread.__init__(self)
def newpages(self, get_redirect = False):
"""
Yield new articles (as Page objects) from Special:Newpages.
Starts with the newest article and fetches a number of articles.
It fetches again. If there is no new page, it blocks until there
is one, sleeping between subsequent fetches of Newpages.
Page objects are yielded.
"""
seen = set()
try:
d = self.site.apipath()
del d
except NotImplementedError:
self.site.config.use_api = False
while not self.shutdown.isSet():
if wikipedia.config.use_api and self.site.versionnumber() >= 10:
params = {
'action': 'query',
'list': 'recentchanges',
'rctype': 'new',
'rcnamespace': self.namespace,
'rclimit': int(self.number_to_fetch),
'rcprop': ['ids','title', 'ns'],
'rcshow': ['!bot','!redirect'],
#'': '',
}
self.number_to_fetch = 7
data = wikipedia.query.GetData(params, self.site)['query']['recentchanges']
for np in data:
if np['pageid'] not in seen:
seen.add(np['pageid'])
page = wikipedia.Page(self.site, np['title'], defaultNamespace=np['ns'])
yield page
else:
path = self.newpages_address(n=number, namespace=namespace)
# The throttling is important here, so always enabled.
get_throttle()
html = self.site.getUrl(path)
entryR = re.compile('<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
' title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
' .?<a href=".+?" title=".+?:(?P<username>.+?)">')
for m in entryR.finditer(html):
date = m.group('date')
title = m.group('title')
title = title.replace('"', '"')
length = int(re.sub("[,.]", "", m.group('length')))
loggedIn = u''
username = m.group('username')
comment = u''
if title not in seen:
seen.add(title)
page = wikipedia.Page(self.site, title)
yield page
self.shutdown.wait(30)
def run(self):
try:
#Load in all dab pages
for (page) in self.newpages(get_redirect = False):
wikipedia.output(u'New Page: %s' % page.title())
self.queue.add_page(page)
except:
self.shutdown.set()
self.queue.add_page(None)
wikipedia.stopme()
raise
# MAIN
if __name__ == "__main__":
# Shutdown event
shutdown = threading.Event()
# communications queues for new articles
new_pages = article_queue.ArticleQueue()
nondab_pages = article_queue.ArticleQueue()
# communications queues for changed pages
changed_pages_1 = article_queue.ArticleQueue()
changed_pages_2 = article_queue.ArticleQueue()
try:
print str(sys.stdout.encoding)
for arg in wikipedia.handleArgs():
wikipedia.output(u'Warning: argument "%s" not understood; ignoring.' % arg)
# WildBot Task 1
# start message-placing and message-updating threads
# message placer stores some statistics
# message updater co-operates with watchlister to ensure only new changes are acted on
TalkPageDabMsger = dab_template_placer.NewPageChecker(shutdown, nondab_pages)
TalkPageDabMsger.start()
#start thread to monitor stop page(s) and stop the world if they change
halt_pages = []
halt_pages.append('User:WildBot/Halt')
halt_checker = haltpage_excluder.HaltpageFilter(
shutdown, changed_pages_1, changed_pages_2, halt_pages)
halt_checker.start()
# start thread to remove dab pages from the new_pages queue
dab_cats = []
dab_cats.append('Category:All disambiguation pages')
dab_page_remover = category_filter.CategoryFilter(
shutdown, new_pages, nondab_pages, dab_cats)
dab_page_remover.start()
# start finding stuff threads, one per namespace
newpages_watcher = NewPagesProducer(shutdown, new_pages, [0, 6, 10, 14])
newpages_watcher.start()
# start checking for changes to clear off the template
TalkPageCleaner = dab_template_placer.TalkCleaner(shutdown, changed_pages_2)
TalkPageCleaner.start()
# start watchlist thread
changes_watcher = watchlist_monitor.WatchlistProducer(shutdown, changed_pages_1)
changes_watcher.start()
# revist = ConfigFileProducer(shutdown, new_pages)
# revist.start()
except:
shutdown.set()
new_pages.add_page(None)
changed_pages.add_page(None)
wikipedia.stopme()
raise