User:AfdlBot/Source
Appearance
#!/usr/bin/python # -*- coding: utf-8 -*- # # AfdlBot - Bot to maintain Wikipedia Afdl pages, see http://en.wikipedia.org/wiki/User:AfdlBot # Written by an Anon E. Mouse Wikipedia editor, http://en.wikipedia.org/wiki/User:AnonEMouse # June 2007 # # Intended potential users of this bot: # Wikipedia:WikiProject Macintosh/Deletion, # Wikipedia:WikiProject Georgia Tech/Deletion # Wikipedia:WikiProject Pornography/Deletion # Wikipedia:WikiProject Video games/Deletion # Wikipedia:WikiProject Warcraft/Deletion # Wikipedia:WikiProject Webcomics/Deletion # TODO: Templates, (and tfdl), mfdl? # TODO: actually read categories for discussion, rather than just maintaining the already listed ones # TODO: Deletion review as for categories # TODO: follow down config-input categories, get all subcategories # TODO: WikiProject Deletion Sorting and subst:ed Afds # TODO: mark Afd when listing import sys,re,wikipedia,codecs,time,config from datetime import date, timedelta site = wikipedia.getSite() logfile = open("afd.log", 'w') afdlBotPage = 'User:AfdlBot/Configuration' afdlProjects = [] def followRedirects(title): """ Gets the page content, and as long as it's a redirect, follows those redirects. """ page = wikipedia.Page(site, title); try: page.get() return page except wikipedia.NoPage: return page except wikipedia.SectionError: return page except wikipedia.IsRedirectPage, arg: return followRedirects(arg.args[0]) def parseDate(string): """ Accept YYYY-MM-DD or 17:55, June 8, 2007 formats, return a date object""" if string == None: return None # print 'parsing "' + string + '"' try: return date(*(time.strptime(string, '%Y-%m-%d')[0:3])) except ValueError: try: return date(*(time.strptime(string, '%H:%M, %B %d, %Y')[0:3])) except ValueError: try: return date(*(time.strptime(string, '%H:%M, %d %B %Y')[0:3])) except ValueError: return None # Retular sub-expression to match 4 kinds of wikilinked dates # [[2007]]-[[03-15]] # [[2007-03-15]] # [[July 3]], [[2007]] # [[3 July]], [[2007]] # Note {len(May)=3, len(September)=9} datexpr = r'(\[\[\d{4}\]\]-\[\[\d\d\-\d\d\]\])|' + \ r'(\[\[\d{4}-\d\d-\d\d\]\])|' + \ r'(\[\[[A-Za-z]{3,9} \d\d\]\],?\s*\[\[\d{4}\]\])|' + \ r'(\[\[\d\d [A-Za-z]{3,9}\]\],?\s*\[\[\d{4}\]\])' def parseWikidate(string): """ Parses the above 4 formats into a date object. """ if string == None: return None string = re.sub('[\]\[,]', '', string) # print 'newstring=', string # print 'parsing "' + string + '"' try: return date(*(time.strptime(string, '%Y-%m-%d')[0:3])) except ValueError: try: return date(*(time.strptime(string, '%B %d %Y')[0:3])) except ValueError: try: return date(*(time.strptime(string, '%d %B %Y')[0:3])) except ValueError: return None def dateToPageName(d): """Need to produce single digits for days < 10. For example, 2007 June 9.""" return d.strftime('%Y %B ') + str(d.day) # The result was '''Speedy delete per G3'''. [[User talk:PeaceNT|'''''Peacent''''']] 02:29, 9 June 2007 (UTC) Rresult = re.compile(r".*\<!--Template:Afd top.*The result was[^']*'''(?P<result>[^']*)'''.*" + r"(?P<date>[0-9][0-9]:[0-9][0-9], [0-9][0-9] [A-Za-z]+ [0-9]{4}) \(UTC\)", re.DOTALL) # Don't keep afdls more than a month old archivedate = date.today() + timedelta(days=-31) class AfdlProject(object): """A project (or other party) maintaining a list of Afds with Afdls.""" # Regular expression matching an Afdl template Rafdl = re.compile(r'\s*(\*\s*)?{{\s*Afdl(\|(?P<params>.+?))?}}\s*', re.DOTALL | re.IGNORECASE) def parseAfdls(self, text): """ Parses Afdls from text, returns unparsed text.""" last = 0 rest = '' for m in AfdlProject.Rafdl.finditer(text): paramString = m.group('params') params = [] if paramString: params = paramString.split('|') aa = AfdArticleFromAfdlParams(params) if aa: if m.start() > last: rest += text[last:m.start()] + '\n' last = m.end() if aa.result: if aa.closedate > archivedate: self.closedAfds.append(aa) else: self.newArchived += 1 else: self.openAfds.append(aa) if last < len(text): rest += text[last:] + '\n' return rest # Matching a categories for deletion line is tough since it's not a template, but free form text. # * [[:Category:Anime and manga inspired webcomics]] to [[:Category:Anime and manga webcomics]] at [[Wikipedia:Categories for deletion/Log/2006 July 11#Category:Anime and manga inspired webcomics to Category:Anime and manga webcomics]] '''Kept''' ''([[July 10]] [[2006]] – [[July 20]] [[2006]])'' # *[[:Category:Keentoons]] at [[Wikipedia:Categories for discussion/Log/2007 April 30#Category:Keentoons]] # * [[:Category:WikiProject Webcomics members]] at [[Wikipedia:Categories for deletion/Log/2006 July 20#WikiProject participants]] ''[[July 20]] [[2006]] – [[July 31]] [[2006]]'' # * [[:Category:Big-bust models and performers]] ([[2007]]-[[03-15]] – [[2007]]-[[03-21]]) '''No consensus''' # *[[:Category:Naturally busty porn stars]] at [[Wikipedia:Categories for discussion/Log/2007 March 8#Category:Naturally busty porn stars]] ([[2007]]-[[03-08]] - ([[2007]]-[[03-14]]) '''Delete''' # * [[:Category:Adult video games]] at [[Wikipedia:Categories for deletion/Log/2006 May 12#Category:Adult video games to Category:Erotic computer and video games]] '''renamed to''' [[:Category:Erotic computer and video games]] # * [[:Category:Artifex]] at [[Wikipedia:Categories for discussion/Log/2007 April 17#Category:Artifex]] '''Speedy Delete''' Rcfd = re.compile(r'\s*(\*\s*)?\[\[:(?P<category>Category:[^\]]+)\]\](?P<optional>.*?)' + r'(\s+at\s+\[\[(?P<cfd>Wikipedia:Categories[ _]for[ _]d[a-z]+/Log/[^\]]+)\]\])?\s*' + r"((?P<optional2>.*?)'''(?P<result>[^'\n]+)''')?(?P<rest>.*)", re.IGNORECASE) # Similarly for deletion review. # * [[Air Force Amy]] at [[Wikipedia:Deletion_review/Log/2007 May 5#Air Force Amy]] ([[2007-04-05]]—[[2007-04-06]]) '''Keep rewritten article.''' #* [[List of male performers in gay porn films]] at [[Wikipedia:Deletion review/Log/2007 April 18#List of male performers in gay porn films]] ([[2007-04-18]]—[[23 April]] [[2007]]) '''Deletion overturned''' Rdrv = re.compile(r'\s*(\*\s*)?\[\[:?(?P<target>[^\]]+)\]\](?P<optional>.*?)\s+at\s+' + r'\[\[(?P<cfd>Wikipedia:Deletion[ _]review[ _]deletion/Log/[^\]]+)\]\]\s*' + r"((?P<optional2>.*?)'''(?P<result>[^'\n]+)''')?(?P<rest>.*)", re.IGNORECASE) Rdatespan = re.compile(r'\s*\(\s*(?P<fromdate>' +datexpr+ r')\s*[^\[\)]*\s*(?P<todate>' +datexpr+ r')\s*\)\s*') # Rdatespan = re.compile(r'\s*\(\s*(?P<fromdate>' +datexpr+ u')\s*-|(–)\s*(?P<todate>' +datexpr+ r')\s*\)\s*') # Rdatespan = re.compile(r'\s*\(\s*(?P<fromdate>\[\[\S+\]\])\s*(?P<dash>-|(–))\s*(?P<todate>\[\[\S+\]\])\)\s*') # Rdatespan = re.compile(r'\s*\(\s*(?P<fromdate>' +datexpr+ ')\s*(-|(—)|(–))\s*(?P<todate>' +datexpr+ ')\s*\)\s*') def parseCfds(self, text): """ Parses Cfd listings from text, returns unparsed text.""" last = 0 rest = '' for m in AfdlProject.Rcfd.finditer(text): # print 'match=', m.group() # print 'category=', m.group('category') # print 'cfd=', m.group('cfd') # print 'optional=', m.group('optional') # print 'optional2=', m.group('optional2') # print 'result=', m.group('result') # print 'rest=', m.group('rest') cfdname = m.group('cfd') if cfdname: cfd = wikipedia.Page(site, m.group('cfd')) else: cfd = None category = wikipedia.Page(site, m.group('category')) cfdrest = '' if m.group('optional'): cfdrest += ' ' + m.group('optional') cfdrest = cfdrest.strip() if m.group('optional2'): cfdrest += ' ' + m.group('optional2') cfdrest = cfdrest.strip() if m.group('rest'): cfdrest += ' ' + m.group('rest') cfdrest = cfdrest.strip() datespan = AfdlProject.Rdatespan.search(cfdrest) fromdate = None todate = None if datespan: # print 'datespan=', datespan.group() # print 'fromdate=', datespan.group('fromdate') # print 'dash=', datespan.group('dash') # print 'todate=', datespan.group('todate') cfdrest = AfdlProject.Rdatespan.sub('', cfdrest) fromdate=parseWikidate(datespan.group('fromdate')) todate=parseWikidate(datespan.group('todate')) if fromdate and not cfd : cfd = wikipedia.Page(site, 'Wikipedia:Categories for discussion/Log/' + dateToPageName(fromdate)) # Todo: check if cfd page links to category? c = CfdCategory(cfd, category, fromdate, todate, m.group('result'), cfdrest) if c.startdate: # in other words, if it's legitimate if m.start() > last: rest += text[last:m.start()] + '\n' last = m.end() if c.result: if not c.closedate or c.closedate > archivedate: self.closedAfds.append(aa) else: self.newArchived += 1 else: self.openAfds.append(c) if last < len(text): rest += text[last:] + '\n' print 'rest after cfds=', rest return rest # Regular expression that matches an Afdl list page RafdlPage = re.compile(r'(?P<header>.*)' + r'==\s*Open\s*==\s*(?P<open>.*)' + r'==\s*Closed\s*==\s*(?P<closed>.*)', re.DOTALL) #Todo: separate footer? def __init__(self, listpage, articleCategories, articleTemplates, talkCategories, talkTemplates): # print listpage, articleTemplates, articleCategories, talkTemplates, talkCategories self.listpage = listpage self.articleTemplates = articleTemplates self.articleCategories = articleCategories self.talkTemplates = talkTemplates self.talkCategories = talkCategories self.openAfds = [] self.closedAfds = [] # Count the number of useful changes that would be made to list page when writing #- if none, don't write anything self.newOpen = 0 self.newClosed = 0 self.newArchived = 0 # Todo: self.archivedAfds = [] match = AfdlProject.RafdlPage.match(listpage.get()) if not match: print 'Could not parse', listpage, '!!' logfile.write('Could not parse ' + str(listpage) + '!!\n') return self.header = match.group('header') openmatch = match.group('open') openmatch = AfdlProject.Rdateheader.sub('', openmatch) closedmatch = match.group('closed') closedmatch = AfdlProject.Rdateheader.sub('', closedmatch) self.opentext = self.parseAfdls(openmatch) self.opentext = self.parseCfds(self.opentext) # Some of the formerly open Afds will have just been closed, count them self.newClosed = len(self.closedAfds) self.closedtext = self.parseAfdls(closedmatch) self.closedtext = self.parseCfds(self.closedtext) def __str__(self): """A console representation of the AfdlProject""" return str(self.listpage) def logAfd(self, page, afd, reason, spec): """ Add an article and its afd to the project lists. Log this in a file for fun.""" # print self.listpage, page.title(), afd.title(), reason, spec aa = AfdArticle(afd, page) # print aa # Consider if article has been deleted or redirected if aa.result: # Todo: should we check archivedate? Or should we put it on the page at least once? self.closedAfds.append(aa) self.newClosed += 1 else: self.openAfds.append(aa) self.newOpen += 1 logfile.write(self.listpage.title() + '\t' + page.title() + '\t' + afd.title() + '\t' + reason + ':' + spec + '\n') logfile.flush() def checkAfdArticle(self, afd, article, talkpage): """ Check if an Afd for an article qualifies to be added to the project lists. Returns True if qualifies (and has been added), False if not. """ # check for articles already in Afd list, those don't even need to be "gotten" for open in self.openAfds: if open.afd == afd and open.article == article: # print afd, 'matches', open if Rresult.match(afd.get()): # afd has a result, in other words, was closed self.openAfds.remove(aa) self.logAfd(article, afd, 'listed as open on', sortPageName) return True for closed in self.closedAfds: if closed.afd == afd and closed.article == article: return True if len(self.articleCategories)>0: for cat in article.categories(): if cat.title().capitalize() in self.articleCategories: self.logAfd(article, afd, 'article category', cat.title()) return True if len(self.articleTemplates)>0: for template in article.templates(): if template.title().capitalize() in self.articleTemplates: self.logAfd(article, afd, 'article template', template) return True # Do we need to check talk page? if len(self.talkCategories) + len(self.talkTemplates) <= 0: return False if not talkpage.exists(): return False if len(self.talkCategories) > 0: for cat in talkpage.categories(): if cat.capitalize() in self.talkCategories: self.logAfd(article, afd, 'talk category', cat.title()) return True if len(self.talkTemplates) > 0: for template in talkpage.templates(): if template.capitalize() in self.talkTemplates: self.logAfd(article, afd, 'talk template', template) return True return False # Regular expression that matches the date header generated below Rdateheader = re.compile(r"^\s*'''\d\d? [A-Za-z]{3,9}'''\s+\(\[\[.*\|AfD*\]\].*\)[ \t]*\n", re.MULTILINE) def afdsByTime(self, list): list.sort() lastdate = None result = '' for afd in list: if afd.startdate != lastdate: # print 'changing lastdate', lastdate, 'to', afd.startdate lastdate = afd.startdate # '''19 June''' ([[Wikipedia:Articles for deletion/Log/2007 June 19|AfD]], # [[Wikipedia:Categories for discussion/Log/2007 June 19|CfD]]) datename = dateToPageName(afd.startdate) result += afd.startdate.strftime("'''%d %B'''") \ + ' ([[Wikipedia:Articles for deletion/Log/' + datename + '|AfD]],' \ + ' [[Wikipedia:Categories for discussion/Log/' + datename + '|CfD]])' + '\n' result += '* ' + str(afd) + '\n' logfile.write('* ' + str(afd).encode(config.console_encoding, 'replace') + '\n') return result def afdlsText(self): """Returns the AfdArticle lists in this project, also to logfile.""" logfile.write(str(self) + '\n') text = self.header + '== Open ==\n' + self.opentext text += self.afdsByTime(self.openAfds) text += '== Closed ==\n' + self.closedtext text += self.afdsByTime(self.closedAfds) # Todo: archived Afds by alphabetical order? logfile.write(text.encode(config.console_encoding, 'replace')) return text # end class AfdlProject class AfdArticle(object): """An article for deletion, with its article (usually but not always 1-1).""" def __init__(self, afd, article, startdate=None, closedate=None, result=None): # print afd, article, startdate, closedate, result self.article = article self.afd = afd if startdate: self.startdate = startdate else: # An approximation - assuming first edit created # print 'getting version history' edits = afd.getVersionHistory(reverseOrder = True, getAll = True) if not edits: return # an AfD must have a startdate self.startdate = parseDate(edits[0][1]) if result and closedate: self.result = result self.closedate = closedate else: # print 'getting afd' afdtext = afd.get() match = Rresult.match(afdtext) if match: if result and len(result) > 0: self.result = result else: self.result = match.group('result') # print self.result if closedate: self.closedate = closedate else: self.closedate = parseDate(match.group('date')) else: self.result = self.closedate = None # print self def __str__(self): """A console representation of the AfdArticle""" return self.afdl() def __cmp__(self, other): """Allows sorting AfdArticles. Descending order by startdate. """ return cmp(other.startdate, self.startdate) def afdl(self): """{{afdl|Rebecca Cummings|Rebecca Cummings (2nd nomination)|2007-05-30|2007-06-04|Delete}}""" retval = '{{afdl|' + self.article.title() + '|' if self.afd.title() != 'Wikipedia:Articles for deletion/' + self.article.title(): retval += self.afd.title() retval += '|' + self.startdate.strftime('%Y-%m-%d') + '|' if self.result: # 2007-03-16 retval += self.closedate.strftime('%Y-%m-%d') + '|' + self.result else: retval += '|' retval += '}}' return retval # end class AfdArticle def AfdArticleFromAfdlParams(afdlp): """Reads an AfdArticle from ['article', 'AfD name', 'open YYYY-MM-DD', 'close YYYY-MM-DD', 'result']. Last 3 params optional. """ # print afdlp if not afdlp or len(afdlp) < 1: return None if len(afdlp) > 1 and len(afdlp[1]) > 0: afdname = afdlp[1] else: afdname = 'Wikipedia:Articles for deletion/' + afdlp[0] afd = wikipedia.Page(site, afdname) # if not afd.exists(): return article = wikipedia.Page(site, afdlp[0]) # Any missing params will be read from the afd if len(afdlp) > 4: aa = AfdArticle(afd, article, parseDate(afdlp[2]), parseDate(afdlp[3]), afdlp[4]) elif len(afdlp) > 2: aa = AfdArticle(afd, article, parseDate(afdlp[2]), None, None) else: aa = AfdArticle(afd, article, None, None, None) # No AFD if not hasattr(aa, 'startdate'): return None return aa class CfdCategory(AfdArticle): """Some special treatment for Categories for discussion/deletion debates.""" # Parse date and subsection out of a cfd link Rcfdlink = re.compile(r'Wikipedia:Categories for d[a-z]+/Log/(?P<date>[A-Za-z0-9_ ]+)(#.*)?') def __init__(self, cfd, category, startdate, closedate, result, rest): # print cfd, category, startdate, closedate, result, rest self.article = category self.afd = cfd self.startdate = startdate self.closedate = closedate self.result = result self.rest = rest # any unparsed stuff if not startdate: match = CfdCategory.Rcfdlink.match(cfd.title()) if match: #If not, should throw error self.startdate = parseDate(match.group('date')) else: # Throw error? return # Todo: parse result and closedate from cfd? # if result and not closedate: # self.closedate = self.startdate + timedelta(10) # Nasty hack # print self def __str__(self): """A console representation of the CfdCategory""" # *[[:Category:Naturally busty porn stars]] at [[Wikipedia:Categories for discussion/Log/2007 March 8#Category:Naturally busty porn stars]] ([[2007]]-[[03-08]] - ([[2007]]-[[03-14]]) '''Delete''' result = '[[:' + self.article.title() + ']] at [[' + self.afd.title() + ']] ([[' + str(self.startdate) + ']] -' if self.closedate: result += ' [[' + str(self.closedate) + ']]' result += ')' if self.result: result += " '''" + self.result + "'''" result += self.rest return result # end class CfdCategory(AfdArticle) def readAfdlProjects(projpagename): """ Reads specifications of all AfdlProjects on input page. """ projPage = followRedirects(projpagename) # Afd List:, article templates:, article categories:, talk templates: # The Afd List one is mandatory, the rest are optional Rspec = re.compile(r'==[^=]*' + r'^\*\s*Afd List:[^\[]*\[\[(?P<listpage>[^\]]+)\]\][^\*=$]*$' +'[^=]*', re.IGNORECASE | re.MULTILINE) # Note that the category includes the word 'Category:' but the template doesn't include the # word 'Template:'. This is to match the results of the Page methods. Rtemplate = re.compile(r'\[\[Template:(?P<template>[^\]]+)\]\]', re.IGNORECASE) Rcategory = re.compile(r'\[\[:(?P<category>Category:[^\]]+)\]\]', re.IGNORECASE) RartCat = re.compile(r'(^\*\s*Article categories:[^\*$]*$)', re.IGNORECASE) RartTem = re.compile(r'(^\*\s*Article templates:[^\*$]*$)', re.IGNORECASE) RtalkCat = re.compile(r'(^\*\s*Talk categories:[^\*$]*$)', re.IGNORECASE) RtalkTem = re.compile(r'(^\*\s*Talk templates:[^\*$]*$)', re.IGNORECASE) for match in Rspec.finditer(projPage.get()): # print match listpagename = match.group('listpage') listPage = followRedirects(listpagename) if not listPage.exists(): continue articleTemplates = [] articleCategories = [] talkTemplates = [] talkCategories = [] # print 'listpage=', listpage for line in match.group().splitlines(): # print line if RartCat.match(line): for cat in Rcategory.finditer(line): articleCategories.append(cat.group('category').capitalize()) # print articleCategories if RartTem.match(line): for template in Rtemplate.finditer(line): articleTemplates.append(template.group('template').capitalize()) # print articleTemplates if RtalkCat.match(line): for cat in Rcategory.finditer(line): talkCategories.append(cat.group('category').capitalize()) # print talkCategories if RtalkTem.match(line): for template in Rtemplate.finditer(line): talkTemplates.append(template.group('template').capitalize()) # print talkTemplates afdlProjects.append(AfdlProject(listPage, articleCategories, articleTemplates, talkCategories, talkTemplates)) # Regular expression that matches a "subst"ed Afd debate # {{Wikipedia:Articles for deletion/Dr. John E. Douglas}} Rafd = re.compile(r'{{\s*(?P<afd>Wikipedia:Articles for deletion/(?P<title>[^}]*))}}') def processAfdList(afdListName): """ Searches input page name for Afds of pages matching project categories and templates. """ print 'Processing', afdListName listpage = followRedirects(afdListName) if not listpage.exists(): return listtext = listpage.get() # print listtext for match in Rafd.finditer(listtext): # print 'match=', match.group() afdname = match.group('afd') # print afdname afdtitle = match.group('title') afd = followRedirects(afdname) # print afd.linkedPages() # need to follow every link, to deal with multiple nominations in one AFD checked = [] # only follow a link once per Afd for article in afd.linkedPages(): # print 'article', article, 'section', article.section() if article.section() != None: continue if ':' in article.title(): continue # mainspace pages only if article in checked: continue checked.append(article) article = followRedirects(article.title()) if not article.exists(): continue # print 'considering ', article # print article.templatesWithParams() for (template, params) in article.templatesWithParams(): if template == 'AfDM' and 'page='+afdtitle in params: talkpage = wikipedia.Page(site, 'Talk:' + article.title()) for proj in afdlProjects: # check them all: even if listed in one, may be listed in many proj.checkAfdArticle(afd, article, talkpage) break # assume only one AfDM template per article def main(): args = wikipedia.handleArgs() # take out the global params readAfdlProjects(afdlBotPage) # for proj in afdlProjects: # print proj # print proj.afdlsText() # return if len(args) > 0: for arg in args: processAfdList(arg) else: checkdate = date.today() + timedelta(days=+2) lastdate = date.today() + timedelta(days=-12) while checkdate > lastdate : checkdate = checkdate + timedelta(days=-1) # Wikipedia:Articles_for_deletion/Log/2007_June_9 checkpagename = 'Wikipedia:Articles_for_deletion/Log/' + dateToPageName(checkdate) processAfdList(checkpagename) for proj in afdlProjects: print proj, 'newOpen', proj.newOpen, 'newClosed', proj.newClosed, 'newArchived', proj.newArchived if proj.newOpen + proj.newClosed + proj.newArchived > 0: comment = '' if proj.newOpen > 0: comment += '+' + str(proj.newOpen) + ' open' if proj.newClosed > 0: if len(comment) > 0: comment += ', ' comment += '+' + str(proj.newClosed) + ' closed' if proj.newArchived > 0: if len(comment) > 0: comment += ', ' comment += '-' + str(proj.newArchived) + ' archived' comment += ' deletion discussions' print comment text = proj.afdlsText() print text proj.listpage.put(text, comment, watchArticle = True, minorEdit = False) if __name__ == "__main__": try: main() finally: wikipedia.stopme() logfile.close()