User:CobraBot/Code2
Appearance
#!/usr/bin/env python # -*- coding: utf-8 -*- import wikipedia import pagegenerators import re import warnings from time import sleep from contextlib import closing from sys import stdout from oclc import isbn2oclc, ParsingProblem, NotInOclc from congress import isbn2classes, NotInLoc from json import dump, load # This is required for the text that is shown when you run this script # with the parameter -help. docuReplacements = { '¶ms;': pagegenerators.parameterHelp } TEMPLATE_PREFIX = u"Template:" SITE = wikipedia.getSite() def pagesUsingTemplate(templateName): transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName transclusionPage = wikipedia.Page(SITE, transclusionPageName) gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True) return gen class BailOut(StandardError): """Immediately stop processing the current page""" class AlreadyFilled(BailOut): """Field already filled in""" class CobraBot: # EDIT_SUMMARY = u'Adding [[OCLC]]# to book infobox based on [[ISBN]] ([[User:CobraBot|CobraBot]]; PLEASE [[User talk:CobraBot|report any problems]])' EDIT_SUMMARY = u'Adding [[Dewey Decimal Classification|Dewey Decimal]] and/or [[Library of Congress Classification|LCC]] to book infobox based on [[ISBN]] ([[User:CobraBot|CobraBot]]; PLEASE [[User talk:CobraBot|report any problems]])' BOOK_INFOBOX = u"Infobox Book" DASHES = [u'-', u'‒', u'–', u'—', u'―'] TERMINATOR = re.compile(u"(}})|\\|") INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE) GENERIC_PARAM = u"\\|[ \t\n]*%s[ \t\n]*=[ \t\n]*" # OCLC_PARAM = GENERIC_PARAM % u"oclc" DEWEY_PARAM = GENERIC_PARAM % u"dewey" LOC_PARAM = GENERIC_PARAM % u"congress" ISBN_MIN_LEN = 10 OFFSET_FILE = 'N.json' JUST_FIC = "[Fic]" def __init__(self, automatic, debug): """ Constructor. Parameters: * generator - The page generator that determines on which pages to work on. * debug - If True, doesn't do any real changes, but only shows what would have been changed. """ self.generator = pagesUsingTemplate(self.BOOK_INFOBOX) self.debug = debug self.editCount = 0 self.log = file("skipped.log", 'a') self.log.write("BEGIN NEW SESSION\n") self.automatic = automatic wikipedia.setAction(self.EDIT_SUMMARY) def run(self): with closing(file(self.OFFSET_FILE, 'r')) as f: N = load(f) # Set the edit summary message print "Advancing by %s..." % N stdout.flush() for i in xrange(N): next(self.generator) print "Done advancing!" stdout.flush() try: for pageIndex, page in enumerate(self.generator): self.treat(page, pageIndex) finally: self.log.close() with closing(file(self.OFFSET_FILE, 'w')) as f: dump(N+pageIndex-5, f) def runManual(self): index = 0 while True: title = raw_input("Page: ").decode('utf8') page = wikipedia.Page(None, title) self.treat(page, index) index += 1 ######### def partition(self, text): boxmatch = self.INFOBOX_START.search(text) if not boxmatch: wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive") raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive" boxStart = boxmatch.start() boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end() prebox = text[:boxStart] box = text[boxStart:boxEnd] postbox = text[boxEnd:] return prebox, box, postbox def checkForField(self, box, field_regex): paramMatch = re.search(field_regex, box) if paramMatch: #has |oclc= fieldValAndRest = box[paramMatch.end():] fieldTermMatch = self.TERMINATOR.search(fieldValAndRest) value = fieldValAndRest[:fieldTermMatch.start()].strip() # | oclc = VALUE | if value: #already has |oclc= filled in wikipedia.output(u"SKIPPING: param already filled") raise AlreadyFilled, "SKIPPING: param already filled" else: #remove the |oclc= # print "REMOVED OCLC:", repr(paramMatch.group()) box = box[:paramMatch.start()] + box[paramMatch.start()+len(paramMatch.group()):] # print "NEW BOX:" # print box return box return box def findIsbnVal(self, box): paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box) if not paramMatch: #no ISBN present wikipedia.output(u"SKIPPING: No isbn param present") raise BailOut, "SKIPPING: No isbn param present" isbnValAndRest = box[paramMatch.end():] termMatch = self.TERMINATOR.search(isbnValAndRest) isbnVal = isbnValAndRest[:termMatch.start()] relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start() isbnTerm = paramMatch.end() + relIsbnTerm isbnFrag = isbnValAndRest[:relIsbnTerm] if '[[' in isbnFrag and ']]' not in isbnFrag: wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle") raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle" return isbnVal, isbnTerm def removeDashes(self, isbn): for dash in self.DASHES: isbn = isbn.replace(dash, '') return isbn def checkForNA(self, isbn): if re.match(u"N/?A", isbn, re.IGNORECASE): wikipedia.output(u"SKIPPING: ISBN Not/Applicable") raise BailOut, "SKIPPING: ISBN Not/Applicable" def removeExtraISBN(self, isbnVal): match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal) if match: return isbnVal[match.end():] return isbnVal def firstWord(self, isbnVal): wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal) return wordMatch.group() def normalize(self, string): return string.replace(u' ',u'').replace(u"-",u'').replace(u"and", u"&").replace(u',', u'').replace(u'.', u'').replace(u"'", u'').replace(u'"', u'').replace(u"’", u'').lower().replace(u"the", u'') def treat(self, page, pageIndex): """ Loads the given page, does some changes, and saves it. """ print "==================================================================" print "PAGE TITLE:", page.title() print "PAGE#:", pageIndex+1 print "EDIT COUNT:", self.editCount if page.namespace() != 0: wikipedia.output(u"SKIPPING: Non-article namespace!") return try: # Load the page text = page.get() except wikipedia.NoPage: wikipedia.output(u"Page %s does not exist; skipping." % page.aslink()) return except wikipedia.IsRedirectPage: wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) return try: prebox, box, postbox = self.partition(text) # print "BOX:" # print box doLOC = True try: box = self.checkForField(box, self.LOC_PARAM) except AlreadyFilled: doLOC = False doDewey = True try: box = self.checkForField(box, self.DEWEY_PARAM) except AlreadyFilled: doDewey = False if not doDewey and not doLOC: return #skip since both filled in isbnVal, isbnTerm = self.findIsbnVal(box) # print "INITIAL ISBN:", repr(isbnVal) isbnVal = self.removeDashes(isbnVal).strip() # print "ISBN SANS DASH:", repr(isbnVal) isbnVal = self.removeExtraISBN(isbnVal) self.checkForNA(isbnVal) # print "ISBN SANS ISBN:", repr(isbnVal) if not isbnVal: #empty |isbn= wikipedia.output(u"SKIPPING: Empty isbn param") raise BailOut, "SKIPPING: Empty isbn param" isbn = self.firstWord(isbnVal) # print "ONE TRUE ISBN:", isbn if not self.automatic: print "ISBN#:", isbn if len(isbn) < self.ISBN_MIN_LEN: wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn) if not re.search("[0-9]", isbn): wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) #do lookup try: loc, dewey = isbn2classes(isbn) except NotInLoc: wikipedia.output(u"SKIPPED: Given ISBN not in LOC database") raise BailOut, "SKIPPED: Given ISBN not in LOC database" except RuntimeError as e: wikipedia.output(u"ABORTED: Problem looking up data (%s)" % e.message) raw_input("Enter to continue") raise BailOut, e.message doDewey &= dewey is not None and dewey != self.JUST_FIC # try: # oclc, oclcTitle = isbn2oclc(isbn) # except ParsingProblem: # wikipedia.output(u"SKIPPED: Problem parsing OCLC response") # raw_input("Enter to continue") # raise BailOut, "SKIPPED: Problem parsing OCLC response" except BailOut as e: try: self.log.write(page.title().encode('utf8')+"; "+e.message+"\n") except: pass return print "LOC Class:", loc print "Dewey Class:", dewey # print "OCLC#:", oclc # if not self.automatic: # wikiCanon = self.normalize(page.title().split(u"(")[0]) # oclcCanon = self.normalize(oclcTitle.split(u":")[0]) # titlesMatch = oclcCanon.startswith(wikiCanon) # if titlesMatch: # # print # print "--Canonical titles DO MATCH--" # else: # print "!!Canonical titles DON'T MATCH!!" # print "PAGE TITLE:", page.title() # print "OCLC TITLE:", oclcTitle # print wikiCanon # print oclcCanon addition = "" if doDewey: addition = "| dewey= "+dewey+(" " if self.debug else "\n") if doLOC: addition += "| loc= "+loc+(" " if self.debug else "\n") box = box[:isbnTerm] + addition + box[isbnTerm:] text = prebox + box + postbox # only save if something was changed if text != page.get(): # Show the title of the page we're working on. if not self.automatic: # Highlight the title in purple. wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) # show what was changed wikipedia.showDiff(page.get(), text) if not self.debug: if False: pass elif self.automatic: pass else: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N') if choice == 'n': return try: # Save the page page.put(text) except wikipedia.LockedPage: wikipedia.output(u"Page %s is locked; skipping." % page.aslink()) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title())) except wikipedia.SpamfilterError, error: wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url)) else: self.editCount += 1 def main(): DEBUG = True # False AUTO = False bot = CobraBot(AUTO, DEBUG) with warnings.catch_warnings(): warnings.simplefilter("ignore") bot.run() #bot.runManual() if __name__ == "__main__": try: main() finally: wikipedia.stopme()