Jump to content

User:BG19bot/Fix section headings

From Wikipedia, the free encyclopedia
import wikipedia
import re
import urllib
import time
import datetime

ISSUECOUNT = 1274   #Change this number to the maximum number of articles in any of the checkwiki categories (id's 7, 19, 25, 83).

def timestamp():
    return datetime.datetime.today().strftime("%m/%d/%y %I:%M:%S%p: ")

def l1check(m):
    global changed
    global changes
    changed = True
    changes += 1
    return "\n==" + m.group(1) + "=="

text = ""
site = wikipedia.getSite()
headerregex = re.compile("\n(?P<level>={2,6})(.*?)(?P=level)(?=\s*\n)")
edits = 0
exceptions = 0

for checkid in [7,19,25,83]:
    for i in range(0,ISSUECOUNT,500):
        print i
        u = urllib.urlopen("http://toolserver.org/~sk/cgi-bin/checkwiki/checkwiki.cgi?project=enwiki&view=bots&id=" + str(checkid) + "&limit=500&offset=" + str(i))
        stuff = u.read()
        text += stuff[stuff.find("<pre>")+6:]
        u.close()
    
articles = text.split("\n")

while "</pre></body></html>" in articles:
    articles.remove("</pre></body></html>")
while ".co" in articles:
    articles.remove(".co")
for i in range(len(articles)):
    articles[i] = articles[i].replace("</pre>", "").replace("</body>", "").replace("</html>", "").decode("utf-8")

print "\n\n" + str(len(articles)) + " articles found.\n\n"

for i in range(len(articles)):
    try:
        #This block of code checks a control page, and stops the bot from operating if it is non-empty.  Commenting it out for Magioladitis.
        """
        if i % 10 == 0:
            control = wikipedia.Page(site, "User:Snotbot/control").get(force=True)
            if control != u'':
                print "\n\n\n\n\n" + timestamp() + "User:Snotbot/control found to be non-empty!  Pausing.....\n\n"
                while control != u'':
                    time.sleep(30)
                    print ".",
                    control = wikipedia.Page(site, "User:Snotbot/control").get(force=True)
                print "\n\n"
                print timestamp() + "Resuming..."
        """
            
        page = wikipedia.Page(site, articles[i])
        print timestamp() + "(" + str(i) + "): " + page.title() + "...",
        if not page.exists():
            print " Doesn't exist."
            continue
        while page.isRedirectPage():
            page = page.getRedirectTarget()
        if not page.exists():
            print " Doesn't exist (post-redirect)."
            continue
        if not page.canBeEdited():
            print " Page cannot be edited (probably protected)."
            continue
        
        data = page.get()
        changed = False
        changes = 0

        #Make sure there are no level 1 headers
        if "<math>" not in data:    #this check can fuck up math formulas, so skip it
            data = re.sub("\n=([^=\n]+?)=", l1check, data)
        
        #Check that the first heading is level 2
        if data[0] == "=":
            firstheading = re.match("(?P<level>={2,6})(.*?)(?P=level)(?=\s*\n)", data)
        else:
            firstheading = headerregex.search(data)

        if firstheading:
            fhlevel = len(firstheading.group("level"))
            if fhlevel > 2:
                currentlevel = fhlevel
                while currentlevel == fhlevel:
                    changed = True
                    changes += 1
                    newstr = "\n==" + firstheading.group(2) + "==\n"
                    data = data[:firstheading.start()] + newstr + data[firstheading.end():]
                    index = firstheading.end() - ((currentlevel - 1) * 2) - 1
                    firstheading = headerregex.search(data, index)
                    if not firstheading:
                        break
                    else:
                        currentlevel = len(firstheading.group("level"))
            

        #Check for hierarchy problems
        if data[0] == "=":
            lastlevel = len(re.match("=*", data).group(0))
        else:
            lastlevel = 6

        index = 0
        lastchanged = [-1, -1]
        level = headerregex.search(data, index)
        while level:
            currentlevel = len(level.group("level"))
            if currentlevel == lastchanged[0]:
                newlevel = "=" * lastchanged[1]
                newstr = "\n" + newlevel + level.group(2) + newlevel + "\n"
                data = data[:level.start()] + newstr + data[level.end():]
                index = level.end() - ((currentlevel - lastchanged[1]) * 2) - 1
                changed = True
                changes += 1
            elif currentlevel > lastlevel + 1:
                newlevel = "=" * (lastlevel + 1)
                newstr = "\n" + newlevel + level.group(2) + newlevel + "\n"
                data = data[:level.start()] + newstr + data[level.end():]
                index = level.end() - ((currentlevel - lastlevel + 1) * 2) - 1
                lastlevel += 1
                lastchanged = [currentlevel, lastlevel]
                changed = True
                changes += 1
            else:
                lastlevel = currentlevel
                index = level.end() - 1
                lastchanged = [-1, -1]
            level = headerregex.search(data, index)
        if changed:
            print " Found " + str(changes) + " header problems."
            page.put(data, "Fixing section headings [[Wikipedia:Bots/Requests for approval/Snotbot 5|(task 5)]]", maxTries=3)
            edits += 1
        else:
            print " No header problems found."
    except KeyboardInterrupt:
        break
    except wikipedia.MaxTriesExceededError:
        print "\n\n\n\n"
        print "#^" * 200
        print timestamp() + "Maximum number of tries exceeded on updating " + page.title() + "\n\n\n\n"
    except:
        print "\n\n\n\n"
        print "*=" * 200
        print timestamp() + "UNHANDLED EXCEPTION ON " + page.title() + "\n\n\n\n"
        exceptions += 1
        continue

print "\n\n\n\n\n" + timestamp() + "Done!  Out of " + str(len(articles)) + " articles on the list, I made " + str(edits) + " edits."
print str(exceptions) + " unhandled exceptions."