User:BG19bot/Fix section headings
Appearance
import wikipedia
import re
import urllib
import time
import datetime
ISSUECOUNT = 1274 #Change this number to the maximum number of articles in any of the checkwiki categories (id's 7, 19, 25, 83).
def timestamp():
return datetime.datetime.today().strftime("%m/%d/%y %I:%M:%S%p: ")
def l1check(m):
global changed
global changes
changed = True
changes += 1
return "\n==" + m.group(1) + "=="
text = ""
site = wikipedia.getSite()
headerregex = re.compile("\n(?P<level>={2,6})(.*?)(?P=level)(?=\s*\n)")
edits = 0
exceptions = 0
for checkid in [7,19,25,83]:
for i in range(0,ISSUECOUNT,500):
print i
u = urllib.urlopen("http://toolserver.org/~sk/cgi-bin/checkwiki/checkwiki.cgi?project=enwiki&view=bots&id=" + str(checkid) + "&limit=500&offset=" + str(i))
stuff = u.read()
text += stuff[stuff.find("<pre>")+6:]
u.close()
articles = text.split("\n")
while "</pre></body></html>" in articles:
articles.remove("</pre></body></html>")
while ".co" in articles:
articles.remove(".co")
for i in range(len(articles)):
articles[i] = articles[i].replace("</pre>", "").replace("</body>", "").replace("</html>", "").decode("utf-8")
print "\n\n" + str(len(articles)) + " articles found.\n\n"
for i in range(len(articles)):
try:
#This block of code checks a control page, and stops the bot from operating if it is non-empty. Commenting it out for Magioladitis.
"""
if i % 10 == 0:
control = wikipedia.Page(site, "User:Snotbot/control").get(force=True)
if control != u'':
print "\n\n\n\n\n" + timestamp() + "User:Snotbot/control found to be non-empty! Pausing.....\n\n"
while control != u'':
time.sleep(30)
print ".",
control = wikipedia.Page(site, "User:Snotbot/control").get(force=True)
print "\n\n"
print timestamp() + "Resuming..."
"""
page = wikipedia.Page(site, articles[i])
print timestamp() + "(" + str(i) + "): " + page.title() + "...",
if not page.exists():
print " Doesn't exist."
continue
while page.isRedirectPage():
page = page.getRedirectTarget()
if not page.exists():
print " Doesn't exist (post-redirect)."
continue
if not page.canBeEdited():
print " Page cannot be edited (probably protected)."
continue
data = page.get()
changed = False
changes = 0
#Make sure there are no level 1 headers
if "<math>" not in data: #this check can fuck up math formulas, so skip it
data = re.sub("\n=([^=\n]+?)=", l1check, data)
#Check that the first heading is level 2
if data[0] == "=":
firstheading = re.match("(?P<level>={2,6})(.*?)(?P=level)(?=\s*\n)", data)
else:
firstheading = headerregex.search(data)
if firstheading:
fhlevel = len(firstheading.group("level"))
if fhlevel > 2:
currentlevel = fhlevel
while currentlevel == fhlevel:
changed = True
changes += 1
newstr = "\n==" + firstheading.group(2) + "==\n"
data = data[:firstheading.start()] + newstr + data[firstheading.end():]
index = firstheading.end() - ((currentlevel - 1) * 2) - 1
firstheading = headerregex.search(data, index)
if not firstheading:
break
else:
currentlevel = len(firstheading.group("level"))
#Check for hierarchy problems
if data[0] == "=":
lastlevel = len(re.match("=*", data).group(0))
else:
lastlevel = 6
index = 0
lastchanged = [-1, -1]
level = headerregex.search(data, index)
while level:
currentlevel = len(level.group("level"))
if currentlevel == lastchanged[0]:
newlevel = "=" * lastchanged[1]
newstr = "\n" + newlevel + level.group(2) + newlevel + "\n"
data = data[:level.start()] + newstr + data[level.end():]
index = level.end() - ((currentlevel - lastchanged[1]) * 2) - 1
changed = True
changes += 1
elif currentlevel > lastlevel + 1:
newlevel = "=" * (lastlevel + 1)
newstr = "\n" + newlevel + level.group(2) + newlevel + "\n"
data = data[:level.start()] + newstr + data[level.end():]
index = level.end() - ((currentlevel - lastlevel + 1) * 2) - 1
lastlevel += 1
lastchanged = [currentlevel, lastlevel]
changed = True
changes += 1
else:
lastlevel = currentlevel
index = level.end() - 1
lastchanged = [-1, -1]
level = headerregex.search(data, index)
if changed:
print " Found " + str(changes) + " header problems."
page.put(data, "Fixing section headings [[Wikipedia:Bots/Requests for approval/Snotbot 5|(task 5)]]", maxTries=3)
edits += 1
else:
print " No header problems found."
except KeyboardInterrupt:
break
except wikipedia.MaxTriesExceededError:
print "\n\n\n\n"
print "#^" * 200
print timestamp() + "Maximum number of tries exceeded on updating " + page.title() + "\n\n\n\n"
except:
print "\n\n\n\n"
print "*=" * 200
print timestamp() + "UNHANDLED EXCEPTION ON " + page.title() + "\n\n\n\n"
exceptions += 1
continue
print "\n\n\n\n\n" + timestamp() + "Done! Out of " + str(len(articles)) + " articles on the list, I made " + str(edits) + " edits."
print str(exceptions) + " unhandled exceptions."