User:Salix alba/subsup.py
Appearance
import sys
import re
dump = len(sys.argv)>1 and sys.argv[1]=='-d'
titleRE = re.compile('<title>(.*)</title>')
nsRE = re.compile('<ns>(.*)</ns>')
subsupRE = re.compile('</?su[pb]>')
pageEndRE = re.compile('</page>')
supOc = 0
supCc = 0
subOc = 0
subCc = 0
title =""
ns = -1
for line in sys.stdin:
m = titleRE.search(line)
if m :
title = m.group(1)
supOc = 0
supCc = 0
subOc = 0
subCc = 0
if dump : print line
m = nsRE.search(line)
if m :
ns = m.group(1)
a = subsupRE.findall(line)
c1 = a.count('<sub>')
c2 = a.count('</sub>')
c3 = a.count('<sup>')
c4 = a.count('</sup>')
subOc += c1
subCc += c2
supOc += c3
supCc += c4
if dump and (c1!=c2 or c3!=c4) :
print c1,c2,c3,c4,line.replace('<','<').replace('>','>')
if pageEndRE.search(line) and ns==0 and ( supOc > 0 or supCc > 0 or subOc > 0 or subCc > 0) :
if dump :
print title, supOc, supCc, subOc, subCc
elif supOc <> supCc or subOc <> subCc :
print title