User:AdultSwim/reflist
Appearance
This script returns all references used on a page. I have not tested it on pages with nested or broken references, strange ref names, or pages with other issues. --AdultSwim (talk) 22:10, 25 July 2008 (UTC)
from BeautifulSoup import BeautifulStoneSoup
import wikipedia
import pagegenerators
import re
genFactory = pagegenerators.GeneratorFactory()
gen = None
pageTitleParts = []
pageTitle = "Kim Deal" #if you don't specify a title, this is what you get
history = False
#Call the script by C:\path\ref.py Page Title
#Don't worry about spaces in the title, its ok
#Special characters like '&' are a known problem through
#Try C:\path\ref.py "Page&Title"
#For titles with unicode characters, just replace the pageTitle at the top of this script
def getrefs(text):
#Beautiful Soup does not like wikimarkup of ref tag stubs
#It trys to complete the tags and just screws it all up, nesting the references, lets prevent that.
#Remove all instances of <ref name="Stub"/>
regex = '< *ref *name *= *[^>]+? */ *>'
text = re.sub(regex, "", text)
soup = BeautifulStoneSoup(text)
#print soup.prettify()
refarray = soup.findAll('ref')
return refarray
def print_array(refarray):
for ref in refarray:
wikipedia.output(str(ref))
wikipedia.output("")
for arg in wikipedia.handleArgs():
if arg.startswith("-history"):
history = True
else:
generator = genFactory.handleArg(arg)
if generator:
gen = generator
else:
pageTitleParts.append(arg)
if pageTitleParts != []:
pageTitle = ' '.join(pageTitleParts)
page = wikipedia.Page(wikipedia.getSite(), pageTitle)
if history == True:
editcount = wikipedia.input(u'Please enter the number of edits to retrieve (0 for all): ')
if editcount == 0:
wikipedia.output("Searching all previous versions")
vh = page.getVersionHistory(getAll = True)
else:
wikipedia.output("Searching %s previous versions" % editcount)
vh = page.getVersionHistory(revCount = int(editcount) )
refarray=[]
for entry in vh:
wikipedia.output(entry[0])
text=page.getEditPage(oldid=entry[0])[0]
array=getrefs(text)
for entry in array:
if entry not in refarray:
refarray.append(entry)
wikipedia.output("")
print_array(refarray)
else:
text = page.get()
refarray=getrefs(text)
print_array(refarray)
This page is GFDL. Feel free with personal/educational usage ,but give a shoutout if you repost or build further.