Jump to content

User:PDFbot/pdfbot.py

From Wikipedia, the free encyclopedia
#!/usr/bin/python 
# -*- coding: utf-8  -*-
"""
This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file:       Update pages listed in a text file.
-ref:        Update pages transcluding from a given page.
-cat:        Update pages from the given category.
-links:      Update pages linked from a given page.
-page:       Update that page.

"""

#
# (c) Dispenser, 2007
#

import re, sys, time
import wikipedia, pagegenerators, catlib
import httplib, socket, urlparse
import codecs
try:
	import commonfixes
except ImportError:
	wikipedia.output('Unable to import commonfixes')
	commonfixes = None
try:
	import reflinks
	def my_reflink_put_page(self, page, new):
		self.page = page
		self.new_text = new
	reflinks.ReferencesRobot.put_page=my_reflink_put_page
except ImportError:
	wikipedia.output('Unable to import reflinks')
	reflinks = None

# Download this file :
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
# ( maintained by User:Marumari )
listof404pages = '404-links.txt'

# Define global constants
readDelay  = 10	# seconds
writeDelay = 30 # seconds
mix_prefix = ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
SI_prefix  = ('bytes', '[[Kilobyte|kB]]', '[[Megabyte|MB]]', '[[Gigabyte|GB]]')
IEC_prefix = ('bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]')
# following char sperate url from title: []"<>\ \n
# {|} is included since we're in a template
urlpattern = re.compile(r'http[s]?://[^][<>\s"{|}]*', re.IGNORECASE)
httpHeader = {
	'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)',
	'Accept': 'application/pdf,application/octet-stream,*/*;q=0.5',
	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	'Keep-Alive': '30',
	'Connection': 'keep-alive',
}

def checkLink(location, useHEAD = True, counter = 5):
	try:
		while counter >= 0 and location:
			(scheme, site, path, query, frag) = urlparse.urlsplit(location)
			query = query and '?' + query or ''
			path = path or '/'
			if scheme == "http":
				conn = httplib.HTTPConnection(site)
			elif scheme == "https":
				conn = httplib.HTTPSConnection(site)
			else:
				return (location, -1, 'Unsupported Protocol', None, None)
			conn.set_debuglevel(0)
			socket.setdefaulttimeout(300)
			
			try:
				request = path.encode('ascii') + query.encode('ascii')
			except UnicodeEncodeError:
				encoding = 'utf-8'
				noencode = '~!^*()_-=&/|,.?;'
				request = unicode(urllib.quote(path.encode(encoding) + query.encode(encoding), noencode))
				
			if useHEAD:
				conn.request('HEAD', request, None, httpHeader)
			else:
				conn.request('GET', request, None, httpHeader)
			
			response = conn.getresponse()
			redirect = response.msg.getheader('location')
			content_length = response.msg.getheader('content-length')
			content_type   = response.msg.getheader('content-type')
			conn.close()
			
			counter -= 1
			if redirect:
				wikipedia.output( u'STATUS:	HTTP %s Moved: %s to %s' % (response.status, location, redirect) )
				if redirect.startswith("http"):
					location = urlparse.urljoin(location, redirect)
				else:
					location = redirect
			else:
				location = None
		return (location, response.status, response.reason, content_length, content_type)
	except httplib.error, arg:
		wikipedia.output(u'ERROR:	HTTP %s %s' % (arg, location))
		return (location, 52, "", None, None)
	except socket.timeout:
		return (location, 110, 'Connection timeout', None, None)
	except socket.error, arg:
		wikipedia.output(u'ERROR:	Socket %s %s' % (arg, location))
		return (location, arg[0], arg[1], None, None)
	except KeyboardInterrupt:
		raise
	except Exception, e: # catches those weird ones
		print u'Exception raised: %s' % e
		return (location, 0, "Exception %s" % e, None, None)
	
def binary_notation(size, base = 1024., prefix = IEC_prefix):
	"""
	Convert the byte count to a human readable value
	"""
	a = float(size)
	exponent = 0
	while a >= 1000.:
		a /= base
		exponent += 3

	# Truncate and remove trailing dot
	byteSigs = str(a)[:4]
	if byteSigs.endswith('.'):
		byteSigs = byteSigs[:3]
	return byteSigs + '&nbsp;' + prefix[exponent / 3]
	# return '%3.3g&nbsp;%s' % (byteSigs, prefix[exponent / 3])

def fix_broken_links(link):
	"""
	Returns link replacement for known broken links
	"""

	# Moving of resources
	link = link.replace('virginiadot.org/infoservice/', 'virginiadot.org/info/')
	link = link.replace('virginiadot.org/comtravel/', 'virginiadot.org/info/')
	link = link.replace('ncdot.org/transit/aviation/ncairports/locations/pdf/', 'ncdot.org/transit/aviation/download/ncairports/')
	link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/')
	
	# 301 Permanent Redirects
	link = link.replace('transportation.ky.gov/planning/', 'www.planning.kytc.ky.gov/')
	link = link.replace('official-documents.co.uk/', 'official-documents.gov.uk/')
	link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/')
	link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/')
	link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/')
	link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/')
	link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/')
	link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/')
	link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/')
	link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/')
	link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/')
	link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/')
	
	return link

def update_size_paramter(template_text):
	m = re.search(r'(?s)\{\{(?P<tpl>[^|]*)\|(1=)?(?P<text>[^|]*).*?(, (?P<size>[0-9]+) byte.*)?\}\}', fix_broken_links(template_text))
	link_text = m.group('text')
	location  = urlpattern.search(link_text).group(0)
	
	old_size = int(m.group('size') or 0)
	
	parameter_prefix = ''
	if '=' in link_text:
		parameter_prefix = '2='
	
	# Convert indirect HTML character references
	location = wikipedia.html2unicode(location)
	
	(redirect, response, reason, content_length, media_type) = checkLink(location)
	try:
		content_length = int(content_length)
	except:
		content_length = None
	if media_type and content_length and content_length != old_size:
			# I should really put in 404 error handling code, but this has been working just fine.
			if 'pdf' in media_type or 'octet-stream' in media_type or 'application/download' in media_type:
			# This was the old format using the comment
			#	return  u'{{%s|%s|%s%s<!-- %s, %d bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length )
			# However, comment was filled with generally non-useful information
				return  (not (old_size == 0) or template_text.count('|')<2, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length, prefix = mix_prefix)))
			else:
				wikipedia.output(u'FIXME:	Bad response: code: %d, type: %s, location: %s' % (response, media_type, location))
	# If anything else return template_text back
	if old_size:
		return  (False, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(old_size, prefix = mix_prefix)))
	else:
		return (False, template_text)

def process_article(page):
		try:
			deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
		except IOError:
			wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory')
			raise
		wikipedia.output('Getting page %s' % page.aslink())
		wikitext = page.get()
		
		# Fix Casing (Reduces the number of possible expressions)
		wikitext = re.sub(r'(?i)\{\{\s*(template:|)pdf', r'{{PDF', wikitext)
		wikitext = wikitext.replace('{{PDFLink', '{{PDFlink')
		
		# State point.  Count any changes as needing an update if they're after this line
		state0 = wikitext
		
		# [http {{PDF}}]
		wikitext = re.sub(r'(\[\w+://[^][<>"\s]+\s[^][\n]+?)\s*(\{\{(PDFlink|PDF)\}\})', r'\2\1', wikitext)
		
		# Convert hard coded pdf links  (ex: [http link] (pdf) )
		wikitext = re.sub(r'(\[\w+://[^][]*\]) *\((\[\[[^|\]]*)?\.?(PDF|pdf) *([Ff]ile)? *([Ff]ormat)?(\]\])?\)', r'{{PDFlink|\1}}', wikitext)
		
		# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
		wikitext = re.sub(r"[(]?\{\{(PDFlink|PDF)\}\}[)]? *((?P<quote>'*)\[\w+://[^][]*\](?P=quote)?)", r'{{\1|\2}}', wikitext)
		wikitext = re.sub(r'("?\[\w+://[^]]*\]"?)([^a-zA-Z0-9()]*) *[(]?\{\{(PDFlink|PDF) *\}\}[)]?', r'{{\3|\1}}\2', wikitext)
		
		# Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
		if '{{PDF' in wikitext:
			wikitext = re.compile(r'(\n *\*+[^\n:/]*)(\[\w+://[^][]*\])([^\n:/]*) *[(](\[\[|\{\{)?(Portable Document Format[|]PDF|pdflink).?(pdf.?)?(file|format|datei)?(\}\}|\]\])?[)]', re.IGNORECASE).sub(r'\1{{PDFlink|\2}}\3', wikitext)
			wikitext = re.sub(r'(\n *\*+[^\n:/]*)(\[\w+://[^][]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)

		# Experimental: move {{PDF}} back in <ref> tag
		wikitext = re.sub(r'(<ref[^][{}<>]*>[^][<>=]*?)("?\[\w+://[^][<>\s"]+[^]\n]*\]"?)([^{}<>]*)\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
		
		# State point.  Correction of {{PDFlink}} template
		genfixState = wikitext
		
		# Remove PDFlink from citation templates
		# {{cite |format={{PDF}}}}
		wikitext = re.sub(r'(?s)(format *= *)(PDF|pdf)?[(]?\{\{PDF[^{}]*?\}\}[)]?', r'\1PDF', wikitext)
		#  {{cite.*?}}{{PDF}}
		wikitext = re.sub(r'(?s)(\{\{ *[Cc]ite[^}]*)(\}\}[^\w() ]*) *[(]?\{\{(PDF|PDFlink)\}\}[)]?', r'\1 |format=PDF\2', wikitext)
		# {{cite | lang= EN {{PDF}} }}
		wikitext = re.sub(r'(?s)(\{\{ *[Cc]ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
		# {{PDF| {{template...}} }}
		wikitext = re.sub(r'(?s)\{\{(PDFlink|PDF)\|\s*(\{\{[^{}]+?(\|[^{|}]+)?\}\})[\s|]*\}\}', r'\2', wikitext)
		# {{citation|url={{PDFlink|...}} }}
		wikitext = re.sub(r'(?i)\{\{(([Cc]itation|[Cc]ite)[^{}]+?)\{\{(PDFlink|PDF)\|([^{}]*?)(\|[^{|}]+)?\}\}', r'{{\1\4', wikitext)
		
		# Sate point.  Removal of {{PDFlink}} in certian instances
		state2 = wikitext
		cleantext = wikitext
		# This is ugly, since we need the comments to check the relative filesize
		for m in re.finditer(r'<!--.*?-->|<nowiki[^>]*>.*?</nowiki>', cleantext):
			if '{{PDF' in m.group():
				cleantext = cleantext.replace(m.group(), '')
		
		sizechange = 0
		for m in re.finditer(r'\{\{(?:PDFlink|PDF)\|[^{}]+?\}\}', cleantext):
			if 'http://' in m.group() or 'https://' in m.group():
				(changed, replacetext) = update_size_paramter(m.group())
				sizechange += changed and 1 or 0
	#			print "update page? %s"%(sizechange, )
				wikitext    = wikitext.replace(m.group(), replacetext)
				# Uncomment the bellow line to see the replacement text
	#			wikipedia.output(u'OUTPUT:	%s' % replacetext)
		
		for s in re.findall(ur'(?ui)\{\{(?:cite[\w\s]+)\|[^{}]+?\}\}', cleantext):
			murl = re.search('\|\s*url\s*=\s*(?P<url>http[s]?://[^][<>"\s|]+)(\||}})', s)
			if murl and 'PDF' in murl.group().upper() and (not re.search(ur'\|\s*format\s*=\s*[^\s{|}]+', s) or not re.search(ur'\|\s*(access\w+)\s*=\s*([^{|}]+?)\s*(?=[{|}])', s)) and not re.search(ur'\|\s*archiveurl\s*=\s*[^\s{|}]+', s):
				repl_url = fix_broken_links(murl.group('url'))
				(redirect, response, reason, content_length, media_type) = checkLink(repl_url)
				# media_type not given
				if not media_type:
					continue
				# Gone/Not Found error code
				elif (response == 410 or (response == 404 and (u'\t%s\t' % murl.group(1) in deadLinks))) and repl_url == murl.group('url'):
					wikitext = wikitext.replace(s, s + time.strftime("{{dead link|bot=PDFbot|date=%B %Y}}"))
				# valid PDF
				# python2.6code: any(item in media_type.lower() for item in ('pdf', 'octet-stream'))
				elif 'pdf' in media_type.lower() or 'octet-stream' in media_type.lower():
					replacetext = s
					replacetext = replacetext.replace(murl.group(), murl.group().replace(murl.group('url'), repl_url))
					if re.search(ur'\|\s*format\s*=\s*[^{|}]*[|}]', replacetext):
						# fill in the format=
						replacetext = re.sub(r'(\|\s*format\s*= ??)(\n* *[{|}])', r'\1PDF\2', replacetext)
					else:
						# add format=PDF (third last parameter)
						replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+(\s*= *)[^{|}]+)(\s*\|[^{|}]+)\}\}', r'\1\3format\4PDF\2\5}}', replacetext)

					accessed = re.search(ur'\|\s*(access\w+)\s*=\s*[^{|}\s]+', replacetext)
					# no access-anything filled in, add/fill accessdate
					if not accessed:
						# fill out accessdate if it exists
						replacetext = re.sub(r'(\|\s*accessdate\s*= ??)(?=\n* *[{|}])', time.strftime(r'\g<1>%Y-%m-%d'), replacetext)
						# if template doesn't contain accessdate then add it (last parameter)
						if not re.search(r'\|\s*accessdate\s*=', replacetext):
							replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+?(\s*= *)[^{|}]+?)(\s*)\}\}', time.strftime(r'\1\2\3accessdate\g<4>%Y-%m-%d\5}}'), replacetext)
							#replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+(\s*= *)[^{|}]+)(\s*\|[^{|}]+)\}\}', time.strftime(r'\1\2\5\3accessdate\g<4>%Y-%m-%d}}'), replacetext)

					# put back in
					wikitext = wikitext.replace(s, replacetext)
					sizechange += 1
					# Uncomment the bellow line to see the replacement text
					wikipedia.output(u'OUTPUT:  %s' % replacetext)

		# remove duplicate {{dead link}}
		dead_templates = r'[Dd]ead[ _]*link|[Dd]l|[Dd]l-s|404|[Bb]roken[ _]+link|[Cc]leanup-link'
		wikitext = re.sub('(\{\{(?:%s)[^}]*?\}\})+((</ref>)?\{\{(?:%s)[^}]*?\}\})'%(dead_templates, dead_templates), r'\2', wikitext)

		# Figure out an edit message of what we did
		if sizechange:
			if state2 != state0:
				EditMsg = "Updating %d PDF%s and fixes" % (sizechange, sizechange>1 and 's' or '')
			else:
				EditMsg = "Updating %d PDF%s" % (sizechange, sizechange>1 and 's' or '')
		else:
			# state0: renamed templates
			# genfix: fixPDFlink
			# state2: removePDFlink
			#wikitext: -
			EditMsg = "General fixes for PDFs"
			if wikitext == state0:
				pass # text stayed the same
			elif wikitext == genfixState:
				EditMsg = "Correct {{PDFlink}} syntax"
			elif wikitext == state2:
				if genfixState == state0: # no fixes
					EditMsg = "Remove incorrect {{PDFlink}}"
				else: #fixes+removal
					pass
		wikipedia.setAction(EditMsg)
		
		updateSizes = wikitext

		# Fix equal sign problem
		# moved here to avoid changing edit message
		wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
		
		# altert me if the page does not contains {{pdflink|no-link}}
		if re.search(r'\{\{PDF(link|)\|[^:]+\}\}', wikitext):
			wikipedia.output(u'FIXME:	No link in {{PDFlink}} on %s' % page.aslink())
		
		# If the text has changed at all since the state point, upload it
		if (wikitext != state0 and sizechange) or state2 != state0 or updateSizes != wikitext:
			wikipedia.output('PDFs updated: % 3d' % sizechange)

			# [[pdf]] -> [[PDF]]
			wikitext = re.sub(r'\[\[pdf(?=[|\]])', '[[PDF', wikitext)

			# {{cite | format = pdf }}
			wikitext = re.sub(r'(?s)(?:([|]\s*format\s*=\s*)(?:\[\[|)[Pp][Dd][Ff](?:\]\]|))+(\s*[{|}])', r'\1PDF\2', wikitext)
			
			# To many to just fix when we come across, so we don't count it with the fixes
			# Unlink PDF in format parameters
			wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)\[\[(adobe|portable|document|file|format|pdf|\.|\s|\(|\)|\|)+\]\]', r'\1PDF', wikitext)
			wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)(\s*\.?(adobe|portable|document|file|format|pdf|\(|\)))+?(?=\s*[|}])', r'\1PDF', wikitext)
			
			# Apply common fixes if avalible
			if commonfixes:
				wikitext = commonfixes.fix(page, text=wikitext)

			# Apply reflink if avalible
			if reflinks:
				# Hackist hook
				page._contents = wikitext
				if page.get() != wikitext:
					wikipedia.output("Injected text wasn't returned with page.get()")
				elif reflinks.linksInRef.search(wikitext):
					reflinksbot = reflinks.ReferencesRobot(iter([page]))
					reflinksbot.run()
					if hasattr(reflinksbot, 'new_text'):
						if reflinksbot.page != page:raise 'pages not the same'
						wikitext = reflinksbot.new_text
					# Reset edit summary
					wikipedia.setAction(EditMsg)
					

			try:
				wikipedia.output(u'WRITE:	Delta length of % 3d bytes.' % (len(wikitext)-len(state0)))
				page.put(wikitext)
			except Exception, e:
				wikipedia.output(u'ERROR:	Except %s raised while writing.' % e)
			
			# Pause to reduce load on the servers
			time.sleep(writeDelay)
		else:
			wikipedia.put_throttle()
			time.sleep(readDelay)
			pass
		
def main():
	site  = wikipedia.getSite()
	gen = None
	namespaces = [0]
	
	for arg in wikipedia.handleArgs():
		if arg.startswith('-ref:'):
			referredPage = wikipedia.Page(site, arg[5:])
			gen = pagegenerators.ReferringPageGenerator(referredPage)
		elif arg.startswith('-file:'):
			gen = pagegenerators.TextfilePageGenerator(arg[6:])
		elif arg.startswith('-cat:'):
			cat = catlib.Category(site, arg[5:])
			gen = pagegenerators.CategorizedPageGenerator(cat)
		elif arg.startswith('-links:'):
			pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:])
			gen = pagegenerators.LinkedPageGenerator(pagelinks)
		elif arg.startswith('-page:'):
			page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:]))
			gen = iter([page])
		elif arg.startswith('-ns:'):
			namespaces.append(int(arg[11:]))
		elif arg.startswith('-delay:'):
			global readDelay, writeDelay
			readDelay = int(arg[7:])
			writeDelay = int(arg[7:])

	if not gen:
		wikipedia.showHelp(u'pdfbot')
		return

	wikipedia.output(u'Delays are %s s for read and %s for writes' % (readDelay, writeDelay,) )
	
	if namespaces != []:
			gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
	
	gen = pagegenerators.RedirectFilterPageGenerator(gen)
	
	for page in gen:
		if page.site().messages:
			wikipedia.output(u'Messages left on talk page, halting.')
			return
		process_article(page)
	wikipedia.output(u'Finished updating')

if __name__ == "__main__":
	try:
		main()
	finally:
		wikipedia.stopme()