User:Gaming Euan
Appearance
- !/usr/bin/env python
import re import yaml import urllib import urllib2
class WikipediaError(Exception):
pass
class Wikipedia:
url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s' url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s' url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml' def __init__(self, lang): self.lang = lang def __fetch(self, url): request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/5.0') try: result = urllib2.urlopen(request) except urllib2.HTTPError, e: raise WikipediaError(e.code) except urllib2.URLError, e: raise WikipediaError(e.reason) return result def article(self, article): url = self.url_article % (self.lang, urllib.quote_plus(article)) content = self.__fetch(url).read() if content.upper().startswith('#REDIRECT'): match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content) if not match == None: return self.article(match.group(1)) raise WikipediaError('Can\'t found redirect article.') return content def image(self, image, thumb=None): url = self.url_image % (self.lang, image) result = self.__fetch(url) content = result.read() if thumb: url = result.geturl() + '/' + thumb + 'px-' + image url = url.replace('/commons/', '/commons/thumb/') url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/') return self.__fetch(url).read() return content def search(self, query, page=1, limit=10): offset = (page - 1) * limit url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit) content = self.__fetch(url).read() parsed = yaml.load(content) search = parsed['query']['search'] results = [] if search: for article in search: title = article['title'].strip() snippet = article['snippet'] snippet = re.sub(r'(?m)<.*?>', , snippet) snippet = re.sub(r'\s+', ' ', snippet) snippet = snippet.replace(' . ', '. ') snippet = snippet.replace(' , ', ', ') snippet = snippet.strip() wordcount = article['wordcount'] results.append({ 'title' : title, 'snippet' : snippet, 'wordcount' : wordcount }) # yaml.dump(results, default_style=, default_flow_style=False, # allow_unicode=True) return results
if __name__ == '__main__':
wiki = Wikipedia('simple') wiki.article('Uruguay') wiki.image('Bono_at_the_2009_Tribeca_Film_Festival.jpg', '640') wiki.search('Wikipedia') print 'OK'