User:Umeboshi/Tools/enwiki-xml-splitter
Appearance
< User:Umeboshi | Tools
#!/usr/bin/python
# This is a script to help split the large xml database dump
# Use 7-zip to extract to stdout -- 7z e -so /path/to/archive.7z | enwiki-xml-splitter
# The page nodes will be extracted into the current directory until the pages per archive
# limit is reached. Then those pages are put in a new 7z archive and removed.
# Both page and archive filenames use 9 digit zero padded numbers.
# Arguments to the -z option need to be quoted.
import os, sys
from hashlib import md5
#from xattr import xattr
from optparse import OptionParser
import xml.parsers.expat
import codecs
usage = """usage: %prog [options]
This is a script to help split the large xml database dump
Use 7-zip to extract to stdout -- 7z e -so /path/to/archive.7z | enwiki-xml-splitter
The page nodes will be extracted into the current directory until the pages per archive
limit is reached. Then those pages are put in a new 7z archive and removed.
Both page and archive filenames use 9 digit zero padded numbers.
Arguments to the -z option need to be quoted.
The index file is a text file that matches page titles to page-#.xml files and their
respective archives.
"""
parser = OptionParser(usage=usage)
parser.add_option('-v', '--verbose', action='store_true', dest='verbose',
default=False, help="this does absolutely nothing")
parser.add_option('--archive-prefix', action='store', dest='archive_prefix',
default='enwiki-archive', help="prefix for archive filenames")
parser.add_option('--index-file', action='store', dest='index_filename',
default='enwiki-indexfile', help="filename for indexfile")
parser.add_option('--archive-path', action='store', dest='archive_path',
default='', help="path to place archives and index in (default .)")
parser.add_option('-p', '--pages-per-archive', action='store', dest='pages_per_archive',
default=10, type=int)
parser.add_option('-z', '--zipcmd', action='store', dest='zipcmd',
default='7z a -t7z -mfb=64 -mx=7')
parser.add_option('-k', '--keep-pages', action='store_false', dest='remove_pages',
default=True)
opts, args = parser.parse_args(sys.argv[1:])
if opts.archive_path:
archive_prefix = os.path.join(opts.archive_path, opts.archive_prefix)
index_filename = os.path.join(opts.archive_path, opts.index_filename)
else:
archive_prefix = opts.archive_prefix
index_filename = opts.index_filename
zipcmd = opts.zipcmd
pages_per_archive = opts.pages_per_archive
def archivefilename(archivenum):
return '%s-%09d.7z' % (archive_prefix, archivenum)
def pagefilename(pagenum):
return 'page-%09d.xml' % pagenum
def new_pagefile(pagenum):
filename = pagefilename(pagenum)
return codecs.open(pagefilename(pagenum), 'w', encoding='utf8')
def make_indexline(archivenum, pagenum, title):
pfilename = pagefilename(pagenum)
afilename = os.path.basename(archivefilename(archivenum))
return '%s,%s:\t%s\n' % (afilename, pfilename, title)
def archive_pagefile(pagenum, archivenum, remove=True):
pfilename = pagefilename(pagenum)
afilename = archivefilename(archivenum)
print 'archiving file %s to archive %s' % (pfilename, afilename)
cmd = '%s %s %s' % (zipcmd, afilename, pfilename)
os.system(cmd)
if remove:
os.remove(pfilename)
def archive_pagefiles(archivenum, remove=True):
afilename = archivefilename(archivenum)
if os.path.exists(afilename):
print 'skipping archive %s' % afilename
else:
print 'creating archive %s' % afilename
cmd = '%s %s page-*.xml' % (zipcmd, afilename)
os.system(cmd)
if remove:
os.system('rm -f page-*.xml')
class ParserHandler(object):
def __init__(self):
self.pagenum = 1
self.archivenum = 1
self.outfile = new_pagefile(self.pagenum)
self.indexfile = codecs.open(index_filename, 'a', encoding='utf8')
self.inpage = False
self.intitle = False
self.current_title = None
def _current_archive_exists(self):
return os.path.exists(archivefilename(self.archivenum))
def start_element(self, name, attrs):
if name == 'page':
self.pagenum += 1
self.inpage = True
if not (self.pagenum - 1) % pages_per_archive:
archive_pagefiles(self.archivenum, remove=opts.remove_pages)
self.archivenum += 1
# make sure empty file stays out of archive
if not self._current_archive_exists():
self.outfile = new_pagefile(self.pagenum)
elif name == 'title':
self.intitle = True
if not self._current_archive_exists():
attlist = ['%s=%s' % (k,v) for k,v in attrs.items()]
attributes = ''
if len(attlist):
attributes = ' '.join(attlist)
tag = name
if attributes:
tag = '%s %s' % (name, attributes)
self.outfile.write('<%s>\n' % tag)
def end_element(self, name):
if not self._current_archive_exists():
self.outfile.write('</%s>' % name)
if name == 'page':
self.inpage = False
if not self._current_archive_exists():
print 'indexing', self.current_title
indexline = make_indexline(self.archivenum, self.pagenum, self.current_title)
self.indexfile.write(indexline)
else:
print 'skipping', self.current_title
self.current_title = None
if name == 'title':
self.intitle = False
def char_data(self, data):
if self.intitle:
if self.current_title is None:
self.current_title = data
else:
self.current_title += data
if not self._current_archive_exists():
self.outfile.write(data)
ph = ParserHandler()
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = ph.start_element
p.EndElementHandler = ph.end_element
p.CharacterDataHandler = ph.char_data
infile = sys.stdin
p.ParseFile(infile)