GMRKB.ReadMWDump Parser
(Redirected from GMRKB.ReadMWDump)
Jump to navigation
Jump to search
A GMRKB.ReadMWDump Parser is a MediaWiki XML File Parser.
- See: xml.etree.ElementTree.
References
2018
#ReadMWDump.py package definition import xml.etree.ElementTree as etree
class GetAllPages(object):
def __init__(self, inputFilename):
self.tokenParser = etree.iterparse(inputFilename, events=('start', 'end'))
self.currentPage = {}
def __iter__(self):
return self
def next(self):
for event, elem in self.tokenParser:
tagName = strip_tag_name(elem)
if event == 'end':
if tagName == 'page':
# Found the end of </page> block
ret = self.currentPage
self.currentPage = {}
# Make sure that "text" is always populated, even for empty pages (0 bytes of text).
if not ret.get('text'):
ret['text'] = ""
if not ret.get('redirect'):
ret['redirect'] = ""
return ret
elif tagName == 'title':
self.currentPage['title'] = elem.text
elif tagName == 'text':
self.currentPage['text'] = elem.text
elif event == 'start':
if tagName == 'redirect':
self.currentPage['redirect'] = elem.attrib['title']
# Entire XML dump was parsed, nothing more to read
raise StopIteration
def strip_tag_name(elem):
t = elem.tag
idx = k = t.rfind("}")
if idx != -1:
t = t[idx + 1:]
return t
# example.py calling program # set PYTHONIOENCODING=utf8 in MSWindows import ReadMWDump; for Page in ReadMWDump.GetAllPages('rkb-mediawiki-20181005-1210.xml'): print("Found page [" + Page['title'] + "] with text [" + Page['text'] + "], redirect=" + Page['redirect'])