|
from lxml import etree |
|
import resource |
|
# http://stackoverflow.com/questions/9809469/python-sax-to-lxml-for-80gb-xml#9814580 |
|
from dateutil.parser import * |
|
import dateutil |
|
from datetime import datetime |
|
import calendar |
|
|
|
def parse(fp): |
|
context = etree.iterparse(fp, events=('end',)) |
|
agency = {} |
|
for action, elem in context: |
|
if elem.tag=='AGENCY': |
|
# processing goes here |
|
# https://code.google.com/p/gource/wiki/CustomLogFormat |
|
|
|
#print agency |
|
if 'START_DATE' in agency: |
|
try: |
|
epoch = calendar.timegm(dateutil.parser.parse(agency['START_DATE'].replace('0000','0101')).utctimetuple()) |
|
print str(epoch) + "|andrew|A|" + agency['TITLE'] |
|
except Exception, e: |
|
print e |
|
print agency['START_DATE'] |
|
pass |
|
if 'END_DATE' in agency: |
|
try: |
|
epoch = calendar.timegm(dateutil.parser.parse(agency['END_DATE'].replace('0000','0101')).utctimetuple()) |
|
print str(epoch) + "|andrew|D|" + agency['TITLE'] |
|
except Exception, e: |
|
print agency['END_DATE'] |
|
pass |
|
|
|
|
|
agency = {} |
|
#print etree.tostring(elem) |
|
#print len(list(elem)) |
|
for child in list(elem): |
|
if child.tag in ['START_DATE','END_DATE','TITLE']: |
|
agency[child.tag] = child.text |
|
else: |
|
#print child.tag |
|
pass |
|
elem.clear() |
|
while elem.getprevious() is not None: |
|
del elem.getparent()[0] |
|
#memory usage |
|
#print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss |
|
|
|
|
|
with open("../agency-sample.xml") as f: |
|
parse(f) |
|
|