1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | from lxml import etree import resource # http://stackoverflow.com/questions/9809469/python-sax-to-lxml-for-80gb-xml#9814580 from dateutil.parser import * import dateutil from datetime import datetime import calendar def parse(fp): context = etree.iterparse(fp, events=('end',)) agency = {} for action, elem in context: if elem.tag=='AGENCY': # processing goes here # https://code.google.com/p/gource/wiki/CustomLogFormat #print agency if 'START_DATE' in agency: try: epoch = calendar.timegm(dateutil.parser.parse(agency['START_DATE'].replace('0000','0101')).utctimetuple()) print str(epoch) + "|andrew|A|" + agency['TITLE'] except Exception, e: print e print agency['START_DATE'] pass if 'END_DATE' in agency: try: epoch = calendar.timegm(dateutil.parser.parse(agency['END_DATE'].replace('0000','0101')).utctimetuple()) print str(epoch) + "|andrew|D|" + agency['TITLE'] except Exception, e: print agency['END_DATE'] pass agency = {} #print etree.tostring(elem) #print len(list(elem)) for child in list(elem): if child.tag in ['START_DATE','END_DATE','TITLE']: agency[child.tag] = child.text else: #print child.tag pass elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] #memory usage #print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss with open("../agency-sample.xml") as f: parse(f) |