--- a/admin/gourceimporter/import.py +++ b/admin/gourceimporter/import.py @@ -1,1 +1,53 @@ +from lxml import etree +import resource +# http://stackoverflow.com/questions/9809469/python-sax-to-lxml-for-80gb-xml#9814580 +from dateutil.parser import * +import dateutil +from datetime import datetime +import calendar +def parse(fp): + context = etree.iterparse(fp, events=('end',)) + agency = {} + for action, elem in context: + if elem.tag=='AGENCY': + # processing goes here + # https://code.google.com/p/gource/wiki/CustomLogFormat + + #print agency + if 'START_DATE' in agency: + try: + epoch = calendar.timegm(dateutil.parser.parse(agency['START_DATE'].replace('0000','0101')).utctimetuple()) + print str(epoch) + "|andrew|A|" + agency['TITLE'] + except Exception, e: + print e + print agency['START_DATE'] + pass + if 'END_DATE' in agency: + try: + epoch = calendar.timegm(dateutil.parser.parse(agency['END_DATE'].replace('0000','0101')).utctimetuple()) + print str(epoch) + "|andrew|D|" + agency['TITLE'] + except Exception, e: + print agency['END_DATE'] + pass + + + agency = {} + #print etree.tostring(elem) + #print len(list(elem)) + for child in list(elem): + if child.tag in ['START_DATE','END_DATE','TITLE']: + agency[child.tag] = child.text + else: + #print child.tag + pass + elem.clear() + while elem.getprevious() is not None: + del elem.getparent()[0] + #memory usage + #print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + + +with open("../agency-sample.xml") as f: + parse(f) +