gource master
[disclosr.git] / admin / gourceimporter / import.py
blob:a/admin/gourceimporter/import.py -> blob:b/admin/gourceimporter/import.py
  from lxml import etree
  import resource
  # http://stackoverflow.com/questions/9809469/python-sax-to-lxml-for-80gb-xml#9814580
  from dateutil.parser import *
  import dateutil
  from datetime import datetime
  import calendar
   
  def parse(fp):
  context = etree.iterparse(fp, events=('end',))
  agency = {}
  for action, elem in context:
  if elem.tag=='AGENCY':
  # processing goes here
  # https://code.google.com/p/gource/wiki/CustomLogFormat
   
  #print agency
  if 'START_DATE' in agency:
  try:
  epoch = calendar.timegm(dateutil.parser.parse(agency['START_DATE'].replace('0000','0101')).utctimetuple())
  print str(epoch) + "|andrew|A|" + agency['TITLE']
  except Exception, e:
  print e
  print agency['START_DATE']
  pass
  if 'END_DATE' in agency:
  try:
  epoch = calendar.timegm(dateutil.parser.parse(agency['END_DATE'].replace('0000','0101')).utctimetuple())
  print str(epoch) + "|andrew|D|" + agency['TITLE']
  except Exception, e:
  print agency['END_DATE']
  pass
   
   
  agency = {}
  #print etree.tostring(elem)
  #print len(list(elem))
  for child in list(elem):
  if child.tag in ['START_DATE','END_DATE','TITLE']:
  agency[child.tag] = child.text
  else:
  #print child.tag
  pass
  elem.clear()
  while elem.getprevious() is not None:
  del elem.getparent()[0]
  #memory usage
  #print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
   
   
  with open("../agency-sample.xml") as f:
  parse(f)