gource master
[disclosr.git] / admin / gourceimporter / import.py
blob:a/admin/gourceimporter/import.py -> blob:b/admin/gourceimporter/import.py
--- a/admin/gourceimporter/import.py
+++ b/admin/gourceimporter/import.py
@@ -1,1 +1,53 @@
+from lxml import etree
+import resource
+# http://stackoverflow.com/questions/9809469/python-sax-to-lxml-for-80gb-xml#9814580
+from dateutil.parser import *
+import dateutil
+from datetime import datetime
+import calendar
 
+def parse(fp):
+    context = etree.iterparse(fp, events=('end',))
+    agency = {}
+    for action, elem in context:
+        if elem.tag=='AGENCY':
+            # processing goes here
+	    # https://code.google.com/p/gource/wiki/CustomLogFormat
+
+            #print agency
+            if 'START_DATE' in agency:
+		    try:
+		    	epoch = calendar.timegm(dateutil.parser.parse(agency['START_DATE'].replace('0000','0101')).utctimetuple())
+		    	print str(epoch) + "|andrew|A|" + agency['TITLE']
+		    except Exception, e:
+			print e
+			print agency['START_DATE']
+			pass
+            if 'END_DATE' in agency:
+		    try:
+    		    	epoch = calendar.timegm(dateutil.parser.parse(agency['END_DATE'].replace('0000','0101')).utctimetuple())
+		    	print str(epoch) + "|andrew|D|" + agency['TITLE']
+		    except Exception, e:
+			print agency['END_DATE']
+			pass
+
+
+	    agency = {}
+	    #print etree.tostring(elem)
+	    #print len(list(elem))
+	    for child in list(elem):
+		if child.tag in ['START_DATE','END_DATE','TITLE']:
+	    		agency[child.tag] = child.text
+	        else:
+			#print child.tag
+                        pass
+            elem.clear()
+            while elem.getprevious() is not None:
+                del elem.getparent()[0]
+	    #memory usage
+            #print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+
+
+with open("../agency-sample.xml") as f:
+	parse(f)
+