gource master
[disclosr.git] / admin / gourceimporter / import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from lxml import etree
import resource
# http://stackoverflow.com/questions/9809469/python-sax-to-lxml-for-80gb-xml#9814580
from dateutil.parser import *
import dateutil
from datetime import datetime
import calendar
 
def parse(fp):
    context = etree.iterparse(fp, events=('end',))
    agency = {}
    for action, elem in context:
        if elem.tag=='AGENCY':
            # processing goes here
            # https://code.google.com/p/gource/wiki/CustomLogFormat
 
            #print agency
            if 'START_DATE' in agency:
                    try:
                        epoch = calendar.timegm(dateutil.parser.parse(agency['START_DATE'].replace('0000','0101')).utctimetuple())
                        print str(epoch) + "|andrew|A|" + agency['TITLE']
                    except Exception, e:
                        print e
                        print agency['START_DATE']
                        pass
            if 'END_DATE' in agency:
                    try:
                        epoch = calendar.timegm(dateutil.parser.parse(agency['END_DATE'].replace('0000','0101')).utctimetuple())
                        print str(epoch) + "|andrew|D|" + agency['TITLE']
                    except Exception, e:
                        print agency['END_DATE']
                        pass
 
 
            agency = {}
            #print etree.tostring(elem)
            #print len(list(elem))
            for child in list(elem):
                if child.tag in ['START_DATE','END_DATE','TITLE']:
                        agency[child.tag] = child.text
                else:
                        #print child.tag
                        pass
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
            #memory usage
            #print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
 
 
with open("../agency-sample.xml") as f:
        parse(f)