ABN import formatting
[dcaas.git] / data / xls2csv.py
blob:a/data/xls2csv.py -> blob:b/data/xls2csv.py
--- a/data/xls2csv.py
+++ b/data/xls2csv.py
@@ -1,17 +1,38 @@
-''' DCaaS Services XLS to CSV converter
-''' Converts the DCaaS XLS file to CSV while removing line breaks and other special characters
-''' Alex Sadleir <maxious@lambdacomplex.org>
+"""DCaaS Services XLS to CSV converter
+   Converts the DCaaS XLS file to CSV while removing line breaks and other special characters
+   Alex Sadleir <maxious@lambdacomplex.org>
+
+"""
+
 from xlrd import *
 import unicodecsv
 import glob
 import codecs
 
-def cleanseValue(value):
+def cleanseValue(column,value):
 	if isinstance(value, basestring):
 		value = "".join([x if ord(x) < 128 else '' for x in value])
 		value = value.replace('<','&gt;').replace('>','&lt;').replace('\n','<br/>').replace('\r','<br/>')
 	else:
 		value = str(value)
+	if column == 'ACNABN' and value != None:
+		value = value.replace(' ','').replace(".0","").replace('"','').replace(",","")
+		value = value.replace("ABN"," ").replace("ACN"," ").replace("/"," ").replace(":"," ")
+		value = value.replace("  "," ").strip()
+		valueParts = value.split(' ')
+		if len(valueParts) == 2:
+			if len(valueParts[0]) > len(valueParts[1]): 
+				value = valueParts[0]
+			if len(valueParts[0]) < len(valueParts[1]):
+				value = valueParts[1]
+		if value == "351222885465":
+			value = "35122885465"			
+		if value == "067060937171":
+			value = "67060937171"			
+		if value == "148179846":
+			value = "49148179846"			
+		if value == "":
+			value = "62081001194" #verizon
 	return value
 
 for filename in glob.glob("*.xls"):
@@ -23,9 +44,14 @@
 		s = b.sheet_by_name(sheet)
 		rbc = open(outputfile,'w')
 		bcw = unicodecsv.writer(rbc,unicodecsv.excel)
+		rowNum = 0
+		headerRow = []
 		for row in range(s.nrows):
 		    this_row = []
 		    for col in range(s.ncols):
-		    	this_row.append(cleanseValue(s.cell_value(row,col)))
+		    	this_row.append(cleanseValue(headerRow[col] if col < len(headerRow) else None , s.cell_value(row,col)))
 		    bcw.writerow(this_row)
+		    if rowNum == 0:
+			headerRow = this_row
+		    rowNum = rowNum + 1