ABN import formatting
[dcaas.git] / data / xls2csv.py
blob:a/data/xls2csv.py -> blob:b/data/xls2csv.py
''' DCaaS Services XLS to CSV converter """DCaaS Services XLS to CSV converter
''' Converts the DCaaS XLS file to CSV while removing line breaks and other special characters Converts the DCaaS XLS file to CSV while removing line breaks and other special characters
''' Alex Sadleir <maxious@lambdacomplex.org> Alex Sadleir <maxious@lambdacomplex.org>
   
  """
   
from xlrd import * from xlrd import *
import unicodecsv import unicodecsv
import glob import glob
import codecs import codecs
   
def cleanseValue(value): def cleanseValue(column,value):
if isinstance(value, basestring): if isinstance(value, basestring):
value = "".join([x if ord(x) < 128 else '' for x in value]) value = "".join([x if ord(x) < 128 else '' for x in value])
value = value.replace('<','&gt;').replace('>','&lt;').replace('\n','<br/>').replace('\r','<br/>') value = value.replace('<','&gt;').replace('>','&lt;').replace('\n','<br/>').replace('\r','<br/>')
else: else:
value = str(value) value = str(value)
  if column == 'ACNABN' and value != None:
  value = value.replace(' ','').replace(".0","").replace('"','').replace(",","")
  value = value.replace("ABN"," ").replace("ACN"," ").replace("/"," ").replace(":"," ")
  value = value.replace(" "," ").strip()
  valueParts = value.split(' ')
  if len(valueParts) == 2:
  if len(valueParts[0]) > len(valueParts[1]):
  value = valueParts[0]
  if len(valueParts[0]) < len(valueParts[1]):
  value = valueParts[1]
  if value == "351222885465":
  value = "35122885465"
  if value == "067060937171":
  value = "67060937171"
  if value == "148179846":
  value = "49148179846"
  if value == "":
  value = "62081001194" #verizon
return value return value
   
for filename in glob.glob("*.xls"): for filename in glob.glob("*.xls"):
print filename print filename
b = open_workbook(filename) b = open_workbook(filename)
for sheet in b.sheet_names()[2:]: for sheet in b.sheet_names()[2:]:
outputfile = filename.replace(".xls","") + sheet.replace(".","-").replace(" ","_").replace("&","and") + ".csv" outputfile = filename.replace(".xls","") + sheet.replace(".","-").replace(" ","_").replace("&","and") + ".csv"
print outputfile print outputfile
s = b.sheet_by_name(sheet) s = b.sheet_by_name(sheet)
rbc = open(outputfile,'w') rbc = open(outputfile,'w')
bcw = unicodecsv.writer(rbc,unicodecsv.excel) bcw = unicodecsv.writer(rbc,unicodecsv.excel)
  rowNum = 0
  headerRow = []
for row in range(s.nrows): for row in range(s.nrows):
this_row = [] this_row = []
for col in range(s.ncols): for col in range(s.ncols):
this_row.append(cleanseValue(s.cell_value(row,col))) this_row.append(cleanseValue(headerRow[col] if col < len(headerRow) else None , s.cell_value(row,col)))
bcw.writerow(this_row) bcw.writerow(this_row)
  if rowNum == 0:
  headerRow = this_row
  rowNum = rowNum + 1