fix import of bulleted lists
[dcaas.git] / data / xls2csv.py
blob:a/data/xls2csv.py -> blob:b/data/xls2csv.py
"""DCaaS Services XLS to CSV converter """DCaaS Services XLS to CSV converter
Converts the DCaaS XLS file to CSV while removing line breaks and other special characters Converts the DCaaS XLS file to CSV while removing line breaks and other special characters
Alex Sadleir <maxious@lambdacomplex.org> Alex Sadleir <maxious@lambdacomplex.org>
   
""" """
   
from xlrd import * from xlrd import *
import unicodecsv import unicodecsv
import glob import glob
import codecs import codecs
   
def cleanseValue(column,value): def cleanseValue(column,value):
if isinstance(value, basestring): if isinstance(value, basestring):
value = "".join([x if ord(x) < 128 else '' for x in value]) value = "".join([x if ord(x) < 128 else '' for x in value.replace(u"\u2022", u"\n&bull;")])
value = value.replace('<','&gt;').replace('>','&lt;').replace('\n','<br/>').replace('\r','<br/>').strip() value = value.replace('<','&gt;').replace('>','&lt;').replace('\n','<br/>').replace('\r','<br/>').strip()
else: else:
value = str(value).strip() value = str(value).strip()
if column == 'ACNABN' and value != None: if column == 'ACNABN' and value != None:
value = value.replace(' ','').replace(".0","").replace('"','').replace(",","") value = value.replace(' ','').replace(".0","").replace('"','').replace(",","")
value = value.replace("ABN"," ").replace("ACN"," ").replace("/"," ").replace(":"," ") value = value.replace("ABN"," ").replace("ACN"," ").replace("/"," ").replace(":"," ")
value = value.replace(" "," ").strip() value = value.replace(" "," ").strip()
valueParts = value.split(' ') valueParts = value.split(' ')
if len(valueParts) == 2: if len(valueParts) == 2:
if len(valueParts[0]) > len(valueParts[1]): if len(valueParts[0]) > len(valueParts[1]):
value = valueParts[0] value = valueParts[0]
if len(valueParts[0]) < len(valueParts[1]): if len(valueParts[0]) < len(valueParts[1]):
value = valueParts[1] value = valueParts[1]
if value == "351222885465": if value == "351222885465":
value = "35122885465" value = "35122885465"
if value == "067060937171": if value == "067060937171":
value = "67060937171" value = "67060937171"
if value == "148179846": if value == "148179846":
value = "49148179846" value = "49148179846"
if value == "": if value == "":
value = "62081001194" #verizon value = "62081001194" #verizon
return value return value
   
for filename in glob.glob("*.xls"): for filename in glob.glob("*.xls"):
print filename print filename
b = open_workbook(filename) b = open_workbook(filename)
for sheet in b.sheet_names()[2:]: for sheet in b.sheet_names()[2:]:
outputfile = filename.replace(".xls","") + sheet.replace(".","-").replace(" ","_").replace("&","and") + ".csv" outputfile = filename.replace(".xls","") + sheet.replace(".","-").replace(" ","_").replace("&","and") + ".csv"
print outputfile print outputfile
s = b.sheet_by_name(sheet) s = b.sheet_by_name(sheet)
rbc = open(outputfile,'w') rbc = open(outputfile,'w')
bcw = unicodecsv.writer(rbc,unicodecsv.excel) bcw = unicodecsv.writer(rbc,unicodecsv.excel)
rowNum = 0 rowNum = 0
headerRow = [] headerRow = []
for row in range(s.nrows): for row in range(s.nrows):
this_row = [] this_row = []
for col in range(s.ncols): for col in range(s.ncols):
this_row.append(cleanseValue(headerRow[col] if col < len(headerRow) else None , s.cell_value(row,col))) this_row.append(cleanseValue(headerRow[col] if col < len(headerRow) else None , s.cell_value(row,col)))
bcw.writerow(this_row) bcw.writerow(this_row)
if rowNum == 0: if rowNum == 0:
headerRow = this_row headerRow = this_row
rowNum = rowNum + 1 rowNum = rowNum + 1