--- a/data/xls2csv.py +++ b/data/xls2csv.py @@ -1,17 +1,38 @@ -''' DCaaS Services XLS to CSV converter -''' Converts the DCaaS XLS file to CSV while removing line breaks and other special characters -''' Alex Sadleir +"""DCaaS Services XLS to CSV converter + Converts the DCaaS XLS file to CSV while removing line breaks and other special characters + Alex Sadleir + +""" + from xlrd import * import unicodecsv import glob import codecs -def cleanseValue(value): +def cleanseValue(column,value): if isinstance(value, basestring): - value = "".join([x if ord(x) < 128 else '' for x in value]) - value = value.replace('<','>').replace('>','<').replace('\n','
').replace('\r','
') + value = "".join([x if ord(x) < 128 else '' for x in value.replace(u"\u2022", u"\n•")]) + value = value.replace('<','>').replace('>','<').replace('\n','
').replace('\r','
').strip() else: - value = str(value) + value = str(value).strip() + if column == 'ACNABN' and value != None: + value = value.replace(' ','').replace(".0","").replace('"','').replace(",","") + value = value.replace("ABN"," ").replace("ACN"," ").replace("/"," ").replace(":"," ") + value = value.replace(" "," ").strip() + valueParts = value.split(' ') + if len(valueParts) == 2: + if len(valueParts[0]) > len(valueParts[1]): + value = valueParts[0] + if len(valueParts[0]) < len(valueParts[1]): + value = valueParts[1] + if value == "351222885465": + value = "35122885465" + if value == "067060937171": + value = "67060937171" + if value == "148179846": + value = "49148179846" + if value == "": + value = "62081001194" #verizon return value for filename in glob.glob("*.xls"): @@ -23,9 +44,14 @@ s = b.sheet_by_name(sheet) rbc = open(outputfile,'w') bcw = unicodecsv.writer(rbc,unicodecsv.excel) + rowNum = 0 + headerRow = [] for row in range(s.nrows): this_row = [] for col in range(s.ncols): - this_row.append(cleanseValue(s.cell_value(row,col))) + this_row.append(cleanseValue(headerRow[col] if col < len(headerRow) else None , s.cell_value(row,col))) bcw.writerow(this_row) + if rowNum == 0: + headerRow = this_row + rowNum = rowNum + 1