From: Maxious Date: Thu, 09 May 2013 13:32:29 +0000 Subject: handle invalid input in file size and description markup for datagov import X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=cb9fbd0ac1b59d2a5656bf823ae8daa22e44ae08 --- handle invalid input in file size and description markup for datagov import Former-commit-id: 6c34fd9f8a95d5075ab5fb36f9ee2721736dcaec --- --- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -16,10 +16,16 @@ def add_package_resource_cachedurl(ckan, package_name, url, name, format, license_id, size,**kwargs): if "xls" in url: format = "xls" + if "pdf" in url: + format = "pdf" + if "xlsx" in url: + format = "xlsx" (returned_url, mime_type, content) = scrape.fetchURL(scrape.docsdb, url, "dataset_resource", "AGIMO", False) - if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]: + if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls"]: format = "xls" + if mime_type in ["application/xlsx","application/x-xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]: + format = "xlsx" if content != None: tf = tempfile.NamedTemporaryFile(delete=False) @@ -196,6 +202,8 @@ agency = doc.value['metadata']["Agency"] if agency == "APS": agency = "Australian Public Service Commission" + if agency == "Department of Broadband, Communications and the Digital Ecomomy": + agency = "Department of Broadband, Communications and the Digital Economy" if agency == "Shared Services, Treasury Directorate": agency = "Shared Services Procurement, Treasury Directorate" if agency == "Treasury - Shared Services": @@ -254,7 +262,7 @@ 'author': creator, 'maintainer': creator, 'license_id': get_license_id(doc.value['metadata']['DCTERMS.License']), - 'notes': html2text.html2text(doc.value['metadata']['Description']), + 'notes': html2text.html2text(doc.value['metadata']['Description']).replace('AC/a!a','-').replace('AC/a!aC/',"'").replace("AC/a!E",":")replace("A "," "), 'owner_org': org_id, 'extras': extras, 'private': (pkg_name not in goodcsvdata and pkg_name not in goodotherdata) --- a/documents/scrape.py +++ b/documents/scrape.py @@ -112,7 +112,7 @@ else: if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash - if "_attachments" in doc.keys(): + if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys(): last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) content = last_attachment.read()