handle invalid input in file size and description markup for datagov import
Former-commit-id: 6c34fd9f8a95d5075ab5fb36f9ee2721736dcaec
--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -16,10 +16,16 @@
def add_package_resource_cachedurl(ckan, package_name, url, name, format, license_id, size,**kwargs):
if "xls" in url:
format = "xls"
+ if "pdf" in url:
+ format = "pdf"
+ if "xlsx" in url:
+ format = "xlsx"
(returned_url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
url, "dataset_resource", "AGIMO", False)
- if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
+ if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls"]:
format = "xls"
+ if mime_type in ["application/xlsx","application/x-xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
+ format = "xlsx"
if content != None:
tf = tempfile.NamedTemporaryFile(delete=False)
@@ -196,6 +202,8 @@
agency = doc.value['metadata']["Agency"]
if agency == "APS":
agency = "Australian Public Service Commission"
+ if agency == "Department of Broadband, Communications and the Digital Ecomomy":
+ agency = "Department of Broadband, Communications and the Digital Economy"
if agency == "Shared Services, Treasury Directorate":
agency = "Shared Services Procurement, Treasury Directorate"
if agency == "Treasury - Shared Services":
@@ -254,7 +262,7 @@
'author': creator,
'maintainer': creator,
'license_id': get_license_id(doc.value['metadata']['DCTERMS.License']),
- 'notes': html2text.html2text(doc.value['metadata']['Description']),
+ 'notes': html2text.html2text(doc.value['metadata']['Description']).replace('AC/a!a','-').replace('AC/a!aC/',"'").replace("AC/a!E",":")replace("A "," "),
'owner_org': org_id,
'extras': extras,
'private': (pkg_name not in goodcsvdata and pkg_name not in goodotherdata)
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -112,7 +112,7 @@
else:
if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)):
print "Uh oh, trying to scrape URL again too soon!" + hash
- if "_attachments" in doc.keys():
+ if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys():
last_attachment_fname = doc["_attachments"].keys()[-1]
last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
content = last_attachment.read()