handle invalid input in file size and description markup for datagov import
handle invalid input in file size and description markup for datagov import


Former-commit-id: 6c34fd9f8a95d5075ab5fb36f9ee2721736dcaec

--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -16,10 +16,16 @@
 def add_package_resource_cachedurl(ckan, package_name, url, name, format, license_id, size,**kwargs):
     if "xls" in url:
 	format = "xls"
+    if "pdf" in url:
+	format = "pdf"
+    if "xlsx" in url:
+	format = "xlsx"
     (returned_url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
                                                 url, "dataset_resource", "AGIMO", False)
-    if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
+    if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls"]:
 	format = "xls"
+    if mime_type in ["application/xlsx","application/x-xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
+	format = "xlsx"
 
     if content != None:
 	    tf = tempfile.NamedTemporaryFile(delete=False)
@@ -196,6 +202,8 @@
                 agency = doc.value['metadata']["Agency"]
                 if agency == "APS":
                     agency = "Australian Public Service Commission"
+                if agency == "Department of Broadband, Communications and the Digital Ecomomy":
+                    agency = "Department of Broadband, Communications and the Digital Economy"
                 if agency == "Shared Services, Treasury Directorate":
                     agency = "Shared Services Procurement, Treasury Directorate"
                 if agency == "Treasury - Shared Services":
@@ -254,7 +262,7 @@
                         'author': creator,
                         'maintainer': creator,
                         'license_id': get_license_id(doc.value['metadata']['DCTERMS.License']),
-                        'notes': html2text.html2text(doc.value['metadata']['Description']),
+                        'notes': html2text.html2text(doc.value['metadata']['Description']).replace('AC/a!a','-').replace('AC/a!aC/',"'").replace("AC/a!E",":")replace("A "," "),
                         'owner_org': org_id,
                         'extras': extras,
                         'private': (pkg_name not in goodcsvdata and pkg_name not in goodotherdata)

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -112,7 +112,7 @@
     else:
         if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)):
             print "Uh oh, trying to scrape URL again too soon!" + hash
-	    if "_attachments" in doc.keys():
+	    if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys():
 	            last_attachment_fname = doc["_attachments"].keys()[-1]
 	            last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
         	    content = last_attachment.read()