more export fixes
more export fixes


Former-commit-id: 4ca7f1f81d51d7c094d904b6aa1e176cf5958630

--- a/admin/refreshDesignDoc.php
+++ b/admin/refreshDesignDoc.php
@@ -8,7 +8,7 @@
 $obj->_id = "_design/" . urlencode("app");
 $obj->language = "javascript";
 $obj->views->all->map = "function(doc) {   emit(doc._id, doc); };";
-$obj->views->byDate->map = "function(doc) {   emit(doc.date, doc); };";
+$obj->views->byDate->map = "function(doc) {  if (doc.title != "Disclosure Log Updated") { emit(doc.date, doc); } };";
 $obj->views->byDateMonthYear->map = "function(doc) {   emit(doc.date, doc); };";
 $obj->views->byDateMonthYear->reduce = "_count";
 $obj->views->byAgencyID->map = "function(doc) {   emit(doc.agencyID, doc); };";

--- a/documents/datagov-export-groups.py
+++ b/documents/datagov-export-groups.py
@@ -10,7 +10,11 @@
 # Instantiate the CKAN client.
 #ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',    api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
 api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
-ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
+server = 'data.disclosurelo.gs'
+api_key = 'c30eb6f5-0f90-47e0-bf05-9b1b4e3a461a'
+server = 'ckan.data.gov.au'
+
+ckan = ckanclient.CkanClient(base_location='http://' + server + '/api',
                              api_key=api_key)
 couch = couchdb.Server('http://127.0.0.1:5984/')
 #couch = couchdb.Server('http://192.168.1.113:5984/')

--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -13,22 +13,38 @@
     pass
 
 import tempfile
-def add_package_resource_cachedurl(ckan, package_name, url, name, format, size, **kwargs):
+def add_package_resource_cachedurl(ckan, package_name, url, name, format, license_id, size,**kwargs):
+    excluded_apis = "recent-earthquakes,sentinel-hotspots,abc-local-stations,action-bus-service-gtfs-feed-act,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,journey-planner-data-act,live-traffic-cameras-nsw,precis-forecast-national,precis-forecast-national,precis-forecast-new-south-wales,precis-forecast-new-south-wales,precis-forecast-northern-territory,precis-forecast-northern-territory,precis-forecast-queensland,precis-forecast-queensland,precis-forecast-south-australia,precis-forecast-south-australia,precis-forecast-tasmania,precis-forecast-tasmania,precis-forecast-victoria,precis-forecast-victoria,precis-forecast-western-australia,precis-forecast-western-australia,register-of-penalty-notices-nsw,sentinel-hotspots,trove-people-and-organisations-data,weather-data-services-radar,abc-local-stations,act-emergency-services-agency-esa-28093-current-incidents,act-emergency-services-agency-esa-news-alerts,act-government-news-and-events,act-government-summaries-of-cabinet-outcomes,act-magistrates-court-judgements,act-supreme-court-judgements,act-supreme-court-sentences,actpla-latest-news,all-vacant-act-government-jobs,community-engagement-current-engagements,community-engagement-news,edd-media-releases,edd-news-and-events,freedom-of-information-foi-summaries,libraries-act-announcements,nsw-rural-fire-service-current-incidents,nsw-rural-fire-service-major-updates,precis-forecast-new-south-wales,precis-forecast-south-australia,precis-forecast-tasmania,precis-forecast-victoria,sentinel-hotspots,south-australian-road-crash-statistics,trove-people-and-organisations-data,weather-warnings-for-new-south-wales-australian-capital-territory,weather-warnings-for-northern-territory,weather-warnings-for-queensland,weather-warnings-for-south-australia,weather-warnings-for-tasmania,weather-warnings-for-victoria,weather-warnings-for-western-australia".split(",")
+    if "xls" in url:
+	format = "xls"
+    if "pdf" in url:
+	format = "pdf"
+    if "xlsx" in url:
+	format = "xlsx"
     (returned_url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
                                                 url, "dataset_resource", "AGIMO", False)
-    if content != None:
-	    tf = tempfile.NamedTemporaryFile(delete=False)
+    if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls"]:
+	format = "xls"
+    if mime_type in ["application/xlsx","application/x-xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
+	format = "xlsx"
+
+    #put file extensions on for windows users downloading files
+    suffix = name.encode("ascii","ignore").replace("/","")
+    if len(suffix) < 5 or (suffix[-4] != "." and suffix[-5] != "."):
+	suffix = suffix + "." + format
+    if content != None and package_name not in excluded_apis:
+	    tf = tempfile.NamedTemporaryFile(suffix=suffix)
 	    tfName = os.path.abspath(tf.name)
 	    print tfName
 	    tf.seek(0)
 	    tf.write(content)
 	    tf.flush()
-	    ckan.add_package_resource (package_name, tfName, name=name)
+	    ckan.add_package_resource (package_name, tfName, name=name, format=format, license_id=license_id)
     else:
 	print "fetch error"
-	ckan.add_package_resource(package_name, url, name=name, resource_type='data',
+	return ckan.add_package_resource(package_name, url, name=name, resource_type='data',
                                                       format=format,
-                                                      size=size)
+                                                      size=size, mimetype=mime_type, license_id=license_id)
 
 # Instantiate the CKAN client.
 api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
@@ -147,15 +163,15 @@
     return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
 
 
-def get_licence_id(licencename):
+def get_license_id(licencename):
     map = {
         "Creative Commons - Attribution-Share Alike 2.0 Australia (CC-SA)\nThe downloadable version of the database is licensed under CC-BY-SA Creative Commons Attribution Share Alike and contains only the database fields that are released under that license. These fields are object title, object number, object description as well as temporal, spatial and dimension details. It also contains a persistent URL for each record.": 'cc-by-sa',
         "CreativeCommonsAttributionNonCommercial30AustraliaCCBYNC30": 'cc-nc',
         'Otherpleasespecify': 'notspecified',
         '': 'notspecified',
         "Publicly available data": 'notspecified',
-        "CreativeCommonsAttributionNoDerivativeWorks30AustraliaCCBYND30": "other-closed",
-        "CreativeCommonsAttributionNonCommercialNoDerivs30AustraliaCCBYNCND30": "other-closed",
+        "CreativeCommonsAttributionNoDerivativeWorks30AustraliaCCBYND30": "cc-by-nd",
+        "CreativeCommonsAttributionNonCommercialNoDerivs30AustraliaCCBYNCND30": "cc-nc-nd",
         'CreativeCommonsAttribution30AustraliaCCBY30': 'cc-by',
         "Creative Commons - Attribution 2.5 Australia (CC-BY)": 'cc-by',
         'CreativeCommonsAttributionCCBY25': 'cc-by',
@@ -185,12 +201,14 @@
             pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',
                               doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]);
             print pkg_name
-            if pkg_name != "":
+            if pkg_name != "" :
 
                 #add to or create organization using direct API
                 agency = doc.value['metadata']["Agency"]
                 if agency == "APS":
                     agency = "Australian Public Service Commission"
+                if agency == "Department of Broadband, Communications and the Digital Ecomomy":
+                    agency = "Department of Broadband, Communications and the Digital Economy"
                 if agency == "Shared Services, Treasury Directorate":
                     agency = "Shared Services Procurement, Treasury Directorate"
                 if agency == "Treasury - Shared Services":
@@ -248,8 +266,8 @@
                         'tags': tags, #tags are mandatory?
                         'author': creator,
                         'maintainer': creator,
-                        'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']),
-                        'notes': html2text.html2text(doc.value['metadata']['Description']),
+                        'license_id': get_license_id(doc.value['metadata']['DCTERMS.License']),
+                        'notes': html2text.html2text(doc.value['metadata']['Description']).replace('AC/a!a','-').replace('AC/a!aC/',"'").replace("AC/a!E",":").replace("A "," "),
                         'owner_org': org_id,
                         'extras': extras,
                         'private': (pkg_name not in goodcsvdata and pkg_name not in goodotherdata)
@@ -292,9 +310,8 @@
                                     name = resource['name']
                                 print resource
                                 add_package_resource_cachedurl(ckan, pkg_name, url_fix(resource['href']), name,
-                                                          format,
-                                                          human2bytes(resource.get('size', '0B')),
-                                                          resource_type='data')
+                                                          format, get_license_id(doc.value['metadata']['DCTERMS.License']),
+                                                          human2bytes(resource.get('size', '0B')))
                         else:
                             print "resources already exist"
                     except CkanApiError, e:

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -202,7 +202,7 @@
     def getDate(self, content, entry, doc):
         strdate = ''.join(content.stripped_strings).strip()
         (a, b, c) = strdate.partition("(")
-        strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")replace("Janrurary", "January").replace("1012","2012"))
+        strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012"))
         print strdate
         try:
 		edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")

--- a/documents/runScrapers.sh
+++ b/documents/runScrapers.sh
@@ -1,7 +1,8 @@
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+echo $DIR
 cd $DIR
 echo "" > /tmp/disclosr-error
-for f in scrapers/*.py; do
+for f in $DIR/scrapers/*.py; do
 	echo "Processing $f file..";
 	md5=`md5sum /tmp/disclosr-error`
 	python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error;

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -112,7 +112,7 @@
     else:
         if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)):
             print "Uh oh, trying to scrape URL again too soon!" + hash
-	    if "_attachments" in doc.keys():
+	    if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys():
 	            last_attachment_fname = doc["_attachments"].keys()[-1]
 	            last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
         	    content = last_attachment.read()