From: Alex Sadleir <maxious@lambdacomplex.org>
Date: Sat, 09 Feb 2013 12:26:38 +0000
Subject: export no-html and licence
X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=65934a8aa047453aaa512d4079b451613f41e4bd
---
export no-html and licence


Former-commit-id: 39dafe9fefec609588df4f189c2364dae8edd246
---


--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -7,9 +7,39 @@
     pass
 
 # Instantiate the CKAN client.
-ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
-    api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
-# (use your own api_key from http://thedatahub.org/user/me )
+#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',    api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
+ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',    api_key='72f90359-0396-438c-804f-a26a24336747')
+#couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
+
+# http://stackoverflow.com/a/7778368/684978
+from HTMLParser import HTMLParser
+import htmlentitydefs
+
+class HTMLTextExtractor(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.result = [ ]
+
+    def handle_data(self, d):
+        self.result.append(d)
+
+    def handle_charref(self, number):
+        codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
+        self.result.append(unichr(codepoint))
+
+    def handle_entityref(self, name):
+        codepoint = htmlentitydefs.name2codepoint[name]
+        self.result.append(unichr(codepoint))
+
+    def get_text(self):
+        return u''.join(self.result)
+
+def html_to_text(html):
+    s = HTMLTextExtractor()
+    s.feed(html)
+    return s.get_text()
+
 # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
 SYMBOLS = {
     'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
@@ -88,7 +118,24 @@
     return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
     #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
 
-couch = couchdb.Server('http://127.0.0.1:5984/')
+def get_licence_id(licencename):
+    map = {
+        "Creative Commons - Attribution-Share Alike 2.0 Australia (CC-SA)\nThe downloadable version of the database is licensed under CC-BY-SA Creative Commons Attribution Share Alike and contains only the database fields that are released under that license. These fields are object title, object number, object description as well as temporal, spatial and dimension details. It also contains a persistent URL for each record.": 'cc-by-sa',
+        "CreativeCommonsAttributionNonCommercial30AustraliaCCBYNC30": 'cc-nc',
+        'Otherpleasespecify': 'notspecified',
+        '': 'notspecified',
+        "Publicly available data": 'notspecified',
+        "CreativeCommonsAttributionNoDerivativeWorks30AustraliaCCBYND30": "other-closed",
+        "CreativeCommonsAttributionNonCommercialNoDerivs30AustraliaCCBYNCND30": "other-closed",
+        'CreativeCommonsAttribution30AustraliaCCBY30': 'cc-by',
+        "Creative Commons - Attribution 2.5 Australia (CC-BY)": 'cc-by',
+        'CreativeCommonsAttributionCCBY25': 'cc-by',
+        "PublicDomain": 'other-pd',
+        }
+    if licencename not in map.keys():
+          raise Exception(licencename + " not found");
+    return map[licencename];
+
 docsdb = couch['disclosr-documents']
 
 if __name__ == "__main__":
@@ -100,17 +147,20 @@
             tags = doc.value['metadata']["Keywords / Tags"]
             if not hasattr(tags, '__iter__'):
                 tags = [tags]
-            [re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags]
+            [re.sub('[^a-zA-Z0-9-_()]', '', tag).replace('&', 'and').lower() for tag in tags]
             package_entity = {
                 'name': pkg_name,
                 'title': doc.value['metadata']['DCTERMS.Title'],
                 'url': doc.value['metadata']['DCTERMS.Source.URI'],
-                'tags': tags,
+
                 'author': doc.value['metadata']["DCTERMS.Creator"],
                 'maintainer': doc.value['metadata']["DCTERMS.Creator"],
-                'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping
-                'notes': doc.value['metadata']['Description'],
+                'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']),
+                'notes': html_to_text(doc.value['metadata']['Description']),
             }
+            if len(tags) > 0:
+                package_entity['tags'] = tags
+                print tags
             try:
                 #print doc.id
                 ckan.package_register_post(package_entity)
@@ -122,7 +172,7 @@
                         ckan.last_status, pkg_name, e.args))
 
             print package_entity
-            #todo add to organisation (author/creator/maintainer)
+            #todo add to organisation (author/creator/maintainer) http://docs.ckan.org/en/latest/apiv3.html#examples ckan.logic.action.update.package_owner_org_update
             #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group
             if 'Download' in doc.value['metadata'].keys():
                 try:
@@ -130,7 +180,7 @@
                     resources = pkg.get('resources', [])
                     if len(resources) < len(doc.value['metadata']['Download']):
                         for resource in doc.value['metadata']['Download']:
-                            #print resource
+                            print resource
                             # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html
                             # (KML/KMZ) / (Shapefile) /(Other)
                             format = "plain"

--- a/documents/datagov.py
+++ b/documents/datagov.py
@@ -13,7 +13,7 @@
     if atag.has_key('href'):
         url = scrape.fullurl(listurl, atag['href'])
         (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
-            url, "data", "AGIMO")
+            url, "data", "AGIMO", False)
         hash = scrape.mkhash(scrape.canonurl(url))
         doc = scrape.docsdb.get(hash)
         if "metadata" not in doc.keys() or True:

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -12,6 +12,11 @@
 import urlparse
 import socket
 
+#couch = couchdb.Server('http://192.168.1.148:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
+#couch = couchdb.Server('http://127.0.0.1:5984/')
+
+
 def mkhash(input):
     return hashlib.md5(input).hexdigest().encode("utf-8")
 
@@ -104,14 +109,11 @@
     if doc == None:
         doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'}
     else:
-        if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14):
+        if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)):
             print "Uh oh, trying to scrape URL again too soon!" + hash
             last_attachment_fname = doc["_attachments"].keys()[-1]
             last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
             content = last_attachment
-            return (doc['url'], doc['mime_type'], content.read())
-        if scrape_again == False:
-            print "Not scraping this URL again as requested"
             return (doc['url'], doc['mime_type'], content.read())
 
     req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
@@ -207,9 +209,6 @@
                     #print linkurl
                     scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID)
 
-#couch = couchdb.Server('http://192.168.1.148:5984/')
-#couch = couchdb.Server('http://192.168.1.113:5984/')
-couch = couchdb.Server('http://127.0.0.1:5984/')
 # select database
 agencydb = couch['disclosr-agencies']
 docsdb = couch['disclosr-documents']