From: Alex Sadleir <maxious@lambdacomplex.org>
Date: Fri, 22 Feb 2013 12:38:36 +0000
Subject: add dataqld
X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=f31cd9d6de3ced75f644b7e7736510a08c419b27
---
add dataqld


Former-commit-id: 421bd9295854615553f74ab2a00773dcdc878775
---


--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -121,28 +121,31 @@
             # Collect the package metadata.
             pkg_name = doc.value['url'].replace("http://data.gov.au/dataset/",'').replace('/','');                                                                  _
             tags = []
-            if len(doc.value['metadata']["Keywords / Tags"]) > 0:
-                if hasattr(doc.value['metadata']["Keywords / Tags"], '__iter__'):
-                    tags = tags + doc.value['metadata']["Keywords / Tags"]
-                else:
-                    tags = tags + [doc.value['metadata']["Keywords / Tags"]]
-            if 'data.gov.au Category' in doc.value['metadata'].keys() and len(doc.value['metadata']['data.gov.au Category']) > 0:
-                if hasattr(doc.value['metadata']['data.gov.au Category'], '__iter__'):
-                    tags = tags + doc.value['metadata']['data.gov.au Category']
-                else:
-                    tags = tags + [doc.value['metadata']['data.gov.au Category']]
-            tags = [re.sub('[^a-zA-Z0-9-_.]', '', tag.replace('&', 'and')).lower() for tag in tags if tag]
-            print tags
-            package_entity = {
-                'name': pkg_name,
-                'title': doc.value['metadata']['DCTERMS.Title'],
-                'url': doc.value['metadata']['DCTERMS.Source.URI'],
-                'tags': tags, #tags are mandatory?
-                'author': doc.value['metadata']["DCTERMS.Creator"],
-                'maintainer': doc.value['metadata']["DCTERMS.Creator"],
-                'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']),
-                'notes': html2text.html2text(doc.value['metadata']['Description']),
-            }
+            if doc.value['agencyID'] == "AGIMO":
+                if len(doc.value['metadata']["Keywords / Tags"]) > 0:
+                    if hasattr(doc.value['metadata']["Keywords / Tags"], '__iter__'):
+                        tags = tags + doc.value['metadata']["Keywords / Tags"]
+                    else:
+                        tags = tags + [doc.value['metadata']["Keywords / Tags"]]
+                if 'data.gov.au Category' in doc.value['metadata'].keys() and len(doc.value['metadata']['data.gov.au Category']) > 0:
+                    if hasattr(doc.value['metadata']['data.gov.au Category'], '__iter__'):
+                        tags = tags + doc.value['metadata']['data.gov.au Category']
+                    else:
+                        tags = tags + [doc.value['metadata']['data.gov.au Category']]
+                tags = [re.sub('[^a-zA-Z0-9-_.]', '', tag.replace('&', 'and')).lower() for tag in tags if tag]
+                print tags
+                package_entity = {
+                    'name': pkg_name,
+                    'title': doc.value['metadata']['DCTERMS.Title'],
+                    'url': doc.value['metadata']['DCTERMS.Source.URI'],
+                    'tags': tags, #tags are mandatory?
+                    'author': doc.value['metadata']["DCTERMS.Creator"],
+                    'maintainer': doc.value['metadata']["DCTERMS.Creator"],
+                    'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']),
+                    'notes': html2text.html2text(doc.value['metadata']['Description']),
+                }
+            if doc.value['agencyID'] == "qld":
+                package_entity = doc.value['metadata']
 
             try:
                 print package_entity

--- /dev/null
+++ b/documents/dataqld.py
@@ -1,1 +1,28 @@
+import sys, os
+import time
+import scrape
+from bs4 import BeautifulSoup
 
+from unidecode import unidecode
+import ckanclient
+
+# Instantiate the CKAN client.
+ckan = ckanclient.CkanClient(base_location='https://data.qld.gov.au/api')
+
+# Get the package list.
+package_list = ckan.package_register_get()
+for package_name in package_list:
+# Get the details of a package.
+    (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+        "https://data.qld.gov.au/dataset/"+package_name , "data", "qld", False)
+    hash = scrape.mkhash(scrape.canonurl(url))
+    print hash
+    doc = scrape.docsdb.get(hash)
+    if "metadata" not in doc.keys() or True:
+        ckan.package_entity_get(package_name)
+        package_entity = ckan.last_message
+        doc['type'] = "dataset"
+        doc['metadata'] = package_entity
+        print package_entity
+        scrape.docsdb.save(doc)
+

--- a/documents/gazette.py
+++ b/documents/gazette.py
@@ -5,44 +5,53 @@
 
 from unidecode import unidecode
 
-listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950"
-(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
-    listurl, "gazette", "AGD")
-for line in listhtml.split('\n'):
-    soup = BeautifulSoup(line)
-    #print line
-    for row in soup.find_all('tr'):
-        print line
-        if row.has_key('valign'):
-            i = 0
-            date = ""
-            id = ""
-            type = ""
-            description = ""
-            name = ""
-            url = ""
-            for col in soup.find_all('td'):
-                #print ''.join(col.stripped_strings)
-                if i == 0:
-                    date = ''.join(col.stripped_strings)
-                if i == 1:
-                    id = ''.join(col.stripped_strings)
-                if i == 2:
-                    type = ''.join(col.stripped_strings)
-                if i == 3:
-                    description = ''.join(col.stripped_strings)
-                    for link in col.findAll('a'):
-                        if link.has_key("href"):
-                            url = link['href']
-                            name = ''.join(link.stripped_strings)
-                            print [date,id,type,description,name, url]
-                i = i +1;
-                #url = scrape.fullurl(listurl, atag['href'])
-                #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
-                #    url, "data", "AGIMO")
-                #hash = scrape.mkhash(scrape.canonurl(url))
-                #doc = scrape.docsdb.get(hash)
-                #print doc['metadata']
-                #scrape.docsdb.save(doc)
-                #time.sleep(2)
+items = 3950
+items = 1
+while True:
+    print str(items) + " (" +str(items/25) +" screens to go)"
+    listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items)
+    (listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
+        listurl, "gazette", "AGD", False)
+    for line in listhtml.split('\n'):
+        soup = BeautifulSoup(line)
+        #print line
+        for row in soup.find_all('tr'):
+            print line
+            if row.has_key('valign'):
+                i = 0
+                date = ""
+                id = ""
+                type = ""
+                description = ""
+                name = ""
+                url = ""
+                for col in soup.find_all('td'):
+                    #print ''.join(col.stripped_strings)
+                    if i == 0:
+                        date = ''.join(col.stripped_strings)
+                    if i == 1:
+                        id = ''.join(col.stripped_strings)
+                    if i == 2:
+                        type = ''.join(col.stripped_strings)
+                    if i == 3:
+                        description = ''.join(col.stripped_strings)
+                        for link in col.findAll('a'):
+                            if link.has_key("href"):
+                                url = link['href']
+                                name = ''.join(link.stripped_strings)
+                                print str(items) + " (" +str(items/25) +" screens to go)"
+                                print [date, id, type, description, name, url]
+                                itemurl = scrape.fullurl(listurl, url)
+                                (itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+                                    itemurl, "gazette", "AGD", False)
+                                hash = scrape.mkhash(scrape.canonurl(itemurl))
+                                doc = scrape.docsdb.get(hash)
+                                doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url}
+                                scrape.docsdb.save(doc)
+                                #time.sleep(2)
+                    i = i + 1;
 
+    items = items - 25
+    if items <= 0:
+        break
+

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -13,8 +13,8 @@
 import socket
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
-#couch = couchdb.Server('http://192.168.1.113:5984/')
-couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
+#couch = couchdb.Server('http://127.0.0.1:5984/')
 
 
 def mkhash(input):