import
Former-commit-id: 997dc9ece51a37dc25779ca4125d0960cdc195c9
--- a/admin/refreshDesignDoc.php
+++ b/admin/refreshDesignDoc.php
@@ -40,6 +40,8 @@
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
+
+$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
$docdb->save($obj, true);
--- /dev/null
+++ b/documents/datagov-export.py
@@ -1,1 +1,154 @@
+import ckanclient
+import couchdb
+from ckanclient import CkanApiError
+import re
+class LoaderError(Exception):
+ pass
+
+# Instantiate the CKAN client.
+ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
+ api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
+# (use your own api_key from http://thedatahub.org/user/me )
+# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
+SYMBOLS = {
+ 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
+ 'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
+ 'zetta', 'iotta'),
+ 'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
+ 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
+ 'zebi', 'yobi'),
+}
+
+def human2bytes(s):
+ """
+ Attempts to guess the string format based on default symbols
+ set and return the corresponding bytes as an integer.
+ When unable to recognize the format ValueError is raised.
+
+ >>> human2bytes('0 B')
+ 0
+ >>> human2bytes('1 K')
+ 1024
+ >>> human2bytes('1 M')
+ 1048576
+ >>> human2bytes('1 Gi')
+ 1073741824
+ >>> human2bytes('1 tera')
+ 1099511627776
+
+ >>> human2bytes('0.5kilo')
+ 512
+ >>> human2bytes('0.1 byte')
+ 0
+ >>> human2bytes('1 k') # k is an alias for K
+ 1024
+ >>> human2bytes('12 foo')
+ Traceback (most recent call last):
+ ...
+ ValueError: can't interpret '12 foo'
+ """
+ init = s
+ num = ""
+ while s and s[0:1].isdigit() or s[0:1] == '.':
+ num += s[0]
+ s = s[1:]
+ num = float(num)
+ letter = s.strip()
+ for name, sset in SYMBOLS.items():
+ if letter in sset:
+ break
+ else:
+ if letter == 'k':
+ # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
+ sset = SYMBOLS['customary']
+ letter = letter.upper()
+ else:
+ raise ValueError("can't interpret %r" % init)
+ prefix = {sset[0]: 1}
+ for i, s in enumerate(sset[1:]):
+ prefix[s] = 1 << (i + 1) * 10
+ return int(num * prefix[letter])
+
+# https://github.com/okfn/ckanext-importlib
+def munge(name):
+ # convert spaces to underscores
+ name = re.sub(' ', '_', name).lower()
+ # convert symbols to dashes
+ name = re.sub('[:]', '_-', name).lower()
+ name = re.sub('[/]', '-', name).lower()
+ # take out not-allowed characters
+ name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
+ # remove double underscores
+ name = re.sub('__', '_', name).lower()
+ return name
+
+
+def name_munge(input_name):
+ return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
+ #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
+
+couch = couchdb.Server('http://127.0.0.1:5984/')
+docsdb = couch['disclosr-documents']
+
+if __name__ == "__main__":
+ for doc in docsdb.view('app/datasets'):
+ print doc.id
+ if doc.value['url'] != "http://data.gov.au/data/":
+ # Collect the package metadata.
+ pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
+ tags = doc.value['metadata']["Keywords / Tags"]
+ if not hasattr(tags, '__iter__'):
+ tags = [tags]
+ [re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags]
+ package_entity = {
+ 'name': pkg_name,
+ 'title': doc.value['metadata']['DCTERMS.Title'],
+ 'url': doc.value['metadata']['DCTERMS.Source.URI'],
+ 'tags': tags,
+ 'author': doc.value['metadata']["DCTERMS.Creator"],
+ 'maintainer': doc.value['metadata']["DCTERMS.Creator"],
+ 'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping
+ 'notes': doc.value['metadata']['Description'],
+ }
+ try:
+ #print doc.id
+ ckan.package_register_post(package_entity)
+ except CkanApiError, e:
+ if ckan.last_status == 409:
+ print "already exists"
+ else:
+ raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
+ ckan.last_status, pkg_name, e.args))
+
+ print package_entity
+ #todo add to organisation (author/creator/maintainer)
+ #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group
+ if 'Download' in doc.value['metadata'].keys():
+ try:
+ pkg = ckan.package_entity_get(pkg_name)
+ resources = pkg.get('resources', [])
+ if len(resources) < len(doc.value['metadata']['Download']):
+ for resource in doc.value['metadata']['Download']:
+ #print resource
+ # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html
+ # (KML/KMZ) / (Shapefile) /(Other)
+ format = "plain"
+ if resource['format'] == '(XML)':
+ format = 'xml'
+ if resource['format'] == '(CSV/XLS)':
+ format = 'csv'
+ name = resource['href']
+ if 'name' in resource.keys():
+ name = resource['name']
+ ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
+ format=format, size=human2bytes(resource['size'].replace(',', '')))
+ else:
+ print "resources already exist"
+ except CkanApiError, e:
+ if ckan.last_status == 404:
+ print "parent dataset does not exist"
+ else:
+ raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
+ ckan.last_status, pkg_name, e.args))
+
--- a/documents/datagov.py
+++ b/documents/datagov.py
@@ -34,7 +34,7 @@
if last_title == "Description":
doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore')
elif last_title == "Download":
- doc['metadata'][last_title] = {}
+ doc['metadata'][last_title] = []
for item in child.find_all("li"):
link = item.find("a")
format = item.find(property="dc:format")
@@ -42,7 +42,7 @@
"format": format.string.strip(), "size": format.next_sibling.string.strip()}
if link.string != None:
linkobj["name"] = link.string.strip()
- doc['metadata'][last_title][] = linkobj
+ doc['metadata'][last_title].append(linkobj)
else:
atags = child.find_all('a')
--- a/documents/gazette.py
+++ b/documents/gazette.py
@@ -1,1 +1,24 @@
+import sys, os
+import time
+import scrape
+from bs4 import BeautifulSoup
+from unidecode import unidecode
+
+listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960"
+(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
+ listurl, "gazette", "AGD")
+soup = BeautifulSoup(listhtml)
+for row in soup.find_all('tr'):
+ if row.has_key('valign'):
+ for col in tr.find_all('td'):
+ print col.string
+ #url = scrape.fullurl(listurl, atag['href'])
+ #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+ # url, "data", "AGIMO")
+ #hash = scrape.mkhash(scrape.canonurl(url))
+ #doc = scrape.docsdb.get(hash)
+ #print doc['metadata']
+ #scrape.docsdb.save(doc)
+ #time.sleep(2)
+
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -104,7 +104,7 @@
if doc == None:
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'}
else:
- if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000):
+ if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14):
print "Uh oh, trying to scrape URL again too soon!" + hash
last_attachment_fname = doc["_attachments"].keys()[-1]
last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
@@ -208,8 +208,8 @@
scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID)
#couch = couchdb.Server('http://192.168.1.148:5984/')
-couch = couchdb.Server('http://192.168.1.113:5984/')
-#couch = couchdb.Server('http://127.0.0.1:5984/')
+#couch = couchdb.Server('http://192.168.1.113:5984/')
+couch = couchdb.Server('http://127.0.0.1:5984/')
# select database
agencydb = couch['disclosr-agencies']
docsdb = couch['disclosr-documents']
--- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py
+++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py
@@ -26,8 +26,8 @@
ScraperImplementation().doScrape()
except Exception, err:
sys.stderr.write('ERROR: %s\n' % str(err))
- print ‘Error Reason: ‘, err.__doc__
- print ‘Exception: ‘, err.__class__
+ print "Error Reason: ", err.__doc__
+ print "Exception: ", err.__class__
print traceback.format_exc()
if amon_available:
data = {
--- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py
+++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py
@@ -14,5 +14,3 @@
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper)
ScraperImplementation().doScrape()
-www.finance.gov.au/foi/disclosure-log/foi-rss.xml
-
--- a/getAgency.php
+++ b/getAgency.php
@@ -178,7 +178,7 @@
echo '<div><i class="icon-shopping-cart" style="float:left"></i><p style="margin-left:16px;">';
$keys = array_keys($row['statistics']['budget']);
$lastkey = $keys[count($keys)-1];
- echo money_format("%#10i",(float)$row['statistics']['budget'][$lastkey]['value']).' <small>('.$lastkey.' budget)</small>';
+ echo "$".number_format(floatval($row['statistics']['budget'][$lastkey]['value'])).' <small>('.$lastkey.' budget)</small>';
echo '</div>';
}
echo ' </ul>
@@ -245,9 +245,9 @@
</div><!--/span-->
<div class="span9">
<div class="hero-unit">
- <h1>Hello, world!</h1>
- <p>This is a template for a simple marketing or informational website. It includes a large callout called the hero unit and three supporting pieces of content. Use it as a starting point to create something more unique.</p>
- <p><a class="btn btn-primary btn-large">Learn more »</a></p>
+ <h1>Australian Government Agencies</h1>
+ <p>Explore collected information about Australian Government Agencies below.</p>
+
</div>
<div class="row-fluid">
<div class="span4">';