From: Maxious Date: Thu, 07 Feb 2013 08:20:43 +0000 Subject: import X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=d6c9103f693ef96468328f4c5a9104102f01c0dd --- import Former-commit-id: 997dc9ece51a37dc25779ca4125d0960cdc195c9 --- --- a/admin/refreshDesignDoc.php +++ b/admin/refreshDesignDoc.php @@ -40,6 +40,8 @@ $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; + +$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}"; $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $docdb->save($obj, true); --- /dev/null +++ b/documents/datagov-export.py @@ -1,1 +1,154 @@ +import ckanclient +import couchdb +from ckanclient import CkanApiError +import re +class LoaderError(Exception): + pass + +# Instantiate the CKAN client. +ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', + api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') +# (use your own api_key from http://thedatahub.org/user/me ) +# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ +SYMBOLS = { + 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), + 'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', + 'zetta', 'iotta'), + 'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), + 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', + 'zebi', 'yobi'), +} + +def human2bytes(s): + """ + Attempts to guess the string format based on default symbols + set and return the corresponding bytes as an integer. + When unable to recognize the format ValueError is raised. + + >>> human2bytes('0 B') + 0 + >>> human2bytes('1 K') + 1024 + >>> human2bytes('1 M') + 1048576 + >>> human2bytes('1 Gi') + 1073741824 + >>> human2bytes('1 tera') + 1099511627776 + + >>> human2bytes('0.5kilo') + 512 + >>> human2bytes('0.1 byte') + 0 + >>> human2bytes('1 k') # k is an alias for K + 1024 + >>> human2bytes('12 foo') + Traceback (most recent call last): + ... + ValueError: can't interpret '12 foo' + """ + init = s + num = "" + while s and s[0:1].isdigit() or s[0:1] == '.': + num += s[0] + s = s[1:] + num = float(num) + letter = s.strip() + for name, sset in SYMBOLS.items(): + if letter in sset: + break + else: + if letter == 'k': + # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs + sset = SYMBOLS['customary'] + letter = letter.upper() + else: + raise ValueError("can't interpret %r" % init) + prefix = {sset[0]: 1} + for i, s in enumerate(sset[1:]): + prefix[s] = 1 << (i + 1) * 10 + return int(num * prefix[letter]) + +# https://github.com/okfn/ckanext-importlib +def munge(name): + # convert spaces to underscores + name = re.sub(' ', '_', name).lower() + # convert symbols to dashes + name = re.sub('[:]', '_-', name).lower() + name = re.sub('[/]', '-', name).lower() + # take out not-allowed characters + name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() + # remove double underscores + name = re.sub('__', '_', name).lower() + return name + + +def name_munge(input_name): + return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) + #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') + +couch = couchdb.Server('http://127.0.0.1:5984/') +docsdb = couch['disclosr-documents'] + +if __name__ == "__main__": + for doc in docsdb.view('app/datasets'): + print doc.id + if doc.value['url'] != "http://data.gov.au/data/": + # Collect the package metadata. + pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100]) + tags = doc.value['metadata']["Keywords / Tags"] + if not hasattr(tags, '__iter__'): + tags = [tags] + [re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags] + package_entity = { + 'name': pkg_name, + 'title': doc.value['metadata']['DCTERMS.Title'], + 'url': doc.value['metadata']['DCTERMS.Source.URI'], + 'tags': tags, + 'author': doc.value['metadata']["DCTERMS.Creator"], + 'maintainer': doc.value['metadata']["DCTERMS.Creator"], + 'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping + 'notes': doc.value['metadata']['Description'], + } + try: + #print doc.id + ckan.package_register_post(package_entity) + except CkanApiError, e: + if ckan.last_status == 409: + print "already exists" + else: + raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( + ckan.last_status, pkg_name, e.args)) + + print package_entity + #todo add to organisation (author/creator/maintainer) + #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group + if 'Download' in doc.value['metadata'].keys(): + try: + pkg = ckan.package_entity_get(pkg_name) + resources = pkg.get('resources', []) + if len(resources) < len(doc.value['metadata']['Download']): + for resource in doc.value['metadata']['Download']: + #print resource + # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html + # (KML/KMZ) / (Shapefile) /(Other) + format = "plain" + if resource['format'] == '(XML)': + format = 'xml' + if resource['format'] == '(CSV/XLS)': + format = 'csv' + name = resource['href'] + if 'name' in resource.keys(): + name = resource['name'] + ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data', + format=format, size=human2bytes(resource['size'].replace(',', ''))) + else: + print "resources already exist" + except CkanApiError, e: + if ckan.last_status == 404: + print "parent dataset does not exist" + else: + raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( + ckan.last_status, pkg_name, e.args)) + --- a/documents/datagov.py +++ b/documents/datagov.py @@ -34,7 +34,7 @@ if last_title == "Description": doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore') elif last_title == "Download": - doc['metadata'][last_title] = {} + doc['metadata'][last_title] = [] for item in child.find_all("li"): link = item.find("a") format = item.find(property="dc:format") @@ -42,7 +42,7 @@ "format": format.string.strip(), "size": format.next_sibling.string.strip()} if link.string != None: linkobj["name"] = link.string.strip() - doc['metadata'][last_title][] = linkobj + doc['metadata'][last_title].append(linkobj) else: atags = child.find_all('a') --- a/documents/gazette.py +++ b/documents/gazette.py @@ -1,1 +1,24 @@ +import sys, os +import time +import scrape +from bs4 import BeautifulSoup +from unidecode import unidecode + +listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" +(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, + listurl, "gazette", "AGD") +soup = BeautifulSoup(listhtml) +for row in soup.find_all('tr'): + if row.has_key('valign'): + for col in tr.find_all('td'): + print col.string + #url = scrape.fullurl(listurl, atag['href']) + #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, + # url, "data", "AGIMO") + #hash = scrape.mkhash(scrape.canonurl(url)) + #doc = scrape.docsdb.get(hash) + #print doc['metadata'] + #scrape.docsdb.save(doc) + #time.sleep(2) + --- a/documents/scrape.py +++ b/documents/scrape.py @@ -104,7 +104,7 @@ if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} else: - if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000): + if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14): print "Uh oh, trying to scrape URL again too soon!" + hash last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) @@ -208,8 +208,8 @@ scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') -#couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents'] --- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py @@ -26,8 +26,8 @@ ScraperImplementation().doScrape() except Exception, err: sys.stderr.write('ERROR: %s\n' % str(err)) - print ‘Error Reason: ‘, err.__doc__ - print ‘Exception: ‘, err.__class__ + print "Error Reason: ", err.__doc__ + print "Exception: ", err.__class__ print traceback.format_exc() if amon_available: data = { --- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py +++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py @@ -14,5 +14,3 @@ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) ScraperImplementation().doScrape() -www.finance.gov.au/foi/disclosure-log/foi-rss.xml -