From: Alex Sadleir Date: Sat, 09 Feb 2013 12:26:38 +0000 Subject: export no-html and licence X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=65934a8aa047453aaa512d4079b451613f41e4bd --- export no-html and licence Former-commit-id: 39dafe9fefec609588df4f189c2364dae8edd246 --- --- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -7,9 +7,39 @@ pass # Instantiate the CKAN client. -ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', - api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') -# (use your own api_key from http://thedatahub.org/user/me ) +#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') +ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', api_key='72f90359-0396-438c-804f-a26a24336747') +#couch = couchdb.Server('http://127.0.0.1:5984/') +couch = couchdb.Server('http://192.168.1.113:5984/') + +# http://stackoverflow.com/a/7778368/684978 +from HTMLParser import HTMLParser +import htmlentitydefs + +class HTMLTextExtractor(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.result = [ ] + + def handle_data(self, d): + self.result.append(d) + + def handle_charref(self, number): + codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number) + self.result.append(unichr(codepoint)) + + def handle_entityref(self, name): + codepoint = htmlentitydefs.name2codepoint[name] + self.result.append(unichr(codepoint)) + + def get_text(self): + return u''.join(self.result) + +def html_to_text(html): + s = HTMLTextExtractor() + s.feed(html) + return s.get_text() + # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ SYMBOLS = { 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), @@ -88,7 +118,24 @@ return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') -couch = couchdb.Server('http://127.0.0.1:5984/') +def get_licence_id(licencename): + map = { + "Creative Commons - Attribution-Share Alike 2.0 Australia (CC-SA)\nThe downloadable version of the database is licensed under CC-BY-SA Creative Commons Attribution Share Alike and contains only the database fields that are released under that license. These fields are object title, object number, object description as well as temporal, spatial and dimension details. It also contains a persistent URL for each record.": 'cc-by-sa', + "CreativeCommonsAttributionNonCommercial30AustraliaCCBYNC30": 'cc-nc', + 'Otherpleasespecify': 'notspecified', + '': 'notspecified', + "Publicly available data": 'notspecified', + "CreativeCommonsAttributionNoDerivativeWorks30AustraliaCCBYND30": "other-closed", + "CreativeCommonsAttributionNonCommercialNoDerivs30AustraliaCCBYNCND30": "other-closed", + 'CreativeCommonsAttribution30AustraliaCCBY30': 'cc-by', + "Creative Commons - Attribution 2.5 Australia (CC-BY)": 'cc-by', + 'CreativeCommonsAttributionCCBY25': 'cc-by', + "PublicDomain": 'other-pd', + } + if licencename not in map.keys(): + raise Exception(licencename + " not found"); + return map[licencename]; + docsdb = couch['disclosr-documents'] if __name__ == "__main__": @@ -100,17 +147,20 @@ tags = doc.value['metadata']["Keywords / Tags"] if not hasattr(tags, '__iter__'): tags = [tags] - [re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags] + [re.sub('[^a-zA-Z0-9-_()]', '', tag).replace('&', 'and').lower() for tag in tags] package_entity = { 'name': pkg_name, 'title': doc.value['metadata']['DCTERMS.Title'], 'url': doc.value['metadata']['DCTERMS.Source.URI'], - 'tags': tags, + 'author': doc.value['metadata']["DCTERMS.Creator"], 'maintainer': doc.value['metadata']["DCTERMS.Creator"], - 'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping - 'notes': doc.value['metadata']['Description'], + 'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']), + 'notes': html_to_text(doc.value['metadata']['Description']), } + if len(tags) > 0: + package_entity['tags'] = tags + print tags try: #print doc.id ckan.package_register_post(package_entity) @@ -122,7 +172,7 @@ ckan.last_status, pkg_name, e.args)) print package_entity - #todo add to organisation (author/creator/maintainer) + #todo add to organisation (author/creator/maintainer) http://docs.ckan.org/en/latest/apiv3.html#examples ckan.logic.action.update.package_owner_org_update #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group if 'Download' in doc.value['metadata'].keys(): try: @@ -130,7 +180,7 @@ resources = pkg.get('resources', []) if len(resources) < len(doc.value['metadata']['Download']): for resource in doc.value['metadata']['Download']: - #print resource + print resource # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html # (KML/KMZ) / (Shapefile) /(Other) format = "plain" --- a/documents/datagov.py +++ b/documents/datagov.py @@ -13,7 +13,7 @@ if atag.has_key('href'): url = scrape.fullurl(listurl, atag['href']) (url, mime_type, html) = scrape.fetchURL(scrape.docsdb, - url, "data", "AGIMO") + url, "data", "AGIMO", False) hash = scrape.mkhash(scrape.canonurl(url)) doc = scrape.docsdb.get(hash) if "metadata" not in doc.keys() or True: --- a/documents/scrape.py +++ b/documents/scrape.py @@ -12,6 +12,11 @@ import urlparse import socket +#couch = couchdb.Server('http://192.168.1.148:5984/') +couch = couchdb.Server('http://192.168.1.113:5984/') +#couch = couchdb.Server('http://127.0.0.1:5984/') + + def mkhash(input): return hashlib.md5(input).hexdigest().encode("utf-8") @@ -104,14 +109,11 @@ if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} else: - if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14): + if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) content = last_attachment - return (doc['url'], doc['mime_type'], content.read()) - if scrape_again == False: - print "Not scraping this URL again as requested" return (doc['url'], doc['mime_type'], content.read()) req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") @@ -207,9 +209,6 @@ #print linkurl scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) -#couch = couchdb.Server('http://192.168.1.148:5984/') -#couch = couchdb.Server('http://192.168.1.113:5984/') -couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents']