From: Maxious Date: Sat, 22 Jun 2013 06:57:06 +0000 Subject: more export fixes X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=835cba9e5eae8d1d8768b5cb52bb5c24dfbda04d --- more export fixes Former-commit-id: 4ca7f1f81d51d7c094d904b6aa1e176cf5958630 --- --- a/admin/refreshDesignDoc.php +++ b/admin/refreshDesignDoc.php @@ -8,7 +8,7 @@ $obj->_id = "_design/" . urlencode("app"); $obj->language = "javascript"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; -$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; +$obj->views->byDate->map = "function(doc) { if (doc.title != "Disclosure Log Updated") { emit(doc.date, doc); } };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; --- a/documents/datagov-export-groups.py +++ b/documents/datagov-export-groups.py @@ -9,8 +9,12 @@ # Instantiate the CKAN client. #ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') -api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a' -ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', +api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc' +server = 'data.disclosurelo.gs' +api_key = 'c30eb6f5-0f90-47e0-bf05-9b1b4e3a461a' +server = 'ckan.data.gov.au' + +ckan = ckanclient.CkanClient(base_location='http://' + server + '/api', api_key=api_key) couch = couchdb.Server('http://127.0.0.1:5984/') #couch = couchdb.Server('http://192.168.1.113:5984/') @@ -39,41 +43,43 @@ groups = {} for doc in docsdb.view('app/datasetGroups'): group_name = doc.key - pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', - doc.value.replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); - if group_name in groups.keys(): - groups[group_name] = list(set(groups[group_name] + [pkg_name])) - else: - groups[group_name] = [pkg_name] + if group_name != "Not specified": + pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', + doc.value.replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); + if group_name in groups.keys(): + groups[group_name] = list(set(groups[group_name] + [pkg_name])) + else: + groups[group_name] = [pkg_name] # add dataset to group(s) for group_name in groups.keys(): - group_url = name_munge(group_name[:100]) - print group_name - print groups[group_name] - try: - # Update the group details - group_entity = ckan.group_entity_get(group_url) - print "group "+group_name+" exists" - if 'packages' in group_entity.keys(): - group_entity['packages'] = list(set(group_entity['packages'] + groups[group_name])) - else: - group_entity['packages'] = groups[group_name] - ckan.group_entity_put(group_entity) - except CkanApiError, e: - if ckan.last_status == 404: - print "group "+group_name+" does not exist, creating" - group_entity = { - 'name': group_url, - 'title': group_name, - 'description': group_name, - 'packages': groups[group_name] - } - #print group_entity - ckan.group_register_post(group_entity) - elif ckan.last_status == 409: - print "group already exists" - else: - raise LoaderError('Unexpected status %s adding to group under \'%s\': %r' % ( - ckan.last_status, pkg_name, e.args)) + if group_name != "Not specified": + group_url = name_munge(group_name[:100]) + print group_name + print groups[group_name] + try: + # Update the group details + group_entity = ckan.group_entity_get(group_url) + print "group "+group_name+" exists" + if 'packages' in group_entity.keys(): + group_entity['packages'] = list(set(group_entity['packages'] + groups[group_name])) + else: + group_entity['packages'] = groups[group_name] + ckan.group_entity_put(group_entity) + except CkanApiError, e: + if ckan.last_status == 404: + print "group "+group_name+" does not exist, creating" + group_entity = { + 'name': group_url, + 'title': group_name, + 'description': group_name, + 'packages': groups[group_name] + } + #print group_entity + ckan.group_register_post(group_entity) + elif ckan.last_status == 409: + print "group already exists" + else: + raise LoaderError('Unexpected status %s adding to group under \'%s\': %r' % ( + ckan.last_status, pkg_name, e.args)) --- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -1,21 +1,85 @@ +# coding=utf-8 import ckanclient import couchdb from ckanclient import CkanApiError import re import html2text # aaronsw :( import ckanapi # https://github.com/open-data/ckanapi - +import scrape +import datetime, os, hashlib +import urllib2 class LoaderError(Exception): pass +import tempfile +def add_package_resource_cachedurl(ckan, package_name, url, name, format, license_id, size,**kwargs): + excluded_apis = "recent-earthquakes,sentinel-hotspots,abc-local-stations,action-bus-service-gtfs-feed-act,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,journey-planner-data-act,live-traffic-cameras-nsw,precis-forecast-national,precis-forecast-national,precis-forecast-new-south-wales,precis-forecast-new-south-wales,precis-forecast-northern-territory,precis-forecast-northern-territory,precis-forecast-queensland,precis-forecast-queensland,precis-forecast-south-australia,precis-forecast-south-australia,precis-forecast-tasmania,precis-forecast-tasmania,precis-forecast-victoria,precis-forecast-victoria,precis-forecast-western-australia,precis-forecast-western-australia,register-of-penalty-notices-nsw,sentinel-hotspots,trove-people-and-organisations-data,weather-data-services-radar,abc-local-stations,act-emergency-services-agency-esa-28093-current-incidents,act-emergency-services-agency-esa-news-alerts,act-government-news-and-events,act-government-summaries-of-cabinet-outcomes,act-magistrates-court-judgements,act-supreme-court-judgements,act-supreme-court-sentences,actpla-latest-news,all-vacant-act-government-jobs,community-engagement-current-engagements,community-engagement-news,edd-media-releases,edd-news-and-events,freedom-of-information-foi-summaries,libraries-act-announcements,nsw-rural-fire-service-current-incidents,nsw-rural-fire-service-major-updates,precis-forecast-new-south-wales,precis-forecast-south-australia,precis-forecast-tasmania,precis-forecast-victoria,sentinel-hotspots,south-australian-road-crash-statistics,trove-people-and-organisations-data,weather-warnings-for-new-south-wales-australian-capital-territory,weather-warnings-for-northern-territory,weather-warnings-for-queensland,weather-warnings-for-south-australia,weather-warnings-for-tasmania,weather-warnings-for-victoria,weather-warnings-for-western-australia".split(",") + if "xls" in url: + format = "xls" + if "pdf" in url: + format = "pdf" + if "xlsx" in url: + format = "xlsx" + (returned_url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + url, "dataset_resource", "AGIMO", False) + if mime_type in ["application/vnd.ms-excel","application/msexcel","application/x-msexcel","application/x-ms-excel","application/x-excel","application/x-dos_ms_excel","application/xls","application/x-xls"]: + format = "xls" + if mime_type in ["application/xlsx","application/x-xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]: + format = "xlsx" + + #put file extensions on for windows users downloading files + suffix = name.encode("ascii","ignore").replace("/","") + if len(suffix) < 5 or (suffix[-4] != "." and suffix[-5] != "."): + suffix = suffix + "." + format + if content != None and package_name not in excluded_apis: + tf = tempfile.NamedTemporaryFile(suffix=suffix) + tfName = os.path.abspath(tf.name) + print tfName + tf.seek(0) + tf.write(content) + tf.flush() + ckan.add_package_resource (package_name, tfName, name=name, format=format, license_id=license_id) + else: + print "fetch error" + return ckan.add_package_resource(package_name, url, name=name, resource_type='data', + format=format, + size=size, mimetype=mime_type, license_id=license_id) + # Instantiate the CKAN client. api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc' -ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', +server = 'data.disclosurelo.gs' + +ckan = ckanclient.CkanClient(base_location='http://' + server + '/api', api_key=api_key) -ckandirect = ckanapi.RemoteCKAN('http://data.disclosurelo.gs', api_key=api_key) +ckandirect = ckanapi.RemoteCKAN('http://' + server, api_key=api_key) couch = couchdb.Server('http://127.0.0.1:5984/') #couch = couchdb.Server('http://192.168.1.113:5984/') + +import urllib +import urlparse + + +def url_fix(s, charset='utf-8'): + """Sometimes you get an URL by a user that just isn't a real + URL because it contains unsafe characters like ' ' and so on. This + function can fix some of the problems in a similar way browsers + handle data entered by the user: + + >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') + 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' + + :param charset: The target charset for the URL if the url was + given as unicode string. + """ + if isinstance(s, unicode): + s = s.encode(charset, 'ignore') + if not urlparse.urlparse(s).scheme: + s = "http://" + s + scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) + path = urllib.quote(path, '/%') + qs = urllib.quote_plus(qs, ':&=') + return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ SYMBOLS = { @@ -57,7 +121,7 @@ ValueError: can't interpret '12 foo' """ if s == None: - return 0 + return 0 s = s.replace(',', '') init = s num = "" @@ -99,15 +163,15 @@ return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) -def get_licence_id(licencename): +def get_license_id(licencename): map = { "Creative Commons - Attribution-Share Alike 2.0 Australia (CC-SA)\nThe downloadable version of the database is licensed under CC-BY-SA Creative Commons Attribution Share Alike and contains only the database fields that are released under that license. These fields are object title, object number, object description as well as temporal, spatial and dimension details. It also contains a persistent URL for each record.": 'cc-by-sa', "CreativeCommonsAttributionNonCommercial30AustraliaCCBYNC30": 'cc-nc', 'Otherpleasespecify': 'notspecified', '': 'notspecified', "Publicly available data": 'notspecified', - "CreativeCommonsAttributionNoDerivativeWorks30AustraliaCCBYND30": "other-closed", - "CreativeCommonsAttributionNonCommercialNoDerivs30AustraliaCCBYNCND30": "other-closed", + "CreativeCommonsAttributionNoDerivativeWorks30AustraliaCCBYND30": "cc-by-nd", + "CreativeCommonsAttributionNonCommercialNoDerivs30AustraliaCCBYNCND30": "cc-nc-nd", 'CreativeCommonsAttribution30AustraliaCCBY30': 'cc-by', "Creative Commons - Attribution 2.5 Australia (CC-BY)": 'cc-by', 'CreativeCommonsAttributionCCBY25': 'cc-by', @@ -117,6 +181,9 @@ raise Exception(licencename + " not found"); return map[licencename]; +goodcsvdata = "afl-in-victoria,annual-budget-initiatives-by-suburb-brisbane-city-council,athletics-in-victoria-gfyl,bicycle-racks-mosman-municipal-council,boat-ramps-brisbane-city-council,brisbane-access-ratings-database,bus-stops-brisbane-city-council,cemeteries-brisbane-city-council,cfa-locations,citycycle-stations-brisbane-city-council,community-gardens-brisbane-city-council,community-halls-brisbane-city-council,cooking-classes-gfyl,court-locations-victoria,customer-service-centres-brisbane-city-council,dance-in-victoria-gfyl,disability-activity-gfyl,dog-parks-brisbane-city-council,ferry-terminals-brisbane-city-council,fishing-club-in-victoria-gfyl,fitness-centres-in-victoria-gfyl,gardens-reserves-gfyl,golf-courses-brisbane-city-council,gymnastics-in-victoria-gfyl,historic-cemeteries-brisbane-city-council,ice-skating-centres-gfyl,immunisation-clinics-brisbane-city-council,libraries-brisbane-city-council,licenced-venues-victoria,lifesaving-locations-victoria,loading-zones-brisbane-city-council,major-projects-victoria,markets-in-victoria,martial-arts-in-victoria-gfyl,melbourne-water-use-by-postcode,members-of-parliament-both-houses-nsw,members-of-the-legislative-assembly-nsw,members-of-the-legislative-council-nsw,mfb-locations-vic,ministers-of-the-nsw-parliament,mosman-local-government-area,mosman-rider-route,mosman-wwii-honour-roll,neighbourhood-houses-gfyl,news-feeds-mosman-municipal-council,off-street-car-parks-mosman-municipal-council,orienteering-clubs-gfyl,parking-meter-areas-brisbane-city-council,parks-and-reserves-mosman-municipal-council,parks-brisbane-city-council,personal-training-gfyl,picnic-areas-brisbane-city-council,playgrounds-brisbane-city-council,playgrounds-mosman-municipal-council,police-region-crime-statistics-victoria,police-service-area-crime-statistics-victoria,pony-clubs-in-victoria-gfyl,prison-locations-victoria,public-amenities-maintained-by-mosman-council,public-art-brisbane-city-council,public-internet-locations-vic,public-toilets-brisbane-city-council,racecourse-locations-victoria,recent-development-applications-mosman-municipal-council,recreation-groups-gfyl,recreational-fishing-spots,regional-business-centres-brisbane-city-council,reports-of-swooping-birds-mosman-municipal-council,restricted-parking-areas-brisbane-city-council,rollerskating-centres-in-victoria-gfyl,sailing-clubs-gfyl,school-locations-victoria,shadow-ministers-of-the-nsw-parliament,skate-parks-gfyl,sporting-clubs-and-organisations-gfyl,stakeboard-parks-brisbane-city-council,state-bodies-gfyl,street-names-brisbane-city-council,suburbs-and-adjoining-suburbs-brisbane-city-council,swimming-pools-brisbane-city-council,swimming-pools-gfyl,tennis-courts-brisbane-city-council,top-40-book-club-reads-brisbane-city-council,tracks-and-trails-gfyl,triathlon-clubs-gfyl,urban-water-restrictions-victoria,veterinary-services-in-mosman,victorian-microbreweries,volunteering-centres-services-and-groups-victoria,walking-groups-gfyl,ward-offices-brisbane-city-council,waste-collection-days-brisbane-city-council,waste-transfer-stations-brisbane-city-council,water-consumption-in-melbourne,water-sports-in-victoria-gfyl,wifi-hot-spots-brisbane-city-council,yoga-pilates-and-tai-chi-in-victoria-gfyl,2809cycling-in-new-south-wales-what-the-data-tells-us2809-and-related-data,act-barbecue-bbq-locations,act-tafe-locations,ausindustry-locations,austender-contract-notice-export,austender-contract-notice-export,austender-contract-notice-export,austender-contract-notice-export,austender-contract-notice-export,austender-contract-notice-export,austender-contract-notice-export,austender-contract-notice-export,australian-gas-light-company-maps,australian-gas-light-company-maps,australian-ports,australian-public-service-statistical-bulletin-2011-12,australian-public-service-statistical-bulletin-snapshot-at-december-31-2011,australian-public-service-statistical-bulletin-tables-0910,austrics-timetable-set,capital-works-call-tender-schedule,collection-item-usage-state-library-of-victoria,country-and-commodity-trade-data-spreadsheet,country-and-commodity-trade-data-spreadsheet-2,country-by-level-of-processing-trade-data-spreadsheet,crime-incident-type-and-frequency-by-capital-city-and-nationally,csiro-locations,data-from-the-oaic-public-sector-information-survey-2012,data-from-the-oaic-public-sector-information-survey-2012,data-from-the-oaic-public-sector-information-survey-2012,department-of-finance-and-deregulation-office-locations,digitised-maps,diisr-division-locations-excluding-ausindustry-enterprise-connect-and-nmi,diisr-locations,diisr-portfolio-agency-locations-excluding-csiro,distance-to-legal-service-providers-from-disadvantaged-suburbs,enterprise-connect-locations,fire-insurance-maps-sydney-block-plans-1919-1940,fire-insurance-maps-sydney-block-plans-1919-1940,first-fleet-collection,first-fleet-collection,first-fleet-maps,first-fleet-maps,freedom-of-information-annual-estimated-costs-and-staff-time-statistical-data-2011-12,freedom-of-information-quarterly-request-and-review-statistical-data-2011-12,freedom-of-information-requests-estimated-costs-and-charges-collected-1982-83-to-2011-12,higher-education-course-completions,higher-education-enrolments,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,historical-australian-government-contract-data,journey-planner-data-nt,library-catalogue-search-terms-state-library-of-victoria,location-of-act-schools,location-of-centrelink-offices,location-of-european-wasps-nests,location-of-lawyers-and-legal-service-providers-by-town,location-of-legal-assistance-service-providers,location-of-medicare-offices,location-of-medicare-offices,maps-of-the-southern-hemisphere-16th-18th-centuries,maps-of-the-southern-hemisphere-16th-18th-centuries,music-queensland,national-measurement-institute-locations,new-south-wales-officers-and-men-of-the-australian-imperial-force-a-i-f-and-the-australian-naval-for,new-south-wales-officers-and-men-of-the-australian-imperial-force-a-i-f-and-the-australian-naval-for,photographs-of-nsw-life-pre-1955,photographs-of-nsw-life-pre-1955,photographs-of-sydney-before-1885,photographs-of-sydney-before-1885,picture-queensland,plgr-28093-playgrounds-act,police-station-locations,queensland-public-libraries,rare-printed-books,rare-printed-books,real-estate-maps,regional-australia-funding-projects,sa-memory-state-library-of-south-australia,search-engine-terms-state-library-of-victoria,south-australian-photographs-state-library-of-south-australia,south-australian-sheet-music-state-library-of-south-australia,sydney-bond-store-maps-1894,sydney-bond-store-maps-1894,sydney-maps-1917,sydney-maps-1917,tafe-institute-locations-victoria,tafe-sa-campus-locations,tolt-public-toilets-act,victorian-public-library-branches-state-library-of-victoria,western-australia-public-library-network,world-war-one-photographs-by-frank-hurley,world-war-one-photographs-by-frank-hurley,citycat-timetables-brisbane-city-council,cityferry-timetables-brisbane-city-council,cost-of-salinity-to-local-infrastructure-1996-97-summary-of-component-costs-of-salinity-by-reporting,cost-of-salinity-to-local-infrastructure-1996-97-summary-of-component-costs-of-salinity-by-reporting,downstream-cost-calculator-model-and-data-for-199697-or-2001-prices,economics-of-australian-soil-conditions-199697-limiting-factor-or-relative-yield-min-of-ry_salt2000-,geographical-names-register-gnr-of-nsw,victorian-dryland-salinity-assessment-2000-d01cac_ramsar_final-xls,victorian-dryland-salinity-assessment-2000-d02cac_fauna_final-xls,victorian-dryland-salinity-assessment-2000-d03cac_fauna_dist_final-xls,victorian-dryland-salinity-assessment-2000-dc04cac_hydrol_final-xls,victorian-dryland-salinity-assessment-2000-dc05cac_wetland_final-xls,victorian-dryland-salinity-assessment-2000-dc06cac_util_final-xls,victorian-dryland-salinity-assessment-2000-dc07cac_road_final-xls,victorian-dryland-salinity-assessment-2000-dc08cac_towns_final-xls,victorian-dryland-salinity-assessment-2000-dc09cac_flora_final-xls,victorian-dryland-salinity-assessment-2000-dc10cac_flora_dist_final-xls,victorian-dryland-salinity-assessment-2000-dc12cac_infrastructure-xls,victorian-dryland-salinity-assessment-2000-dc13cac_natural_envt-xls,victorian-dryland-salinity-assessment-2000-dc14cac_agriculture-xls,victorian-dryland-salinity-assessment-2000-dc16cac_agric_cost-xls,victorian-dryland-salinity-assessment-2000-dc17cac_shallow_wt-xls,victorian-dryland-salinity-assessment-2000-dc18cac_agric_cost_time-xls,victorian-dryland-salinity-assessment-2000-dc21cac_water_resources_new-xls,victorian-dryland-salinity-assessment-2000-dc22cac_risk-xls,licensed-broadcasting-transmitter-data,nsw-crime-data,recorded-crime-dataset-nsw,crime-statistics-in-nsw-by-month,2001-02-to-2007-08-local-government-survey-victoria,2009-green-light-report,annual-statistical-reports-fire-brigades-nsw-200304,annual-statistical-reports-fire-brigades-nsw-200405,annual-statistical-reports-fire-brigades-nsw-200506,annual-statistical-reports-fire-brigades-nsw-200607,arts-on-the-map,assets-and-liabilities-of-australian-located-operations,assets-of-australian-located-operations,assets-of-australian-located-operations-by-country,assets-of-financial-institutions,back-issues-of-monthly-banking-statistics,banks-assets,banks-consolidated-group-capital,banks-consolidated-group-impaired-assets,banks-consolidated-group-off-balance-sheet-business,banks-liabilities,building-societies-selected-assets-and-liabilities,byteback2842-locations-vic,cash-management-trusts,city-of-melbourne-street-furniture-database,community-services-nsw,consolidated-exposures-immediate-and-ultimate-risk-basis,consolidated-exposures-immediate-risk-basis-foreign-claims-by-country,consolidated-exposures-immediate-risk-basis-international-claims-by-country,consolidated-exposures-ultimate-risk-basis,consolidated-exposures-ultimate-risk-basis-foreign-claims-by-country,cosolidated-exposures-immediate-risk-basis,credit-unions-selected-assets-and-liabilities,daily-net-foreign-exchange-transactions,detox-your-home,education-national-assessment-program-literacy-and-numeracy-nsw,employment-data-by-nsw-regions,excise-beer-clearance-data-updated-each-month-beer-clearance-summary-data,finance-companies-and-general-financiers-selected-assets-and-liabilities,foreign-exchange-transactions-and-holdings-of-official-reserve-assets,half-yearly-life-insurance-bulletin-december-2010,health-behaviours-in-nsw,international-liabilities-by-country-of-the-australian-located-operations-of-banks-and-rfcs,liabilities-and-assets-monthly,liabilities-and-assets-weekly,liabilities-of-australian-located-operations,life-insurance-offices-statutory-funds,managed-funds,monetary-policy-changes,money-market-corporations-selected-assets-and-liabilities,monthly-airport-traffic-data-for-top-ten-airports-january-1985-to-december-2008,monthly-banking-statistics-april-2011,monthly-banking-statistics-june-2011,monthly-banking-statistics-may-2011,open-market-operations-2009-to-current,projected-households-vic-rvic-msd-2006-2056,projected-population-by-age-and-sex-vic-rvic-msd-2006-2056,public-unit-trust,quarterly-bank-performance-statistics,quarterly-general-insurance-performance-statistics-march-2011,quarterly-superannuation-performance-march-2011,recorded-crime-dataset-nsw,residential-land-bulletin,resourcesmart-retailers,resourcesmart-retailers-vic,road-fatalities-nsw,securitisation-vehicles,selected-asset-and-liabilities-of-the-private-non-financial-sectors,seperannuation-funds-outside-life-offices,solar-report-vic,towns-in-time-victoria,vif2008-projected-population-by-5-year-age-groups-and-sex-sla-lga-ssd-sd-2006-2026,vif2008-projected-population-totals-and-components-vic-rvic-msd-2006-2056,vif2008-projected-population-totals-sla-lga-ssd-sd-2006-2026,arts-festivals-victoria,arts-organisations-victoria,arts-spaces-and-places-victoria,ausgrid-average-electricity-use,collecting-institutions-victoria,indigenous-arts-organisations-victoria,latest-coastal-weather-observations-for-coolangatta-qld,top-10-fiction-books-brisbane-city-council".split(",") +goodotherdata = "abc-local-stations,abc-local-stations,abc-local-stations,act-emergency-services-agency-esa-28093-current-incidents,act-government-news-and-events,act-government-summaries-of-cabinet-outcomes,act-magistrates-court-judgements,act-supreme-court-judgements,act-supreme-court-sentences,action-bus-service-gtfs-feed-act,actpla-latest-news,agricultural-commodities-for-199697-linked-to-profit-function-surfaces,agricultural-structure-classification,agricultural-structure-classification,all-vacant-act-government-jobs,annual-family-income-1996-1997-to-1998-1999-three-year-average,apvma-pubcris-dataset-for-registered-agricultural-and-veterinary-chemical-products-and-approved-acti,argus-newspaper-collection-of-photographs-state-library-of-victoria,assessment-of-terrestrial-biodiversity-2002-biodiversity-audit-data-entry-system-bades,assessment-of-terrestrial-biodiversity-2002-database,assisted-immigration-1848-1912-index,ausgrid-average-electricity-use,ausgrid-average-electricity-use-2011,ausindustry-locations,ausindustry-locations,austender-contract-notice-export,australian-broadband-guarantee,australian-broadband-guarantee,australian-data-access,australian-dryland-salinity-assessment-spatial-data-12500000-nlwra-2001,australian-dryland-salinity-assessment-spatial-data-12500000-nlwra-2001,australian-groundwater-flow-systems-national-land-and-water-resources-audit-january-2000,australian-groundwater-flow-systems-national-land-and-water-resources-audit-january-2000,australian-irrigation-areas-raster-version-1a-national-land-and-water-resources-audit,australian-irrigation-areas-raster-version-1a-national-land-and-water-resources-audit,australian-irrigation-areas-vector-version-1a-national-land-and-water-resources-audit,australian-irrigation-areas-vector-version-1a-national-land-and-water-resources-audit,australian-public-service-statistical-bulletin-2010-11,australian-water-resources-assessment-2000-database,australiana-index-state-library-of-victoria,available-water-capacity-for-australian-areas-of-intensive-agriculture-of-layer-1-a-horizon-top-soil,bicycle-racks-mosman-municipal-council,bikeways-briisbane-city-council,bikeways-briisbane-city-council,boreholes-in-the-murray-basin-southeastern-australia,boreholes-in-the-murray-basin-southeastern-australia,british-convict-transportation-registers,calculated-annual-and-monthly-potential-evaporation-mm,calculated-annual-and-monthly-potential-evaporation-mm,canberra-suburb-boundaries,catchment-and-subcatchments-grid,cemeteries-brisbane-city-council,cemeteries-brisbane-city-council,coal-fields-in-the-murray-basin-southeastern-australia,coal-fields-in-the-murray-basin-southeastern-australia,commonwealth-agencies,commonwealth-electoral-boundaries-archive-2009,commonwealth-electoral-boundaries-archive-2009,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-all-infrastructure-buildings-road-rail-a,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-all-infrastructure-buildings-road-rail-a,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-increase-to-local-infrastructure-based-o,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-of-all-infrastructure-buildings-road-rai,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-of-all-infrastructure-buildings-road-rai,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-of-all-infrastructure-buildings-road-rai,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-of-all-infrastructure-buildings-road-rai,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-of-the-general-infrastructure-component-,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-of-the-rail-component-of-infrastructure-,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-the-general-infrastructure-component-bui,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-the-road-component-of-infrastructure-bas,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-the-road-component-of-infrastructure-bas,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-to-the-bridge-component-of-infrastructur,cost-of-salinity-to-local-infrastructure-1996-97-total-cost-to-the-bridge-component-of-infrastructur,country-by-level-of-processing-trade-data-spreadsheet-2,country-by-level-of-processing-trade-data-spreadsheet-2011-12,crime-incidents-data-2004-international-crime-victimisation-survey-icvs-australian-component,cropping-management-practices-1998-1999,csiro-locations,csiro-locations,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,cybersafety-outreach-program,cybersafety-outreach-program,data-source-for-polygonal-data-used-by-the-asris-project-in-generation-of-modelled-surfaces,department-of-finance-and-deregulation-office-locations,department-of-finance-and-deregulation-office-locations,depositional-path-length,digital-enterprise,digital-hubs,digitised-maps,diisr-division-locations-excluding-ausindustry-enterprise-connect-and-nmi,diisr-division-locations-excluding-ausindustry-enterprise-connect-and-nmi,diisr-locations,diisr-portfolio-agency-locations-excluding-csiro,diisr-portfolio-agency-locations-excluding-csiro,directory-gov-au-full-data-export,distance-to-ridges,economics-of-australian-soil-conditions-199697-factor-most-limiting-yield-aciditysodicitysalinity,economics-of-australian-soil-conditions-199697-gross-benefit-acidity-hayr,economics-of-australian-soil-conditions-199697-gross-benefit-of-the-limiting-factor-hayr,economics-of-australian-soil-conditions-199697-gross-benefit-salinity-hayr,economics-of-australian-soil-conditions-199697-gross-benefit-sodicity-hayr,economics-of-australian-soil-conditions-199697-impact-cost-of-salinity-2000-2020-hayr,economics-of-australian-soil-conditions-199697-relative-yield-from-acidity,economics-of-australian-soil-conditions-199697-relative-yield-from-salinity-in-2000,economics-of-australian-soil-conditions-199697-relative-yield-from-salinity-in-2020,economics-of-australian-soil-conditions-199697-relative-yield-from-sodicity,edd-media-releases,edd-news-and-events,egovernment-resource-centre-website-analytics,elevation-of-the-pre-tertiary-basement-in-the-murray-basin,elevation-of-the-pre-tertiary-basement-in-the-murray-basin,enterprise-connect-locations,enterprise-connect-locations,equivalent-fresh-water-head-difference-between-the-shallowest-and-deepest-aquifers,equivalent-fresh-water-head-difference-between-the-shallowest-and-deepest-aquifers,erosion-gully-density,erosion-path-length,estimated-proportion-of-farms-carrying-out-landcare-related-work-1998-1999,estimated-value-of-agricultural-operations-evao-1996-1997,farm-equity-ratio-1996-1997-to-1998-1999-three-year-average,farm-family-cash-income-1196-1997-to-1998-1999-three-year-average,farmer-population-1996,farms-with-significant-degradation-problems-irrigation-salinity-1998-1999,farms-with-significant-degradation-problems-irrigation-salinity-1998-1999-2,farms-with-significant-degradation-problems-soil-acidity-1998-1999,forests-of-australia-2003,freedom-of-information-foi-summaries,geology-lithology-12-500-000-scale,glenorchy-city-council-building-footprints,glenorchy-city-council-building-footprints,glenorchy-city-council-building-footprints,glenorchy-city-council-kerbs,glenorchy-city-council-kerbs,glenorchy-city-council-kerbs,glenorchy-city-council-stormwater-pipes,glenorchy-city-council-stormwater-pipes,glenorchy-city-council-stormwater-pipes,glenorchy-city-council-stormwater-pits,glenorchy-city-council-stormwater-pits,glenorchy-city-council-stormwater-pits,groundwater-sdl-resource-units,groundwater-sdl-resource-units,groundwater-sdl-resource-units,higher-qualifications-of-farmers-and-farm-managers-1996,historical-australian-government-contract-data,historical-australian-government-contract-data,hydrologic-indicator-sites,hydrologic-indicator-sites,immigration-land-orders-1861-1874,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-biota-condition-sub-in,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-catchment-condition-in,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-feral-animal-density,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-human-population-densi,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-impoundment-density,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-industrial-point-sourc,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-intensive-agricultural,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-land-condition-sub-ind,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-native-vegetation-frag,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-nutrient-point-source-,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-pesticide-hazard,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-predicted-2050-salinit,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-protected-areas,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-rivers-in-acidificatio,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-rivers-in-salt-hazard,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-rivers-through-forests,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-soil-acidification-haz,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-soil-degradation-hazar,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-suspended-sediment-loa,indicators-of-catchment-condition-in-the-intensive-land-use-zone-of-australia-weed-density,integrated-vegetation-cover-2003-version-1,john-t-collins-collection-state-library-of-victoria,journal-of-the-h-m-s-endeavour-1768-1771,journey-planner-data-act,krantz-sheldon-architectural-images,land-use-of-australia-version-3-28093-20012002,lands-surveys-historic-map-series-western-australia,latest-coastal-weather-observations-for-coolangatta-qld,launceston-city-council-addresses,launceston-city-council-building-footprints,launceston-city-council-contours,launceston-city-council-detail-survey-drawing-file,launceston-city-council-drainage,launceston-city-council-fences,launceston-city-council-pavement,launceston-city-council-railway,launceston-city-council-roads,libraries-act-announcements,licensed-broadcasting-transmitter-data,linc-tasmania,look-up-table-of-auslig-river-basins-of-australia-1997,major-water-resources-infrastructure-part-of-the-australian-water-resources-assessment-2000-database,mean-annual-concentration-of-mineral-nitrogen-in-soil-water-mgn-kgh20-in-the-pre-1788-scenario,mean-annual-concentration-of-mineral-nitrogen-in-soil-water-mgn-kgh20-in-the-pre-1788-scenario,mean-annual-concentration-of-mineral-nitrogen-in-soil-water-mgn-kgh20-in-the-present-day-scenario,mean-annual-concentration-of-mineral-nitrogen-in-soil-water-mgn-kgh20-in-the-present-day-scenario,mean-annual-deep-drainage-mm-y-in-the-pre-1788-scenario,mean-annual-deep-drainage-mm-y-in-the-pre-1788-scenario,mean-annual-deep-drainage-mm-y-in-the-present-day-scenario,mean-annual-deep-drainage-mm-y-in-the-present-day-scenario,mean-annual-transpiration-from-the-plant-canopy-for-the-pre-1788-scenario,mean-annual-transpiration-from-the-plant-canopy-for-the-pre-1788-scenario,mean-annual-transpiration-from-the-plant-canopy-for-the-present-day-scenario,mean-annual-transpiration-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-april-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-april-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-august-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-august-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-august-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-august-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-december-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-december-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-december-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-december-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-february-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-february-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-january-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-january-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-january-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-january-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-july-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-july-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-july-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-july-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-june-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-june-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-june-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-june-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-march-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-march-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-march-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-march-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-march-from-the-plant-canopy-for-the-present-day-scenario-2,mean-transpiration-in-march-from-the-plant-canopy-for-the-present-day-scenario-2,mean-transpiration-in-may-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-may-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-may-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-may-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-november-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-november-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-november-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-november-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-october-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-october-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-october-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-october-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-september-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-september-from-the-plant-canopy-for-the-pre-1788-scenario,mean-transpiration-in-september-from-the-plant-canopy-for-the-present-day-scenario,mean-transpiration-in-september-from-the-plant-canopy-for-the-present-day-scenario,mildenhall-photographs-of-early-canberra,mobility-map-brisbane-city,mobility-map-mt-coot-tha,mosman-local-government-area,mosman-rider-route,mosman-wwii-honour-roll,mosman-wwii-honour-roll,murray-darling-basin-water-resource-plan-areas-groundwater,murray-darling-basin-water-resource-plan-areas-groundwater,murray-darling-basin-water-resource-plan-areas-surface-water,murray-darling-basin-water-resource-plan-areas-surface-water,music-queensland,national-broadband-network,national-broadband-network,national-broadband-network-2011-10,national-broadband-network-2011-10,national-broadband-network-2011-12,national-broadband-network-2011-12,national-broadband-network-2012,national-broadband-network-28093-august-2011,national-broadband-network-28093-august-2011,national-broadband-network-28093-july-2011,national-broadband-network-28093-july-2011,national-broadband-network-february-2012,national-broadband-network-february-2012,national-broadband-network-september-2011,national-broadband-network-september-2011,national-library-of-australia-sheet-music-collection,national-measurement-institute-locations,national-parks-and-asset-locations-south-australia,national-public-toilet-map,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2000,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2000,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2000,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2000,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2000,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2000,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2000,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2020,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2020,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2020,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2020,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2020,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2020,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2020,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2050,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2050,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2050,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2050,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2050,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2050,new-south-wales-dryland-salinity-assessment-2000-assessmet-of-dryland-salinity-extent-2050,nsw-newspapers-catalogue-data,nsw-rural-fire-service-current-incidents,nsw-rural-fire-service-major-updates,off-street-car-parks-mosman-municipal-council,open-database-brisbane-city-council,ost-of-salinity-to-local-infrastructure-1996-97-total-cost-of-the-rail-component-of-infrastructure-b,parking-areas-brisbane-city-council,parking-areas-brisbane-city-council,parks-and-reserves-mosman-municipal-council,parks-brisbane-city-council,parks-brisbane-city-council,picture-australia-metadata,picture-queensland,picture-queensland,playgrounds-mosman-municipal-council,police-station-locations,police-station-locations,port-phillip-papers-state-library-of-victoria,precis-forecast-national,precis-forecast-national,precis-forecast-new-south-wales,precis-forecast-new-south-wales,precis-forecast-new-south-wales,precis-forecast-northern-territory,precis-forecast-northern-territory,precis-forecast-queensland,precis-forecast-queensland,precis-forecast-south-australia,precis-forecast-south-australia,precis-forecast-south-australia,precis-forecast-tasmania,precis-forecast-tasmania,precis-forecast-tasmania,precis-forecast-victoria,precis-forecast-victoria,precis-forecast-victoria,precis-forecast-western-australia,precis-forecast-western-australia,public-amenities-maintained-by-mosman-council,radio-and-television-broadcasting-stations-book-internet-edition,real-estate-maps,recent-earthquakes,regional-development-australia,regional-development-australia-2011-september-2011,regional-development-australia-may-2012,reports-of-swooping-birds-mosman-municipal-council,sentinel-hotspots,sentinel-hotspots,slq-catalogue-searches,slq-catalogue-searches,slv-rural-water,slv-shipping,slwa-digital-photographic-collection,south-australian-boat-ramp-locator,south-australian-road-crash-statistics,state-library-of-victoria-online-image-collection,state-library-of-victoria-online-image-collection-inc-high-res,state-of-the-service-report-2010-11-australian-public-service-employee-survey-results,state-of-the-service-report-2010-11-australian-public-service-employee-survey-results,statistical-local-areas-1996-for-agricultural-structure-classification,surface-water-gauging-stations-part-of-the-australian-water-resources-assessment-2000-database,surface-water-gauging-stations-part-of-the-australian-water-resources-assessment-2000-database,surface-water-sdl-resource-units,surface-water-sdl-resource-units,tasmanian-herbarium,tasmanian-museum-and-art-gallery-faunal-collection".split(",") + docsdb = couch['disclosr-documents'] @@ -134,98 +201,123 @@ pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); print pkg_name - #add to or create organization using direct API - org_name = name_munge(doc.value['metadata']["Agency"][:100]) - if org_name not in orgs_list: - orgs_list = ckandirect.action.organization_list()['result'] - #print orgs_list + if pkg_name != "" : + + #add to or create organization using direct API + agency = doc.value['metadata']["Agency"] + if agency == "APS": + agency = "Australian Public Service Commission" + if agency == "Department of Broadband, Communications and the Digital Ecomomy": + agency = "Department of Broadband, Communications and the Digital Economy" + if agency == "Shared Services, Treasury Directorate": + agency = "Shared Services Procurement, Treasury Directorate" + if agency == "Treasury - Shared Services": + agency = "Shared Services Procurement, Treasury Directorate" + if agency == "Territory and Municipal Services (TAMS)": + agency = "Territory and Municipal Services Directorate" + if agency == "State Library of NSW": + agency = "State Library of New South Wales" + org_name = name_munge(agency[:100]) if org_name not in orgs_list: - try: - print "org not found, creating " + org_name - ckandirect.action.organization_create(name=org_name, title=doc.value['metadata']["Agency"], - description=doc.value['metadata']["Agency"]) - orgs_list.append(org_name) - except ckanapi.ValidationError, e: - print e - raise LoaderError('Unexpected status') - else: - print "org found, adding dataset to " + org_name - - # cache org names -> id mapping - if org_name not in orgs_ids: - org = ckandirect.action.organization_show(id=org_name) - orgs_ids[org_name] = org["result"]["id"] - org_id = orgs_ids[org_name] - print "org id is "+org_id - tags = [] - if doc.value['agencyID'] == "AGIMO": - if len(doc.value['metadata']["Keywords / Tags"]) > 0: - if hasattr(doc.value['metadata']["Keywords / Tags"], '__iter__'): - tags = tags + doc.value['metadata']["Keywords / Tags"] + orgs_list = ckandirect.action.organization_list()['result'] + #print orgs_list + if org_name not in orgs_list: + try: + print "org not found, creating " + org_name + ckandirect.action.organization_create(name=org_name, title=agency, + description=agency) + orgs_list.append(org_name) + except ckanapi.ValidationError, e: + print e + raise LoaderError('Unexpected status') else: - tags = tags + [doc.value['metadata']["Keywords / Tags"]] - - tags = [re.sub('[^a-zA-Z0-9-_.]', '', tag.replace('&', 'and')).lower() for tag in tags if tag] - #print tags - package_entity = { - 'name': pkg_name, - 'title': doc.value['metadata']['DCTERMS.Title'], - 'url': doc.value['metadata']['DCTERMS.Source.URI'], - 'tags': tags, #tags are mandatory? - 'author': doc.value['metadata']["DCTERMS.Creator"], - 'maintainer': doc.value['metadata']["DCTERMS.Creator"], - 'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']), - 'notes': html2text.html2text(doc.value['metadata']['Description']), - 'owner_org': org_id - #todo add missing key values like jurasdiction - } - if doc.value['agencyID'] == "qld": - package_entity = doc.value['metadata'] - - try: - #print package_entity - ckan.package_register_post(package_entity) - except CkanApiError, e: - if ckan.last_message == "{\"name\": [\"That URL is already in use.\"]}": - print "package already exists" - else: - print ckan.last_message - raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( - ckan.last_status, pkg_name, e.args)) - pkg = ckan.package_entity_get(pkg_name) - - # add resources (downloadable data files) - if 'Download' in doc.value['metadata'].keys(): + print "org found, adding dataset to " + org_name + + # cache org names -> id mapping + if org_name not in orgs_ids: + org = ckandirect.action.organization_show(id=org_name) + orgs_ids[org_name] = org["result"]["id"] + org_id = orgs_ids[org_name] + print "org id is " + org_id + tags = [] + creator = doc.value['metadata']["DCTERMS.Creator"] + if doc.value['agencyID'] == "AGIMO": + if len(doc.value['metadata']["Keywords / Tags"]) > 0: + if hasattr(doc.value['metadata']["Keywords / Tags"], '__iter__'): + tags = tags + doc.value['metadata']["Keywords / Tags"] + else: + tags = tags + [doc.value['metadata']["Keywords / Tags"]] + + tags = [re.sub('[^a-zA-Z0-9-_.]', '', tag.replace('&', 'and')).lower() for tag in tags if tag] + #print tags + extras = [] + + for extra_key in doc.value['metadata'].keys(): + if extra_key not in ["Description", "Content-Language", "DCTERMS.Description", + "Keywords / Tags", + "data.gov.au Category", "Download", "Permalink", "DCTERMS.Identifier"]: + if doc.value['metadata'][extra_key] != None and doc.value['metadata'][extra_key] != "": + extras.append([extra_key, doc.value['metadata'][extra_key]]) + + package_entity = { + 'name': pkg_name, + 'title': doc.value['metadata']['DCTERMS.Title'], + 'url': doc.value['metadata']['DCTERMS.Source.URI'], + 'tags': tags, #tags are mandatory? + 'author': creator, + 'maintainer': creator, + 'license_id': get_license_id(doc.value['metadata']['DCTERMS.License']), + 'notes': html2text.html2text(doc.value['metadata']['Description']).replace('AC/a!a','-').replace('AC/a!aC/',"'").replace("AC/a!E",":").replace("A "," "), + 'owner_org': org_id, + 'extras': extras, + 'private': (pkg_name not in goodcsvdata and pkg_name not in goodotherdata) + } + try: - - resources = pkg.get('resources', []) - if len(resources) < len(doc.value['metadata']['Download']): - for resource in doc.value['metadata']['Download']: - - # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html - # (KML/KMZ) / (Shapefile) /(Other) - format = "plain" - if resource['format'] == '(XML)': - format = 'xml' - if resource['format'] == '(CSV/XLS)': - format = 'csv' - if resource['format'] == '(Shapefile)': - format = 'shp' - if resource['format'] == '(KML/KMZ)': - format = 'kml' - name = resource['href'] - if 'name' in resource.keys(): - name = resource['name'] - print resource - ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data', - format=format, - size=human2bytes(resource.get('size','0B'))) + #print package_entity + ckan.package_register_post(package_entity) + except CkanApiError, e: + if ckan.last_message == "{\"name\": [\"That URL is already in use.\"]}": + print "package already exists" else: - print "resources already exist" - except CkanApiError, e: - if ckan.last_status == 404: - print "parent dataset does not exist" - else: + print ckan.last_message raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( ckan.last_status, pkg_name, e.args)) - + pkg = ckan.package_entity_get(pkg_name) + + + # add resources (downloadable data files) + if 'Download' in doc.value['metadata'].keys(): + try: + + resources = pkg.get('resources', []) + if len(resources) < len(doc.value['metadata']['Download']): + for resource in doc.value['metadata']['Download']: + + # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html + # (KML/KMZ) / (Shapefile) /(Other) + format = "plain" + if resource['format'] == '(XML)': + format = 'xml' + if resource['format'] == '(CSV/XLS)': + format = 'csv' + if resource['format'] == '(Shapefile)': + format = 'shp' + if resource['format'] == '(KML/KMZ)': + format = 'kml' + name = resource['href'] + if 'name' in resource.keys(): + name = resource['name'] + print resource + add_package_resource_cachedurl(ckan, pkg_name, url_fix(resource['href']), name, + format, get_license_id(doc.value['metadata']['DCTERMS.License']), + human2bytes(resource.get('size', '0B'))) + else: + print "resources already exist" + except CkanApiError, e: + if ckan.last_status == 404: + print "parent dataset does not exist" + else: + raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( + ckan.last_status, pkg_name, e.args)) + --- /dev/null +++ b/documents/datagov-resourcereport.py @@ -1,1 +1,81 @@ +import couchdb +couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') +import urllib +import urlparse +import httplib2 +import httplib +import csv + + +def url_fix(s, charset='utf-8'): + """Sometimes you get an URL by a user that just isn't a real + URL because it contains unsafe characters like ' ' and so on. This + function can fix some of the problems in a similar way browsers + handle data entered by the user: + + :param charset: The target charset for the URL if the url was + given as unicode string. + """ + if isinstance(s, unicode): + s = s.encode(charset, 'ignore') + if not urlparse.urlparse(s).scheme: + s = "http://"+s + scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) + path = urllib.quote(path, '/%') + qs = urllib.quote_plus(qs, ':&=') + return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) + +# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ +SYMBOLS = { + 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), + 'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', + 'zetta', 'iotta'), + 'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), + 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', + 'zebi', 'yobi'), +} + + +docsdb = couch['disclosr-documents'] +out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL) +if __name__ == "__main__": + for doc in docsdb.view('app/datasets'): + if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": + # Collect the package metadata. + pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', + doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); + if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0: + for resource in doc.value['metadata']['Download']: + # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html + # (KML/KMZ) / (Shapefile) /(Other) + format = "plain" + if resource['format'] == '(XML)': + format = 'xml' + if resource['format'] == '(CSV/XLS)': + format = 'csv' + if resource['format'] == '(Shapefile)': + format = 'shp' + if resource['format'] == '(KML/KMZ)': + format = 'kml' + name = resource['href'] + if 'name' in resource.keys(): + name = resource['name'] + if resource['href'].startswith("ftp"): + out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""]) + else: + try: + h = httplib2.Http(disable_ssl_certificate_validation=True) + resp = h.request(url_fix(resource['href']), 'HEAD') + content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else "" + out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, resp[0]['status'], content_type]) + except httplib2.ServerNotFoundError: + out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) + except httplib.InvalidURL: + out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) + except httplib2.RelativeURIError: + out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) + else: + out.writerow([pkg_name.encode('ascii', 'ignore')]) + --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -202,7 +202,7 @@ def getDate(self, content, entry, doc): strdate = ''.join(content.stripped_strings).strip() (a, b, c) = strdate.partition("(") - strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")replace("Janrurary", "January").replace("1012","2012")) + strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) print strdate try: edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") --- a/documents/runScrapers.sh +++ b/documents/runScrapers.sh @@ -1,7 +1,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +echo $DIR cd $DIR echo "" > /tmp/disclosr-error -for f in scrapers/*.py; do +for f in $DIR/scrapers/*.py; do echo "Processing $f file.."; md5=`md5sum /tmp/disclosr-error` python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; --- a/documents/scrape.py +++ b/documents/scrape.py @@ -112,7 +112,7 @@ else: if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash - if "_attachments" in doc.keys(): + if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys(): last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) content = last_attachment.read()