From: Maxious Date: Sat, 22 Jun 2013 06:57:06 +0000 Subject: more export fixes X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=835cba9e5eae8d1d8768b5cb52bb5c24dfbda04d --- more export fixes Former-commit-id: 4ca7f1f81d51d7c094d904b6aa1e176cf5958630 --- --- a/admin/refreshDesignDoc.php +++ b/admin/refreshDesignDoc.php @@ -8,7 +8,7 @@ $obj->_id = "_design/" . urlencode("app"); $obj->language = "javascript"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; -$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; +$obj->views->byDate->map = "function(doc) { if (doc.title != "Disclosure Log Updated") { emit(doc.date, doc); } };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; --- a/documents/datagov-export-groups.py +++ b/documents/datagov-export-groups.py @@ -10,7 +10,11 @@ # Instantiate the CKAN client. #ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc' -ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', +server = 'data.disclosurelo.gs' +api_key = 'c30eb6f5-0f90-47e0-bf05-9b1b4e3a461a' +server = 'ckan.data.gov.au' + +ckan = ckanclient.CkanClient(base_location='http://' + server + '/api', api_key=api_key) couch = couchdb.Server('http://127.0.0.1:5984/') #couch = couchdb.Server('http://192.168.1.113:5984/') --- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -14,6 +14,7 @@ import tempfile def add_package_resource_cachedurl(ckan, package_name, url, name, format, license_id, size,**kwargs): + excluded_apis = "recent-earthquakes,sentinel-hotspots,abc-local-stations,action-bus-service-gtfs-feed-act,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,current-and-future-road-reports-traffic-restrictions-in-south-australia,journey-planner-data-act,live-traffic-cameras-nsw,precis-forecast-national,precis-forecast-national,precis-forecast-new-south-wales,precis-forecast-new-south-wales,precis-forecast-northern-territory,precis-forecast-northern-territory,precis-forecast-queensland,precis-forecast-queensland,precis-forecast-south-australia,precis-forecast-south-australia,precis-forecast-tasmania,precis-forecast-tasmania,precis-forecast-victoria,precis-forecast-victoria,precis-forecast-western-australia,precis-forecast-western-australia,register-of-penalty-notices-nsw,sentinel-hotspots,trove-people-and-organisations-data,weather-data-services-radar,abc-local-stations,act-emergency-services-agency-esa-28093-current-incidents,act-emergency-services-agency-esa-news-alerts,act-government-news-and-events,act-government-summaries-of-cabinet-outcomes,act-magistrates-court-judgements,act-supreme-court-judgements,act-supreme-court-sentences,actpla-latest-news,all-vacant-act-government-jobs,community-engagement-current-engagements,community-engagement-news,edd-media-releases,edd-news-and-events,freedom-of-information-foi-summaries,libraries-act-announcements,nsw-rural-fire-service-current-incidents,nsw-rural-fire-service-major-updates,precis-forecast-new-south-wales,precis-forecast-south-australia,precis-forecast-tasmania,precis-forecast-victoria,sentinel-hotspots,south-australian-road-crash-statistics,trove-people-and-organisations-data,weather-warnings-for-new-south-wales-australian-capital-territory,weather-warnings-for-northern-territory,weather-warnings-for-queensland,weather-warnings-for-south-australia,weather-warnings-for-tasmania,weather-warnings-for-victoria,weather-warnings-for-western-australia".split(",") if "xls" in url: format = "xls" if "pdf" in url: @@ -27,8 +28,12 @@ if mime_type in ["application/xlsx","application/x-xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]: format = "xlsx" - if content != None: - tf = tempfile.NamedTemporaryFile(delete=False) + #put file extensions on for windows users downloading files + suffix = name.encode("ascii","ignore").replace("/","") + if len(suffix) < 5 or (suffix[-4] != "." and suffix[-5] != "."): + suffix = suffix + "." + format + if content != None and package_name not in excluded_apis: + tf = tempfile.NamedTemporaryFile(suffix=suffix) tfName = os.path.abspath(tf.name) print tfName tf.seek(0) @@ -196,7 +201,7 @@ pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); print pkg_name - if pkg_name != "": + if pkg_name != "" : #add to or create organization using direct API agency = doc.value['metadata']["Agency"] @@ -262,7 +267,7 @@ 'author': creator, 'maintainer': creator, 'license_id': get_license_id(doc.value['metadata']['DCTERMS.License']), - 'notes': html2text.html2text(doc.value['metadata']['Description']).replace('AC/a!a','-').replace('AC/a!aC/',"'").replace("AC/a!E",":")replace("A "," "), + 'notes': html2text.html2text(doc.value['metadata']['Description']).replace('AC/a!a','-').replace('AC/a!aC/',"'").replace("AC/a!E",":").replace("A "," "), 'owner_org': org_id, 'extras': extras, 'private': (pkg_name not in goodcsvdata and pkg_name not in goodotherdata) --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -202,7 +202,7 @@ def getDate(self, content, entry, doc): strdate = ''.join(content.stripped_strings).strip() (a, b, c) = strdate.partition("(") - strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")replace("Janrurary", "January").replace("1012","2012")) + strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) print strdate try: edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") --- a/documents/runScrapers.sh +++ b/documents/runScrapers.sh @@ -1,7 +1,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +echo $DIR cd $DIR echo "" > /tmp/disclosr-error -for f in scrapers/*.py; do +for f in $DIR/scrapers/*.py; do echo "Processing $f file.."; md5=`md5sum /tmp/disclosr-error` python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error;