From: Alex Sadleir Date: Thu, 11 Apr 2013 12:22:35 +0000 Subject: datagov fixes X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=c64b3fe20debb60737f29c859d2bcf41ef0f70a7 --- datagov fixes Former-commit-id: ed3ba96db4beeb126f802a3168476e27f298aeb8 --- --- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -12,8 +12,8 @@ #ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', api_key='482a9dd2-a976-4adf-ac77-d71d92a98a52') -#couch = couchdb.Server('http://127.0.0.1:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ SYMBOLS = { @@ -91,6 +91,7 @@ def name_munge(input_name): return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) + #[:100] #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') @@ -117,9 +118,9 @@ if __name__ == "__main__": for doc in docsdb.view('app/datasets'): print doc.id - if doc.value['url'] != "http://data.gov.au/data/": + if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": # Collect the package metadata. - pkg_name = doc.value['url'].replace("http://data.gov.au/dataset/",'').replace('/',''); _ + pkg_name = doc.value['url'].replace("http://data.gov.au/dataset/",'').replace('/',''); tags = [] if doc.value['agencyID'] == "AGIMO": if len(doc.value['metadata']["Keywords / Tags"]) > 0: @@ -185,6 +186,8 @@ } print group_entity ckan.group_register_post(group_entity) + elif ckan.last_status == 409: + print "group already exists" else: raise LoaderError('Unexpected status %s adding to group under \'%s\': %r' % ( ckan.last_status, pkg_name, e.args)) --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -72,8 +72,7 @@ edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", - "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description), "diff": diff} foidocsdb.save(doc) else: print "already saved" @@ -200,16 +199,11 @@ return table.find_all('tr') def getDate(self, content, entry, doc): - strdate = ''.join(content.stripped_strings).strip() - (a, b, c) = strdate.partition("(") - strdate = self.remove_control_chars(a.replace("Octber", "October").replace("Janrurary", "January").replace("1012","2012")) - print strdate - try: - edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - except ValueError: - print >> sys.stderr, "ERROR date invalid %s " % strdate - print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() - edate = date.today().strftime("%Y-%m-%d") + date = ''.join(content.stripped_strings).strip() + (a, b, c) = date.partition("(") + date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janurary","January")) + print date + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate doc.update({'date': edate}) return @@ -272,7 +266,8 @@ 'Summary of FOIrequest received by agency/minister', 'Summary of FOI request received', 'Description of FOI Request', "FOI request", 'Results 1 to 67 of 67'] - if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': + if doc['title'] not in badtitles\ + and doc['description'] != '': print "saving" foidocsdb.save(doc) else: @@ -282,6 +277,6 @@ print "header row" else: - print >> sys.stderr, "ERROR number of columns incorrect" + print "ERROR number of columns incorrect" print row --- a/documents/runScrapers.sh +++ b/documents/runScrapers.sh @@ -1,21 +1,13 @@ -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -cd $DIR -echo "" > /tmp/disclosr-error +rm /tmp/disclosr-error for f in scrapers/*.py; do echo "Processing $f file.."; - md5=`md5sum /tmp/disclosr-error` - python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; - md52=`md5sum /tmp/disclosr-error` - if [ "$md5" != "$md52" ]; then - echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; - fi + python $f 2>/tmp/disclosr-error; if [ "$?" -ne "0" ]; then echo "error"; - sleep 1; + sleep 2; fi done if [ -s /tmp/disclosr-error ] ; then - echo "emailling logs.."; mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; fi --- a/documents/scrape.py +++ b/documents/scrape.py @@ -7,7 +7,6 @@ from urlparse import urljoin import time import os -import sys import mimetypes import urllib import urlparse @@ -90,7 +89,7 @@ def getLastAttachment(docsdb, url): hash = mkhash(url) doc = docsdb.get(hash) - if doc != None and "_attachments" in doc.keys(): + if doc != None: last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) return last_attachment @@ -104,7 +103,7 @@ req = urllib2.Request(url) print "Fetching %s (%s)" % (url, hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": - print >> sys.stderr, "Not a valid HTTP url" + print "Not a valid HTTP url" return (None, None, None) doc = docsdb.get(hash) if doc == None: @@ -112,15 +111,10 @@ else: if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash - if "_attachments" in doc.keys(): - last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc, last_attachment_fname) - content = last_attachment.read() - mime_type = doc['mime_type'] - else: - content = None - mime_type = None - return (doc['url'], mime_type, content) + last_attachment_fname = doc["_attachments"].keys()[-1] + last_attachment = docsdb.get_attachment(doc, last_attachment_fname) + content = last_attachment + return (doc['url'], doc['mime_type'], content.read()) req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags @@ -165,13 +159,13 @@ #store as attachment epoch-filename except (urllib2.URLError, socket.timeout) as e: - print >> sys.stderr,"error!" + print "error!" error = "" if hasattr(e, 'reason'): error = "error %s in downloading %s" % (str(e.reason), url) elif hasattr(e, 'code'): error = "error %s in downloading %s" % (e.code, url) - print >> sys.stderr, error + print error doc['error'] = error docsdb.save(doc) return (None, None, None) --- a/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py +++ b/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py @@ -18,13 +18,13 @@ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(htcontent) - rowtitle = soup.find(class_ = "wc-title").find("h1").string - if rowtitle != None: - description = rowtitle + ": " - for row in soup.find(class_ ="wc-content").find_all('td'): + for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): if row != None: - for text in row.stripped_strings: - description = description + text + "\n" + rowtitle = row.find('th').string + if rowtitle != None: + description = description + "\n" + rowtitle + ": " + for text in row.find('td').stripped_strings: + description = description + text for atag in row.find_all("a"): if atag.has_key('href'): links.append(scrape.fullurl(link,atag['href'])) @@ -37,7 +37,7 @@ def getColumnCount(self): return 2 def getTable(self,soup): - return soup.find(class_ = "ms-rteTable-default") + return soup.find(class_ = "ms-rteTable-GreyAlternating") def getColumns(self,columns): (date, title) = columns return (title, date, title, title, None) --- a/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py +++ b/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py @@ -7,7 +7,7 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getTable(self,soup): - return soup.find(class_="tborder") + return soup.find(id = "inner_content") def getColumnCount(self): return 2 def getColumns(self,columns): --- a/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py +++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py @@ -8,14 +8,40 @@ from datetime import * #http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(class_ = "inner-column").table + def getRows(self,table): + return table.tbody.find_all('tr',recursive=False) def getColumnCount(self): - return 0 + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + def getDate(self, content, entry, doc): + i = 0 + date = "" + for string in content.stripped_strings: + if i ==1: + date = string + i = i+1 + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return + def getTitle(self, content, entry, doc): + i = 0 + title = "" + for string in content.stripped_strings: + if i < 2: + title = title + string + i = i+1 + doc.update({'title': title}) + #print title + return if __name__ == '__main__': -#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx -#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericHTMLDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericHTMLDisclogScraper) + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) ScraperImplementation().doScrape() --- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py @@ -21,15 +21,11 @@ d.make_links_absolute(base_url = self.getURL()) for table in d('table').items(): title= table('thead').text() - print self.remove_control_chars(title) + print title (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) links = table('a').map(lambda i, e: pq(e).attr('href')) description = descA+" "+descB - try: - edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - except ValueError: - edate = date.today().strftime("%Y-%m-%d") - pass + edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate dochash = scrape.mkhash(self.remove_control_chars(title)) doc = foidocsdb.get(dochash) --- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py @@ -18,10 +18,10 @@ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(htcontent) - for text in soup.find(class_ = "mainContent").stripped_strings: + for text in soup.find(id="divFullWidthColumn").stripped_strings: description = description + text.encode('ascii', 'ignore') - for atag in soup.find(id="SortingTable").find_all("a"): + for atag in soup.find(id="divFullWidthColumn").find_all("a"): if atag.has_key('href'): links.append(scrape.fullurl(link,atag['href'])) --- a/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py +++ b/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py @@ -7,11 +7,11 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getTable(self,soup): - return soup.find(id="int-content").table + return soup.find(id="ctl00_ContentPlaceHolderMainNoAjax_EdtrTD1494_2").table def getColumnCount(self): - return 3 + return 4 def getColumns(self,columns): - (id, title,date) = columns + (blank,id, title,date) = columns return (id, date, title, title, None) if __name__ == '__main__': --- a/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py +++ b/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py @@ -10,7 +10,7 @@ (id, date, title, description, notes) = columns return (id, date, title, description, notes) def getTable(self,soup): - return soup.find(class_ = "simpletable") + return soup.find(class_ = "content") if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- a/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py +++ b/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py @@ -10,7 +10,7 @@ (id, date, title, description, notes) = columns return (id, date, title, description, notes) def getTable(self,soup): - return soup.find("table") + return soup.find(id = "content").table if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)