From: Alex Sadleir Date: Thu, 11 Apr 2013 12:24:20 +0000 Subject: Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=5948d2a9216855b7a214dd4f5fcb82ed6af548d3 --- Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr Conflicts: documents/genericScrapers.py documents/runScrapers.sh Former-commit-id: a6f8697ed080934b51ab7b63a3d4428ff5ccdb2b --- --- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -12,8 +12,8 @@ #ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', api_key='482a9dd2-a976-4adf-ac77-d71d92a98a52') -#couch = couchdb.Server('http://127.0.0.1:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ SYMBOLS = { @@ -91,6 +91,7 @@ def name_munge(input_name): return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) + #[:100] #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') @@ -117,9 +118,9 @@ if __name__ == "__main__": for doc in docsdb.view('app/datasets'): print doc.id - if doc.value['url'] != "http://data.gov.au/data/": + if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": # Collect the package metadata. - pkg_name = doc.value['url'].replace("http://data.gov.au/dataset/",'').replace('/',''); _ + pkg_name = doc.value['url'].replace("http://data.gov.au/dataset/",'').replace('/',''); tags = [] if doc.value['agencyID'] == "AGIMO": if len(doc.value['metadata']["Keywords / Tags"]) > 0: @@ -185,6 +186,8 @@ } print group_entity ckan.group_register_post(group_entity) + elif ckan.last_status == 409: + print "group already exists" else: raise LoaderError('Unexpected status %s adding to group under \'%s\': %r' % ( ckan.last_status, pkg_name, e.args)) --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -202,7 +202,7 @@ def getDate(self, content, entry, doc): strdate = ''.join(content.stripped_strings).strip() (a, b, c) = strdate.partition("(") - strdate = self.remove_control_chars(a.replace("Octber", "October").replace("Janrurary", "January").replace("1012","2012")) + strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")replace("Janrurary", "January").replace("1012","2012")) print strdate try: edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") --- a/documents/runScrapers.sh +++ b/documents/runScrapers.sh @@ -1,3 +1,5 @@ +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR echo "" > /tmp/disclosr-error for f in scrapers/*.py; do echo "Processing $f file.."; --- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py @@ -42,7 +42,6 @@ 'data': {'request': '', 'session': '', 'more': ''} } - - amonpy.exception(data) + #amonpy.exception(data) pass --- a/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py +++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py @@ -8,42 +8,14 @@ from datetime import * #http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(class_ = "inner-column").table - def getRows(self,table): - return table.tbody.find_all('tr',recursive=False) +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): def getColumnCount(self): - return 3 - def getColumns(self,columns): - (date, title, description) = columns - return (date, date, title, description, None) - def getDate(self, content, entry, doc): - i = 0 - date = "" - for string in content.stripped_strings: - if i ==1: - date = string - i = i+1 - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - print edate - doc.update({'date': edate}) - return - def getTitle(self, content, entry, doc): - i = 0 - title = "" - for string in content.stripped_strings: - if i < 2: - title = title + string - i = i+1 - doc.update({'title': title}) - #print title - return + return 0 if __name__ == '__main__': #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericHTMLDisclogScraper) ScraperImplementation().doScrape() --- a/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py +++ b/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py @@ -6,8 +6,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(id = "content_div_50269").table def getColumns(self,columns): (id, date, title, description, notes) = columns return (id, date, title, description, notes) --- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py @@ -21,7 +21,7 @@ d.make_links_absolute(base_url = self.getURL()) for table in d('table').items(): title= table('thead').text() - print title + print self.remove_control_chars(title) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) links = table('a').map(lambda i, e: pq(e).attr('href')) description = descA+" "+descB --- /dev/null +++ b/documents/scrapers/b0ca7fddcd1c965787daea47f2d32e0a.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() +