From: Maxious Date: Sun, 17 Mar 2013 07:22:38 +0000 Subject: ignore CSIRO disclog formatting X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=ce8a1e63f91a6cc81d644a787b0cd5d115370965 --- ignore CSIRO disclog formatting Former-commit-id: 225487e3a636c4cf2758c6ad60437c748bc1df7c --- --- a/documents/runScrapers.sh +++ b/documents/runScrapers.sh @@ -1,3 +1,5 @@ +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR echo "" > /tmp/disclosr-error for f in scrapers/*.py; do echo "Processing $f file.."; --- a/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py +++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py @@ -8,42 +8,14 @@ from datetime import * #http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(class_ = "inner-column").table - def getRows(self,table): - return table.tbody.find_all('tr',recursive=False) +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): def getColumnCount(self): - return 3 - def getColumns(self,columns): - (date, title, description) = columns - return (date, date, title, description, None) - def getDate(self, content, entry, doc): - i = 0 - date = "" - for string in content.stripped_strings: - if i ==1: - date = string - i = i+1 - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - print edate - doc.update({'date': edate}) - return - def getTitle(self, content, entry, doc): - i = 0 - title = "" - for string in content.stripped_strings: - if i < 2: - title = title + string - i = i+1 - doc.update({'title': title}) - #print title - return + return 0 if __name__ == '__main__': #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericHTMLDisclogScraper) ScraperImplementation().doScrape() --- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py @@ -21,7 +21,7 @@ d.make_links_absolute(base_url = self.getURL()) for table in d('table').items(): title= table('thead').text() - print title + print self.remove_control_chars(title) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) links = table('a').map(lambda i, e: pq(e).attr('href')) description = descA+" "+descB