--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py @@ -3,7 +3,7 @@ import genericScrapers import scrape from bs4 import BeautifulSoup -import codecs +import codecs #http://www.doughellmann.com/PyMOTW/abc/ class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getDescription(self,content, entry,doc): @@ -20,7 +20,7 @@ soup = BeautifulSoup(htcontent) for text in soup.find(id="divFullWidthColumn").stripped_strings: description = description + text.encode('ascii', 'ignore') - + for atag in soup.find(id="divFullWidthColumn").find_all("a"): if atag.has_key('href'): links.append(scrape.fullurl(link,atag['href'])) @@ -76,11 +76,10 @@ if __name__ == '__main__': print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - #NewScraperImplementation().doScrape() + NewScraperImplementation().doScrape() print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) osi = OldScraperImplementation() osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" osi.doScrape() -# old site too