--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py @@ -3,7 +3,7 @@ import genericScrapers import scrape from bs4 import BeautifulSoup -import codecs +import codecs #http://www.doughellmann.com/PyMOTW/abc/ class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getDescription(self,content, entry,doc): @@ -11,18 +11,18 @@ links = [] description = "" for atag in entry.find_all('a'): - if atag.has_key('href'): + if atag.has_attr('href'): link = scrape.fullurl(self.getURL(),atag['href']) (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) if htcontent != None: if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(htcontent) - for text in soup.find(id="divFullWidthColumn").stripped_strings: + for text in soup.find(class_ = "mainContent").stripped_strings: description = description + text.encode('ascii', 'ignore') - - for atag in soup.find(id="divFullWidthColumn").find_all("a"): - if atag.has_key('href'): + + for atag in soup.find(id="SortingTable").find_all("a"): + if atag.has_attr('href'): links.append(scrape.fullurl(link,atag['href'])) if links != []: @@ -43,7 +43,7 @@ links = [] description = "" for atag in entry.find_all('a'): - if atag.has_key('href'): + if atag.has_attr('href'): link = scrape.fullurl(self.getURL(),atag['href']) (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) if htcontent != None: @@ -53,7 +53,7 @@ for text in soup.find(id="content-item").stripped_strings: description = description + text + " \n" for atag in soup.find(id="content-item").find_all("a"): - if atag.has_key('href'): + if atag.has_attr('href'): links.append(scrape.fullurl(link,atag['href'])) if links != []: doc.update({'links': links}) @@ -76,11 +76,10 @@ if __name__ == '__main__': print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - #NewScraperImplementation().doScrape() + NewScraperImplementation().doScrape() print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) osi = OldScraperImplementation() osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" osi.doScrape() -# old site too