more scrapers
[disclosr.git] / documents / scrapers / 8e874a2fde8aa0ccdc6d14573d766540.py
blob:a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py -> blob:b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
+++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
@@ -3,7 +3,7 @@
 import genericScrapers
 import scrape
 from bs4 import BeautifulSoup
-import codecs 
+import codecs
 #http://www.doughellmann.com/PyMOTW/abc/
 class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
         def getDescription(self,content, entry,doc):
@@ -18,10 +18,10 @@
                                         if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
                                         # http://www.crummy.com/software/BeautifulSoup/documentation.html
                                                 soup = BeautifulSoup(htcontent)
-                                                for text in soup.find(id="divFullWidthColumn").stripped_strings:
+                                                for text in soup.find(class_ = "mainContent").stripped_strings:
                                                     description = description + text.encode('ascii', 'ignore')
-                                                
-                                                for atag in soup.find(id="divFullWidthColumn").find_all("a"):
+
+                                                for atag in soup.find(id="SortingTable").find_all("a"):
                                                       	if atag.has_key('href'):
                                                               	links.append(scrape.fullurl(link,atag['href']))
 
@@ -76,11 +76,10 @@
 if __name__ == '__main__':
     print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
     print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
-    #NewScraperImplementation().doScrape()
+    NewScraperImplementation().doScrape()
     print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
     print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
     osi = OldScraperImplementation()
     osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI"
     osi.doScrape()
-# old site too