From: Maxious Date: Fri, 30 Nov 2012 12:50:36 +0000 Subject: more scrapers X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=bffa93e6d665a9a84f42927ea7a61776fccc2bf9 --- more scrapers Former-commit-id: a1b071fe60c375afcb0527b3c5a86844b8f0035b --- --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -12,6 +12,7 @@ $disclogs = 0; $red = 0; $green = 0; +$yellow = 0; $orange = 0; try { $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows; @@ -46,8 +47,14 @@ echo ""; $green++; } else if (file_exists("./scrapers/" . $row->id . '.txt')) { + if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") { + echo ""; + $yellow++; + } else { + echo file_get_contents("./scrapers/" . $row->id . '.txt'); echo ""; $orange++; + } } else { echo ""; $red++; --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -9,6 +9,7 @@ import dateutil from dateutil.parser import * from datetime import * +import codecs class GenericDisclogScraper(object): __metaclass__ = abc.ABCMeta @@ -120,7 +121,7 @@ columns = row.find_all('td') if len(columns) == self.getColumnCount(): (id, date, title, description, notes) = self.getColumns(columns) - print ''.join(id.stripped_strings) + print ''.join(id.stripped_strings).encode('ascii', 'ignore') if id.string == None: hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings)))) else: --- a/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt +++ b/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt @@ -1,2 +1,1 @@ -see parent dhs - +no disclog --- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt @@ -1,1 +1,1 @@ - +acma style --- /dev/null +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py @@ -1,1 +1,86 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import codecs +#http://www.doughellmann.com/PyMOTW/abc/ +class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for text in soup.find(id="divFullWidthColumn").stripped_strings: + description = description + text.encode('ascii', 'ignore') + + for atag in soup.find(id="divFullWidthColumn").find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + def getColumnCount(self): + return 2 + def getTable(self,soup): + return soup.find(id = "TwoColumnSorting") + def getColumns(self,columns): + ( title, date) = columns + return (title, date, title, title, None) +class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for text in soup.find(id="content-item").stripped_strings: + description = description + text + " \n" + for atag in soup.find(id="content-item").find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + def getColumnCount(self): + return 2 + def getTable(self,soup): + return soup.find(class_ = "doc-list") + def getColumns(self,columns): + (date, title) = columns + return (title, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + #NewScraperImplementation().doScrape() + print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + osi = OldScraperImplementation() + osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" + osi.doScrape() +# old site too + --- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt +++ /dev/null @@ -1,49 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getDescription(self,content, entry,doc): - link = None - links = [] - description = "" - for atag in entry.find_all('a'): - if atag.has_key('href'): - link = scrape.fullurl(self.getURL(),atag['href']) - (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) - if htcontent != None: - if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": - # http://www.crummy.com/software/BeautifulSoup/documentation.html - soup = BeautifulSoup(htcontent) - for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): - if row != None: - rowtitle = row.find('th').string - description = description + "\n" + rowtitle + ": " - for text in row.find('td').stripped_strings: - description = description + text - for atag in row.find_all("a"): - if atag.has_key('href'): - links.append(scrape.fullurl(link,atag['href'])) - - if links != []: - doc.update({'links': links}) - if description != "": - doc.update({ 'description': description}) - - def getColumnCount(self): - return 2 - def getTable(self,soup): - return soup.find(class_ = "ms-rteTable-GreyAlternating") - def getColumns(self,columns): - (date, title) = columns - return (title, date, title, title, None) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() -# old site too http://archive.treasury.gov.au/content/foi_publications.asp - --- a/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt +++ b/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt @@ -1,2 +1,1 @@ -# does not have any disclog entries or table - +no disclog --- a/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt +++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt @@ -1,1 +1,1 @@ -no disclog yet +no disclog --- a/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt +++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt @@ -1,2 +1,1 @@ -no log - +no disclog --- a/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt +++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt @@ -1,1 +1,1 @@ - +acma style --- a/documents/scrapers/e770921522a49dc77de208cc724ce134.txt +++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt @@ -1,2 +1,1 @@ -c'est ne pas une table - +no disclog