From: Maxious Date: Wed, 28 Nov 2012 03:43:08 +0000 Subject: more scrapers, multipage scrapers X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=0625db0f55b51fbbdcccffce12be3bacc0c8677e --- more scrapers, multipage scrapers Former-commit-id: d70c13dcae735d252b687b917ec530695be41419 --- --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -8,6 +8,11 @@ Agency NameDisclosure Log URL recorded?Do we monitor this URL?"; $agenciesdb = $server->get_db('disclosr-agencies'); $docsdb = $server->get_db('disclosr-documents'); +$agencies = 0; +$disclogs = 0; +$red = 0; +$green = 0; +$orange = 0; try { $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows; @@ -19,10 +24,11 @@ if ($ENV == "DEV") echo "
(" . $row->id . ")"; echo "\n"; - +$agencies++; echo ""; if (isset($row->value->FOIDocumentsURL)) { + $disclogs++; echo '' . $row->value->FOIDocumentsURL . ''; if ($ENV == "DEV") @@ -35,10 +41,13 @@ if (isset($row->value->FOIDocumentsURL)) { if (file_exists("./scrapers/" . $row->id . '.py')) { echo ""; + $green++; } else if (file_exists("./scrapers/" . $row->id . '.txt')) { - echo ""; + echo ""; + $orange++; } else { echo ""; + $red++; } } echo "\n"; @@ -48,5 +57,8 @@ setteErrorHandler($e); } echo ""; +echo $agencies." agencies ".(($disclogs/$agencies)*100)."% with disclosure logs, ".(($green/$disclogs)*100)."% with scrapers ".(($red/$disclogs)*100)."% without scrapers ".(($orange/$disclogs)*100)."% WIP scrapers "; + include_footer_documents(); ?> + --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -55,7 +55,7 @@ doc = foidocsdb.get(hash) #print doc if doc == None: - print "saving" + print "saving "+ hash edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, "date": edate,"title": entry.title} @@ -84,14 +84,27 @@ doc.update({'description': descriptiontxt}) return def getTitle(self, content, entry, doc): - doc.update({'title': content.string}) + doc.update({'title': (''.join(content.stripped_strings))}) return def getTable(self, soup): return soup.table + def getRows(self, table): + return table.find_all('tr') def getDate(self, content, entry, doc): - edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + date = ''.join(content.stripped_strings).strip() + date = str.replace("Octber","October",date) + print date + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate doc.update({'date': edate}) + return + def getLinks(self, content, entry, doc): + links = [] + for atag in entry.find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(content,atag['href'])) + if links != []: + doc.update({'links': links}) return def doScrape(self): @@ -102,31 +115,26 @@ # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(content) table = self.getTable(soup) - for row in table.find_all('tr'): + for row in self.getRows(table): columns = row.find_all('td') if len(columns) == self.getColumnCount(): - (id, date, description, title, notes) = self.getColumns(columns) - print id.string + (id, date, title, description, notes) = self.getColumns(columns) + print ''.join(id.stripped_strings) if id.string == None: - hash = scrape.mkhash(self.remove_control_chars(url+date.string)) + hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings)))) else: - hash = scrape.mkhash(self.remove_control_chars(url+id.string)) - links = [] - for atag in row.find_all("a"): - if atag.has_key('href'): - links.append(scrape.fullurl(url,atag['href'])) + hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings)))) doc = foidocsdb.get(hash) if doc == None: - print "saving" - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string} - if links != []: - doc.update({'links': links}) + print "saving " +hash + doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))} + self.getLinks(self.getURL(),row,doc) self.getTitle(title,row, doc) self.getDate(date,row, doc) self.getDescription(description,row, doc) if notes != None: - doc.update({ 'notes': notes.string}) + doc.update({ 'notes': (''.join(notes.stripped_strings))}) foidocsdb.save(doc) else: print "already saved "+hash --- a/documents/scrape.py +++ b/documents/scrape.py @@ -204,12 +204,12 @@ scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) if key == 'website' and False: scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) + agency['metadata']['lastScraped'] = time.time() if key.endswith('URL') and False: print key depth = 1 if 'scrapeDepth' in agency.keys(): depth = agency['scrapeDepth'] scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) - agency['metadata']['lastScraped'] = time.time() agencydb.save(agency) --- /dev/null +++ b/documents/scrapers/00a294de663db69062ca09aede7c0487.txt @@ -1,1 +1,2 @@ +multipage --- /dev/null +++ b/documents/scrapers/0324e4b1654fd6dd651307abcef67094.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description, notes,link) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/0603dfcc930a791efaa64f31ae5fceda.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="body-content").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/0e46f8bd1414b1fdd4f0543d54a97500.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "maincontentcontainer").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (date, disclogdate, title, description, notes) = columns + return (date, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/1097fa8afdcf5db89d212d0979226667.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/157cbe654bdaa0a48e6650152ae34489.py +++ b/documents/scrapers/157cbe654bdaa0a48e6650152ae34489.py @@ -10,7 +10,7 @@ return 5 def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) def getTable(self,soup): return soup.find_all('table')[4] --- /dev/null +++ b/documents/scrapers/1803322b27286950cab0c543168b5f21.txt @@ -1,1 +1,2 @@ +multipage log --- /dev/null +++ b/documents/scrapers/1ad74ca88932f90f0b92b69387171441.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(_class = "article-content").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, title, date, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/269680be088f3d8e663251655f3825b4.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (date, id, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/31b7c75cf484747b6b120680bddd33b0.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 2 + def getColumns(self,columns): + (title, date) = columns + return (date, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/329fccdca068b78ab7edd550e2957398.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 7 + def getTable(self,soup): + return soup.find(class_ = "foi-disclosure") + def getColumns(self,columns): + (disclogid, id, date, title, link, removedate, notes) = columns + return (id, date, title, title, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py @@ -1,1 +1,32 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + def getTitle(self, content, entry, doc): + i = 0 + title = "" + for string in content.stripped_strings: + if i < 2: + title = title + string + i = i+1 + doc.update({'title': title}) + print title + return + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/3b54190e3f409380e109fae29e1917aa.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description, link, deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py +++ b/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, description, title, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/3d426eb8c85c8f04b814eee597efd866.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "primary").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py @@ -1,1 +1,47 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(class_ = "inner-column").table + def getRows(self,table): + return table.tbody.find_all('tr',recursive=False) + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + def getDate(self, content, entry, doc): + i = 0 + date = "" + for string in content.stripped_strings: + if i ==1: + date = string + i = i+1 + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return + def getTitle(self, content, entry, doc): + i = 0 + title = "" + for string in content.stripped_strings: + if i < 2: + title = title + string + i = i+1 + doc.update({'title': title}) + #print title + return + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/4934000fddd6a5b1094f398798341290.py @@ -1,1 +1,23 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + --- /dev/null +++ b/documents/scrapers/50601505ef69483121a6d130bb0515e4.txt @@ -1,1 +1,1 @@ - +apsc has ACMA style disclog --- /dev/null +++ b/documents/scrapers/53b14397c8f27c29ff07b6319f7a0ec5.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py @@ -1,1 +1,39 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(class_ = "ms-rtestate-field").table + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + + def getLinks(self, content, entry, doc): + link = None + links = [] + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for atag in soup.find(class_ = "article-content").find_all('a'): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + + if links != []: + doc.update({'links': links}) + doc.update({'url': link}) + return + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/55b69726fde4b4898ecf6d7217d1d1d2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/5716ce0aacfe98f7d638b7a66b7f1040.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (date, id, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/5d05365e981d87e746b596d63e35b1dc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt @@ -1,1 +1,2 @@ +see parent dhs --- /dev/null +++ b/documents/scrapers/649b053f5e2884906ddc7174c2cd4b38.py @@ -1,1 +1,28 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + si.disclogURL = "http://www.fahcsia.gov.au/disclosure-log-2011-12-financial-year" + si.doScrape() + si.disclogURL = "http://www.fahcsia.gov.au/disclosure-log-2010-11-financial-year" + si.doScrape() + + --- /dev/null +++ b/documents/scrapers/6cf3870aedeeecfd6394b5c0abed4c55.py @@ -1,1 +1,23 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + --- /dev/null +++ b/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "content_div_50269").table + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6fe3c812a99d486963133459b2768cf6.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/734f12db07e844b30cd11dc98500f2ce.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/77f02f713e3c37bff73882fb90828379.py @@ -1,1 +1,22 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find("table",width="571") +#findAll("table")[3] + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/795c33ed030293dbdb155c909ea47e37.py +++ b/documents/scrapers/795c33ed030293dbdb155c909ea47e37.py @@ -10,7 +10,7 @@ return 7 def getColumns(self,columns): (id, date, title, description, notes, deletedate, otherinfo) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) #def getTable(self,soup): # return soup.find(class_ = "box").table --- /dev/null +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt --- a/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py +++ b/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/8ef0e5802f99800f514b3a148e013b75.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 4; + def getTable(self,soup): + return soup.find(class_ = "content").table + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/905a1c409b6afb1de0074b13a5559560.py @@ -1,1 +1,23 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + --- /dev/null +++ b/documents/scrapers/955dc4cb047b5439dfb65549ce2696a6.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/9f5cd66dea3e2ec958c17e28b27e60a7.txt @@ -1,1 +1,2 @@ +acma style --- /dev/null +++ b/documents/scrapers/a2d871c5d28de1dde8a3b66c4957e1a5.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/ad033512610d8e36886ab6a795f26561.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "_ctl0__ctl0_MainContentPlaceHolder_MainContentPlaceHolder_ContentSpan").findAll("table")[3] + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b0a3281ba66efe173c5a33d5ef90ff76.txt @@ -1,1 +1,2 @@ +multipage immi --- /dev/null +++ b/documents/scrapers/b506b87c8ee9e3a7ea8007914078c741.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description,link,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt @@ -1,1 +1,1 @@ - +no disclog yet --- /dev/null +++ b/documents/scrapers/c1302c8d7cbbd911f0d4d8a4128f8079.txt @@ -1,1 +1,1 @@ - +uses RET disclog --- /dev/null +++ b/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(class_ = "ms-rtestate-field").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/c414e65ed728d05307d5b27f13e195e1.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/c43ca6780764f4e61918e8836be74420.py +++ b/documents/scrapers/c43ca6780764f4e61918e8836be74420.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title,description,notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt @@ -1,1 +1,2 @@ +no log --- /dev/null +++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt --- /dev/null +++ b/documents/scrapers/dae7e934f1c341ccc9547a89a8af917e.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py +++ b/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py @@ -8,7 +8,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) def getTable(self,soup): return soup.find(class_ = "content") --- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py +++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) --- /dev/null +++ b/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "genericContent").table.tbody + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, date,title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py +++ b/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py @@ -8,7 +8,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) def getTable(self,soup): return soup.find(id = "content").table --- /dev/null +++ b/documents/scrapers/fa9b3badd6c686398c1c6982a4b02475.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/rtk.py +++ b/documents/scrapers/rtk.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) --- a/documents/viewDocument.php +++ b/documents/viewDocument.php @@ -3,7 +3,13 @@ include_once('../include/common.inc.php'); $hash = $_REQUEST['hash']; $docsdb = $server->get_db('disclosr-documents'); +try { $doc = object_to_array($docsdb->get($hash)); + +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} + if (!isset($doc['_attachments']) || count($doc['_attachments']) == 0) die ("no attachments"); $attachments = $doc['_attachments']; @@ -13,3 +19,4 @@ //echo $url; $request = Requests::get($url); echo ($request->body); + --- a/include/couchdb.inc.php +++ b/include/couchdb.inc.php @@ -10,6 +10,7 @@ if (php_uname('n') == "KYUUBEY") { $serverAddr = 'http://192.168.1.148:5984/'; + $serverAddr = 'http://127.0.0.1:5984/'; } else { $serverAddr = 'http://127.0.0.1:5984/'; }