From: Maxious Date: Wed, 28 Nov 2012 00:25:31 +0000 Subject: more scrapers X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=7edc396bc34a40ebdea01c8c2d633f6667d3b57b --- more scrapers Former-commit-id: f3be16c5f01755423b13d67b8fef1653538d6db2 --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -84,7 +84,7 @@ doc.update({'description': descriptiontxt}) return def getTitle(self, content, entry, doc): - doc.update({'title': content.string}) + doc.update({'title': (''.join(content.stripped_strings))}) return def getTable(self, soup): return soup.table @@ -123,13 +123,13 @@ if doc == None: print "saving " +hash - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string} + doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))} self.getLinks(self.getURL(),row,doc) self.getTitle(title,row, doc) self.getDate(date,row, doc) self.getDescription(description,row, doc) if notes != None: - doc.update({ 'notes': notes.string}) + doc.update({ 'notes': (''.join(notes.stripped_strings))}) foidocsdb.save(doc) else: print "already saved "+hash --- /dev/null +++ b/documents/scrapers/0324e4b1654fd6dd651307abcef67094.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description, notes,link) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/3b54190e3f409380e109fae29e1917aa.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description, link, deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/5716ce0aacfe98f7d638b7a66b7f1040.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (date, id, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/5d05365e981d87e746b596d63e35b1dc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "content_div_50269").table + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.txt +++ /dev/null @@ -1,19 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(id = "content_div_50269").table - def getColumns(self,columns): - (id, date, title, description, notes) = columns - return (id, date, title, description, notes) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() - --- /dev/null +++ b/documents/scrapers/77f02f713e3c37bff73882fb90828379.py @@ -1,1 +1,22 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find("table",width="571") +#findAll("table")[3] + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/9f5cd66dea3e2ec958c17e28b27e60a7.txt @@ -1,1 +1,2 @@ +acma style --- /dev/null +++ b/documents/scrapers/ad033512610d8e36886ab6a795f26561.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "_ctl0__ctl0_MainContentPlaceHolder_MainContentPlaceHolder_ContentSpan").findAll("table")[3] + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b506b87c8ee9e3a7ea8007914078c741.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description,link,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(class_ = "ms-rtestate-field").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.txt +++ /dev/null @@ -1,20 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - - def getColumnCount(self): - return 4 - def getColumns(self,columns): - (id, date, title, description) = columns - return (id, date, title, description, None) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() - --- /dev/null +++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt @@ -1,1 +1,2 @@ +no log --- /dev/null +++ b/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "genericContent").table.tbody + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, date,title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.txt +++ /dev/null @@ -1,21 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(id = "genericContent").table.tbody - def getColumnCount(self): - return 5 - def getColumns(self,columns): - (id, date,title, description, notes) = columns - return (id, date, title, description, notes) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() - --- a/documents/viewDocument.php +++ b/documents/viewDocument.php @@ -3,7 +3,13 @@ include_once('../include/common.inc.php'); $hash = $_REQUEST['hash']; $docsdb = $server->get_db('disclosr-documents'); +try { $doc = object_to_array($docsdb->get($hash)); + +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} + if (!isset($doc['_attachments']) || count($doc['_attachments']) == 0) die ("no attachments"); $attachments = $doc['_attachments']; @@ -13,3 +19,4 @@ //echo $url; $request = Requests::get($url); echo ($request->body); +