From: Maxious Date: Thu, 29 Nov 2012 05:40:20 +0000 Subject: more scrapers X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=d9af0803ff8180d4c45467874b2b19ae3875abd8 --- more scrapers Former-commit-id: 012f6a5c1fa63f9ddaeb47cb13297668fa35b23e --- --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -19,8 +19,8 @@ if ($rows) { foreach ($rows as $row) { - - echo "" . $row->value->name . ""; +if (!isset($row->value->status) || $row->value->status != "suspended") { + echo "". $row->value->name . ""; if ($ENV == "DEV") echo "
(" . $row->id . ")"; echo "\n"; @@ -53,6 +53,7 @@ echo "\n"; } } +} } catch (SetteeRestClientException $e) { setteErrorHandler($e); } --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -92,7 +92,8 @@ return table.find_all('tr') def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() - date = date.replace("Octber","October") + (a,b,c) = date.partition("(") + date = a.replace("Octber","October") print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate --- /dev/null +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.txt @@ -1,1 +1,2 @@ +pdf --- /dev/null +++ b/documents/scrapers/0372b19123076338d483f624c433727b.txt @@ -1,1 +1,2 @@ +docx --- /dev/null +++ b/documents/scrapers/0ae822d1a748e60d90f0b79b97d5a3e5.txt @@ -1,1 +1,2 @@ +ACMA style --- /dev/null +++ b/documents/scrapers/0ced9dd2de36100c3cabdb7fd8e843a9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/2cac2cd1f42687db2d04fa20b5b6a538.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, title, date) = columns + return (id, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/3e2f110af49d62833a835bd257771ffb.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/4c57389dda9bd454bcb08bc1e5ed87bf.txt @@ -1,1 +1,2 @@ +parent --- /dev/null +++ b/documents/scrapers/4d2af2dcc72f1703bbf04b13b03720a8.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/525c3953187da08cd702359b2fc2997f.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/54cbb3439276062b7a9f007f9f69d1f6.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/655d4d67333536bda18d68265dfe7e80.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="node-30609") + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/656f7bb1884f4b9d31ebe2a5f5f58064.txt @@ -1,1 +1,2 @@ +list style --- /dev/null +++ b/documents/scrapers/65ec17101b00519e6d88c5a9f33c2c46.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, date, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/69d59284ef0ccd2677394d82d3292abc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "centercontent").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6ac74a939f420c6194ae29224809734a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/768bbbfb34115873af361af8519b38a9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/795e7a8afb39a420360aa207b0cb1306.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/7b39ce7f362a0af9a711eaf223943eea.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/7ec28d7d97fcf493b1350acd03e3642e.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/7f55a3c42ad7460254906aa043a6e324.py @@ -1,1 +1,24 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getTitle(self, content, entry, doc): + doc.update({'title': content.stripped_strings.next()}) + return + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, id, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/8317df630946937864d31a4728ad8ee8.txt @@ -1,1 +1,2 @@ +pdf --- /dev/null +++ b/documents/scrapers/8aae1c28db7f3ce10f232a0137be6bb2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/9282306e244040c9e4ae5705f06f9548.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/93ce83e46f5c2c4ca1b7f199b59b4bd2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date,logdate, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/99328d76c8efb56ff3f1da79b9d1b17f.txt @@ -1,1 +1,2 @@ +acma style --- /dev/null +++ b/documents/scrapers/9961dc45e046288ad1431941653af20c.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/a1ab9c80ab473958676c62c1a25dd502.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/a43467fe82b840a353b380c4d7462a4c.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="ctl00_ContentPlaceHolderMainNoAjax_EdtrTD1494_2").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (blank,id, title,date) = columns + return (id, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b91f866928eb61959dbbab56313214fc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/bc91b878e2317fa231cc2c512e2027f0.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/cca17a34bd490474a316fe0a1ca03c25.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_ctl01__ControlWrapper_RichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/cde8eb4a2e40abb18d8b28d3b85bc9b0.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(summary="This table lists the schedule of upcoming courses.") + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/ce34d1e9b55911e4272d2d388821f311.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/d1296c366287f7a9faedf235c7e6df01.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="main").table + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/e64c71f4986f78675a252104c5a5f359.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt @@ -1,1 +1,2 @@ +c'est ne pas une table --- /dev/null +++ b/documents/scrapers/ee30aad97f0bb32e74c4587404b67ce4.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, title, date, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/f189459fc43f941e0d4ecfba52c666f3.txt @@ -1,1 +1,2 @@ +no disclog