more scrapers
Former-commit-id: f3be16c5f01755423b13d67b8fef1653538d6db2
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -84,7 +84,7 @@
doc.update({'description': descriptiontxt})
return
def getTitle(self, content, entry, doc):
- doc.update({'title': content.string})
+ doc.update({'title': (''.join(content.stripped_strings))})
return
def getTable(self, soup):
return soup.table
@@ -123,13 +123,13 @@
if doc == None:
print "saving " +hash
- doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
+ doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}
self.getLinks(self.getURL(),row,doc)
self.getTitle(title,row, doc)
self.getDate(date,row, doc)
self.getDescription(description,row, doc)
if notes != None:
- doc.update({ 'notes': notes.string})
+ doc.update({ 'notes': (''.join(notes.stripped_strings))})
foidocsdb.save(doc)
else:
print "already saved "+hash
--- /dev/null
+++ b/documents/scrapers/0324e4b1654fd6dd651307abcef67094.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 6
+ def getColumns(self,columns):
+ (id, date, title, description, notes,link) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/3b54190e3f409380e109fae29e1917aa.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description, link, deldate,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/5716ce0aacfe98f7d638b7a66b7f1040.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (date, id, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/5d05365e981d87e746b596d63e35b1dc.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "content_div_50269").table
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.txt
+++ /dev/null
@@ -1,19 +1,1 @@
-import sys,os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
-import genericScrapers
-import scrape
-from bs4 import BeautifulSoup
-#http://www.doughellmann.com/PyMOTW/abc/
-class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getTable(self,soup):
- return soup.find(id = "content_div_50269").table
- def getColumns(self,columns):
- (id, date, title, description, notes) = columns
- return (id, date, title, description, notes)
-
-if __name__ == '__main__':
- print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
- print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
- ScraperImplementation().doScrape()
-
--- /dev/null
+++ b/documents/scrapers/77f02f713e3c37bff73882fb90828379.py
@@ -1,1 +1,22 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find("table",width="571")
+#findAll("table")[3]
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description,link,deldate,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/9f5cd66dea3e2ec958c17e28b27e60a7.txt
@@ -1,1 +1,2 @@
+acma style
--- /dev/null
+++ b/documents/scrapers/ad033512610d8e36886ab6a795f26561.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "_ctl0__ctl0_MainContentPlaceHolder_MainContentPlaceHolder_ContentSpan").findAll("table")[3]
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/b506b87c8ee9e3a7ea8007914078c741.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 6
+ def getColumns(self,columns):
+ (id, date, title, description,link,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(class_ = "ms-rtestate-field").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.txt
+++ /dev/null
@@ -1,20 +1,1 @@
-import sys,os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
-import genericScrapers
-import scrape
-from bs4 import BeautifulSoup
-#http://www.doughellmann.com/PyMOTW/abc/
-class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
-
- def getColumnCount(self):
- return 4
- def getColumns(self,columns):
- (id, date, title, description) = columns
- return (id, date, title, description, None)
-
-if __name__ == '__main__':
- print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
- print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
- ScraperImplementation().doScrape()
-
--- /dev/null
+++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt
@@ -1,1 +1,2 @@
+no log
--- /dev/null
+++ b/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "genericContent").table.tbody
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (id, date,title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.txt
+++ /dev/null
@@ -1,21 +1,1 @@
-import sys,os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
-import genericScrapers
-import scrape
-from bs4 import BeautifulSoup
-#http://www.doughellmann.com/PyMOTW/abc/
-class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getTable(self,soup):
- return soup.find(id = "genericContent").table.tbody
- def getColumnCount(self):
- return 5
- def getColumns(self,columns):
- (id, date,title, description, notes) = columns
- return (id, date, title, description, notes)
-
-if __name__ == '__main__':
- print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
- print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
- ScraperImplementation().doScrape()
-
--- a/documents/viewDocument.php
+++ b/documents/viewDocument.php
@@ -3,7 +3,13 @@
include_once('../include/common.inc.php');
$hash = $_REQUEST['hash'];
$docsdb = $server->get_db('disclosr-documents');
+try {
$doc = object_to_array($docsdb->get($hash));
+
+} catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+}
+
if (!isset($doc['_attachments']) || count($doc['_attachments']) == 0) die ("no attachments");
$attachments = $doc['_attachments'];
@@ -13,3 +19,4 @@
//echo $url;
$request = Requests::get($url);
echo ($request->body);
+