more scrapers
Former-commit-id: 680ff767fd0ca810aa4b2cbe789740758373acf2
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -36,7 +36,7 @@
if (file_exists("./scrapers/" . $row->id . '.py')) {
echo "<font color='green'>✔</font>";
} else if (file_exists("./scrapers/" . $row->id . '.txt')) {
- echo "<font color='blue'><b>▬</b></font>";
+ echo "<font color='orange'><b>▬</b></font>";
} else {
echo "<font color='red'>✘</font>";
}
@@ -50,3 +50,4 @@
echo "</table>";
include_footer_documents();
?>
+
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -55,7 +55,7 @@
doc = foidocsdb.get(hash)
#print doc
if doc == None:
- print "saving"
+ print "saving "+ hash
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
"date": edate,"title": entry.title}
@@ -89,9 +89,17 @@
def getTable(self, soup):
return soup.table
def getDate(self, content, entry, doc):
- edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+ edate = parse(''.join(content.stripped_strings).strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
doc.update({'date': edate})
+ return
+ def getLinks(self, content, entry, doc):
+ links = []
+ for atag in entry.find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(content,atag['href']))
+ if links != []:
+ doc.update({'links': links})
return
def doScrape(self):
@@ -106,22 +114,17 @@
columns = row.find_all('td')
if len(columns) == self.getColumnCount():
(id, date, description, title, notes) = self.getColumns(columns)
- print id.string
+ print ''.join(id.stripped_strings)
if id.string == None:
- hash = scrape.mkhash(self.remove_control_chars(url+date.string))
+ hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
else:
- hash = scrape.mkhash(self.remove_control_chars(url+id.string))
- links = []
- for atag in row.find_all("a"):
- if atag.has_key('href'):
- links.append(scrape.fullurl(url,atag['href']))
+ hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
doc = foidocsdb.get(hash)
if doc == None:
- print "saving"
+ print "saving " +hash
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
- if links != []:
- doc.update({'links': links})
+ self.getLinks(self.getURL(),row,doc)
self.getTitle(title,row, doc)
self.getDate(date,row, doc)
self.getDescription(description,row, doc)
--- /dev/null
+++ b/documents/scrapers/0603dfcc930a791efaa64f31ae5fceda.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="body-content").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/0e46f8bd1414b1fdd4f0543d54a97500.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "maincontentcontainer").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (date, disclogdate, title, description, notes) = columns
+ return (date, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/1097fa8afdcf5db89d212d0979226667.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, description, title, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/1ad74ca88932f90f0b92b69387171441.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(_class = "article-content").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, title, date, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/269680be088f3d8e663251655f3825b4.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (date, id, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/31b7c75cf484747b6b120680bddd33b0.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 2
+ def getColumns(self,columns):
+ (title, date) = columns
+ return (date, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/329fccdca068b78ab7edd550e2957398.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 7
+ def getTable(self,soup):
+ return soup.find(class_ = "foi-disclosure")
+ def getColumns(self,columns):
+ (disclogid, id, date, title, link, removedate, notes) = columns
+ return (id, date, title, title, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/3d426eb8c85c8f04b814eee597efd866.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "primary").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/50601505ef69483121a6d130bb0515e4.txt
@@ -1,1 +1,1 @@
-
+apsc has ACMA style disclog
--- /dev/null
+++ b/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py
@@ -1,1 +1,39 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(class_ = "ms-rtestate-field").table
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, description, title, notes)
+
+ def getLinks(self, content, entry, doc):
+ link = None
+ links = []
+ for atag in entry.find_all('a'):
+ if atag.has_key('href'):
+ link = scrape.fullurl(self.getURL(),atag['href'])
+ (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+ if htcontent != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(htcontent)
+ for atag in soup.find(class_ = "article-content").find_all('a'):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(link,atag['href']))
+
+ if links != []:
+ doc.update({'links': links})
+ doc.update({'url': link})
+ return
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.txt
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "content_div_50269").table
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/6fe3c812a99d486963133459b2768cf6.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, description, title, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/734f12db07e844b30cd11dc98500f2ce.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
--- /dev/null
+++ b/documents/scrapers/8ef0e5802f99800f514b3a148e013b75.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 4;
+ def getTable(self,soup):
+ return soup.find(class_ = "content").table
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, description, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/955dc4cb047b5439dfb65549ce2696a6.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/a2d871c5d28de1dde8a3b66c4957e1a5.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description,link,deldate, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt
@@ -1,1 +1,1 @@
-
+no disclog yet
--- /dev/null
+++ b/documents/scrapers/c1302c8d7cbbd911f0d4d8a4128f8079.txt
@@ -1,1 +1,1 @@
-
+uses RET disclog
--- /dev/null
+++ b/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.txt
@@ -1,1 +1,20 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/c414e65ed728d05307d5b27f13e195e1.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, title, description) = columns
+ return (date, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt
--- /dev/null
+++ b/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.txt
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "genericContent").table.tbody
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date,title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/fa9b3badd6c686398c1c6982a4b02475.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+