more scrapers
Former-commit-id: a1b071fe60c375afcb0527b3c5a86844b8f0035b
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -12,6 +12,7 @@
$disclogs = 0;
$red = 0;
$green = 0;
+$yellow = 0;
$orange = 0;
try {
$rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
@@ -46,8 +47,14 @@
echo "<font color='green'><abbr title='Yes'>✔</abbr></font>";
$green++;
} else if (file_exists("./scrapers/" . $row->id . '.txt')) {
+ if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
+ echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
+ $yellow++;
+ } else {
+ echo file_get_contents("./scrapers/" . $row->id . '.txt');
echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
$orange++;
+ }
} else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>";
$red++;
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -9,6 +9,7 @@
import dateutil
from dateutil.parser import *
from datetime import *
+import codecs
class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta
@@ -120,7 +121,7 @@
columns = row.find_all('td')
if len(columns) == self.getColumnCount():
(id, date, title, description, notes) = self.getColumns(columns)
- print ''.join(id.stripped_strings)
+ print ''.join(id.stripped_strings).encode('ascii', 'ignore')
if id.string == None:
hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
else:
--- a/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt
+++ b/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt
@@ -1,2 +1,1 @@
-see parent dhs
-
+no disclog
--- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
+++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
@@ -1,1 +1,1 @@
-
+acma style
--- /dev/null
+++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
@@ -1,1 +1,86 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import codecs
+#http://www.doughellmann.com/PyMOTW/abc/
+class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getDescription(self,content, entry,doc):
+ link = None
+ links = []
+ description = ""
+ for atag in entry.find_all('a'):
+ if atag.has_key('href'):
+ link = scrape.fullurl(self.getURL(),atag['href'])
+ (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+ if htcontent != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(htcontent)
+ for text in soup.find(id="divFullWidthColumn").stripped_strings:
+ description = description + text.encode('ascii', 'ignore')
+
+ for atag in soup.find(id="divFullWidthColumn").find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(link,atag['href']))
+ if links != []:
+ doc.update({'links': links})
+ if description != "":
+ doc.update({ 'description': description})
+
+ def getColumnCount(self):
+ return 2
+ def getTable(self,soup):
+ return soup.find(id = "TwoColumnSorting")
+ def getColumns(self,columns):
+ ( title, date) = columns
+ return (title, date, title, title, None)
+class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getDescription(self,content, entry,doc):
+ link = None
+ links = []
+ description = ""
+ for atag in entry.find_all('a'):
+ if atag.has_key('href'):
+ link = scrape.fullurl(self.getURL(),atag['href'])
+ (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+ if htcontent != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(htcontent)
+ for text in soup.find(id="content-item").stripped_strings:
+ description = description + text + " \n"
+ for atag in soup.find(id="content-item").find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(link,atag['href']))
+ if links != []:
+ doc.update({'links': links})
+ if description != "":
+ doc.update({ 'description': description})
+
+ if links != []:
+ doc.update({'links': links})
+ if description != "":
+ doc.update({ 'description': description})
+
+ def getColumnCount(self):
+ return 2
+ def getTable(self,soup):
+ return soup.find(class_ = "doc-list")
+ def getColumns(self,columns):
+ (date, title) = columns
+ return (title, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ #NewScraperImplementation().doScrape()
+ print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ osi = OldScraperImplementation()
+ osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI"
+ osi.doScrape()
+# old site too
+
--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt
+++ /dev/null
@@ -1,49 +1,1 @@
-import sys,os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
-import genericScrapers
-import scrape
-from bs4 import BeautifulSoup
-#http://www.doughellmann.com/PyMOTW/abc/
-class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getDescription(self,content, entry,doc):
- link = None
- links = []
- description = ""
- for atag in entry.find_all('a'):
- if atag.has_key('href'):
- link = scrape.fullurl(self.getURL(),atag['href'])
- (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
- if htcontent != None:
- if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
- # http://www.crummy.com/software/BeautifulSoup/documentation.html
- soup = BeautifulSoup(htcontent)
- for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'):
- if row != None:
- rowtitle = row.find('th').string
- description = description + "\n" + rowtitle + ": "
- for text in row.find('td').stripped_strings:
- description = description + text
- for atag in row.find_all("a"):
- if atag.has_key('href'):
- links.append(scrape.fullurl(link,atag['href']))
-
- if links != []:
- doc.update({'links': links})
- if description != "":
- doc.update({ 'description': description})
-
- def getColumnCount(self):
- return 2
- def getTable(self,soup):
- return soup.find(class_ = "ms-rteTable-GreyAlternating")
- def getColumns(self,columns):
- (date, title) = columns
- return (title, date, title, title, None)
-
-if __name__ == '__main__':
- print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
- print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
- ScraperImplementation().doScrape()
-# old site too http://archive.treasury.gov.au/content/foi_publications.asp
-
--- a/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt
+++ b/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt
@@ -1,2 +1,1 @@
-# does not have any disclog entries or table
-
+no disclog
--- a/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt
+++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt
@@ -1,1 +1,1 @@
-no disclog yet
+no disclog
--- a/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt
+++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt
@@ -1,2 +1,1 @@
-no log
-
+no disclog
--- a/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt
+++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt
@@ -1,1 +1,1 @@
-
+acma style
--- a/documents/scrapers/e770921522a49dc77de208cc724ce134.txt
+++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt
@@ -1,2 +1,1 @@
-c'est ne pas une table
-
+no disclog