more scrapers
Former-commit-id: 012f6a5c1fa63f9ddaeb47cb13297668fa35b23e
--- a/.gitmodules
+++ b/.gitmodules
@@ -31,4 +31,7 @@
[submodule "documents/lib/parsedatetime"]
path = documents/lib/parsedatetime
url = git://github.com/bear/parsedatetime.git
+[submodule "lib/FeedWriter"]
+ path = lib/FeedWriter
+ url = https://github.com/mibe/FeedWriter
--- /dev/null
+++ b/documents/.gitignore
@@ -1,1 +1,2 @@
+*.pyc
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -19,8 +19,8 @@
if ($rows) {
foreach ($rows as $row) {
-
- echo "<tr><td><b>" . $row->value->name . "</b>";
+if (!isset($row->value->status) || $row->value->status != "suspended") {
+ echo "<tr><td><a href='" . $row->value->website ."'><b>". $row->value->name . "</b></a>";
if ($ENV == "DEV")
echo "<br>(" . $row->id . ")";
echo "</td>\n";
@@ -35,29 +35,31 @@
echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
. 'view local copy</a>)</small>';
} else {
- echo "<font color='red'>✘</font>";
+ echo "<font color='red'><abbr title='No'>✘</abbr></font>";
}
echo "</td>\n<td>";
if (isset($row->value->FOIDocumentsURL)) {
if (file_exists("./scrapers/" . $row->id . '.py')) {
- echo "<font color='green'>✔</font>";
+ echo "<font color='green'><abbr title='Yes'>✔</abbr></font>";
$green++;
} else if (file_exists("./scrapers/" . $row->id . '.txt')) {
- echo "<font color='orange'><b>▬</b></font>";
+ echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
$orange++;
} else {
- echo "<font color='red'>✘</font>";
+ echo "<font color='red'><abbr title='No'>✘</abbr></font>";
$red++;
}
}
echo "</td></tr>\n";
}
}
+}
} catch (SetteeRestClientException $e) {
setteErrorHandler($e);
}
echo "</table>";
-echo $agencies." agencies ".(($disclogs/$agencies)*100)."% with disclosure logs, ".(($green/$disclogs)*100)."% with scrapers ".(($red/$disclogs)*100)."% without scrapers ".(($orange/$disclogs)*100)."% WIP scrapers ";
+echo $agencies." agencies, ".round(($disclogs/$agencies)*100)."% with disclosure logs; "
+.round(($green/$disclogs)*100)."% logs with scrapers ".round(($red/$disclogs)*100)."% logs without scrapers ".round(($orange/$disclogs)*100)."% logs Work-In-Progress scrapers ";
include_footer_documents();
?>
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -92,7 +92,9 @@
return table.find_all('tr')
def getDate(self, content, entry, doc):
date = ''.join(content.stripped_strings).strip()
- date = str.replace("Octber","October",date)
+ (a,b,c) = date.partition("(")
+ date = a.replace("Octber","October")
+ print date
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
doc.update({'date': edate})
--- a/documents/index.php
+++ b/documents/index.php
@@ -20,7 +20,7 @@
if ($rows) {
foreach ($rows as $row) {
-displayLogEntry($row,$idtoname);
+echo displayLogEntry($row,$idtoname);
}
}
} catch (SetteeRestClientException $e) {
--- a/documents/robots.txt
+++ b/documents/robots.txt
@@ -2,4 +2,5 @@
# http://code.google.com/web/controlcrawlindex/
User-agent: *
-
+Disallow: /admin/
+Sitemap: http://disclosurelo.gs/sitemap.xml.php
--- a/documents/rss.xml.php
+++ b/documents/rss.xml.php
@@ -3,28 +3,38 @@
// Agency X updated Y, new files, diff of plain text/link text,
// feed for just one agency or all
// This is a minimum example of using the Universal Feed Generator Class
-include("lib/FeedWriter.php");
+include("../lib/FeedWriter/FeedTypes.php");
+include_once('../include/common.inc.php');
//Creating an instance of FeedWriter class.
-$TestFeed = new FeedWriter(RSS2);
+$TestFeed = new RSS2FeedWriter();
//Setting the channel elements
//Use wrapper functions for common channelelements
$TestFeed->setTitle('Last Modified - All');
-$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
+$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php');
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
+ $TestFeed->setChannelElement('language', 'en-us');
+ $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
//Retriving informations from database
-$rows = $db->get_view("app", "byLastModified")->rows;
+$idtoname = Array();
+$agenciesdb = $server->get_db('disclosr-agencies');
+foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
+ $idtoname[$row->id] = trim($row->value->name);
+}
+$foidocsdb = $server->get_db('disclosr-foidocuments');
+$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows;
//print_r($rows);
foreach ($rows as $row) {
//Create an empty FeedItem
$newItem = $TestFeed->createNewItem();
//Add elements to the feed item
- $newItem->setTitle($row['name']);
- $newItem->setLink($row['id']);
- $newItem->setDate(date("c", $row['metadata']['lastModified']));
- $newItem->setDescription($row['name']);
+ $newItem->setTitle($row->value->title);
+ $newItem->setLink("view.php?id=".$row->value->docID);
+ $newItem->setDate(date("c", strtotime($row->value->date)));
+ $newItem->setDescription(displayLogEntry($row,$idtoname));
+ $newItem->addElement('guid', $row->value->_id,array('isPermaLink'=>'true'));
//Now add the feed item
$TestFeed->addItem($newItem);
}
//OK. Everything is done. Now genarate the feed.
-$TestFeed->genarateFeed();
+$TestFeed->generateFeed();
?>
--- /dev/null
+++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.txt
@@ -1,1 +1,2 @@
+pdf
--- /dev/null
+++ b/documents/scrapers/00a294de663db69062ca09aede7c0487.txt
@@ -1,1 +1,2 @@
+multipage
--- /dev/null
+++ b/documents/scrapers/0372b19123076338d483f624c433727b.txt
@@ -1,1 +1,2 @@
+docx
--- /dev/null
+++ b/documents/scrapers/0ae822d1a748e60d90f0b79b97d5a3e5.txt
@@ -1,1 +1,2 @@
+ACMA style
--- /dev/null
+++ b/documents/scrapers/0ced9dd2de36100c3cabdb7fd8e843a9.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/1803322b27286950cab0c543168b5f21.txt
@@ -1,1 +1,2 @@
+multipage log
--- /dev/null
+++ b/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "inner_content")
+ def getColumnCount(self):
+ return 2
+ def getColumns(self,columns):
+ (date, title) = columns
+ return (date, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/2cac2cd1f42687db2d04fa20b5b6a538.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (id, title, date) = columns
+ return (id, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/31685505438d393f45a90f442b8fa27f.txt
@@ -1,1 +1,2 @@
+pdf
--- /dev/null
+++ b/documents/scrapers/3e2f110af49d62833a835bd257771ffb.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/41a166419503bb50e410c58be54c102f.txt
@@ -1,1 +1,1 @@
-
+aspx
--- /dev/null
+++ b/documents/scrapers/4934000fddd6a5b1094f398798341290.py
@@ -1,1 +1,23 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import dateutil
+from dateutil.parser import *
+from datetime import *
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ si = ScraperImplementation()
+ si.doScrape()
+
--- /dev/null
+++ b/documents/scrapers/4c57389dda9bd454bcb08bc1e5ed87bf.txt
@@ -1,1 +1,2 @@
+parent
--- /dev/null
+++ b/documents/scrapers/4d2af2dcc72f1703bbf04b13b03720a8.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/525c3953187da08cd702359b2fc2997f.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/54cbb3439276062b7a9f007f9f69d1f6.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/601aedeef4344638d635bdd761e9fdba.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (date, title, description,notes) = columns
+ return (title, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/649b053f5e2884906ddc7174c2cd4b38.py
@@ -1,1 +1,28 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import dateutil
+from dateutil.parser import *
+from datetime import *
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ si = ScraperImplementation()
+ si.doScrape()
+ si.disclogURL = "http://www.fahcsia.gov.au/disclosure-log-2011-12-financial-year"
+ si.doScrape()
+ si.disclogURL = "http://www.fahcsia.gov.au/disclosure-log-2010-11-financial-year"
+ si.doScrape()
+
+
--- /dev/null
+++ b/documents/scrapers/655d4d67333536bda18d68265dfe7e80.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="node-30609")
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/656f7bb1884f4b9d31ebe2a5f5f58064.txt
@@ -1,1 +1,2 @@
+list style
--- /dev/null
+++ b/documents/scrapers/65ec17101b00519e6d88c5a9f33c2c46.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (id, date, description) = columns
+ return (id, date, description, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/69d59284ef0ccd2677394d82d3292abc.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "centercontent").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/6ac74a939f420c6194ae29224809734a.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/6afdde1d4ff1ad8d8cfe1a8675ea83bd.txt
@@ -1,1 +1,2 @@
+PDF
--- /dev/null
+++ b/documents/scrapers/6cf3870aedeeecfd6394b5c0abed4c55.py
@@ -1,1 +1,23 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import dateutil
+from dateutil.parser import *
+from datetime import *
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ si = ScraperImplementation()
+ si.doScrape()
+
--- /dev/null
+++ b/documents/scrapers/72a295f10734d64e8185f651fd2b39ea.txt
@@ -1,1 +1,2 @@
+weird div based log with tables of links
--- /dev/null
+++ b/documents/scrapers/75d8f1c605ef9da0c2590264b7aa046b.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "content-middle").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/768bbbfb34115873af361af8519b38a9.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/795e7a8afb39a420360aa207b0cb1306.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/7b39ce7f362a0af9a711eaf223943eea.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/7ec28d7d97fcf493b1350acd03e3642e.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, title, description) = columns
+ return (date, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/7f55a3c42ad7460254906aa043a6e324.py
@@ -1,1 +1,24 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getTitle(self, content, entry, doc):
+ doc.update({'title': content.stripped_strings.next()})
+ return
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, id, description) = columns
+ return (id, date, description, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/8317df630946937864d31a4728ad8ee8.txt
@@ -1,1 +1,2 @@
+pdf
--- /dev/null
+++ b/documents/scrapers/8796220032faf94501bd366763263685.txt
@@ -1,1 +1,2 @@
+multiple pages
--- /dev/null
+++ b/documents/scrapers/8aae1c28db7f3ce10f232a0137be6bb2.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt
@@ -1,1 +1,49 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getDescription(self,content, entry,doc):
+ link = None
+ links = []
+ description = ""
+ for atag in entry.find_all('a'):
+ if atag.has_key('href'):
+ link = scrape.fullurl(self.getURL(),atag['href'])
+ (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+ if htcontent != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(htcontent)
+ for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'):
+ if row != None:
+ rowtitle = row.find('th').string
+ description = description + "\n" + rowtitle + ": "
+ for text in row.find('td').stripped_strings:
+ description = description + text
+ for atag in row.find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(link,atag['href']))
+
+ if links != []:
+ doc.update({'links': links})
+ if description != "":
+ doc.update({ 'description': description})
+
+ def getColumnCount(self):
+ return 2
+ def getTable(self,soup):
+ return soup.find(class_ = "ms-rteTable-GreyAlternating")
+ def getColumns(self,columns):
+ (date, title) = columns
+ return (title, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+# old site too http://archive.treasury.gov.au/content/foi_publications.asp
+
--- /dev/null
+++ b/documents/scrapers/905a1c409b6afb1de0074b13a5559560.py
@@ -1,1 +1,23 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import dateutil
+from dateutil.parser import *
+from datetime import *
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ si = ScraperImplementation()
+ si.doScrape()
+
--- /dev/null
+++ b/documents/scrapers/9282306e244040c9e4ae5705f06f9548.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/93ce83e46f5c2c4ca1b7f199b59b4bd2.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date,logdate, description) = columns
+ return (id, date, description, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/99328d76c8efb56ff3f1da79b9d1b17f.txt
@@ -1,1 +1,2 @@
+acma style
--- /dev/null
+++ b/documents/scrapers/9961dc45e046288ad1431941653af20c.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/a1ab9c80ab473958676c62c1a25dd502.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/a43467fe82b840a353b380c4d7462a4c.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, title, description) = columns
+ return (date, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="ctl00_ContentPlaceHolderMainNoAjax_EdtrTD1494_2").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (blank,id, title,date) = columns
+ return (id, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/b0a3281ba66efe173c5a33d5ef90ff76.txt
@@ -1,1 +1,2 @@
+multipage immi
--- /dev/null
+++ b/documents/scrapers/b0fb402314e685238537105ee0e70c84.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/b7770c4584332cff42bb6abb3326e564.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "ctl00_PlaceHolderMain_Content__ControlWrapper_RichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/b91f866928eb61959dbbab56313214fc.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/bc91b878e2317fa231cc2c512e2027f0.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/cca17a34bd490474a316fe0a1ca03c25.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "ctl00_PlaceHolderMain_ctl01__ControlWrapper_RichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/cde8eb4a2e40abb18d8b28d3b85bc9b0.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(summary="This table lists the schedule of upcoming courses.")
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description,link,deldate,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/ce34d1e9b55911e4272d2d388821f311.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/d1296c366287f7a9faedf235c7e6df01.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="main").table
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description,link,deldate,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/e64c71f4986f78675a252104c5a5f359.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt
@@ -1,1 +1,2 @@
+c'est ne pas une table
--- /dev/null
+++ b/documents/scrapers/e90b1b7cbb83e3eed0b5f849c7e3af79.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "inner_content")
+ def getColumnCount(self):
+ return 2
+ def getColumns(self,columns):
+ (date, title) = columns
+ return (date, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/ee30aad97f0bb32e74c4587404b67ce4.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, title, date, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/f189459fc43f941e0d4ecfba52c666f3.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/f5ce2d1651739704634eb8ca4b2b46d3.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "ctl00_PlaceHolderMain_PublishingPageContent__ControlWrapper_RichHtmlField").table
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description,link,deldate, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/sitemap.xml.php
@@ -1,1 +1,25 @@
+<?php
+include ('../include/common.inc.php');
+$last_updated = date('Y-m-d', @filemtime('cbrfeed.zip'));
+header("Content-Type: text/xml");
+echo "<?xml version='1.0' encoding='UTF-8'?>";
+echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
+echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n";
+foreach (scandir("./") as $file) {
+ if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php")
+ echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n";
+}
+
+$db = $server->get_db('disclosr-foidocuments');
+try {
+ $rows = $db->get_view("app", "all")->rows;
+ foreach ($rows as $row) {
+ echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n";
+ }
+} catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+}
+echo '</urlset>';
+?>
+
--- a/documents/template.inc.php
+++ b/documents/template.inc.php
@@ -130,21 +130,23 @@
}
function displayLogEntry($row, $idtoname) {
- echo "<div><h2>".$row->value->date.": ".$row->value->title." (".$idtoname[$row->value->agencyID].")</h2> <p>".str_replace("\n","<br>",$row->value->description);
+ $result = "";
+ $result .= "<div><h2>".$row->value->date.": ".$row->value->title." (".$idtoname[$row->value->agencyID].")</h2> <p>".str_replace("\n","<br>",$row->value->description);
if (isset($row->value->notes)) {
-echo " <br>Note: ".$row->value->notes;
+$result .= " <br>Note: ".$row->value->notes;
}
-echo "</p>";
+$result .= "</p>";
if (isset($row->value->links)){
-echo "<h3>Links/Documents</h3><ul>";
+$result .= "<h3>Links/Documents</h3><ul>";
foreach ($row->value->links as $link) {
- echo "<li><a href='$link'>".$link."</a></li>";
+ $result .= "<li><a href='$link'>".$link."</a></li>";
}
- echo "</ul>";
+ $result .= "</ul>";
}
- echo "<small><A href='".$row->value->url."'>View original source...</a> ID: ".$row->value->docID."</small>";
-echo"</div>";
+ $result .= "<small><A href='".$row->value->url."'>View original source...</a> ID: ".$row->value->docID."</small>";
+$result .= "</div>";
+return $result;
}
--- /dev/null
+++ b/documents/view.php
@@ -1,1 +1,27 @@
+<?php
+include('template.inc.php');
+include_header_documents("");
+include_once('../include/common.inc.php');
+?>
+<?php
+
+
+$agenciesdb = $server->get_db('disclosr-agencies');
+
+$idtoname = Array();
+foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
+ $idtoname[$row->id] = trim($row->value->name);
+}
+$foidocsdb = $server->get_db('disclosr-foidocuments');
+try {
+ $obj = new stdClass();
+ $obj->value = $foidocsdb->get($_REQUEST['id']);
+echo displayLogEntry($obj,$idtoname);
+
+} catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+}
+include_footer_documents();
+?>
+
--- a/lib/FeedItem.php
+++ /dev/null
@@ -1,168 +1,1 @@
-<?php
- /**
- * Univarsel Feed Writer
- *
- * FeedItem class - Used as feed element in FeedWriter class
- *
- * @package UnivarselFeedWriter
- * @author Anis uddin Ahmad <anisniit@gmail.com>
- * @link http://www.ajaxray.com/projects/rss
- */
- class FeedItem
- {
- private $elements = array(); //Collection of feed elements
- private $version;
-
- /**
- * Constructor
- *
- * @param contant (RSS1/RSS2/ATOM) RSS2 is default.
- */
- function __construct($version = RSS2)
- {
- $this->version = $version;
- }
-
- /**
- * Add an element to elements array
- *
- * @access public
- * @param srting The tag name of an element
- * @param srting The content of tag
- * @param array Attributes(if any) in 'attrName' => 'attrValue' format
- * @return void
- */
- public function addElement($elementName, $content, $attributes = null)
- {
- $this->elements[$elementName]['name'] = $elementName;
- $this->elements[$elementName]['content'] = $content;
- $this->elements[$elementName]['attributes'] = $attributes;
- }
-
- /**
- * Set multiple feed elements from an array.
- * Elements which have attributes cannot be added by this method
- *
- * @access public
- * @param array array of elements in 'tagName' => 'tagContent' format.
- * @return void
- */
- public function addElementArray($elementArray)
- {
- if(! is_array($elementArray)) return;
- foreach ($elementArray as $elementName => $content)
- {
- $this->addElement($elementName, $content);
- }
- }
-
- /**
- * Return the collection of elements in this feed item
- *
- * @access public
- * @return array
- */
- public function getElements()
- {
- return $this->elements;
- }
-
- // Wrapper functions ------------------------------------------------------
-
- /**
- * Set the 'dscription' element of feed item
- *
- * @access public
- * @param string The content of 'description' element
- * @return void
- */
- public function setDescription($description)
- {
- $tag = ($this->version == ATOM)? 'summary' : 'description';
- $this->addElement($tag, $description);
- }
-
- /**
- * @desc Set the 'title' element of feed item
- * @access public
- * @param string The content of 'title' element
- * @return void
- */
- public function setTitle($title)
- {
- $this->addElement('title', $title);
- }
-
- /**
- * Set the 'date' element of feed item
- *
- * @access public
- * @param string The content of 'date' element
- * @return void
- */
- public function setDate($date)
- {
- if(! is_numeric($date))
- {
- $date = strtotime($date);
- }
-
- if($this->version == ATOM)
- {
- $tag = 'updated';
- $value = date(DATE_ATOM, $date);
- }
- elseif($this->version == RSS2)
- {
- $tag = 'pubDate';
- $value = date(DATE_RSS, $date);
- }
- else
- {
- $tag = 'dc:date';
- $value = date("Y-m-d", $date);
- }
-
- $this->addElement($tag, $value);
- }
-
- /**
- * Set the 'link' element of feed item
- *
- * @access public
- * @param string The content of 'link' element
- * @return void
- */
- public function setLink($link)
- {
- if($this->version == RSS2 || $this->version == RSS1)
- {
- $this->addElement('link', $link);
- }
- else
- {
- $this->addElement('link','',array('href'=>$link));
- $this->addElement('id', FeedWriter::uuid($link,'urn:uuid:'));
- }
-
- }
-
- /**
- * Set the 'encloser' element of feed item
- * For RSS 2.0 only
- *
- * @access public
- * @param string The url attribute of encloser tag
- * @param string The length attribute of encloser tag
- * @param string The type attribute of encloser tag
- * @return void
- */
- public function setEncloser($url, $length, $type)
- {
- $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type);
- $this->addElement('enclosure','',$attributes);
- }
-
- } // end of class FeedItem
-?>
--- a/lib/FeedWriter.php
+++ /dev/null
@@ -1,435 +1,1 @@
-<?php
-// RSS 0.90 Officially obsoleted by 1.0
-// RSS 0.91, 0.92, 0.93 and 0.94 Officially obsoleted by 2.0
-// So, define constants for RSS 1.0, RSS 2.0 and ATOM
-
- define('RSS1', 'RSS 1.0', true);
- define('RSS2', 'RSS 2.0', true);
- define('ATOM', 'ATOM', true);
-
- /**
- * Univarsel Feed Writer class
- *
- * Genarate RSS 1.0, RSS2.0 and ATOM Feed
- *
- * @package UnivarselFeedWriter
- * @author Anis uddin Ahmad <anisniit@gmail.com>
- * @link http://www.ajaxray.com/projects/rss
- */
- class FeedWriter
- {
- private $channels = array(); // Collection of channel elements
- private $items = array(); // Collection of items as object of FeedItem class.
- private $data = array(); // Store some other version wise data
- private $CDATAEncoding = array(); // The tag names which have to encoded as CDATA
-
- private $version = null;
-
- /**
- * Constructor
- *
- * @param constant the version constant (RSS1/RSS2/ATOM).
- */
- function __construct($version = RSS2)
- {
- $this->version = $version;
-
- // Setting default value for assential channel elements
- $this->channels['title'] = $version . ' Feed';
- $this->channels['link'] = 'http://www.ajaxray.com/blog';
-
- //Tag names to encode in CDATA
- $this->CDATAEncoding = array('description', 'content:encoded', 'summary');
- }
-
- // Start # public functions ---------------------------------------------
-
- /**
- * Set a channel element
- * @access public
- * @param srting name of the channel tag
- * @param string content of the channel tag
- * @return void
- */
- public function setChannelElement($elementName, $content)
- {
- $this->channels[$elementName] = $content ;
- }
-
- /**
- * Set multiple channel elements from an array. Array elements
- * should be 'channelName' => 'channelContent' format.
- *
- * @access public
- * @param array array of channels
- * @return void
- */
- public function setChannelElementsFromArray($elementArray)
- {
- if(! is_array($elementArray)) return;
- foreach ($elementArray as $elementName => $content)
- {
- $this->setChannelElement($elementName, $content);
- }
- }
-
- /**
- * Genarate the actual RSS/ATOM file
- *
- * @access public
- * @return void
- */
- public function genarateFeed()
- {
- header("Content-type: text/xml");
-
- $this->printHead();
- $this->printChannels();
- $this->printItems();
- $this->printTale();
- }
-
- /**
- * Create a new FeedItem.
- *
- * @access public
- * @return object instance of FeedItem class
- */
- public function createNewItem()
- {
- $Item = new FeedItem($this->version);
- return $Item;
- }
-
- /**
- * Add a FeedItem to the main class
- *
- * @access public
- * @param object instance of FeedItem class
- * @return void
- */
- public function addItem($feedItem)
- {
- $this->items[] = $feedItem;
- }
-
-
- // Wrapper functions -------------------------------------------------------------------
-
- /**
- * Set the 'title' channel element
- *
- * @access public
- * @param srting value of 'title' channel tag
- * @return void
- */
- public function setTitle($title)
- {
- $this->setChannelElement('title', $title);
- }
-
- /**
- * Set the 'description' channel element
- *
- * @access public
- * @param srting value of 'description' channel tag
- * @return void
- */
- public function setDescription($desciption)
- {
- $this->setChannelElement('description', $desciption);
- }
-
- /**
- * Set the 'link' channel element
- *
- * @access public
- * @param srting value of 'link' channel tag
- * @return void
- */
- public function setLink($link)
- {
- $this->setChannelElement('link', $link);
- }
-
- /**
- * Set the 'image' channel element
- *
- * @access public
- * @param srting title of image
- * @param srting link url of the imahe
- * @param srting path url of the image
- * @return void
- */
- public function setImage($title, $link, $url)
- {
- $this->setChannelElement('image', array('title'=>$title, 'link'=>$link, 'url'=>$url));
- }
-
- /**
- * Set the 'about' channel element. Only for RSS 1.0
- *
- * @access public
- * @param srting value of 'about' channel tag
- * @return void
- */
- public function setChannelAbout($url)
- {
- $this->data['ChannelAbout'] = $url;
- }
-
- /**
- * Genarates an UUID
- * @author Anis uddin Ahmad <admin@ajaxray.com>
- * @param string an optional prefix
- * @return string the formated uuid
- */
- public function uuid($key = null, $prefix = '')
- {
- $key = ($key == null)? uniqid(rand()) : $key;
- $chars = md5($key);
- $uuid = substr($chars,0,8) . '-';
- $uuid .= substr($chars,8,4) . '-';
- $uuid .= substr($chars,12,4) . '-';
- $uuid .= substr($chars,16,4) . '-';
- $uuid .= substr($chars,20,12);
-
- return $prefix . $uuid;
- }
- // End # public functions ----------------------------------------------
-
- // Start # private functions ----------------------------------------------
-
- /**
- * Prints the xml and rss namespace
- *
- * @access private
- * @return void
- */
- private function printHead()
- {
- $out = '<?xml version="1.0" encoding="utf-8"?>' . "\n";
-
- if($this->version == RSS2)
- {
- $out .= '<rss version="2.0"
- xmlns:content="http://purl.org/rss/1.0/modules/content/"
- xmlns:wfw="http://wellformedweb.org/CommentAPI/"
- >' . PHP_EOL;
- }
- elseif($this->version == RSS1)
- {
- $out .= '<rdf:RDF
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- xmlns="http://purl.org/rss/1.0/"
- xmlns:dc="http://purl.org/dc/elements/1.1/"
- >' . PHP_EOL;;
- }
- else if($this->version == ATOM)
- {
- $out .= '<feed xmlns="http://www.w3.org/2005/Atom">' . PHP_EOL;;
- }
- echo $out;
- }
-
- /**
- * Closes the open tags at the end of file
- *
- * @access private
- * @return void
- */
- private function printTale()
- {
- if($this->version == RSS2)
- {
- echo '</channel>' . PHP_EOL . '</rss>';
- }
- elseif($this->version == RSS1)
- {
- echo '</rdf:RDF>';
- }
- else if($this->version == ATOM)
- {
- echo '</feed>';
- }
-
- }
-
- /**
- * Creates a single node as xml format
- *
- * @access private
- * @param srting name of the tag
- * @param mixed tag value as string or array of nested tags in 'tagName' => 'tagValue' format
- * @param array Attributes(if any) in 'attrName' => 'attrValue' format
- * @return string formatted xml tag
- */
- private function makeNode($tagName, $tagContent, $attributes = null)
- {
- $nodeText = '';
- $attrText = '';
-
- if(is_array($attributes))
- {
- foreach ($attributes as $key => $value)
- {
- $attrText .= " $key=\"$value\" ";
- }
- }
-
- if(is_array($tagContent) && $this->version == RSS1)
- {
- $attrText = ' rdf:parseType="Resource"';
- }
-
-
- $attrText .= (in_array($tagName, $this->CDATAEncoding) && $this->version == ATOM)? ' type="html" ' : '';
- $nodeText .= (in_array($tagName, $this->CDATAEncoding))? "<{$tagName}{$attrText}><![CDATA[" : "<{$tagName}{$attrText}>";
-
- if(is_array($tagContent))
- {
- foreach ($tagContent as $key => $value)
- {
- $nodeText .= $this->makeNode($key, $value);
- }
- }
- else
- {
- $nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent);
- }
-
- $nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>";
-
- return $nodeText . PHP_EOL;
- }
-
- /**
- * @desc Print channels
- * @access private
- * @return void
- */
- private function printChannels()
- {
- //Start channel tag
- switch ($this->version)
- {
- case RSS2:
- echo '<channel>' . PHP_EOL;
- break;
- case RSS1:
- echo (isset($this->data['ChannelAbout']))? "<channel rdf:about=\"{$this->data['ChannelAbout']}\">" : "<channel rdf:about=\"{$this->channels['link']}\">";
- break;
- }
-
- //Print Items of channel
- foreach ($this->channels as $key => $value)
- {
- if($this->version == ATOM && $key == 'link')
- {
- // ATOM prints link element as href attribute
- echo $this->makeNode($key,'',array('href'=>$value));
- //Add the id for ATOM
- echo $this->makeNode('id',$this->uuid($value,'urn:uuid:'));
- }
- else
- {
- echo $this->makeNode($key, $value);
- }
-
- }
-
- //RSS 1.0 have special tag <rdf:Seq> with channel
- if($this->version == RSS1)
- {
- echo "<items>" . PHP_EOL . "<rdf:Seq>" . PHP_EOL;
- foreach ($this->items as $item)
- {
- $thisItems = $item->getElements();
- echo "<rdf:li resource=\"{$thisItems['link']['content']}\"/>" . PHP_EOL;
- }
- echo "</rdf:Seq>" . PHP_EOL . "</items>" . PHP_EOL . "</channel>" . PHP_EOL;
- }
- }
-
- /**
- * Prints formatted feed items
- *
- * @access private
- * @return void
- */
- private function printItems()
- {
- foreach ($this->items as $item)
- {
- $thisItems = $item->getElements();
-
- //the argument is printed as rdf:about attribute of item in rss 1.0
- echo $this->startItem($thisItems['link']['content']);
-
- foreach ($thisItems as $feedItem )
- {
- echo $this->makeNode($feedItem['name'], $feedItem['content'], $feedItem['attributes']);
- }
- echo $this->endItem();
- }
- }
-
- /**
- * Make the starting tag of channels
- *
- * @access private
- * @param srting The vale of about tag which is used for only RSS 1.0
- * @return void
- */
- private function startItem($about = false)
- {
- if($this->version == RSS2)
- {
- echo '<item>' . PHP_EOL;
- }
- elseif($this->version == RSS1)
- {
- if($about)
- {
- echo "<item rdf:about=\"$about\">" . PHP_EOL;
- }
- else
- {
- die('link element is not set .\n It\'s required for RSS 1.0 to be used as about attribute of item');
- }
- }
- else if($this->version == ATOM)
- {
- echo "<entry>" . PHP_EOL;
- }
- }
-
- /**
- * Closes feed item tag
- *
- * @access private
- * @return void
- */
- private function endItem()
- {
- if($this->version == RSS2 || $this->version == RSS1)
- {
- echo '</item>' . PHP_EOL;
- }
- else if($this->version == ATOM)
- {
- echo "</entry>" . PHP_EOL;
- }
- }
-
-
-
- // End # private functions ----------------------------------------------
-
- } // end of class FeedWriter
-
-// autoload classes
-function __autoload($class_name)
-{
- require_once $class_name . '.php';
-}
+