scraper and sort order updatyes
scraper and sort order updatyes


Former-commit-id: c8bfc5c3ecbee616fa6dd8bfdd147bedf4d64646

  for line in `curl "http://localhost:5984/disclosr-foidocuments/_design/app/_view/byAgencyID?reduce=false&keys=%5B\"5716ce0aacfe98f7d638b7a66b7f1040\"%5D&limit=600" | xargs -L1`; do
  # echo $line
  id=`echo $line | grep -Po '_id:.*?[^\\\],' | perl -pe 's/_id://; s/^//; s/,$//'`
  rev=`echo $line | grep -Po 'rev:.*?[^\\\],'| perl -pe 's/rev://; s/^//; s/,$//'`
  if [ -n "$id" ]; then
  echo "curl -X DELETE http://localhost:5984/disclosr-foidocuments/$id?rev=$rev"
  curl -X DELETE http://localhost:5984/disclosr-foidocuments/$id?rev=$rev
  fi
  done;
 
<?php <?php
   
include('template.inc.php'); include('template.inc.php');
include_header_documents("About"); include_header_documents("About");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
?> ?>
<h1>About</h1> <h1>About</h1>
  Written and managed by Alex Sadleir (maxious [at] lambdacomplex.org)
<?php <?php
include_footer_documents(); include_footer_documents();
?> ?>
   
<?php <?php
include('template.inc.php'); include('template.inc.php');
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
   
$idtoname = Array(); $idtoname = Array();
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
   
include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency')); include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency'));
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
?> ?>
<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act
in one place! in one place!
</div> </div>
<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a> <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
<br> <br>
<?php <?php
try { try {
if (isset($_REQUEST['id'])) { if (isset($_REQUEST['id'])) {
$rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows; $rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows;
foreach ($rows as $row) { foreach ($rows as $row) {
//print_r($rows); //print_r($rows);
echo displayLogEntry($row, $idtoname); echo displayLogEntry($row, $idtoname);
if (!isset($startkey)) if (!isset($startkey))
$startkey = $row->key; $startkey = $row->key;
$endkey = $row->key; $endkey = $row->key;
} }
} else { } else {
$rows = $foidocsdb->get_view("app", "byAgencyID?group=true", null, false, false, true)->rows; $rows = $foidocsdb->get_view("app", "byAgencyID?group=true", null, false, false, true)->rows;
if ($rows) { if ($rows) {
  function cmp($a, $b)
  {
  global $idtoname;
  return strcmp($idtoname[$a->key], $idtoname[$b->key]);
  }
  usort($rows, "cmp");
foreach ($rows as $row) { foreach ($rows as $row) {
echo '<a href="agency.php?id=' . $row->key . '">' . $idtoname[$row->key] . " (" . $row->value . " records)</a> <br>\n"; echo '<a href="agency.php?id=' . $row->key . '">' . $idtoname[$row->key] . " (" . $row->value . " records)</a> <br>\n";
} }
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>"; echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>";
include_footer_documents(); include_footer_documents();
?> ?>
   
<?php <?php
include('template.inc.php'); include('template.inc.php');
include_header_documents("Charts"); include_header_documents("Charts");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
   
$idtoname = Array(); $idtoname = Array();
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
   
?> ?>
<div class="foundation-header"> <div class="foundation-header">
<h1><a href="about.php">Charts</a></h1> <h1><a href="about.php">Charts</a></h1>
<h4 class="subheader"></h4> <h4 class="subheader"></h4>
</div> </div>
<div id="bydate" style="width:1000px;height:300px;"></div> <div id="bydate" style="width:1000px;height:300px;"></div>
<div id="byagency" style="width:1000px;height:1400px;"></div> <div id="byagency" style="width:1000px;height:1400px;"></div>
<script id="source"> <script id="source">
window.onload = function () { window.onload = function () {
$(document).ready(function () { $(document).ready(function () {
var var
d1 = [], d1 = [],
options1, options1,
o1; o1;
   
<?php <?php
try { try {
$rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows; $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
   
   
$dataValues = Array(); $dataValues = Array();
foreach ($rows as $row) { foreach ($rows as $row) {
$dataValues[$row->key] = $row->value; $dataValues[$row->key] = $row->value;
} }
$i = 0; $i = 0;
ksort($dataValues); ksort($dataValues);
foreach ($dataValues as $key => $value) { foreach ($dataValues as $key => $value) {
$date = date_create_from_format('Y-m-d', $key); $date = date_create_from_format('Y-m-d', $key);
if (date_format($date, 'U') != "") { if (date_format($date, 'U') != "") {
echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL; echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
// echo " emplabels.push('$key');" . PHP_EOL; // echo " emplabels.push('$key');" . PHP_EOL;
$i++; $i++;
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
?> ?>
   
   
options1 = { options1 = {
xaxis: { xaxis: {
mode: 'time', mode: 'time',
labelsAngle: 45 labelsAngle: 45
}, },
selection: { selection: {
mode: 'x' mode: 'x'
}, },
HtmlText: false, HtmlText: false,
title: 'Disclosure Log entries added by Date' title: 'Disclosure Log entries added by Date'
}; };
   
// Draw graph with default options, overwriting with passed options // Draw graph with default options, overwriting with passed options
function drawGraph(opts) { function drawGraph(opts) {
   
// Clone the options, so the 'options' variable always keeps intact. // Clone the options, so the 'options' variable always keeps intact.
o1 = Flotr._.extend(Flotr._.clone(options1), opts || {}); o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
   
// Return a new graph. // Return a new graph.
return Flotr.draw( return Flotr.draw(
document.getElementById("bydate"), document.getElementById("bydate"),
[ d1 ], [ d1 ],
o1 o1
); );
} }
   
graph = drawGraph(); graph = drawGraph();
   
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) { Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {
// Draw selected area // Draw selected area
graph = drawGraph({ graph = drawGraph({
xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 }, xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },
yaxis: { min: area.y1, max: area.y2 } yaxis: { min: area.y1, max: area.y2 }
}); });
}); });
   
// When graph is clicked, draw the graph with default area. // When graph is clicked, draw the graph with default area.
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {
graph = drawGraph(); graph = drawGraph();
}); });
   
}); });
}; };
   
var d2 = []; var d2 = [];
var agencylabels = []; var agencylabels = [];
function agencytrackformatter(obj) { function agencytrackformatter(obj) {
   
return agencylabels[Math.floor(obj.y)] + " = " + obj.x; return agencylabels[Math.floor(obj.y)] + " = " + obj.x;
   
} }
function agencytickformatter(val, axis) { function agencytickformatter(val, axis) {
if (agencylabels[Math.floor(val)]) { if (agencylabels[Math.floor(val)]) {
return (agencylabels[Math.floor(val)]) ; return (agencylabels[Math.floor(val)]) ;
   
} else { } else {
return ""; return "";
} }
} }
<?php <?php
try { try {
$rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows; $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
  function cmp($a, $b)
  {
  return $a->value > $b->value;
  }
  usort($rows, "cmp");
   
$dataValues = Array(); $dataValues = Array();
$i = 0; $i = 0;
foreach ($rows as $row) { foreach ($rows as $row) {
echo " d2.push([ $row->value,$i]);" . PHP_EOL; echo " d2.push([ $row->value,$i]);" . PHP_EOL;
echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL; echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
   
$i++; $i++;
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
?> ?>
// Draw the graph // Draw the graph
Flotr.draw( Flotr.draw(
document.getElementById("byagency"), document.getElementById("byagency"),
[d2], [d2],
{ {
title: "Disclosure Log entries by Agency", title: "Disclosure Log entries by Agency",
bars: { bars: {
show: true, show: true,
horizontal: true, horizontal: true,
shadowSize: 0, shadowSize: 0,
barWidth: 0.5 barWidth: 0.5
}, },
mouse: { mouse: {
track: true, track: true,
relative: true, relative: true,
trackFormatter: agencytrackformatter trackFormatter: agencytrackformatter
}, },
yaxis: { yaxis: {
minorTickFreq: 1, minorTickFreq: 1,
noTicks: agencylabels.length, noTicks: agencylabels.length,
showMinorLabels: true, showMinorLabels: true,
tickFormatter: agencytickformatter tickFormatter: agencytickformatter
}, },
xaxis: { xaxis: {
min: 0, min: 0,
autoscaleMargin: 1 autoscaleMargin: 1
}, },
legend: { legend: {
show: false show: false
} }
} }
); );
</script> </script>
   
<?php <?php
include_footer_documents(); include_footer_documents();
?> ?>
   
   
<?php <?php
include('template.inc.php'); include('template.inc.php');
include_header_documents(""); include_header_documents("");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
$enddocid = (isset($_REQUEST['end_docid']) ? $_REQUEST['end_docid'] : null); $enddocid = (isset($_REQUEST['end_docid']) ? $_REQUEST['end_docid'] : null);
?> ?>
<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in
one place! one place!
</div> </div>
<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a> <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
<br> <br>
<?php <?php
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
   
$idtoname = Array(); $idtoname = Array();
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
  //print_r($foidocsdb);
try { try {
$rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20, null, $enddocid)->rows; $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20, null, $enddocid)->rows;
if ($rows) { if ($rows) {
foreach ($rows as $key => $row) { foreach ($rows as $key => $row) {
echo displayLogEntry($row, $idtoname); echo displayLogEntry($row, $idtoname);
if (!isset($startkey)) if (!isset($startkey))
$startkey = $row->key; $startkey = $row->key;
$endkey = $row->key; $endkey = $row->key;
$enddocid = $row->value->_id; $enddocid = $row->value->_id;
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey&amp;end_docid=$enddocid' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>"; echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey&amp;end_docid=$enddocid' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>";
include_footer_documents(); include_footer_documents();
?> ?>
   
<?php <?php
   
// Agency X updated Y, new files, diff of plain text/link text, // Agency X updated Y, new files, diff of plain text/link text,
// feed for just one agency or all // feed for just one agency or all
// This is a minimum example of using the Universal Feed Generator Class // This is a minimum example of using the Universal Feed Generator Class
include("../lib/FeedWriter/FeedTypes.php"); include("../lib/FeedWriter/FeedTypes.php");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
//Creating an instance of FeedWriter class. //Creating an instance of FeedWriter class.
$TestFeed = new RSS2FeedWriter(); $TestFeed = new RSS2FeedWriter();
//Setting the channel elements //Setting the channel elements
////Retriving informations from database ////Retriving informations from database
$idtoname = Array(); $idtoname = Array();
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
if (isset($_REQUEST['id'])) { if (isset($_REQUEST['id'])) {
$rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows; $rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows;
$title = $idtoname[$_REQUEST['id']]; $title = $idtoname[$_REQUEST['id']];
} else { } else {
$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99', '0000-00-00', 50), true)->rows; $rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99', '0000-00-00', 50), true)->rows;
$title = 'All Agencies'; $title = 'All Agencies';
} }
//Use wrapper functions for common channelelements //Use wrapper functions for common channelelements
$TestFeed->setTitle('disclosurelo.gs Newest Entries - ' . $title); $TestFeed->setTitle('disclosurelo.gs Newest Entries - ' . $title);
$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php' . (isset($_REQUEST['id']) ? '?id=' . $_REQUEST['id'] : '')); $TestFeed->setLink('http://disclosurelo.gs/rss.xml.php' . (isset($_REQUEST['id']) ? '?id=' . $_REQUEST['id'] : ''));
$TestFeed->setDescription('disclosurelo.gs Newest Entries - ' . $title); $TestFeed->setDescription('disclosurelo.gs Newest Entries - ' . $title);
$TestFeed->setChannelElement('language', 'en-us'); $TestFeed->setChannelElement('language', 'en-us');
$TestFeed->setChannelElement('pubDate', date(DATE_RSS, time())); $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
   
   
//print_r($rows); //print_r($rows);
  $i =0;
foreach ($rows as $row) { foreach ($rows as $row) {
//Create an empty FeedItem //Create an empty FeedItem
$newItem = $TestFeed->createNewItem(); $newItem = $TestFeed->createNewItem();
//Add elements to the feed item //Add elements to the feed item
$newItem->setTitle($row->value->title); $newItem->setTitle(preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $row->value->title));
$newItem->setLink("http://disclosurelo.gs/view.php?id=" . $row->value->_id); $newItem->setLink("http://disclosurelo.gs/view.php?id=" . $row->value->_id);
$newItem->setDate(strtotime($row->value->date)); $newItem->setDate(strtotime($row->value->date));
$newItem->setDescription(displayLogEntry($row, $idtoname)); $newItem->setDescription(displayLogEntry($row, $idtoname));
$newItem->setAuthor($idtoname[$row->value->agencyID]); $newItem->setAuthor($idtoname[$row->value->agencyID]);
$newItem->addElement('guid', "http://disclosurelo.gs/view.php?id=" . $row->value->_id, array('isPermaLink' => 'true')); $newItem->addElement('guid', "http://disclosurelo.gs/view.php?id=" . $row->value->_id, array('isPermaLink' => 'true'));
//Now add the feed item //Now add the feed item
$TestFeed->addItem($newItem); $TestFeed->addItem($newItem);
  $i++;
  if ($i > 50) break;
} }
//OK. Everything is done. Now genarate the feed. //OK. Everything is done. Now genarate the feed.
$TestFeed->generateFeed(); $TestFeed->generateFeed();
?> ?>
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
#def getTable(self,soup): def getTable(self,soup):
# return soup.find(id = "cphMain_C001_Col01").table return soup.findAll('table')[1]
def getColumnCount(self): def getColumnCount(self):
return 5 return 5
def getColumns(self,columns): def getColumns(self,columns):
(id, date, title, description,notes) = columns (id, date, title, description,notes) = columns
return (id, date, title, description, notes) return (id, date, title, description, notes)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import date from datetime import date
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getTable(self,soup): def getTable(self,soup):
return soup.find(id= "ctl00_MSO_ContentDiv").table return soup.find(class_ = "rgMasterTable")
   
def getColumns(self,columns): def getColumns(self,columns):
(id, title, description, notes) = columns (id, title, description, notes) = columns
return (id, title, title, description, notes) return (id, title, title, description, notes)
def getDate(self, content, entry, doc): def getDate(self, content, entry, doc):
edate = date.today().strftime("%Y-%m-%d") edate = date.today().strftime("%Y-%m-%d")
doc.update({'date': edate}) doc.update({'date': edate})
return return
def getColumnCount(self): def getColumnCount(self):
return 4 return 4
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
#def getTable(self,soup): def getTable(self,soup):
# return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table return soup.find(id = "main").table
def getColumnCount(self): def getColumnCount(self):
return 4 return 4
def getColumns(self,columns): def getColumns(self,columns):
(date, title, description,notes) = columns (date, title, description,notes) = columns
return (title, date, title, description, notes) return (title, date, title, description, notes)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
#RSS feed not detailed #RSS feed not detailed
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
  def getTable(self,soup):
  return soup.find(id = "page_content").table
def getColumns(self,columns): def getColumns(self,columns):
(id, date, title, description, notes) = columns (id, date, title, description, notes) = columns
return (id, date, title, description, notes) return (id, date, title, description, notes)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getTable(self,soup): # def getTable(self,soup):
return soup.find(id = "_ctl0__ctl0_MainContentPlaceHolder_MainContentPlaceHolder_ContentSpan").findAll("table")[3] # return soup.find(_class = "content").table
def getColumnCount(self): def getColumnCount(self):
return 5 return 5
def getColumns(self,columns): def getColumns(self,columns):
(id, date, title, description,notes) = columns (id, date, title, description,notes) = columns
return (id, date, title, description, notes) return (id, date, title, description, notes)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
#RSS feed not detailed import dateutil
  from dateutil.parser import *
  from datetime import *
  import scrape
  from bs4 import BeautifulSoup
  class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
   
#http://www.doughellmann.com/PyMOTW/abc/ def __init__(self):
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): super(ScraperImplementation, self).__init__()
def getColumns(self,columns): def getTable(self, soup):
(id, date, title, description, notes) = columns return soup.find(id='content')
return (id, date, title, description, notes)  
  def getDescription(self,content, entry,doc):
  link = None
  links = []
  description = ""
  for atag in entry.find_all('a'):
  if atag.has_attr('href'):
  link = scrape.fullurl(self.getURL(), atag['href'])
  (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
  if htcontent != None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
  soup = BeautifulSoup(htcontent)
  row = soup.find(id="foidetails")
  if row == None:
  row = soup.find(id="content").table
  if row == None:
  row = soup.find(id="content")
  description = ''.join(row.stripped_strings)
  for atag in row.find_all("a"):
  if atag.has_attr('href'):
  links.append(scrape.fullurl(link, atag['href']))
   
  if links != []:
  doc.update({'links': links})
  if description != "":
  doc.update({ 'description': description})
   
  def getColumnCount(self):
  return 3
   
  def getColumns(self, columns):
  (id, title, date) = columns
  return (id, date, title, title, None)
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
<?php <?php
   
function include_header_documents($title) function include_header_documents($title)
{ {
header('X-UA-Compatible: IE=edge,chrome=1'); header('X-UA-Compatible: IE=edge,chrome=1');
?> ?>
<!doctype html> <!doctype html>
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
<!--[if lt IE 7]> <!--[if lt IE 7]>
<html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]> <!--[if IE 7]>
<html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]> <!--[if IE 8]>
<html class="no-js lt-ie9" lang="en"> <![endif]--> <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline --> <!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--> <!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
   
<title>Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?></title> <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?></title>
<meta name="description" content=""> <meta name="description" content="">
   
<!-- Mobile viewport optimized: h5bp.com/viewport --> <!-- Mobile viewport optimized: h5bp.com/viewport -->
<meta name="viewport" content="width=device-width"> <meta name="viewport" content="width=device-width">
<link rel="alternate" type="application/rss+xml" title="Latest Disclosure Log Entries" href="rss.xml.php"/> <link rel="alternate" type="application/rss+xml" title="Latest Disclosure Log Entries" href="rss.xml.php"/>
<!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons --> <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8"/> <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8"/>
   
<!-- Le styles --> <!-- Le styles -->
<link href="css/bootstrap.min.css" rel="stylesheet"> <link href="css/bootstrap.min.css" rel="stylesheet">
<style type="text/css"> <style type="text/css">
body { body {
padding-top: 60px; padding-top: 60px;
padding-bottom: 40px; padding-bottom: 40px;
} }
   
.sidebar-nav { .sidebar-nav {
padding: 9px 0; padding: 9px 0;
} }
</style> </style>
<link href="css/bootstrap-responsive.min.css" rel="stylesheet"> <link href="css/bootstrap-responsive.min.css" rel="stylesheet">
   
<!-- HTML5 shim, for IE6-8 support of HTML5 elements --> <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
<!--[if lt IE 9]> <!--[if lt IE 9]>
<script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script> <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
<![endif]--> <![endif]-->
<!-- More ideas for your <head> here: h5bp.com/d/head-Tips --> <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
   
<!-- All JavaScript at the bottom, except this Modernizr build. <!-- All JavaScript at the bottom, except this Modernizr build.
Modernizr enables HTML5 elements & feature detects for optimal performance. Modernizr enables HTML5 elements & feature detects for optimal performance.
Create your own custom Modernizr build: www.modernizr.com/download/ Create your own custom Modernizr build: www.modernizr.com/download/
<script src="js/libs/modernizr-2.5.3.min.js"></script>--> <script src="js/libs/modernizr-2.5.3.min.js"></script>-->
<script src="js/jquery.js"></script> <script src="js/jquery.js"></script>
<script type="text/javascript" src="js/flotr2.min.js"></script> <script type="text/javascript" src="js/flotr2.min.js"></script>
   
</head> </head>
<body> <body>
<div class="navbar navbar-inverse navbar-fixed-top"> <div class="navbar navbar-inverse navbar-fixed-top">
<div class="navbar-inner"> <div class="navbar-inner">
<div class="container-fluid"> <div class="container-fluid">
<!--<a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> <!--<a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
<span class="icon-bar"></span> <span class="icon-bar"></span>
<span class="icon-bar"></span> <span class="icon-bar"></span>
<span class="icon-bar"></span> <span class="icon-bar"></span>
</a> --> </a> -->
<a class="brand" href="#">Australian Disclosure Logs</a> <a class="brand" href="#">Australian Disclosure Logs</a>
   
<div class="nav-collapse collapse"> <div class="nav-collapse collapse">
<p class="navbar-text pull-right"> <p class="navbar-text pull-right">
<small> <small>
Subsites on: Subsites on:
</small> </small>
<a href="http://orgs.disclosurelo.gs">Government Agencies</a> <a href="http://orgs.disclosurelo.gs">Government Agencies</a>
• <a href="http://lobbyists.disclosurelo.gs">Political Lobbyists</a> • <a href="http://lobbyists.disclosurelo.gs">Political Lobbyists</a>
• <a href="http://contracts.disclosurelo.gs">Government Contracts and Spending</a> • <a href="http://contracts.disclosurelo.gs">Government Contracts and Spending</a>
   
</p> </p>
<ul class="nav"> <ul class="nav">
<li><a href="agency.php">By Agency</a></li> <li><a href="agency.php">By Agency</a></li>
<li><a href="date.php">By Date</a></li> <!-- <li><a href="date.php">By Date</a></li> -->
<li><a href="disclogsList.php">List of Disclosure Logs</a></li> <li><a href="disclogsList.php">List of Disclosure Logs</a></li>
<li><a href="charts.php">Charts</a></li> <li><a href="charts.php">Charts</a></li>
<li><a href="about.php">About</a></li> <li><a href="about.php">About</a></li>
   
</ul> </ul>
</div> </div>
<!--/.nav-collapse --> <!--/.nav-collapse -->
</div> </div>
</div> </div>
</div> </div>
<div class="container"> <div class="container">
<?php <?php
} }
   
function include_footer_documents() function include_footer_documents()
{ {
global $ENV; global $ENV;
?> ?>
</div> <!-- /container --> </div> <!-- /container -->
<hr> <hr>
   
<footer> <footer>
<p>Not affiliated with or endorsed by any government agency.</p> <p>Not affiliated with or endorsed by any government agency.</p>
</footer> </footer>
<?php <?php
if ($ENV != "DEV") { if ($ENV != "DEV") {
echo "<script type='text/javascript'> echo "<script type='text/javascript'>
   
var _gaq = _gaq || []; var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-12341040-4']); _gaq.push(['_setAccount', 'UA-12341040-4']);
_gaq.push(['_setDomainName', 'disclosurelo.gs']); _gaq.push(['_setDomainName', 'disclosurelo.gs']);
_gaq.push(['_setAllowLinker', true]); _gaq.push(['_setAllowLinker', true]);
_gaq.push(['_trackPageview']); _gaq.push(['_trackPageview']);
   
(function() { (function() {
var ga = document.createElement('script'); var ga = document.createElement('script');
ga.type = 'text/javascript'; ga.type = 'text/javascript';
ga.async = true; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(ga, s); s.parentNode.insertBefore(ga, s);
})(); })();
   
</script>"; </script>";
} }
?> ?>
<!-- Le javascript <!-- Le javascript
================================================== --> ================================================== -->
<!-- Placed at the end of the document so the pages load faster --> <!-- Placed at the end of the document so the pages load faster -->
<!-- <!--
<script src="js/bootstrap-transition.js"></script> <script src="js/bootstrap-transition.js"></script>
<script src="js/bootstrap-alert.js"></script> <script src="js/bootstrap-alert.js"></script>
<script src="js/bootstrap-modal.js"></script> <script src="js/bootstrap-modal.js"></script>
<script src="js/bootstrap-dropdown.js"></script> <script src="js/bootstrap-dropdown.js"></script>
<script src="js/bootstrap-scrollspy.js"></script> <script src="js/bootstrap-scrollspy.js"></script>
<script src="js/bootstrap-tab.js"></script> <script src="js/bootstrap-tab.js"></script>
<script src="js/bootstrap-tooltip.js"></script> <script src="js/bootstrap-tooltip.js"></script>
<script src="js/bootstrap-popover.js"></script> <script src="js/bootstrap-popover.js"></script>
<script src="js/bootstrap-button.js"></script> <script src="js/bootstrap-button.js"></script>
<script src="js/bootstrap-collapse.js"></script> <script src="js/bootstrap-collapse.js"></script>
<script src="js/bootstrap-carousel.js"></script> <script src="js/bootstrap-carousel.js"></script>
<script src="js/bootstrap-typeahead.js"></script>--> <script src="js/bootstrap-typeahead.js"></script>-->
   
   
</body> </body>
</html> </html>
<?php <?php
} }
   
function truncate($string, $length, $stopanywhere = false) function truncate($string, $length, $stopanywhere = false)
{ {
//truncates a string to a certain char length, stopping on a word if not specified otherwise. //truncates a string to a certain char length, stopping on a word if not specified otherwise.
if (strlen($string) > $length) { if (strlen($string) > $length) {
//limit hit! //limit hit!
$string = substr($string, 0, ($length - 3)); $string = substr($string, 0, ($length - 3));
if ($stopanywhere) { if ($stopanywhere) {
//stop anywhere //stop anywhere
$string .= '...'; $string .= '...';
} else { } else {
//stop on a word. //stop on a word.
$string = substr($string, 0, strrpos($string, ' ')) . '...'; $string = substr($string, 0, strrpos($string, ' ')) . '...';
} }
} }
return $string; return $string;
} }
   
function displayLogEntry($row, $idtoname) function displayLogEntry($row, $idtoname)
{ {
$result = ""; $result = "";
$result .= '<div itemscope itemtype="http://schema.org/Article">'; $result .= '<div itemscope itemtype="http://schema.org/Article">';
$result .= '<h2><a href="http://disclosurelo.gs/view.php?id=' . $row->value->_id . '"> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>"; $result .= '<h2><a href="http://disclosurelo.gs/view.php?id=' . $row->value->_id . '"> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>";
$result .= ' (<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</a></h2>'; $result .= ' (<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</a></h2>';
$result .= "<p itemprop='description articleBody text'> Title: " . $row->value->title . "<br/>"; $result .= "<p itemprop='description articleBody text'> Title: " . $row->value->title . "<br/>";
if (isset($row->value->description)) { if (isset($row->value->description)) {
$result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "", trim($row->value->description))); $result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "", trim($row->value->description)));
} }
if (isset($row->value->notes)) { if (isset($row->value->notes)) {
$result .= " <br>Note: " . $row->value->notes; $result .= " <br>Note: " . $row->value->notes;
} }
$result .= "</p>"; $result .= "</p>";
   
if (isset($row->value->links)) { if (isset($row->value->links)) {
$result .= '<h3>Links/Documents</h3><ul itemprop="associatedMedia">'; $result .= '<h3>Links/Documents</h3><ul itemprop="associatedMedia">';
foreach ($row->value->links as $link) { foreach ($row->value->links as $link) {
$result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href="' . htmlspecialchars($link) . '" itemprop="url contentURL">' . htmlspecialchars($link) . "</a></li>"; $result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href="' . htmlspecialchars($link) . '" itemprop="url contentURL">' . htmlspecialchars($link) . "</a></li>";
} }
   
$result .= "</ul>"; $result .= "</ul>";
} }
$result .= "<small><A itemprop='url' href='" . $row->value->url . "'>View original source...</a> ID: " . strip_tags($row->value->docID) . "</small>"; $result .= "<small><A itemprop='url' href='" . $row->value->url . "'>View original source...</a> ID: " . strip_tags($row->value->docID) . "</small>";
$result .= "</div>\n"; $result .= "</div>\n";
return $result; return $result;
} }