for line in `curl "http://localhost:5984/disclosr-foidocuments/_design/app/_view/byAgencyID?reduce=false&keys=%5B\"5716ce0aacfe98f7d638b7a66b7f1040\"%5D&limit=600" | xargs -L1`; do | |
# echo $line | |
id=`echo $line | grep -Po '_id:.*?[^\\\],' | perl -pe 's/_id://; s/^//; s/,$//'` | |
rev=`echo $line | grep -Po 'rev:.*?[^\\\],'| perl -pe 's/rev://; s/^//; s/,$//'` | |
if [ -n "$id" ]; then | |
echo "curl -X DELETE http://localhost:5984/disclosr-foidocuments/$id?rev=$rev" | |
curl -X DELETE http://localhost:5984/disclosr-foidocuments/$id?rev=$rev | |
fi | |
done; | |
<?php | <?php |
include('template.inc.php'); | include('template.inc.php'); |
include_header_documents("About"); | include_header_documents("About"); |
include_once('../include/common.inc.php'); | include_once('../include/common.inc.php'); |
?> | ?> |
<h1>About</h1> | <h1>About</h1> |
Written and managed by Alex Sadleir (maxious [at] lambdacomplex.org) | |
<?php | <?php |
include_footer_documents(); | include_footer_documents(); |
?> | ?> |
<?php | <?php |
include('template.inc.php'); | include('template.inc.php'); |
include_once('../include/common.inc.php'); | include_once('../include/common.inc.php'); |
$agenciesdb = $server->get_db('disclosr-agencies'); | $agenciesdb = $server->get_db('disclosr-agencies'); |
$idtoname = Array(); | $idtoname = Array(); |
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { | foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { |
$idtoname[$row->id] = trim($row->value->name); | $idtoname[$row->id] = trim($row->value->name); |
} | } |
$foidocsdb = $server->get_db('disclosr-foidocuments'); | $foidocsdb = $server->get_db('disclosr-foidocuments'); |
include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency')); | include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency')); |
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); | $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); |
?> | ?> |
<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act | <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act |
in one place! | in one place! |
</div> | </div> |
<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a> | <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a> |
<br> | <br> |
<?php | <?php |
try { | try { |
if (isset($_REQUEST['id'])) { | if (isset($_REQUEST['id'])) { |
$rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows; | $rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
//print_r($rows); | //print_r($rows); |
echo displayLogEntry($row, $idtoname); | echo displayLogEntry($row, $idtoname); |
if (!isset($startkey)) | if (!isset($startkey)) |
$startkey = $row->key; | $startkey = $row->key; |
$endkey = $row->key; | $endkey = $row->key; |
} | } |
} else { | } else { |
$rows = $foidocsdb->get_view("app", "byAgencyID?group=true", null, false, false, true)->rows; | $rows = $foidocsdb->get_view("app", "byAgencyID?group=true", null, false, false, true)->rows; |
if ($rows) { | if ($rows) { |
function cmp($a, $b) | |
{ | |
global $idtoname; | |
return strcmp($idtoname[$a->key], $idtoname[$b->key]); | |
} | |
usort($rows, "cmp"); | |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<a href="agency.php?id=' . $row->key . '">' . $idtoname[$row->key] . " (" . $row->value . " records)</a> <br>\n"; | echo '<a href="agency.php?id=' . $row->key . '">' . $idtoname[$row->key] . " (" . $row->value . " records)</a> <br>\n"; |
} | } |
} | } |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>"; | echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>"; |
include_footer_documents(); | include_footer_documents(); |
?> | ?> |
<?php | <?php |
include('template.inc.php'); | include('template.inc.php'); |
include_header_documents("Charts"); | include_header_documents("Charts"); |
include_once('../include/common.inc.php'); | include_once('../include/common.inc.php'); |
$agenciesdb = $server->get_db('disclosr-agencies'); | $agenciesdb = $server->get_db('disclosr-agencies'); |
$idtoname = Array(); | $idtoname = Array(); |
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { | foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { |
$idtoname[$row->id] = trim($row->value->name); | $idtoname[$row->id] = trim($row->value->name); |
} | } |
$foidocsdb = $server->get_db('disclosr-foidocuments'); | $foidocsdb = $server->get_db('disclosr-foidocuments'); |
?> | ?> |
<div class="foundation-header"> | <div class="foundation-header"> |
<h1><a href="about.php">Charts</a></h1> | <h1><a href="about.php">Charts</a></h1> |
<h4 class="subheader"></h4> | <h4 class="subheader"></h4> |
</div> | </div> |
<div id="bydate" style="width:1000px;height:300px;"></div> | <div id="bydate" style="width:1000px;height:300px;"></div> |
<div id="byagency" style="width:1000px;height:1400px;"></div> | <div id="byagency" style="width:1000px;height:1400px;"></div> |
<script id="source"> | <script id="source"> |
window.onload = function () { | window.onload = function () { |
$(document).ready(function () { | $(document).ready(function () { |
var | var |
d1 = [], | d1 = [], |
options1, | options1, |
o1; | o1; |
<?php | <?php |
try { | try { |
$rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows; | $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows; |
$dataValues = Array(); | $dataValues = Array(); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$dataValues[$row->key] = $row->value; | $dataValues[$row->key] = $row->value; |
} | } |
$i = 0; | $i = 0; |
ksort($dataValues); | ksort($dataValues); |
foreach ($dataValues as $key => $value) { | foreach ($dataValues as $key => $value) { |
$date = date_create_from_format('Y-m-d', $key); | $date = date_create_from_format('Y-m-d', $key); |
if (date_format($date, 'U') != "") { | if (date_format($date, 'U') != "") { |
echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL; | echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL; |
// echo " emplabels.push('$key');" . PHP_EOL; | // echo " emplabels.push('$key');" . PHP_EOL; |
$i++; | $i++; |
} | } |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
?> | ?> |
options1 = { | options1 = { |
xaxis: { | xaxis: { |
mode: 'time', | mode: 'time', |
labelsAngle: 45 | labelsAngle: 45 |
}, | }, |
selection: { | selection: { |
mode: 'x' | mode: 'x' |
}, | }, |
HtmlText: false, | HtmlText: false, |
title: 'Disclosure Log entries added by Date' | title: 'Disclosure Log entries added by Date' |
}; | }; |
// Draw graph with default options, overwriting with passed options | // Draw graph with default options, overwriting with passed options |
function drawGraph(opts) { | function drawGraph(opts) { |
// Clone the options, so the 'options' variable always keeps intact. | // Clone the options, so the 'options' variable always keeps intact. |
o1 = Flotr._.extend(Flotr._.clone(options1), opts || {}); | o1 = Flotr._.extend(Flotr._.clone(options1), opts || {}); |
// Return a new graph. | // Return a new graph. |
return Flotr.draw( | return Flotr.draw( |
document.getElementById("bydate"), | document.getElementById("bydate"), |
[ d1 ], | [ d1 ], |
o1 | o1 |
); | ); |
} | } |
graph = drawGraph(); | graph = drawGraph(); |
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) { | Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) { |
// Draw selected area | // Draw selected area |
graph = drawGraph({ | graph = drawGraph({ |
xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 }, | xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 }, |
yaxis: { min: area.y1, max: area.y2 } | yaxis: { min: area.y1, max: area.y2 } |
}); | }); |
}); | }); |
// When graph is clicked, draw the graph with default area. | // When graph is clicked, draw the graph with default area. |
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { | Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { |
graph = drawGraph(); | graph = drawGraph(); |
}); | }); |
}); | }); |
}; | }; |
var d2 = []; | var d2 = []; |
var agencylabels = []; | var agencylabels = []; |
function agencytrackformatter(obj) { | function agencytrackformatter(obj) { |
return agencylabels[Math.floor(obj.y)] + " = " + obj.x; | return agencylabels[Math.floor(obj.y)] + " = " + obj.x; |
} | } |
function agencytickformatter(val, axis) { | function agencytickformatter(val, axis) { |
if (agencylabels[Math.floor(val)]) { | if (agencylabels[Math.floor(val)]) { |
return (agencylabels[Math.floor(val)]) ; | return (agencylabels[Math.floor(val)]) ; |
} else { | } else { |
return ""; | return ""; |
} | } |
} | } |
<?php | <?php |
try { | try { |
$rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows; | $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows; |
function cmp($a, $b) | |
{ | |
return $a->value > $b->value; | |
} | |
usort($rows, "cmp"); | |
$dataValues = Array(); | $dataValues = Array(); |
$i = 0; | $i = 0; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo " d2.push([ $row->value,$i]);" . PHP_EOL; | echo " d2.push([ $row->value,$i]);" . PHP_EOL; |
echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL; | echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL; |
$i++; | $i++; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
?> | ?> |
// Draw the graph | // Draw the graph |
Flotr.draw( | Flotr.draw( |
document.getElementById("byagency"), | document.getElementById("byagency"), |
[d2], | [d2], |
{ | { |
title: "Disclosure Log entries by Agency", | title: "Disclosure Log entries by Agency", |
bars: { | bars: { |
show: true, | show: true, |
horizontal: true, | horizontal: true, |
shadowSize: 0, | shadowSize: 0, |
barWidth: 0.5 | barWidth: 0.5 |
}, | }, |
mouse: { | mouse: { |
track: true, | track: true, |
relative: true, | relative: true, |
trackFormatter: agencytrackformatter | trackFormatter: agencytrackformatter |
}, | }, |
yaxis: { | yaxis: { |
minorTickFreq: 1, | minorTickFreq: 1, |
noTicks: agencylabels.length, | noTicks: agencylabels.length, |
showMinorLabels: true, | showMinorLabels: true, |
tickFormatter: agencytickformatter | tickFormatter: agencytickformatter |
}, | }, |
xaxis: { | xaxis: { |
min: 0, | min: 0, |
autoscaleMargin: 1 | autoscaleMargin: 1 |
}, | }, |
legend: { | legend: { |
show: false | show: false |
} | } |
} | } |
); | ); |
</script> | </script> |
<?php | <?php |
include_footer_documents(); | include_footer_documents(); |
?> | ?> |
import sys | import sys |
import os | import os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
from time import mktime | from time import mktime |
import feedparser | import feedparser |
import abc | import abc |
import unicodedata | import unicodedata |
import re | import re |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
from datetime import * | from datetime import * |
import codecs | import codecs |
import difflib | import difflib |
from StringIO import StringIO | from StringIO import StringIO |
from pdfminer.pdfparser import PDFDocument, PDFParser | from pdfminer.pdfparser import PDFDocument, PDFParser |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf |
from pdfminer.pdfdevice import PDFDevice, TagExtractor | from pdfminer.pdfdevice import PDFDevice, TagExtractor |
from pdfminer.converter import TextConverter | from pdfminer.converter import TextConverter |
from pdfminer.cmapdb import CMapDB | from pdfminer.cmapdb import CMapDB |
from pdfminer.layout import LAParams | from pdfminer.layout import LAParams |
class GenericDisclogScraper(object): | class GenericDisclogScraper(object): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
agencyID = None | agencyID = None |
disclogURL = None | disclogURL = None |
def remove_control_chars(self, input): | def remove_control_chars(self, input): |
return "".join([i for i in input if ord(i) in range(32, 127)]) | return "".join([i for i in input if ord(i) in range(32, 127)]) |
def getAgencyID(self): | def getAgencyID(self): |
""" disclosr agency id """ | """ disclosr agency id """ |
if self.agencyID is None: | if self.agencyID is None: |
self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") | self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") |
return self.agencyID | return self.agencyID |
def getURL(self): | def getURL(self): |
""" disclog URL""" | """ disclog URL""" |
if self.disclogURL is None: | if self.disclogURL is None: |
agency = scrape.agencydb.get(self.getAgencyID()) | agency = scrape.agencydb.get(self.getAgencyID()) |
self.disclogURL = agency['FOIDocumentsURL'] | self.disclogURL = agency['FOIDocumentsURL'] |
return self.disclogURL | return self.disclogURL |
@abc.abstractmethod | @abc.abstractmethod |
def doScrape(self): | def doScrape(self): |
""" do the scraping """ | """ do the scraping """ |
return | return |
class GenericHTMLDisclogScraper(GenericDisclogScraper): | class GenericHTMLDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
content = rcontent | content = rcontent |
dochash = scrape.mkhash(content) | dochash = scrape.mkhash(content) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" | description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" |
last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) | last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) |
if last_attach != None: | if last_attach != None: |
html_diff = difflib.HtmlDiff() | html_diff = difflib.HtmlDiff() |
diff = html_diff.make_table(last_attach.read().split('\n'), | diff = html_diff.make_table(last_attach.read().split('\n'), |
content.split('\n')) | content.split('\n')) |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |