Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr
Conflicts:
documents/genericScrapers.py
Former-commit-id: 492c708ed8d0d1b30bb7c8f672b9e101a7d44f89
--- a/documents/404.html
+++ b/documents/404.html
@@ -1,44 +1,156 @@
<!doctype html>
<html lang="en">
<head>
- <meta charset="utf-8">
- <title>Page Not Found :(</title>
- <style>
- ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
- ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
- html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
- html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
- body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
- h1 { margin: 0 10px; font-size: 50px; text-align: center; }
- h1 span { color: #bbb; }
- h3 { margin: 1.5em 0 0.5em; }
- p { margin: 1em 0; }
- ul { padding: 0 0 0 40px; margin: 1em 0; }
- .container { max-width: 380px; _width: 380px; margin: 0 auto; }
- /* google search */
- #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
- #goog-fixurl form { margin: 0; }
- #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
- #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
- #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
- #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
- #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
- input::-moz-focus-inner { padding: 0; border: 0; }
- </style>
+ <meta charset="utf-8">
+ <title>Page Not Found :(</title>
+ <style>
+ ::-moz-selection {
+ background: #fe57a1;
+ color: #fff;
+ text-shadow: none;
+ }
+
+ ::selection {
+ background: #fe57a1;
+ color: #fff;
+ text-shadow: none;
+ }
+
+ html {
+ padding: 30px 10px;
+ font-size: 20px;
+ line-height: 1.4;
+ color: #737373;
+ background: #f0f0f0;
+ -webkit-text-size-adjust: 100%;
+ -ms-text-size-adjust: 100%;
+ }
+
+ html, input {
+ font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+ }
+
+ body {
+ max-width: 500px;
+ _width: 500px;
+ padding: 30px 20px 50px;
+ border: 1px solid #b3b3b3;
+ border-radius: 4px;
+ margin: 0 auto;
+ box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff;
+ background: #fcfcfc;
+ }
+
+ h1 {
+ margin: 0 10px;
+ font-size: 50px;
+ text-align: center;
+ }
+
+ h1 span {
+ color: #bbb;
+ }
+
+ h3 {
+ margin: 1.5em 0 0.5em;
+ }
+
+ p {
+ margin: 1em 0;
+ }
+
+ ul {
+ padding: 0 0 0 40px;
+ margin: 1em 0;
+ }
+
+ .container {
+ max-width: 380px;
+ _width: 380px;
+ margin: 0 auto;
+ }
+
+ /* google search */
+ #goog-fixurl ul {
+ list-style: none;
+ padding: 0;
+ margin: 0;
+ }
+
+ #goog-fixurl form {
+ margin: 0;
+ }
+
+ #goog-wm-qt, #goog-wm-sb {
+ border: 1px solid #bbb;
+ font-size: 16px;
+ line-height: normal;
+ vertical-align: top;
+ color: #444;
+ border-radius: 2px;
+ }
+
+ #goog-wm-qt {
+ width: 220px;
+ height: 20px;
+ padding: 5px;
+ margin: 5px 10px 0 0;
+ box-shadow: inset 0 1px 1px #ccc;
+ }
+
+ #goog-wm-sb {
+ display: inline-block;
+ height: 32px;
+ padding: 0 10px;
+ margin: 5px 0 0;
+ white-space: nowrap;
+ cursor: pointer;
+ background-color: #f5f5f5;
+ background-image: -webkit-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ background-image: -moz-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ background-image: -ms-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ background-image: -o-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ -webkit-appearance: none;
+ -moz-appearance: none;
+ appearance: none;
+ *overflow: visible;
+ *display: inline;
+ *zoom: 1;
+ }
+
+ #goog-wm-sb:hover, #goog-wm-sb:focus {
+ border-color: #aaa;
+ box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1);
+ background-color: #f8f8f8;
+ }
+
+ #goog-wm-qt:focus, #goog-wm-sb:focus {
+ border-color: #105cb6;
+ outline: 0;
+ color: #222;
+ }
+
+ input::-moz-focus-inner {
+ padding: 0;
+ border: 0;
+ }
+ </style>
</head>
<body>
- <div class="container">
+<div class="container">
<h1>Not found <span>:(</span></h1>
+
<p>Sorry, but the page you were trying to view does not exist.</p>
+
<p>It looks like this was the result of either:</p>
<ul>
- <li>a mistyped address</li>
- <li>an out-of-date link</li>
+ <li>a mistyped address</li>
+ <li>an out-of-date link</li>
</ul>
<script>
- var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+ var GOOG_FIXURL_LANG = (navigator.language || '').slice(0, 2), GOOG_FIXURL_SITE = location.host;
</script>
<script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
- </div>
+</div>
--- a/documents/agency.php
+++ b/documents/agency.php
@@ -12,8 +12,11 @@
include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency'));
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+ <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act
+ in one place!
+ </div>
+ <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+ <br>
<?php
try {
if ($_REQUEST['id']) {
--- a/documents/charts.php
+++ b/documents/charts.php
@@ -18,144 +18,145 @@
<div id="bydate" style="width:1000px;height:300px;"></div>
<div id="byagency" style="width:1200px;height:300px;"></div>
<script id="source">
- window.onload = function() {
- $(document).ready(function() {
- var
- d1 = [],
- options1,
- o1;
+ window.onload = function () {
+ $(document).ready(function () {
+ var
+ d1 = [],
+ options1,
+ o1;
-<?php
- try {
- $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
+ <?php
+ try {
+ $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
- $dataValues = Array();
- foreach ($rows as $row) {
- $dataValues[$row->key] = $row->value;
- }
- $i = 0;
- ksort($dataValues);
- foreach ($dataValues as $key => $value) {
-$date = date_create_from_format('Y-m-d', $key);
-if (date_format($date, 'U') != "") {
- echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
-// echo " emplabels.push('$key');" . PHP_EOL;
- $i++;
-}
- }
- } catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
- }
- ?>
+ $dataValues = Array();
+ foreach ($rows as $row) {
+ $dataValues[$row->key] = $row->value;
+ }
+ $i = 0;
+ ksort($dataValues);
+ foreach ($dataValues as $key => $value) {
+ $date = date_create_from_format('Y-m-d', $key);
+ if (date_format($date, 'U') != "") {
+ echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
+ // echo " emplabels.push('$key');" . PHP_EOL;
+ $i++;
+ }
+ }
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+ }
+ ?>
-
- options1 = {
- xaxis : {
- mode : 'time',
- labelsAngle : 45
- },
- selection : {
- mode : 'x'
- },
- HtmlText : false,
- title : 'Time'
- };
-
- // Draw graph with default options, overwriting with passed options
- function drawGraph (opts) {
+ options1 = {
+ xaxis: {
+ mode: 'time',
+ labelsAngle: 45
+ },
+ selection: {
+ mode: 'x'
+ },
+ HtmlText: false,
+ title: 'Time'
+ };
- // Clone the options, so the 'options' variable always keeps intact.
- o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
+ // Draw graph with default options, overwriting with passed options
+ function drawGraph(opts) {
- // Return a new graph.
- return Flotr.draw(
- document.getElementById("bydate"),
- [ d1 ],
- o1
- );
- }
+ // Clone the options, so the 'options' variable always keeps intact.
+ o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
- graph = drawGraph();
-
- Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function(area){
- // Draw selected area
- graph = drawGraph({
- xaxis : { min : area.x1, max : area.x2, mode : 'time', labelsAngle : 45 },
- yaxis : { min : area.y1, max : area.y2 }
- });
- });
-
- // When graph is clicked, draw the graph with default area.
- Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { graph = drawGraph(); });
+ // Return a new graph.
+ return Flotr.draw(
+ document.getElementById("bydate"),
+ [ d1 ],
+ o1
+ );
+ }
+
+ graph = drawGraph();
+
+ Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {
+ // Draw selected area
+ graph = drawGraph({
+ xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },
+ yaxis: { min: area.y1, max: area.y2 }
+ });
+ });
+
+ // When graph is clicked, draw the graph with default area.
+ Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {
+ graph = drawGraph();
+ });
});
-};
+ };
-var d2 = [];
-var agencylabels = [];
-function agencytrackformatter(obj) {
-
- return agencylabels[Math.floor(obj.x)] +" = "+obj.y;
-
- }
- function agencytickformatter(val, axis) {
- if (agencylabels[Math.floor(val)]) {
- return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">'+(agencylabels[Math.floor(val)])+"</b>";
-
- } else {
- return "";
- }
- }
-<?php
- try {
- $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
+ var d2 = [];
+ var agencylabels = [];
+ function agencytrackformatter(obj) {
+
+ return agencylabels[Math.floor(obj.x)] + " = " + obj.y;
+
+ }
+ function agencytickformatter(val, axis) {
+ if (agencylabels[Math.floor(val)]) {
+ return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">' + (agencylabels[Math.floor(val)]) + "</b>";
+
+ } else {
+ return "";
+ }
+ }
+ <?php
+ try {
+ $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
- $dataValues = Array();
- $i = 0;
- foreach ($rows as $row) {
- echo " d2.push([".$i.", $row->value]);" . PHP_EOL;
- echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
-
- $i++;
+ $dataValues = Array();
+ $i = 0;
+ foreach ($rows as $row) {
+ echo " d2.push([".$i.", $row->value]);" . PHP_EOL;
+ echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
+
+ $i++;
+ }
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
}
- } catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
- }
- ?>
- // Draw the graph
- Flotr.draw(
- document.getElementById("byagency"),
- [d2],
- {
- bars : {
- show : true,
- horizontal : false,
- shadowSize : 0,
- barWidth : 0.5
- },
-mouse : {
- track : true,
- relative : true,
- trackFormatter: agencytrackformatter
- },
- yaxis : {
- min : 0,
- autoscaleMargin : 1
- },
- xaxis: {
- minorTickFreq: 1,
- noTicks: agencylabels.length,
- showMinorLabels: true,
- tickFormatter: agencytickformatter
- },
- legend: {
- show: false
- }
- }
- );
+ ?>
+ // Draw the graph
+ Flotr.draw(
+ document.getElementById("byagency"),
+ [d2],
+ {
+ bars: {
+ show: true,
+ horizontal: false,
+ shadowSize: 0,
+ barWidth: 0.5
+ },
+ mouse: {
+ track: true,
+ relative: true,
+ trackFormatter: agencytrackformatter
+ },
+ yaxis: {
+ min: 0,
+ autoscaleMargin: 1
+ },
+ xaxis: {
+ minorTickFreq: 1,
+ noTicks: agencylabels.length,
+ showMinorLabels: true,
+ tickFormatter: agencytickformatter
+ },
+ legend: {
+ show: false
+ }
+ }
+ );
</script>
<?php
--- a/documents/crossdomain.xml
+++ b/documents/crossdomain.xml
@@ -3,24 +3,23 @@
<cross-domain-policy>
-<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+ <!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
-<!-- Most restrictive policy: -->
- <site-control permitted-cross-domain-policies="none"/>
+ <!-- Most restrictive policy: -->
+ <site-control permitted-cross-domain-policies="none"/>
-
-<!-- Least restrictive policy: -->
-<!--
- <site-control permitted-cross-domain-policies="all"/>
- <allow-access-from domain="*" to-ports="*" secure="false"/>
- <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
--->
-<!--
- If you host a crossdomain.xml file with allow-access-from domain="*"
- and don’t understand all of the points described here, you probably
- have a nasty security vulnerability. ~ simon willison
--->
+ <!-- Least restrictive policy: -->
+ <!--
+ <site-control permitted-cross-domain-policies="all"/>
+ <allow-access-from domain="*" to-ports="*" secure="false"/>
+ <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+ -->
+ <!--
+ If you host a crossdomain.xml file with allow-access-from domain="*"
+ and don’t understand all of the points described here, you probably
+ have a nasty security vulnerability. ~ simon willison
+ -->
</cross-domain-policy>
--- /dev/null
+++ b/documents/datagov.py
@@ -1,1 +1,48 @@
+import sys, os
+import scrape
+from bs4 import BeautifulSoup
+
+
+listurl = "http://data.gov.au/data/"
+(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
+ listurl, "data", "AGIMO")
+soup = BeautifulSoup(datasetlisthtml)
+for atag in soup.find_all(class_='result-title'):
+ if atag.has_key('href'):
+ url = scrape.fullurl(listurl, atag['href'])
+ (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+ url, "data", "AGIMO")
+ hash = scrape.mkhash(scrape.canonurl(url))
+ doc = scrape.docsdb.get(hash)
+ if "metadata" not in doc.keys():
+ doc['metadata'] = {}
+ soup = BeautifulSoup(html)
+ for metatag in soup.find_all('meta'):
+ if metatag.has_key('name'):
+ doc['metadata'][metatag['name']] = metatag['content']
+ for list in soup.find_all('dl'):
+ last_title = ""
+ for child in list.children:
+ if str(type(child)) != "<class 'bs4.element.NavigableString'>":
+ if child.name == 'dt' and child.string != None:
+ last_title = child.string.strip()
+ if child.name == 'dd':
+ #print last_title
+ if last_title == "Download":
+ for item in child.find_all("li"):
+ link = item.find("a")
+ format = item.find(property="dc:format")
+ linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
+ "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+ doc['metadata'][last_title] = linkobj
+
+ else:
+ atags = child.find_all('a')
+ if len(atags) < 2:
+ [s.extract() for s in child(class_='viewAll')]
+ doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
+ else:
+ doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
+ print doc['metadata']
+ sys.exit("ggg")
--- a/documents/date.php
+++ b/documents/date.php
@@ -5,8 +5,11 @@
include_once('../include/common.inc.php');
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in
+ one place!
+</div>
+<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+<br>
<?php
/*$agenciesdb = $server->get_db('disclosr-agencies');
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -34,10 +34,10 @@
if (isset($row->value->FOIDocumentsURL)) {
$disclogs++;
echo '<a href="' . $row->value->FOIDocumentsURL . '">'
- . $row->value->FOIDocumentsURL . '</a>';
+ . $row->value->FOIDocumentsURL . '</a>';
if ($ENV == "DEV")
echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
- . 'view local copy</a>)</small>';
+ . 'view local copy</a>)</small>';
} else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>";
}
@@ -49,11 +49,11 @@
} else if (file_exists("./scrapers/" . $row->id . '.txt')) {
if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
- $yellow++;
+ $yellow++;
} else {
echo file_get_contents("./scrapers/" . $row->id . '.txt');
- echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
- $orange++;
+ echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
+ $orange++;
}
} else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>";
@@ -69,7 +69,7 @@
}
echo "</table>";
echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; "
- . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
+ . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
include_footer_documents();
?>
--- a/documents/exportAll.csv.php
+++ b/documents/exportAll.csv.php
@@ -39,7 +39,7 @@
if (is_array($agencyArray[$fieldName])) {
$row[] = implode(";", $agencyArray[$fieldName]);
} else {
- $row[] = str_replace(Array("\n", '"', "\t"),"",$agencyArray[$fieldName]);
+ $row[] = str_replace(Array("\n", '"', "\t"), "", $agencyArray[$fieldName]);
}
} else {
$row[] = "";
--- /dev/null
+++ b/documents/gazette.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,5 +1,6 @@
import sys
import os
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape
from bs4 import BeautifulSoup
@@ -51,12 +52,12 @@
""" do the scraping """
return
+
class GenericHTMLDisclogScraper(GenericDisclogScraper):
-
def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
- self.getURL(), "foidocuments", self.getAgencyID())
+ self.getURL(), "foidocuments", self.getAgencyID())
content = rcontent
dochash = scrape.mkhash(content)
doc = foidocsdb.get(dochash)
@@ -66,33 +67,32 @@
last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
if last_attach != None:
html_diff = difflib.HtmlDiff()
- description = description + "\nChanges: "
- description = description + html_diff.make_table(last_attach.read().split('\n'),
- content.split('\n'))
+ diff = html_diff.make_table(last_attach.read().split('\n'),
+ content.split('\n'))
edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description), "diff": diff}
foidocsdb.save(doc)
else:
print "already saved"
+
class GenericPDFDisclogScraper(GenericDisclogScraper):
-
def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
- self.getURL(), "foidocuments", self.getAgencyID())
+ self.getURL(), "foidocuments", self.getAgencyID())
laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=True)
outfp = StringIO()
device = TextConverter(rsrcmgr, outfp, codec='utf-8',
- laparams=laparams)
+ laparams=laparams)
fp = StringIO()
fp.write(content)
process_pdf(rsrcmgr, device, fp, set(), caching=True,
- check_extractable=True)
+ check_extractable=True)
description = outfp.getvalue()
fp.close()
device.close()
@@ -104,18 +104,17 @@
edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
foidocsdb.save(doc)
else:
print "already saved"
class GenericDOCXDisclogScraper(GenericDisclogScraper):
-
def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb
- , self.getURL(), "foidocuments", self.getAgencyID())
+ , self.getURL(), "foidocuments", self.getAgencyID())
mydoc = zipfile.ZipFile(file)
xmlcontent = mydoc.read('word/document.xml')
document = etree.fromstring(xmlcontent)
@@ -125,7 +124,7 @@
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))
- ## Print our documnts test with two newlines under each paragraph
+ ## Print our documnts test with two newlines under each paragraph
description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
dochash = scrape.mkhash(description)
doc = foidocsdb.get(dochash)
@@ -134,42 +133,42 @@
print "saving " + dochash
edate = time().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
- , 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ , 'url': self.getURL(), 'docID': dochash,
+ "date": edate, "title": "Disclosure Log Updated", "description": description}
foidocsdb.save(doc)
else:
print "already saved"
class GenericRSSDisclogScraper(GenericDisclogScraper):
-
- def doScrape(self):
- foidocsdb = scrape.couch['disclosr-foidocuments']
- (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
- self.getURL(), "foidocuments", self.getAgencyID())
- feed = feedparser.parse(content)
- for entry in feed.entries:
- #print entry
- print entry.id
- dochash = scrape.mkhash(entry.id)
- doc = foidocsdb.get(dochash)
- #print doc
- if doc is None:
- print "saving " + dochash
- edate = datetime.fromtimestamp(
- mktime(entry.published_parsed)).strftime("%Y-%m-%d")
- doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
- 'url': entry.link, 'docID': entry.id,
- "date": edate, "title": entry.title}
- self.getDescription(entry, entry, doc)
- foidocsdb.save(doc)
- else:
- print "already saved"
-
- def getDescription(self, content, entry, doc):
- """ get description from rss entry"""
- doc.update({'description': content.summary})
- return
+ def doScrape(self):
+ foidocsdb = scrape.couch['disclosr-foidocuments']
+ (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+ self.getURL(), "foidocuments", self.getAgencyID())
+ feed = feedparser.parse(content)
+ for entry in feed.entries:
+ #print entry
+ print entry.id
+ dochash = scrape.mkhash(entry.id)
+ doc = foidocsdb.get(dochash)
+ #print doc
+ if doc is None:
+ print "saving " + dochash
+ edate = datetime.fromtimestamp(
+ mktime(entry.published_parsed)).strftime("%Y-%m-%d")
+ doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
+ 'url': entry.link, 'docID': entry.id,
+ "date": edate, "title": entry.title}
+ self.getDescription(entry, entry, doc)
+ foidocsdb.save(doc)
+ else:
+ print "already saved"
+
+ def getDescription(self, content, entry, doc):
+ """ get description from rss entry"""
+ doc.update({'description': content.summary})
+
+ return
class GenericOAICDisclogScraper(GenericDisclogScraper):
@@ -187,7 +186,7 @@
""" get description from rss entry"""
descriptiontxt = ""
for string in content.stripped_strings:
- descriptiontxt = descriptiontxt + " \n" + string
+ descriptiontxt = descriptiontxt + " \n" + string
doc.update({'description': descriptiontxt})
def getTitle(self, content, entry, doc):
@@ -202,7 +201,7 @@
def getDate(self, content, entry, doc):
date = ''.join(content.stripped_strings).strip()
(a, b, c) = date.partition("(")
- date = self.remove_control_chars(a.replace("Octber", "October"))
+ date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
print date
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
@@ -215,7 +214,7 @@
if atag.has_key('href'):
links.append(scrape.fullurl(content, atag['href']))
if links != []:
- doc.update({'links': links})
+ doc.update({'links': links})
return
def doScrape(self):
@@ -232,7 +231,7 @@
columns = row.find_all('td')
if len(columns) is self.getColumnCount():
(id, date, title,
- description, notes) = self.getColumns(columns)
+ description, notes) = self.getColumns(columns)
print self.remove_control_chars(
''.join(id.stripped_strings))
if id.string is None:
@@ -248,27 +247,29 @@
if doc is None:
print "saving " + dochash
doc = {'_id': dochash,
- 'agencyID': self.getAgencyID(),
- 'url': self.getURL(),
- 'docID': (''.join(id.stripped_strings))}
+ 'agencyID': self.getAgencyID(),
+ 'url': self.getURL(),
+ 'docID': (''.join(id.stripped_strings))}
self.getLinks(self.getURL(), row, doc)
self.getTitle(title, row, doc)
self.getDate(date, row, doc)
self.getDescription(description, row, doc)
if notes is not None:
- doc.update({ 'notes': (
+ doc.update({'notes': (
''.join(notes.stripped_strings))})
- badtitles = ['-','Summary of FOI Request'
- , 'FOI request(in summary form)'
- , 'Summary of FOI request received by the ASC',
-'Summary of FOI request received by agency/minister',
-'Description of Documents Requested','FOI request',
-'Description of FOI Request','Summary of request','Description','Summary',
-'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67']
+ badtitles = ['-', 'Summary of FOI Request'
+ , 'FOI request(in summary form)'
+ , 'Summary of FOI request received by the ASC',
+ 'Summary of FOI request received by agency/minister',
+ 'Description of Documents Requested', 'FOI request',
+ 'Description of FOI Request', 'Summary of request', 'Description', 'Summary',
+ 'Summary of FOIrequest received by agency/minister',
+ 'Summary of FOI request received', 'Description of FOI Request',
+ "FOI request", 'Results 1 to 67 of 67']
if doc['title'] not in badtitles\
and doc['description'] != '':
- print "saving"
- foidocsdb.save(doc)
+ print "saving"
+ foidocsdb.save(doc)
else:
print "already saved " + dochash
--- a/documents/index.php
+++ b/documents/index.php
@@ -5,8 +5,11 @@
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
$enddocid = (isset($_REQUEST['end_docid']) ? $_REQUEST['end_docid'] : null);
?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in
+ one place!
+</div>
+<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+<br>
<?php
$agenciesdb = $server->get_db('disclosr-agencies');
@@ -16,7 +19,7 @@
}
$foidocsdb = $server->get_db('disclosr-foidocuments');
try {
- $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20,null, $enddocid)->rows;
+ $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20, null, $enddocid)->rows;
if ($rows) {
foreach ($rows as $key => $row) {
echo displayLogEntry($row, $idtoname);
--- a/documents/redirect.php
+++ b/documents/redirect.php
@@ -1,18 +1,18 @@
<?php
-$subdomain = str_replace('disclo.gs','',$_SERVER['SERVER_NAME']);
+$subdomain = str_replace('disclo.gs', '', $_SERVER['SERVER_NAME']);
$script = $_SERVER['REQUEST_URI'];
if ($script == '/google676a414ad086cefb.html') {
- echo 'google-site-verification: google676a414ad086cefb.html';
- exit();
+ echo 'google-site-verification: google676a414ad086cefb.html';
+ exit();
}
if ($script == '/googlebcce906c6b666bb8.html') {
- echo 'google-site-verification: googlebcce906c6b666bb8.html';
- exit();
+ echo 'google-site-verification: googlebcce906c6b666bb8.html';
+ exit();
}
header('HTTP/1.1 301 Moved Permanently');
-header('Location: http://'.$subdomain.'disclosurelo.gs'.$script);
+header('Location: http://' . $subdomain . 'disclosurelo.gs' . $script);
exit();
?>
--- a/documents/rss.xml.php
+++ b/documents/rss.xml.php
@@ -23,9 +23,9 @@
$title = 'All Agencies';
}
//Use wrapper functions for common channelelements
-$TestFeed->setTitle('disclosurelo.gs Newest Entries - '.$title);
-$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php'.(isset($_REQUEST['id'])? '?id='.$_REQUEST['id'] : ''));
-$TestFeed->setDescription('disclosurelo.gs Newest Entries - '.$title);
+$TestFeed->setTitle('disclosurelo.gs Newest Entries - ' . $title);
+$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php' . (isset($_REQUEST['id']) ? '?id=' . $_REQUEST['id'] : ''));
+$TestFeed->setDescription('disclosurelo.gs Newest Entries - ' . $title);
$TestFeed->setChannelElement('language', 'en-us');
$TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -16,6 +16,7 @@
def mkhash(input):
return hashlib.md5(input).hexdigest().encode("utf-8")
+
def canonurl(url):
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
if the URL looks invalid.
@@ -67,10 +68,11 @@
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
return url[:4096]
-def fullurl(url,href):
- href = href.replace(" ","%20")
- href = re.sub('#.*$','',href)
- return urljoin(url,href)
+
+def fullurl(url, href):
+ href = href.replace(" ", "%20")
+ href = re.sub('#.*$', '', href)
+ return urljoin(url, href)
#http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler):
@@ -79,37 +81,39 @@
addinfourl.code = code
return addinfourl
-def getLastAttachment(docsdb,url):
+
+def getLastAttachment(docsdb, url):
hash = mkhash(url)
doc = docsdb.get(hash)
if doc != None:
last_attachment_fname = doc["_attachments"].keys()[-1]
- last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+ last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
return last_attachment
else:
return None
+
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
url = canonurl(url)
hash = mkhash(url)
req = urllib2.Request(url)
- print "Fetching %s (%s)" % (url,hash)
+ print "Fetching %s (%s)" % (url, hash)
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
print "Not a valid HTTP url"
- return (None,None,None)
+ return (None, None, None)
doc = docsdb.get(hash)
if doc == None:
- doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName, 'type': 'website'}
+ doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'}
else:
- if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
- print "Uh oh, trying to scrape URL again too soon!"+hash
+ if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000):
+ print "Uh oh, trying to scrape URL again too soon!" + hash
last_attachment_fname = doc["_attachments"].keys()[-1]
- last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+ last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
content = last_attachment
- return (doc['url'],doc['mime_type'],content.read())
+ return (doc['url'], doc['mime_type'], content.read())
if scrape_again == False:
print "Not scraping this URL again as requested"
- return (doc['url'],doc['mime_type'],content.read())
+ return (doc['url'], doc['mime_type'], content.read())
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
#if there is a previous version stored in couchdb, load caching helper tags
@@ -120,9 +124,7 @@
opener = urllib2.build_opener(NotModifiedHandler())
try:
- #default_timeout = 12
- #socket.setdefaulttimeout(default_timeout)
- url_handle = opener.open(req,None,3)
+ url_handle = opener.open(req, None, 20)
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag")
@@ -135,75 +137,76 @@
doc['file_size'] = headers.getheader("Content-Length")
content_type = headers.getheader("Content-Type")
if content_type != None:
- doc['mime_type'] = content_type.split(";")[0]
+ doc['mime_type'] = content_type.split(";")[0]
else:
- (type,encoding) = mimetypes.guess_type(url)
- doc['mime_type'] = type
+ (type, encoding) = mimetypes.guess_type(url)
+ doc['mime_type'] = type
if hasattr(url_handle, 'code'):
if url_handle.code == 304:
- print "the web page has not been modified"+hash
+ print "the web page has not been modified" + hash
last_attachment_fname = doc["_attachments"].keys()[-1]
- last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+ last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
content = last_attachment
- return (doc['url'],doc['mime_type'],content.read())
+ return (doc['url'], doc['mime_type'], content.read())
else:
print "new webpage loaded"
content = url_handle.read()
docsdb.save(doc)
doc = docsdb.get(hash) # need to get a _rev
- docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
+ docsdb.put_attachment(doc, content, str(time.time()) + "-" + os.path.basename(url), doc['mime_type'])
return (doc['url'], doc['mime_type'], content)
#store as attachment epoch-filename
except (urllib2.URLError, socket.timeout) as e:
- print "error!"
- error = ""
- if hasattr(e, 'reason'):
- error = "error %s in downloading %s" % (str(e.reason), url)
- elif hasattr(e, 'code'):
- error = "error %s in downloading %s" % (e.code, url)
- print error
- doc['error'] = error
- docsdb.save(doc)
- return (None,None,None)
-
+ print "error!"
+ error = ""
+ if hasattr(e, 'reason'):
+ error = "error %s in downloading %s" % (str(e.reason), url)
+ elif hasattr(e, 'code'):
+ error = "error %s in downloading %s" % (e.code, url)
+ print error
+ doc['error'] = error
+ docsdb.save(doc)
+ return (None, None, None)
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
- (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+ (url, mime_type, content) = fetchURL(docsdb, url, fieldName, agencyID)
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
- if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
- # http://www.crummy.com/software/BeautifulSoup/documentation.html
- soup = BeautifulSoup(content)
- navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
- for nav in navIDs:
- print "Removing element", nav['id']
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(content)
+ navIDs = soup.findAll(
+ id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
+ for nav in navIDs:
+ print "Removing element", nav['id']
+ nav.extract()
+ navClasses = soup.findAll(
+ attrs={'class': re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
+ for nav in navClasses:
+ print "Removing element", nav['class']
nav.extract()
- navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
- for nav in navClasses:
- print "Removing element", nav['class']
- nav.extract()
- links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
- linkurls = set([])
- for link in links:
- if link.has_key("href"):
- if link['href'].startswith("http"):
- # lets not do external links for now
- # linkurls.add(link['href'])
- None
- if link['href'].startswith("mailto"):
- # not http
- None
- if link['href'].startswith("javascript"):
- # not http
- None
- else:
- # remove anchors and spaces in urls
- linkurls.add(fullurl(url,link['href']))
- for linkurl in linkurls:
- #print linkurl
- scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
+ links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
+ linkurls = set([])
+ for link in links:
+ if link.has_key("href"):
+ if link['href'].startswith("http"):
+ # lets not do external links for now
+ # linkurls.add(link['href'])
+ None
+ if link['href'].startswith("mailto"):
+ # not http
+ None
+ if link['href'].startswith("javascript"):
+ # not http
+ None
+ else:
+ # remove anchors and spaces in urls
+ linkurls.add(fullurl(url, link['href']))
+ for linkurl in linkurls:
+ #print linkurl
+ scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID)
#couch = couchdb.Server('http://192.168.1.148:5984/')
couch = couchdb.Server('http://192.168.1.113:5984/')
@@ -218,17 +221,17 @@
print agency['name']
for key in agency.keys():
if key == "FOIDocumentsURL" and "status" not in agency.keys() and False:
- scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+ scrapeAndStore(docsdb, agency[key], 0, key, agency['_id'])
if key == 'website' and True:
- scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
- if "metadata" not in agency.keys():
- agency['metadata'] = {}
+ scrapeAndStore(docsdb, agency[key], 0, key, agency['_id'])
+ if "metadata" not in agency.keys():
+ agency['metadata'] = {}
agency['metadata']['lastScraped'] = time.time()
if key.endswith('URL') and False:
print key
depth = 1
if 'scrapeDepth' in agency.keys():
depth = agency['scrapeDepth']
- scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
+ scrapeAndStore(docsdb, agency[key], depth, key, agency['_id'])
agencydb.save(agency)
--- a/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
+++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
@@ -6,8 +6,6 @@
#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getTable(self,soup):
- return soup.find(_class = "article-content").table
def getColumnCount(self):
return 5
def getColumns(self,columns):
--- a/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
+++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
@@ -21,6 +21,7 @@
if i < 2:
title = title + string
i = i+1
+ title = self.remove_control_chars(title)
doc.update({'title': title})
print title
return
--- a/documents/search.php
+++ b/documents/search.php
@@ -2,23 +2,23 @@
include_once('include/common.inc.php');
include_header('Search');
?>
-<div class="foundation-header">
- <h1><a href="search.php">Search</a></h1>
-</div>
-<form>
- <input type="text" name="q" value="<?php if (isset($_REQUEST['q']))echo $_REQUEST['q'];?>"/>
- <input type="submit"/>
-</form>
+ <div class="foundation-header">
+ <h1><a href="search.php">Search</a></h1>
+ </div>
+ <form>
+ <input type="text" name="q" value="<?php if (isset($_REQUEST['q'])) echo $_REQUEST['q'];?>"/>
+ <input type="submit"/>
+ </form>
<?php
if (isset($_REQUEST['q'])) {
- $request = Requests::get($serverAddr."disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=".$_REQUEST['q']);
+ $request = Requests::get($serverAddr . "disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=" . $_REQUEST['q']);
$results = json_decode($request->body);
$db = $server->get_db('disclosr-documents');
foreach ($results->rows as $result) {
//print_r($result);
- //$row = $db->get($result->id);
- echo $result->doc->_id." ".$result->doc->url."<br>".PHP_EOL;
+ //$row = $db->get($result->id);
+ echo $result->doc->_id . " " . $result->doc->url . "<br>" . PHP_EOL;
}
}
include_footer();
--- a/documents/template.inc.php
+++ b/documents/template.inc.php
@@ -1,101 +1,109 @@
<?php
-function include_header_documents($title) {
+function include_header_documents($title)
+{
header('X-UA-Compatible: IE=edge,chrome=1');
?>
<!doctype html>
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
- <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
- <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
- <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
+ <!--[if lt IE 7]>
+ <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
+ <!--[if IE 7]>
+ <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
+ <!--[if IE 8]>
+ <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
- <head>
- <meta charset="utf-8">
+ <head>
+ <meta charset="utf-8">
- <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?></title>
- <meta name="description" content="">
+ <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?></title>
+ <meta name="description" content="">
- <!-- Mobile viewport optimized: h5bp.com/viewport -->
- <meta name="viewport" content="width=device-width">
- <link rel="alternate" type="application/rss+xml" title="Latest Disclosure Log Entries" href="rss.xml.php" />
- <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
- <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" />
+ <!-- Mobile viewport optimized: h5bp.com/viewport -->
+ <meta name="viewport" content="width=device-width">
+ <link rel="alternate" type="application/rss+xml" title="Latest Disclosure Log Entries" href="rss.xml.php"/>
+ <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
+ <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8"/>
- <!-- Le styles -->
- <link href="css/bootstrap.min.css" rel="stylesheet">
- <style type="text/css">
- body {
- padding-top: 60px;
- padding-bottom: 40px;
- }
- .sidebar-nav {
- padding: 9px 0;
- }
- </style>
- <link href="css/bootstrap-responsive.min.css" rel="stylesheet">
-
- <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
- <!--[if lt IE 9]>
- <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
- <![endif]-->
- <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
-
- <!-- All JavaScript at the bottom, except this Modernizr build.
- Modernizr enables HTML5 elements & feature detects for optimal performance.
- Create your own custom Modernizr build: www.modernizr.com/download/
- <script src="js/libs/modernizr-2.5.3.min.js"></script>-->
- <script src="js/jquery.js"></script>
- <script type="text/javascript" src="js/flotr2.min.js"></script>
-
- </head>
- <body>
- <div class="navbar navbar-inverse navbar-fixed-top">
- <div class="navbar-inner">
- <div class="container-fluid">
- <a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
- <span class="icon-bar"></span>
- <span class="icon-bar"></span>
- <span class="icon-bar"></span>
- </a>
- <a class="brand" href="#">Australian Disclosure Logs</a>
- <div class="nav-collapse collapse">
- <p class="navbar-text pull-right">
- <small>
- Subsites on:
- </small>
- <a href="http://orgs.disclosurelo.gs">Government Agencies</a>
- • <a href="http://lobbyists.disclosurelo.gs">Political Lobbyists</a>
- • <a href="http://contracts.disclosurelo.gs">Government Contracts and Spending</a>
-
- </p>
- <ul class="nav">
- <li><a href="agency.php">By Agency</a></li>
- <li><a href="date.php">By Date</a></li>
- <li><a href="disclogsList.php">List of Disclosure Logs</a></li>
- <li><a href="about.php">About</a></li>
-
- </ul>
- </div><!--/.nav-collapse -->
- </div>
- </div>
- </div>
- <div class="container">
- <?php
+ <!-- Le styles -->
+ <link href="css/bootstrap.min.css" rel="stylesheet">
+ <style type="text/css">
+ body {
+ padding-top: 60px;
+ padding-bottom: 40px;
}
- function include_footer_documents() {
- global $ENV;
- ?>
- </div> <!-- /container -->
- <hr>
+ .sidebar-nav {
+ padding: 9px 0;
+ }
+ </style>
+ <link href="css/bootstrap-responsive.min.css" rel="stylesheet">
- <footer>
- <p>Not affiliated with or endorsed by any government agency.</p>
- </footer>
- <?php
- if ($ENV != "DEV") {
- echo "<script type='text/javascript'>
+ <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
+ <!--[if lt IE 9]>
+ <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
+ <![endif]-->
+ <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
+
+ <!-- All JavaScript at the bottom, except this Modernizr build.
+ Modernizr enables HTML5 elements & feature detects for optimal performance.
+ Create your own custom Modernizr build: www.modernizr.com/download/
+ <script src="js/libs/modernizr-2.5.3.min.js"></script>-->
+ <script src="js/jquery.js"></script>
+ <script type="text/javascript" src="js/flotr2.min.js"></script>
+
+ </head>
+ <body>
+ <div class="navbar navbar-inverse navbar-fixed-top">
+ <div class="navbar-inner">
+ <div class="container-fluid">
+ <a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+ <span class="icon-bar"></span>
+ <span class="icon-bar"></span>
+ <span class="icon-bar"></span>
+ </a>
+ <a class="brand" href="#">Australian Disclosure Logs</a>
+
+ <div class="nav-collapse collapse">
+ <p class="navbar-text pull-right">
+ <small>
+ Subsites on:
+ </small>
+ <a href="http://orgs.disclosurelo.gs">Government Agencies</a>
+ • <a href="http://lobbyists.disclosurelo.gs">Political Lobbyists</a>
+ • <a href="http://contracts.disclosurelo.gs">Government Contracts and Spending</a>
+
+ </p>
+ <ul class="nav">
+ <li><a href="agency.php">By Agency</a></li>
+ <li><a href="date.php">By Date</a></li>
+ <li><a href="disclogsList.php">List of Disclosure Logs</a></li>
+ <li><a href="about.php">About</a></li>
+
+ </ul>
+ </div>
+ <!--/.nav-collapse -->
+ </div>
+ </div>
+ </div>
+ <div class="container">
+<?php
+}
+
+function include_footer_documents()
+{
+ global $ENV;
+ ?>
+ </div> <!-- /container -->
+ <hr>
+
+ <footer>
+ <p>Not affiliated with or endorsed by any government agency.</p>
+ </footer>
+ <?php
+ if ($ENV != "DEV") {
+ echo "<script type='text/javascript'>
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-12341040-4']);
@@ -113,32 +121,33 @@
})();
</script>";
- }
- ?>
- <!-- Le javascript
- ================================================== -->
- <!-- Placed at the end of the document so the pages load faster -->
- <!--
- <script src="js/bootstrap-transition.js"></script>
- <script src="js/bootstrap-alert.js"></script>
- <script src="js/bootstrap-modal.js"></script>
- <script src="js/bootstrap-dropdown.js"></script>
- <script src="js/bootstrap-scrollspy.js"></script>
- <script src="js/bootstrap-tab.js"></script>
- <script src="js/bootstrap-tooltip.js"></script>
- <script src="js/bootstrap-popover.js"></script>
- <script src="js/bootstrap-button.js"></script>
- <script src="js/bootstrap-collapse.js"></script>
- <script src="js/bootstrap-carousel.js"></script>
- <script src="js/bootstrap-typeahead.js"></script>-->
+ }
+ ?>
+ <!-- Le javascript
+ ================================================== -->
+ <!-- Placed at the end of the document so the pages load faster -->
+ <!--
+ <script src="js/bootstrap-transition.js"></script>
+ <script src="js/bootstrap-alert.js"></script>
+ <script src="js/bootstrap-modal.js"></script>
+ <script src="js/bootstrap-dropdown.js"></script>
+ <script src="js/bootstrap-scrollspy.js"></script>
+ <script src="js/bootstrap-tab.js"></script>
+ <script src="js/bootstrap-tooltip.js"></script>
+ <script src="js/bootstrap-popover.js"></script>
+ <script src="js/bootstrap-button.js"></script>
+ <script src="js/bootstrap-collapse.js"></script>
+ <script src="js/bootstrap-carousel.js"></script>
+ <script src="js/bootstrap-typeahead.js"></script>-->
- </body>
+ </body>
</html>
- <?php
+<?php
}
-function truncate($string, $length, $stopanywhere = false) {
+function truncate($string, $length, $stopanywhere = false)
+{
//truncates a string to a certain char length, stopping on a word if not specified otherwise.
if (strlen($string) > $length) {
//limit hit!
@@ -154,14 +163,15 @@
return $string;
}
-function displayLogEntry($row, $idtoname) {
+function displayLogEntry($row, $idtoname)
+{
$result = "";
$result .= '<div itemscope itemtype="http://schema.org/Article">';
- $result .= '<h2><a href="http://disclosurelo.gs/view.php?id='.$row->value->_id.'"> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>";
+ $result .= '<h2><a href="http://disclosurelo.gs/view.php?id=' . $row->value->_id . '"> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>";
$result .= ' (<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</a></h2>';
$result .= "<p itemprop='description articleBody text'> Title: " . $row->value->title . "<br/>";
if (isset($row->value->description)) {
- $result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "",trim($row->value->description)));
+ $result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "", trim($row->value->description)));
}
if (isset($row->value->notes)) {
$result .= " <br>Note: " . $row->value->notes;
@@ -171,7 +181,7 @@
if (isset($row->value->links)) {
$result .= '<h3>Links/Documents</h3><ul itemprop="associatedMedia">';
foreach ($row->value->links as $link) {
- $result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href="' . htmlspecialchars ($link) . '" itemprop="url contentURL">' . htmlspecialchars ( $link) . "</a></li>";
+ $result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href="' . htmlspecialchars($link) . '" itemprop="url contentURL">' . htmlspecialchars($link) . "</a></li>";
}
$result .= "</ul>";
--- a/documents/view.php
+++ b/documents/view.php
@@ -14,11 +14,11 @@
}
$foidocsdb = $server->get_db('disclosr-foidocuments');
try {
- $obj = new stdClass();
+ $obj = new stdClass();
$obj->value = $foidocsdb->get($_REQUEST['id']);
include_header_documents($obj->value->title);
-echo displayLogEntry($obj,$idtoname);
+ echo displayLogEntry($obj, $idtoname);
} catch (SetteeRestClientException $e) {
setteErrorHandler($e);
--- a/documents/viewDocument.php
+++ b/documents/viewDocument.php
@@ -4,7 +4,7 @@
$hash = $_REQUEST['hash'];
$docsdb = $server->get_db('disclosr-documents');
try {
-$doc = object_to_array($docsdb->get($hash));
+ $doc = object_to_array($docsdb->get($hash));
} catch (SetteeRestClientException $e) {
setteErrorHandler($e);
@@ -15,7 +15,7 @@
$attachments = $doc['_attachments'];
$attachment_filenames = array_keys($attachments);
//print_r($attachments);
-$url = $serverAddr.'disclosr-documents/'.$hash.'/'.urlencode($attachment_filenames[0]);
+$url = $serverAddr . 'disclosr-documents/' . $hash . '/' . urlencode($attachment_filenames[0]);
//echo $url;
$request = Requests::get($url);
echo ($request->body);