Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr
Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr

Conflicts:
documents/genericScrapers.py

Former-commit-id: 492c708ed8d0d1b30bb7c8f672b9e101a7d44f89

--- a/admin/refreshDesignDoc.php
+++ b/admin/refreshDesignDoc.php
@@ -40,7 +40,8 @@
 $obj->views->byURL->map = "function(doc) {\n  emit(doc.url, doc);\n}";
 $obj->views->agency->map = "function(doc) {\n  emit(doc.agencyID, doc);\n}";
 $obj->views->byWebServer->map = "function(doc) {\n  emit(doc.web_server, doc);\n}";
-$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n  emit(doc._id, doc._attachments);\n}\n}";
+$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n  emit(doc._id, doc._attachments);\n}\n}";
+$docdb->save($obj, true);
 
 
 

--- a/admin/validation.py
+++ b/admin/validation.py
@@ -5,13 +5,13 @@
 import re
 from tidylib import tidy_document
 
-couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
 
 # select database
 docsdb = couch['disclosr-documents']
 
 def f(x):
-	invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized")
+	invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized|noscript|audio")
 	valid = re.compile(r"line")
 	return (not invalid.search(x)) and valid.search(x) and x != ''
 

--- a/documents/404.html
+++ b/documents/404.html
@@ -1,44 +1,156 @@
 <!doctype html>
 <html lang="en">
 <head>
-  <meta charset="utf-8">
-  <title>Page Not Found :(</title>
-  <style>
-    ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
-    ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
-    html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
-    html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
-    body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
-    h1 { margin: 0 10px; font-size: 50px; text-align: center; }
-    h1 span { color: #bbb; }
-    h3 { margin: 1.5em 0 0.5em; }
-    p { margin: 1em 0; }
-    ul { padding: 0 0 0 40px; margin: 1em 0; }
-    .container { max-width: 380px; _width: 380px; margin: 0 auto; }
-    /* google search */
-    #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
-    #goog-fixurl form { margin: 0; }
-    #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
-    #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
-    #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
-    #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
-    #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
-    input::-moz-focus-inner { padding: 0; border: 0; }
-  </style>
+    <meta charset="utf-8">
+    <title>Page Not Found :(</title>
+    <style>
+        ::-moz-selection {
+            background: #fe57a1;
+            color: #fff;
+            text-shadow: none;
+        }
+
+        ::selection {
+            background: #fe57a1;
+            color: #fff;
+            text-shadow: none;
+        }
+
+        html {
+            padding: 30px 10px;
+            font-size: 20px;
+            line-height: 1.4;
+            color: #737373;
+            background: #f0f0f0;
+            -webkit-text-size-adjust: 100%;
+            -ms-text-size-adjust: 100%;
+        }
+
+        html, input {
+            font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+        }
+
+        body {
+            max-width: 500px;
+            _width: 500px;
+            padding: 30px 20px 50px;
+            border: 1px solid #b3b3b3;
+            border-radius: 4px;
+            margin: 0 auto;
+            box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff;
+            background: #fcfcfc;
+        }
+
+        h1 {
+            margin: 0 10px;
+            font-size: 50px;
+            text-align: center;
+        }
+
+        h1 span {
+            color: #bbb;
+        }
+
+        h3 {
+            margin: 1.5em 0 0.5em;
+        }
+
+        p {
+            margin: 1em 0;
+        }
+
+        ul {
+            padding: 0 0 0 40px;
+            margin: 1em 0;
+        }
+
+        .container {
+            max-width: 380px;
+            _width: 380px;
+            margin: 0 auto;
+        }
+
+            /* google search */
+        #goog-fixurl ul {
+            list-style: none;
+            padding: 0;
+            margin: 0;
+        }
+
+        #goog-fixurl form {
+            margin: 0;
+        }
+
+        #goog-wm-qt, #goog-wm-sb {
+            border: 1px solid #bbb;
+            font-size: 16px;
+            line-height: normal;
+            vertical-align: top;
+            color: #444;
+            border-radius: 2px;
+        }
+
+        #goog-wm-qt {
+            width: 220px;
+            height: 20px;
+            padding: 5px;
+            margin: 5px 10px 0 0;
+            box-shadow: inset 0 1px 1px #ccc;
+        }
+
+        #goog-wm-sb {
+            display: inline-block;
+            height: 32px;
+            padding: 0 10px;
+            margin: 5px 0 0;
+            white-space: nowrap;
+            cursor: pointer;
+            background-color: #f5f5f5;
+            background-image: -webkit-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            background-image: -moz-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            background-image: -ms-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            background-image: -o-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            -webkit-appearance: none;
+            -moz-appearance: none;
+            appearance: none;
+            *overflow: visible;
+            *display: inline;
+            *zoom: 1;
+        }
+
+        #goog-wm-sb:hover, #goog-wm-sb:focus {
+            border-color: #aaa;
+            box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1);
+            background-color: #f8f8f8;
+        }
+
+        #goog-wm-qt:focus, #goog-wm-sb:focus {
+            border-color: #105cb6;
+            outline: 0;
+            color: #222;
+        }
+
+        input::-moz-focus-inner {
+            padding: 0;
+            border: 0;
+        }
+    </style>
 </head>
 <body>
-  <div class="container">
+<div class="container">
     <h1>Not found <span>:(</span></h1>
+
     <p>Sorry, but the page you were trying to view does not exist.</p>
+
     <p>It looks like this was the result of either:</p>
     <ul>
-      <li>a mistyped address</li>
-      <li>an out-of-date link</li>
+        <li>a mistyped address</li>
+        <li>an out-of-date link</li>
     </ul>
     <script>
-      var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+        var GOOG_FIXURL_LANG = (navigator.language || '').slice(0, 2), GOOG_FIXURL_SITE = location.host;
     </script>
     <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
-  </div>
+</div>
 
 

--- a/documents/agency.php
+++ b/documents/agency.php
@@ -12,8 +12,11 @@
 include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency'));
 $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
 ?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+    <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act
+        in one place!
+    </div>
+    <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+    <br>
 <?php
 try {
     if ($_REQUEST['id']) {

--- a/documents/charts.php
+++ b/documents/charts.php
@@ -18,144 +18,145 @@
 <div id="bydate" style="width:1000px;height:300px;"></div>
 <div id="byagency" style="width:1200px;height:300px;"></div>
 <script id="source">
-    window.onload = function() {
-        $(document).ready(function() {
-  var
-    d1    = [],
-    options1,
-     o1;
+    window.onload = function () {
+        $(document).ready(function () {
+            var
+                d1 = [],
+                options1,
+                o1;
 
-<?php
-    try {
-        $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
+            <?php
+                try {
+                    $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
 
 
-        $dataValues = Array();
-        foreach ($rows as $row) {
-            $dataValues[$row->key] = $row->value;
-        }
-        $i = 0;
-        ksort($dataValues);
-        foreach ($dataValues as $key => $value) {
-$date = date_create_from_format('Y-m-d', $key);
-if (date_format($date, 'U') != "") {
-            echo "       d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
-//            echo "        emplabels.push('$key');" . PHP_EOL;
-            $i++;
-}
-        }
-    } catch (SetteeRestClientException $e) {
-        setteErrorHandler($e);
-    }
-    ?>
+                    $dataValues = Array();
+                    foreach ($rows as $row) {
+                        $dataValues[$row->key] = $row->value;
+                    }
+                    $i = 0;
+                    ksort($dataValues);
+                    foreach ($dataValues as $key => $value) {
+            $date = date_create_from_format('Y-m-d', $key);
+            if (date_format($date, 'U') != "") {
+                        echo "       d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
+            //            echo "        emplabels.push('$key');" . PHP_EOL;
+                        $i++;
+            }
+                    }
+                } catch (SetteeRestClientException $e) {
+                    setteErrorHandler($e);
+                }
+                ?>
 
 
-        
-  options1 = {
-    xaxis : {
-      mode : 'time', 
-      labelsAngle : 45
-    },
-    selection : {
-      mode : 'x'
-    },
-    HtmlText : false,
-    title : 'Time'
-  };
-        
-  // Draw graph with default options, overwriting with passed options
-  function drawGraph (opts) {
+            options1 = {
+                xaxis: {
+                    mode: 'time',
+                    labelsAngle: 45
+                },
+                selection: {
+                    mode: 'x'
+                },
+                HtmlText: false,
+                title: 'Time'
+            };
 
-    // Clone the options, so the 'options' variable always keeps intact.
-    o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
+            // Draw graph with default options, overwriting with passed options
+            function drawGraph(opts) {
 
-    // Return a new graph.
-    return Flotr.draw(
-      document.getElementById("bydate"),
-      [ d1 ],
-      o1
-    );
-  }
+                // Clone the options, so the 'options' variable always keeps intact.
+                o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
 
-  graph = drawGraph();      
-        
-  Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function(area){
-    // Draw selected area
-    graph = drawGraph({
-      xaxis : { min : area.x1, max : area.x2, mode : 'time', labelsAngle : 45 },
-      yaxis : { min : area.y1, max : area.y2 }
-    });
-  });
-        
-  // When graph is clicked, draw the graph with default area.
-  Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { graph = drawGraph(); });
+                // Return a new graph.
+                return Flotr.draw(
+                    document.getElementById("bydate"),
+                    [ d1 ],
+                    o1
+                );
+            }
+
+            graph = drawGraph();
+
+            Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {
+                // Draw selected area
+                graph = drawGraph({
+                    xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },
+                    yaxis: { min: area.y1, max: area.y2 }
+                });
+            });
+
+            // When graph is clicked, draw the graph with default area.
+            Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {
+                graph = drawGraph();
+            });
 
         });
-}; 
+    };
 
-var d2 = [];
-var agencylabels = [];
-function agencytrackformatter(obj) {
-                   
-                        return agencylabels[Math.floor(obj.x)] +" = "+obj.y;
-                     
-                }
-                function agencytickformatter(val, axis) {
-                    if (agencylabels[Math.floor(val)]) {
-                        return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">'+(agencylabels[Math.floor(val)])+"</b>";
-                     
-                    } else {
-                        return "";
-                    }
-                }
-<?php
-    try {
-        $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
+    var d2 = [];
+    var agencylabels = [];
+    function agencytrackformatter(obj) {
+
+        return agencylabels[Math.floor(obj.x)] + " = " + obj.y;
+
+    }
+    function agencytickformatter(val, axis) {
+        if (agencylabels[Math.floor(val)]) {
+            return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">' + (agencylabels[Math.floor(val)]) + "</b>";
+
+        } else {
+            return "";
+        }
+    }
+    <?php
+        try {
+            $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
 
 
-        $dataValues = Array();
-        $i = 0;
-        foreach ($rows as $row) {
-            echo "       d2.push([".$i.", $row->value]);" . PHP_EOL;
-            echo "       agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
-            
-            $i++;
+            $dataValues = Array();
+            $i = 0;
+            foreach ($rows as $row) {
+                echo "       d2.push([".$i.", $row->value]);" . PHP_EOL;
+                echo "       agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
+
+                $i++;
+            }
+        } catch (SetteeRestClientException $e) {
+            setteErrorHandler($e);
         }
-    } catch (SetteeRestClientException $e) {
-        setteErrorHandler($e);
-    }
-    ?>
-  // Draw the graph
-  Flotr.draw(
-   document.getElementById("byagency"),
-    [d2],
-    {
-      bars : {
-        show : true,
-        horizontal : false,
-        shadowSize : 0,
-        barWidth : 0.5
-      },
-mouse : {
-                        track : true,
-                        relative : true,
-                    trackFormatter: agencytrackformatter
-                    },
-      yaxis : {
-        min : 0,
-        autoscaleMargin : 1
-      },
-      xaxis: {
-                    minorTickFreq: 1,
-                    noTicks: agencylabels.length,
-                    showMinorLabels: true,
-                        tickFormatter: agencytickformatter
-                    },
-                    legend: {
-                        show: false
-                    }
-    }
-  );
+        ?>
+    // Draw the graph
+    Flotr.draw(
+        document.getElementById("byagency"),
+        [d2],
+        {
+            bars: {
+                show: true,
+                horizontal: false,
+                shadowSize: 0,
+                barWidth: 0.5
+            },
+            mouse: {
+                track: true,
+                relative: true,
+                trackFormatter: agencytrackformatter
+            },
+            yaxis: {
+                min: 0,
+                autoscaleMargin: 1
+            },
+            xaxis: {
+                minorTickFreq: 1,
+                noTicks: agencylabels.length,
+                showMinorLabels: true,
+                tickFormatter: agencytickformatter
+            },
+            legend: {
+                show: false
+            }
+        }
+    );
 </script>
 
 <?php

--- a/documents/crossdomain.xml
+++ b/documents/crossdomain.xml
@@ -3,24 +3,23 @@
 <cross-domain-policy>
 
 
-<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+    <!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
 
-<!-- Most restrictive policy: -->
-	<site-control permitted-cross-domain-policies="none"/>
+    <!-- Most restrictive policy: -->
+    <site-control permitted-cross-domain-policies="none"/>
 
 
-
-<!-- Least restrictive policy: -->
-<!--
-	<site-control permitted-cross-domain-policies="all"/>
-	<allow-access-from domain="*" to-ports="*" secure="false"/>
-	<allow-http-request-headers-from domain="*" headers="*" secure="false"/>
--->
-<!--
-  If you host a crossdomain.xml file with allow-access-from domain="*"
-  and don’t understand all of the points described here, you probably
-  have a nasty security vulnerability. ~ simon willison
--->
+    <!-- Least restrictive policy: -->
+    <!--
+        <site-control permitted-cross-domain-policies="all"/>
+        <allow-access-from domain="*" to-ports="*" secure="false"/>
+        <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+    -->
+    <!--
+      If you host a crossdomain.xml file with allow-access-from domain="*"
+      and don’t understand all of the points described here, you probably
+      have a nasty security vulnerability. ~ simon willison
+    -->
 
 </cross-domain-policy>
 

--- /dev/null
+++ b/documents/datagov.py
@@ -1,1 +1,48 @@
+import sys, os
 
+import scrape
+from bs4 import BeautifulSoup
+
+
+listurl = "http://data.gov.au/data/"
+(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
+    listurl, "data", "AGIMO")
+soup = BeautifulSoup(datasetlisthtml)
+for atag in soup.find_all(class_='result-title'):
+    if atag.has_key('href'):
+        url = scrape.fullurl(listurl, atag['href'])
+        (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+            url, "data", "AGIMO")
+        hash = scrape.mkhash(scrape.canonurl(url))
+        doc = scrape.docsdb.get(hash)
+        if "metadata" not in doc.keys():
+            doc['metadata'] = {}
+        soup = BeautifulSoup(html)
+        for metatag in soup.find_all('meta'):
+            if metatag.has_key('name'):
+                doc['metadata'][metatag['name']] = metatag['content']
+        for list in soup.find_all('dl'):
+            last_title = ""
+            for child in list.children:
+                if str(type(child)) != "<class 'bs4.element.NavigableString'>":
+                    if child.name == 'dt' and child.string != None:
+                        last_title = child.string.strip()
+                    if child.name == 'dd':
+                        #print last_title
+                        if last_title == "Download":
+                            for item in child.find_all("li"):
+                                link = item.find("a")
+                                format = item.find(property="dc:format")
+                                linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
+                                    "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+                                doc['metadata'][last_title] = linkobj
+
+                        else:
+                            atags = child.find_all('a')
+                            if len(atags) < 2:
+                                [s.extract() for s in child(class_='viewAll')]
+                                doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
+                            else:
+                                doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
+        print doc['metadata']
+        sys.exit("ggg")

--- a/documents/date.php
+++ b/documents/date.php
@@ -5,8 +5,11 @@
 include_once('../include/common.inc.php');
 $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
 ?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in
+    one place!
+</div>
+<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+<br>
 <?php
 /*$agenciesdb = $server->get_db('disclosr-agencies');
 

--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -34,10 +34,10 @@
                 if (isset($row->value->FOIDocumentsURL)) {
                     $disclogs++;
                     echo '<a href="' . $row->value->FOIDocumentsURL . '">'
-                    . $row->value->FOIDocumentsURL . '</a>';
+                        . $row->value->FOIDocumentsURL . '</a>';
                     if ($ENV == "DEV")
                         echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
-                        . 'view local copy</a>)</small>';
+                            . 'view local copy</a>)</small>';
                 } else {
                     echo "<font color='red'><abbr title='No'>✘</abbr></font>";
                 }
@@ -49,11 +49,11 @@
                     } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
                         if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
                             echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
-                        $yellow++;
+                            $yellow++;
                         } else {
                             echo file_get_contents("./scrapers/" . $row->id . '.txt');
-                        echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
-                        $orange++;
+                            echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
+                            $orange++;
                         }
                     } else {
                         echo "<font color='red'><abbr title='No'>✘</abbr></font>";
@@ -69,7 +69,7 @@
 }
 echo "</table>";
 echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; "
- . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
+    . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
 
 include_footer_documents();
 ?>

--- a/documents/exportAll.csv.php
+++ b/documents/exportAll.csv.php
@@ -39,7 +39,7 @@
                         if (is_array($agencyArray[$fieldName])) {
                             $row[] = implode(";", $agencyArray[$fieldName]);
                         } else {
-                            $row[] = str_replace(Array("\n", '"', "\t"),"",$agencyArray[$fieldName]);
+                            $row[] = str_replace(Array("\n", '"', "\t"), "", $agencyArray[$fieldName]);
                         }
                     } else {
                         $row[] = "";

--- /dev/null
+++ b/documents/gazette.py

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,5 +1,6 @@
 import sys
 import os
+
 sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
 import scrape
 from bs4 import BeautifulSoup
@@ -51,12 +52,12 @@
         """ do the scraping """
         return
 
+
 class GenericHTMLDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
-             self.getURL(), "foidocuments", self.getAgencyID())
+            self.getURL(), "foidocuments", self.getAgencyID())
         content = rcontent
         dochash = scrape.mkhash(content)
         doc = foidocsdb.get(dochash)
@@ -66,33 +67,32 @@
             last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
             if last_attach != None:
                 html_diff = difflib.HtmlDiff()
-                description = description + "\nChanges: "
-                description = description + html_diff.make_table(last_attach.read().split('\n'),
-                           content.split('\n'))
+                diff = html_diff.make_table(last_attach.read().split('\n'),
+                    content.split('\n'))
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description), "diff": diff}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
+
 class GenericPDFDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
-             self.getURL(), "foidocuments", self.getAgencyID())
+            self.getURL(), "foidocuments", self.getAgencyID())
         laparams = LAParams()
         rsrcmgr = PDFResourceManager(caching=True)
         outfp = StringIO()
         device = TextConverter(rsrcmgr, outfp, codec='utf-8',
-             laparams=laparams)
+            laparams=laparams)
         fp = StringIO()
         fp.write(content)
 
         process_pdf(rsrcmgr, device, fp, set(), caching=True,
-             check_extractable=True)
+            check_extractable=True)
         description = outfp.getvalue()
         fp.close()
         device.close()
@@ -104,18 +104,17 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
 
 class GenericDOCXDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
-        , self.getURL(), "foidocuments", self.getAgencyID())
+            , self.getURL(), "foidocuments", self.getAgencyID())
         mydoc = zipfile.ZipFile(file)
         xmlcontent = mydoc.read('word/document.xml')
         document = etree.fromstring(xmlcontent)
@@ -125,7 +124,7 @@
         newparatextlist = []
         for paratext in paratextlist:
             newparatextlist.append(paratext.encode("utf-8"))
-        ## Print our documnts test with two newlines under each paragraph
+            ## Print our documnts test with two newlines under each paragraph
         description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
         dochash = scrape.mkhash(description)
         doc = foidocsdb.get(dochash)
@@ -134,42 +133,42 @@
             print "saving " + dochash
             edate = time().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
-            , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+                , 'url': self.getURL(), 'docID': dochash,
+                   "date": edate, "title": "Disclosure Log Updated", "description": description}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
 
 class GenericRSSDisclogScraper(GenericDisclogScraper):
-
-        def doScrape(self):
-            foidocsdb = scrape.couch['disclosr-foidocuments']
-            (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
-                 self.getURL(), "foidocuments", self.getAgencyID())
-            feed = feedparser.parse(content)
-            for entry in feed.entries:
-                #print entry
-                print entry.id
-                dochash = scrape.mkhash(entry.id)
-                doc = foidocsdb.get(dochash)
-                #print doc
-                if doc is None:
-                    print "saving " + dochash
-                    edate = datetime.fromtimestamp(
-                        mktime(entry.published_parsed)).strftime("%Y-%m-%d")
-                    doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
-                        'url': entry.link, 'docID': entry.id,
-                        "date": edate, "title": entry.title}
-                    self.getDescription(entry, entry, doc)
-                    foidocsdb.save(doc)
-                else:
-                    print "already saved"
-
-            def getDescription(self, content, entry, doc):
-                    """ get description from rss entry"""
-                    doc.update({'description': content.summary})
-            return
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+            self.getURL(), "foidocuments", self.getAgencyID())
+        feed = feedparser.parse(content)
+        for entry in feed.entries:
+            #print entry
+            print entry.id
+            dochash = scrape.mkhash(entry.id)
+            doc = foidocsdb.get(dochash)
+            #print doc
+            if doc is None:
+                print "saving " + dochash
+                edate = datetime.fromtimestamp(
+                    mktime(entry.published_parsed)).strftime("%Y-%m-%d")
+                doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
+                       'url': entry.link, 'docID': entry.id,
+                       "date": edate, "title": entry.title}
+                self.getDescription(entry, entry, doc)
+                foidocsdb.save(doc)
+            else:
+                print "already saved"
+
+        def getDescription(self, content, entry, doc):
+            """ get description from rss entry"""
+            doc.update({'description': content.summary})
+
+        return
 
 
 class GenericOAICDisclogScraper(GenericDisclogScraper):
@@ -187,7 +186,7 @@
         """ get description from rss entry"""
         descriptiontxt = ""
         for string in content.stripped_strings:
-                    descriptiontxt = descriptiontxt + " \n" + string
+            descriptiontxt = descriptiontxt + " \n" + string
         doc.update({'description': descriptiontxt})
 
     def getTitle(self, content, entry, doc):
@@ -202,7 +201,7 @@
     def getDate(self, content, entry, doc):
         date = ''.join(content.stripped_strings).strip()
         (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October"))
+        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
         print date
         edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
         print edate
@@ -215,7 +214,7 @@
             if atag.has_key('href'):
                 links.append(scrape.fullurl(content, atag['href']))
         if links != []:
-                    doc.update({'links': links})
+            doc.update({'links': links})
         return
 
     def doScrape(self):
@@ -232,7 +231,7 @@
                     columns = row.find_all('td')
                     if len(columns) is self.getColumnCount():
                         (id, date, title,
-                        description, notes) = self.getColumns(columns)
+                         description, notes) = self.getColumns(columns)
                         print self.remove_control_chars(
                             ''.join(id.stripped_strings))
                         if id.string is None:
@@ -248,27 +247,29 @@
                         if doc is None:
                             print "saving " + dochash
                             doc = {'_id': dochash,
-                            'agencyID': self.getAgencyID(),
-                            'url': self.getURL(),
-                            'docID': (''.join(id.stripped_strings))}
+                                   'agencyID': self.getAgencyID(),
+                                   'url': self.getURL(),
+                                   'docID': (''.join(id.stripped_strings))}
                             self.getLinks(self.getURL(), row, doc)
                             self.getTitle(title, row, doc)
                             self.getDate(date, row, doc)
                             self.getDescription(description, row, doc)
                             if notes is not None:
-                                doc.update({ 'notes': (
+                                doc.update({'notes': (
                                     ''.join(notes.stripped_strings))})
-                            badtitles = ['-','Summary of FOI Request'
-                            , 'FOI request(in summary form)'
-                            , 'Summary of FOI request received by the ASC',
-'Summary of FOI request received by agency/minister',
-'Description of Documents Requested','FOI request',
-'Description of FOI Request','Summary of request','Description','Summary',
-'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of    FOI Request',"FOI request",'Results 1 to 67 of 67']
+                            badtitles = ['-', 'Summary of FOI Request'
+                                , 'FOI request(in summary form)'
+                                , 'Summary of FOI request received by the ASC',
+                                         'Summary of FOI request received by agency/minister',
+                                         'Description of Documents Requested', 'FOI request',
+                                         'Description of FOI Request', 'Summary of request', 'Description', 'Summary',
+                                         'Summary of FOIrequest received by agency/minister',
+                                         'Summary of FOI request received', 'Description of    FOI Request',
+                                         "FOI request", 'Results 1 to 67 of 67']
                             if doc['title'] not in badtitles\
                             and doc['description'] != '':
-                                                            print "saving"
-                                                            foidocsdb.save(doc)
+                                print "saving"
+                                foidocsdb.save(doc)
                         else:
                             print "already saved " + dochash
 

--- a/documents/index.php
+++ b/documents/index.php
@@ -5,8 +5,11 @@
 $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
 $enddocid = (isset($_REQUEST['end_docid']) ? $_REQUEST['end_docid'] : null);
 ?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in
+    one place!
+</div>
+<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+<br>
 <?php
 $agenciesdb = $server->get_db('disclosr-agencies');
 
@@ -16,7 +19,7 @@
 }
 $foidocsdb = $server->get_db('disclosr-foidocuments');
 try {
-    $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20,null, $enddocid)->rows;
+    $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20, null, $enddocid)->rows;
     if ($rows) {
         foreach ($rows as $key => $row) {
             echo displayLogEntry($row, $idtoname);

--- a/documents/redirect.php
+++ b/documents/redirect.php
@@ -1,18 +1,18 @@
 <?php
-$subdomain = str_replace('disclo.gs','',$_SERVER['SERVER_NAME']);
+$subdomain = str_replace('disclo.gs', '', $_SERVER['SERVER_NAME']);
 $script = $_SERVER['REQUEST_URI'];
 
 if ($script == '/google676a414ad086cefb.html') {
-	echo 'google-site-verification: google676a414ad086cefb.html';
-	exit();
+    echo 'google-site-verification: google676a414ad086cefb.html';
+    exit();
 }
 if ($script == '/googlebcce906c6b666bb8.html') {
-        echo 'google-site-verification: googlebcce906c6b666bb8.html';
-        exit();
+    echo 'google-site-verification: googlebcce906c6b666bb8.html';
+    exit();
 }
 
 header('HTTP/1.1 301 Moved Permanently');
-header('Location: http://'.$subdomain.'disclosurelo.gs'.$script);
+header('Location: http://' . $subdomain . 'disclosurelo.gs' . $script);
 exit();
 ?>
 

--- a/documents/rss.xml.php
+++ b/documents/rss.xml.php
@@ -23,9 +23,9 @@
     $title = 'All Agencies';
 }
 //Use wrapper functions for common channelelements
-$TestFeed->setTitle('disclosurelo.gs Newest Entries - '.$title);
-$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php'.(isset($_REQUEST['id'])? '?id='.$_REQUEST['id'] : ''));
-$TestFeed->setDescription('disclosurelo.gs Newest Entries - '.$title);
+$TestFeed->setTitle('disclosurelo.gs Newest Entries - ' . $title);
+$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php' . (isset($_REQUEST['id']) ? '?id=' . $_REQUEST['id'] : ''));
+$TestFeed->setDescription('disclosurelo.gs Newest Entries - ' . $title);
 $TestFeed->setChannelElement('language', 'en-us');
 $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
 

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -10,9 +10,12 @@
 import mimetypes
 import urllib
 import urlparse
+import socket
+
 
 def mkhash(input):
     return hashlib.md5(input).hexdigest().encode("utf-8")
+
 
 def canonurl(url):
     r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
@@ -65,10 +68,11 @@
     url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
     return url[:4096]
 
-def fullurl(url,href):
-    href = href.replace(" ","%20")
-    href = re.sub('#.*$','',href)
-    return urljoin(url,href)
+
+def fullurl(url, href):
+    href = href.replace(" ", "%20")
+    href = re.sub('#.*$', '', href)
+    return urljoin(url, href)
 
 #http://diveintopython.org/http_web_services/etags.html
 class NotModifiedHandler(urllib2.BaseHandler):
@@ -77,37 +81,39 @@
         addinfourl.code = code
         return addinfourl
 
-def getLastAttachment(docsdb,url):
+
+def getLastAttachment(docsdb, url):
     hash = mkhash(url)
     doc = docsdb.get(hash)
     if doc != None:
         last_attachment_fname = doc["_attachments"].keys()[-1]
-        last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+        last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
         return last_attachment
     else:
         return None
+
 
 def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
     url = canonurl(url)
     hash = mkhash(url)
     req = urllib2.Request(url)
-    print "Fetching %s (%s)" % (url,hash)
+    print "Fetching %s (%s)" % (url, hash)
     if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
         print "Not a valid HTTP url"
-        return (None,None,None)
+        return (None, None, None)
     doc = docsdb.get(hash)
     if doc == None:
-        doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
+        doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'}
     else:
-        if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
-            print "Uh oh, trying to scrape URL again too soon!"+hash
+        if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000):
+            print "Uh oh, trying to scrape URL again too soon!" + hash
             last_attachment_fname = doc["_attachments"].keys()[-1]
-            last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+            last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
             content = last_attachment
-            return (doc['url'],doc['mime_type'],content.read())
+            return (doc['url'], doc['mime_type'], content.read())
         if scrape_again == False:
             print "Not scraping this URL again as requested"
-            return (doc['url'],doc['mime_type'],content.read())
+            return (doc['url'], doc['mime_type'], content.read())
 
     req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
     #if there is a previous version stored in couchdb, load caching helper tags
@@ -118,7 +124,7 @@
 
     opener = urllib2.build_opener(NotModifiedHandler())
     try:
-        url_handle = opener.open(req)
+        url_handle = opener.open(req, None, 20)
         doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
         headers = url_handle.info() # the addinfourls have the .info() too
         doc['etag'] = headers.getheader("ETag")
@@ -131,97 +137,101 @@
         doc['file_size'] = headers.getheader("Content-Length")
         content_type = headers.getheader("Content-Type")
         if content_type != None:
-             doc['mime_type'] = content_type.split(";")[0]
+            doc['mime_type'] = content_type.split(";")[0]
         else:
-             (type,encoding) = mimetypes.guess_type(url)
-             doc['mime_type'] = type
+            (type, encoding) = mimetypes.guess_type(url)
+            doc['mime_type'] = type
         if hasattr(url_handle, 'code'):
             if url_handle.code == 304:
-                print "the web page has not been modified"+hash
+                print "the web page has not been modified" + hash
                 last_attachment_fname = doc["_attachments"].keys()[-1]
-                last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+                last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
                 content = last_attachment
-                return (doc['url'],doc['mime_type'],content.read())
+                return (doc['url'], doc['mime_type'], content.read())
             else:
                 print "new webpage loaded"
                 content = url_handle.read()
                 docsdb.save(doc)
                 doc = docsdb.get(hash) # need to get a _rev
-                docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
+                docsdb.put_attachment(doc, content, str(time.time()) + "-" + os.path.basename(url), doc['mime_type'])
                 return (doc['url'], doc['mime_type'], content)
                 #store as attachment epoch-filename
 
-    except urllib2.URLError as e:
-            print "error!"
-            error = ""
-            if hasattr(e, 'reason'):
-                error = "error %s in downloading %s" % (str(e.reason), url)
-            elif hasattr(e, 'code'):
-                error = "error %s in downloading %s" % (e.code, url)
-            print error
-            doc['error'] = error
-            docsdb.save(doc)
-            return (None,None,None)
-
+    except (urllib2.URLError, socket.timeout) as e:
+        print "error!"
+        error = ""
+        if hasattr(e, 'reason'):
+            error = "error %s in downloading %s" % (str(e.reason), url)
+        elif hasattr(e, 'code'):
+            error = "error %s in downloading %s" % (e.code, url)
+        print error
+        doc['error'] = error
+        docsdb.save(doc)
+        return (None, None, None)
 
 
 def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
-    (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+    (url, mime_type, content) = fetchURL(docsdb, url, fieldName, agencyID)
     badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
     if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
-        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
-                # http://www.crummy.com/software/BeautifulSoup/documentation.html
-                soup = BeautifulSoup(content)
-                navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
-                for nav in navIDs:
-                    print "Removing element", nav['id']
+        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
+            # http://www.crummy.com/software/BeautifulSoup/documentation.html
+            soup = BeautifulSoup(content)
+            navIDs = soup.findAll(
+                id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
+            for nav in navIDs:
+                print "Removing element", nav['id']
+                nav.extract()
+                navClasses = soup.findAll(
+                    attrs={'class': re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
+                for nav in navClasses:
+                    print "Removing element", nav['class']
                     nav.extract()
-                    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
-                    for nav in navClasses:
-                        print "Removing element", nav['class']
-                        nav.extract()
-                    links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
-                    linkurls = set([])
-                    for link in links:
-                        if link.has_key("href"):
-                            if link['href'].startswith("http"):
-                                # lets not do external links for now
-                                # linkurls.add(link['href'])
-                                None
-                            if link['href'].startswith("mailto"):
-                                # not http
-                                None
-                            if link['href'].startswith("javascript"):
-                                # not http
-                                None
-                            else:
-                                # remove anchors and spaces in urls
-                                linkurls.add(fullurl(url,link['href']))
-                    for linkurl in linkurls:
-                               #print linkurl
-                               scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
+                links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
+                linkurls = set([])
+                for link in links:
+                    if link.has_key("href"):
+                        if link['href'].startswith("http"):
+                            # lets not do external links for now
+                            # linkurls.add(link['href'])
+                            None
+                        if link['href'].startswith("mailto"):
+                            # not http
+                            None
+                        if link['href'].startswith("javascript"):
+                            # not http
+                            None
+                        else:
+                            # remove anchors and spaces in urls
+                            linkurls.add(fullurl(url, link['href']))
+                for linkurl in linkurls:
+                    #print linkurl
+                    scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID)
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
-couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
+#couch = couchdb.Server('http://127.0.0.1:5984/')
 # select database
 agencydb = couch['disclosr-agencies']
 docsdb = couch['disclosr-documents']
 
 if __name__ == "__main__":
-    for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+    for row in agencydb.view('app/all'): #not recently scraped agencies view?
         agency = agencydb.get(row.id)
         print agency['name']
         for key in agency.keys():
-            if key == "FOIDocumentsURL" and "status" not in agency.keys:
-                scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
-            if key == 'website' and False:
-                scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+            if key == "FOIDocumentsURL" and "status" not in agency.keys() and False:
+                scrapeAndStore(docsdb, agency[key], 0, key, agency['_id'])
+            if key == 'website' and True:
+                scrapeAndStore(docsdb, agency[key], 0, key, agency['_id'])
+                if "metadata" not in agency.keys():
+                    agency['metadata'] = {}
                 agency['metadata']['lastScraped'] = time.time()
             if key.endswith('URL') and False:
                 print key
                 depth = 1
                 if 'scrapeDepth' in agency.keys():
                     depth = agency['scrapeDepth']
-                scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
+                scrapeAndStore(docsdb, agency[key], depth, key, agency['_id'])
         agencydb.save(agency)
 

--- a/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
+++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
@@ -6,8 +6,6 @@
 
 #http://www.doughellmann.com/PyMOTW/abc/
 class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
-        def getTable(self,soup):
-                return soup.find(_class = "article-content").table
         def getColumnCount(self):
                 return 5
         def getColumns(self,columns):

--- a/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
+++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
@@ -21,6 +21,7 @@
     			if i < 2:
 				title = title + string
 			i = i+1
+		title = self.remove_control_chars(title)
                 doc.update({'title': title})
 		print title
                 return

--- a/documents/search.php
+++ b/documents/search.php
@@ -2,23 +2,23 @@
 include_once('include/common.inc.php');
 include_header('Search');
 ?>
-<div class="foundation-header">
-    <h1><a href="search.php">Search</a></h1>
-</div>
-<form>
-    <input type="text" name="q" value="<?php if (isset($_REQUEST['q']))echo $_REQUEST['q'];?>"/>
-    <input type="submit"/>
-</form>
+    <div class="foundation-header">
+        <h1><a href="search.php">Search</a></h1>
+    </div>
+    <form>
+        <input type="text" name="q" value="<?php if (isset($_REQUEST['q'])) echo $_REQUEST['q'];?>"/>
+        <input type="submit"/>
+    </form>
 
 <?php
 if (isset($_REQUEST['q'])) {
-    $request = Requests::get($serverAddr."disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=".$_REQUEST['q']);
+    $request = Requests::get($serverAddr . "disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=" . $_REQUEST['q']);
     $results = json_decode($request->body);
     $db = $server->get_db('disclosr-documents');
     foreach ($results->rows as $result) {
         //print_r($result);
-         //$row = $db->get($result->id);
-        echo $result->doc->_id." ".$result->doc->url."<br>".PHP_EOL;
+        //$row = $db->get($result->id);
+        echo $result->doc->_id . " " . $result->doc->url . "<br>" . PHP_EOL;
     }
 }
 include_footer();

--- a/documents/template.inc.php
+++ b/documents/template.inc.php
@@ -1,101 +1,109 @@
 <?php
 
-function include_header_documents($title) {
+function include_header_documents($title)
+{
     header('X-UA-Compatible: IE=edge,chrome=1');
     ?>
     <!doctype html>
     <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
-    <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
-    <!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
-    <!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
+    <!--[if lt IE 7]>
+    <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
+    <!--[if IE 7]>
+    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
+    <!--[if IE 8]>
+    <html class="no-js lt-ie9" lang="en"> <![endif]-->
     <!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
     <!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
-        <head>
-            <meta charset="utf-8">
+    <head>
+        <meta charset="utf-8">
 
-            <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?></title>
-            <meta name="description" content="">
+        <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?></title>
+        <meta name="description" content="">
 
-            <!-- Mobile viewport optimized: h5bp.com/viewport -->
-            <meta name="viewport" content="width=device-width">
-            <link rel="alternate" type="application/rss+xml" title="Latest Disclosure Log Entries" href="rss.xml.php" />
-            <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
-            <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" />
+        <!-- Mobile viewport optimized: h5bp.com/viewport -->
+        <meta name="viewport" content="width=device-width">
+        <link rel="alternate" type="application/rss+xml" title="Latest Disclosure Log Entries" href="rss.xml.php"/>
+        <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
+        <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8"/>
 
-            <!-- Le styles -->
-            <link href="css/bootstrap.min.css" rel="stylesheet">
-            <style type="text/css">
-                body {
-                    padding-top: 60px;
-                    padding-bottom: 40px;
-                }
-                .sidebar-nav {
-                    padding: 9px 0;
-                }
-            </style>
-            <link href="css/bootstrap-responsive.min.css" rel="stylesheet">
-
-            <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
-            <!--[if lt IE 9]>
-              <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
-            <![endif]-->
-            <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
-
-            <!-- All JavaScript at the bottom, except this Modernizr build.
-                 Modernizr enables HTML5 elements & feature detects for optimal performance.
-                 Create your own custom Modernizr build: www.modernizr.com/download/ 
-            <script src="js/libs/modernizr-2.5.3.min.js"></script>-->
-            <script src="js/jquery.js"></script>
-            <script type="text/javascript" src="js/flotr2.min.js"></script>
-
-        </head>
-        <body>
-            <div class="navbar navbar-inverse navbar-fixed-top">
-                <div class="navbar-inner">
-                    <div class="container-fluid">
-                        <a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
-                            <span class="icon-bar"></span>
-                            <span class="icon-bar"></span>
-                            <span class="icon-bar"></span>
-                        </a>
-                        <a class="brand" href="#">Australian Disclosure Logs</a>
-                        <div class="nav-collapse collapse">
-                            <p class="navbar-text pull-right">
-                                <small>
-                                Subsites on: 
-       </small> 
-                                <a href="http://orgs.disclosurelo.gs">Government Agencies</a>
-                                • <a href="http://lobbyists.disclosurelo.gs">Political Lobbyists</a>
-                                • <a href="http://contracts.disclosurelo.gs">Government Contracts and Spending</a>
-
-                            </p>
-                            <ul class="nav">
-                                <li><a href="agency.php">By Agency</a></li>
-                                <li><a href="date.php">By Date</a></li>
-                                <li><a href="disclogsList.php">List of Disclosure Logs</a></li>
-                                <li><a href="about.php">About</a></li>
-
-                            </ul>
-                        </div><!--/.nav-collapse -->
-                    </div>
-                </div>
-            </div>
-            <div class="container">
-                <?php
+        <!-- Le styles -->
+        <link href="css/bootstrap.min.css" rel="stylesheet">
+        <style type="text/css">
+            body {
+                padding-top: 60px;
+                padding-bottom: 40px;
             }
 
-            function include_footer_documents() {
-                global $ENV;
-                ?>
-            </div> <!-- /container -->
-            <hr>
+            .sidebar-nav {
+                padding: 9px 0;
+            }
+        </style>
+        <link href="css/bootstrap-responsive.min.css" rel="stylesheet">
 
-            <footer>
-                <p>Not affiliated with or endorsed by any government agency.</p>
-            </footer>
-              <?php
-            if ($ENV != "DEV") {
-                echo "<script type='text/javascript'>
+        <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
+        <!--[if lt IE 9]>
+        <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
+        <![endif]-->
+        <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
+
+        <!-- All JavaScript at the bottom, except this Modernizr build.
+             Modernizr enables HTML5 elements & feature detects for optimal performance.
+             Create your own custom Modernizr build: www.modernizr.com/download/
+        <script src="js/libs/modernizr-2.5.3.min.js"></script>-->
+        <script src="js/jquery.js"></script>
+        <script type="text/javascript" src="js/flotr2.min.js"></script>
+
+    </head>
+    <body>
+    <div class="navbar navbar-inverse navbar-fixed-top">
+        <div class="navbar-inner">
+            <div class="container-fluid">
+                <a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+                    <span class="icon-bar"></span>
+                    <span class="icon-bar"></span>
+                    <span class="icon-bar"></span>
+                </a>
+                <a class="brand" href="#">Australian Disclosure Logs</a>
+
+                <div class="nav-collapse collapse">
+                    <p class="navbar-text pull-right">
+                        <small>
+                            Subsites on:
+                        </small>
+                        <a href="http://orgs.disclosurelo.gs">Government Agencies</a>
+                        • <a href="http://lobbyists.disclosurelo.gs">Political Lobbyists</a>
+                        • <a href="http://contracts.disclosurelo.gs">Government Contracts and Spending</a>
+
+                    </p>
+                    <ul class="nav">
+                        <li><a href="agency.php">By Agency</a></li>
+                        <li><a href="date.php">By Date</a></li>
+                        <li><a href="disclogsList.php">List of Disclosure Logs</a></li>
+                        <li><a href="about.php">About</a></li>
+
+                    </ul>
+                </div>
+                <!--/.nav-collapse -->
+            </div>
+        </div>
+    </div>
+    <div class="container">
+<?php
+}
+
+function include_footer_documents()
+{
+    global $ENV;
+    ?>
+    </div> <!-- /container -->
+    <hr>
+
+    <footer>
+        <p>Not affiliated with or endorsed by any government agency.</p>
+    </footer>
+    <?php
+    if ($ENV != "DEV") {
+        echo "<script type='text/javascript'>
 
                 var _gaq = _gaq || [];
                 _gaq.push(['_setAccount', 'UA-12341040-4']);
@@ -113,32 +121,33 @@
                 })();
 
             </script>";
-            }
-            ?>
-            <!-- Le javascript
-            ================================================== -->
-            <!-- Placed at the end of the document so the pages load faster -->
-            <!--
-                <script src="js/bootstrap-transition.js"></script>
-                <script src="js/bootstrap-alert.js"></script>
-                <script src="js/bootstrap-modal.js"></script>
-                <script src="js/bootstrap-dropdown.js"></script>
-                <script src="js/bootstrap-scrollspy.js"></script>
-                <script src="js/bootstrap-tab.js"></script>
-                <script src="js/bootstrap-tooltip.js"></script>
-                <script src="js/bootstrap-popover.js"></script>
-                <script src="js/bootstrap-button.js"></script>
-                <script src="js/bootstrap-collapse.js"></script>
-                <script src="js/bootstrap-carousel.js"></script>
-                <script src="js/bootstrap-typeahead.js"></script>-->
+    }
+    ?>
+    <!-- Le javascript
+    ================================================== -->
+    <!-- Placed at the end of the document so the pages load faster -->
+    <!--
+        <script src="js/bootstrap-transition.js"></script>
+        <script src="js/bootstrap-alert.js"></script>
+        <script src="js/bootstrap-modal.js"></script>
+        <script src="js/bootstrap-dropdown.js"></script>
+        <script src="js/bootstrap-scrollspy.js"></script>
+        <script src="js/bootstrap-tab.js"></script>
+        <script src="js/bootstrap-tooltip.js"></script>
+        <script src="js/bootstrap-popover.js"></script>
+        <script src="js/bootstrap-button.js"></script>
+        <script src="js/bootstrap-collapse.js"></script>
+        <script src="js/bootstrap-carousel.js"></script>
+        <script src="js/bootstrap-typeahead.js"></script>-->
 
 
-        </body>
+    </body>
     </html>
-    <?php
+<?php
 }
 
-function truncate($string, $length, $stopanywhere = false) {
+function truncate($string, $length, $stopanywhere = false)
+{
     //truncates a string to a certain char length, stopping on a word if not specified otherwise.
     if (strlen($string) > $length) {
         //limit hit!
@@ -154,14 +163,15 @@
     return $string;
 }
 
-function displayLogEntry($row, $idtoname) {
+function displayLogEntry($row, $idtoname)
+{
     $result = "";
     $result .= '<div itemscope itemtype="http://schema.org/Article">';
-    $result .= '<h2><a href="http://disclosurelo.gs/view.php?id='.$row->value->_id.'"> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>";
+    $result .= '<h2><a href="http://disclosurelo.gs/view.php?id=' . $row->value->_id . '"> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>";
     $result .= ' (<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</a></h2>';
     $result .= "<p itemprop='description articleBody text'> Title: " . $row->value->title . "<br/>";
     if (isset($row->value->description)) {
-        $result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "",trim($row->value->description)));
+        $result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "", trim($row->value->description)));
     }
     if (isset($row->value->notes)) {
         $result .= " <br>Note: " . $row->value->notes;
@@ -171,7 +181,7 @@
     if (isset($row->value->links)) {
         $result .= '<h3>Links/Documents</h3><ul itemprop="associatedMedia">';
         foreach ($row->value->links as $link) {
-            $result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href="' . htmlspecialchars ($link) . '" itemprop="url contentURL">' . htmlspecialchars ( $link) . "</a></li>";
+            $result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href="' . htmlspecialchars($link) . '" itemprop="url contentURL">' . htmlspecialchars($link) . "</a></li>";
         }
 
         $result .= "</ul>";

--- a/documents/view.php
+++ b/documents/view.php
@@ -14,11 +14,11 @@
 }
 $foidocsdb = $server->get_db('disclosr-foidocuments');
 try {
-  $obj = new stdClass();
+    $obj = new stdClass();
     $obj->value = $foidocsdb->get($_REQUEST['id']);
     include_header_documents($obj->value->title);
 
-echo displayLogEntry($obj,$idtoname);
+    echo displayLogEntry($obj, $idtoname);
 
 } catch (SetteeRestClientException $e) {
     setteErrorHandler($e);

--- a/documents/viewDocument.php
+++ b/documents/viewDocument.php
@@ -4,7 +4,7 @@
 $hash = $_REQUEST['hash'];
 $docsdb = $server->get_db('disclosr-documents');
 try {
-$doc = object_to_array($docsdb->get($hash));
+    $doc = object_to_array($docsdb->get($hash));
 
 } catch (SetteeRestClientException $e) {
     setteErrorHandler($e);
@@ -15,7 +15,7 @@
 $attachments = $doc['_attachments'];
 $attachment_filenames = array_keys($attachments);
 //print_r($attachments);
-$url = $serverAddr.'disclosr-documents/'.$hash.'/'.urlencode($attachment_filenames[0]);
+$url = $serverAddr . 'disclosr-documents/' . $hash . '/' . urlencode($attachment_filenames[0]);
 //echo $url;
 $request = Requests::get($url);
 echo ($request->body);