beginning datagov scraper
beginning datagov scraper


Former-commit-id: a8775a64a3cdda480e4433742ed7ea6ca6a437ef

--- a/documents/404.html
+++ b/documents/404.html
@@ -1,44 +1,156 @@
 <!doctype html>
 <html lang="en">
 <head>
-  <meta charset="utf-8">
-  <title>Page Not Found :(</title>
-  <style>
-    ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
-    ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
-    html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
-    html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
-    body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
-    h1 { margin: 0 10px; font-size: 50px; text-align: center; }
-    h1 span { color: #bbb; }
-    h3 { margin: 1.5em 0 0.5em; }
-    p { margin: 1em 0; }
-    ul { padding: 0 0 0 40px; margin: 1em 0; }
-    .container { max-width: 380px; _width: 380px; margin: 0 auto; }
-    /* google search */
-    #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
-    #goog-fixurl form { margin: 0; }
-    #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
-    #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
-    #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
-    #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
-    #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
-    input::-moz-focus-inner { padding: 0; border: 0; }
-  </style>
+    <meta charset="utf-8">
+    <title>Page Not Found :(</title>
+    <style>
+        ::-moz-selection {
+            background: #fe57a1;
+            color: #fff;
+            text-shadow: none;
+        }
+
+        ::selection {
+            background: #fe57a1;
+            color: #fff;
+            text-shadow: none;
+        }
+
+        html {
+            padding: 30px 10px;
+            font-size: 20px;
+            line-height: 1.4;
+            color: #737373;
+            background: #f0f0f0;
+            -webkit-text-size-adjust: 100%;
+            -ms-text-size-adjust: 100%;
+        }
+
+        html, input {
+            font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+        }
+
+        body {
+            max-width: 500px;
+            _width: 500px;
+            padding: 30px 20px 50px;
+            border: 1px solid #b3b3b3;
+            border-radius: 4px;
+            margin: 0 auto;
+            box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff;
+            background: #fcfcfc;
+        }
+
+        h1 {
+            margin: 0 10px;
+            font-size: 50px;
+            text-align: center;
+        }
+
+        h1 span {
+            color: #bbb;
+        }
+
+        h3 {
+            margin: 1.5em 0 0.5em;
+        }
+
+        p {
+            margin: 1em 0;
+        }
+
+        ul {
+            padding: 0 0 0 40px;
+            margin: 1em 0;
+        }
+
+        .container {
+            max-width: 380px;
+            _width: 380px;
+            margin: 0 auto;
+        }
+
+            /* google search */
+        #goog-fixurl ul {
+            list-style: none;
+            padding: 0;
+            margin: 0;
+        }
+
+        #goog-fixurl form {
+            margin: 0;
+        }
+
+        #goog-wm-qt, #goog-wm-sb {
+            border: 1px solid #bbb;
+            font-size: 16px;
+            line-height: normal;
+            vertical-align: top;
+            color: #444;
+            border-radius: 2px;
+        }
+
+        #goog-wm-qt {
+            width: 220px;
+            height: 20px;
+            padding: 5px;
+            margin: 5px 10px 0 0;
+            box-shadow: inset 0 1px 1px #ccc;
+        }
+
+        #goog-wm-sb {
+            display: inline-block;
+            height: 32px;
+            padding: 0 10px;
+            margin: 5px 0 0;
+            white-space: nowrap;
+            cursor: pointer;
+            background-color: #f5f5f5;
+            background-image: -webkit-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            background-image: -moz-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            background-image: -ms-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            background-image: -o-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+            -webkit-appearance: none;
+            -moz-appearance: none;
+            appearance: none;
+            *overflow: visible;
+            *display: inline;
+            *zoom: 1;
+        }
+
+        #goog-wm-sb:hover, #goog-wm-sb:focus {
+            border-color: #aaa;
+            box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1);
+            background-color: #f8f8f8;
+        }
+
+        #goog-wm-qt:focus, #goog-wm-sb:focus {
+            border-color: #105cb6;
+            outline: 0;
+            color: #222;
+        }
+
+        input::-moz-focus-inner {
+            padding: 0;
+            border: 0;
+        }
+    </style>
 </head>
 <body>
-  <div class="container">
+<div class="container">
     <h1>Not found <span>:(</span></h1>
+
     <p>Sorry, but the page you were trying to view does not exist.</p>
+
     <p>It looks like this was the result of either:</p>
     <ul>
-      <li>a mistyped address</li>
-      <li>an out-of-date link</li>
+        <li>a mistyped address</li>
+        <li>an out-of-date link</li>
     </ul>
     <script>
-      var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+        var GOOG_FIXURL_LANG = (navigator.language || '').slice(0, 2), GOOG_FIXURL_SITE = location.host;
     </script>
     <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
-  </div>
+</div>
 
 

--- a/documents/agency.php
+++ b/documents/agency.php
@@ -12,8 +12,11 @@
 include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency'));
 $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
 ?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+    <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act
+        in one place!
+    </div>
+    <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+    <br>
 <?php
 try {
     if ($_REQUEST['id']) {

--- a/documents/charts.php
+++ b/documents/charts.php
@@ -18,144 +18,145 @@
 <div id="bydate" style="width:1000px;height:300px;"></div>
 <div id="byagency" style="width:1200px;height:300px;"></div>
 <script id="source">
-    window.onload = function() {
-        $(document).ready(function() {
-  var
-    d1    = [],
-    options1,
-     o1;
+    window.onload = function () {
+        $(document).ready(function () {
+            var
+                d1 = [],
+                options1,
+                o1;
 
-<?php
-    try {
-        $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
+            <?php
+                try {
+                    $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
 
 
-        $dataValues = Array();
-        foreach ($rows as $row) {
-            $dataValues[$row->key] = $row->value;
-        }
-        $i = 0;
-        ksort($dataValues);
-        foreach ($dataValues as $key => $value) {
-$date = date_create_from_format('Y-m-d', $key);
-if (date_format($date, 'U') != "") {
-            echo "       d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
-//            echo "        emplabels.push('$key');" . PHP_EOL;
-            $i++;
-}
-        }
-    } catch (SetteeRestClientException $e) {
-        setteErrorHandler($e);
-    }
-    ?>
+                    $dataValues = Array();
+                    foreach ($rows as $row) {
+                        $dataValues[$row->key] = $row->value;
+                    }
+                    $i = 0;
+                    ksort($dataValues);
+                    foreach ($dataValues as $key => $value) {
+            $date = date_create_from_format('Y-m-d', $key);
+            if (date_format($date, 'U') != "") {
+                        echo "       d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
+            //            echo "        emplabels.push('$key');" . PHP_EOL;
+                        $i++;
+            }
+                    }
+                } catch (SetteeRestClientException $e) {
+                    setteErrorHandler($e);
+                }
+                ?>
 
 
-        
-  options1 = {
-    xaxis : {
-      mode : 'time', 
-      labelsAngle : 45
-    },
-    selection : {
-      mode : 'x'
-    },
-    HtmlText : false,
-    title : 'Time'
-  };
-        
-  // Draw graph with default options, overwriting with passed options
-  function drawGraph (opts) {
+            options1 = {
+                xaxis: {
+                    mode: 'time',
+                    labelsAngle: 45
+                },
+                selection: {
+                    mode: 'x'
+                },
+                HtmlText: false,
+                title: 'Time'
+            };
 
-    // Clone the options, so the 'options' variable always keeps intact.
-    o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
+            // Draw graph with default options, overwriting with passed options
+            function drawGraph(opts) {
 
-    // Return a new graph.
-    return Flotr.draw(
-      document.getElementById("bydate"),
-      [ d1 ],
-      o1
-    );
-  }
+                // Clone the options, so the 'options' variable always keeps intact.
+                o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
 
-  graph = drawGraph();      
-        
-  Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function(area){
-    // Draw selected area
-    graph = drawGraph({
-      xaxis : { min : area.x1, max : area.x2, mode : 'time', labelsAngle : 45 },
-      yaxis : { min : area.y1, max : area.y2 }
-    });
-  });
-        
-  // When graph is clicked, draw the graph with default area.
-  Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { graph = drawGraph(); });
+                // Return a new graph.
+                return Flotr.draw(
+                    document.getElementById("bydate"),
+                    [ d1 ],
+                    o1
+                );
+            }
+
+            graph = drawGraph();
+
+            Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {
+                // Draw selected area
+                graph = drawGraph({
+                    xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },
+                    yaxis: { min: area.y1, max: area.y2 }
+                });
+            });
+
+            // When graph is clicked, draw the graph with default area.
+            Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {
+                graph = drawGraph();
+            });
 
         });
-}; 
+    };
 
-var d2 = [];
-var agencylabels = [];
-function agencytrackformatter(obj) {
-                   
-                        return agencylabels[Math.floor(obj.x)] +" = "+obj.y;
-                     
-                }
-                function agencytickformatter(val, axis) {
-                    if (agencylabels[Math.floor(val)]) {
-                        return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">'+(agencylabels[Math.floor(val)])+"</b>";
-                     
-                    } else {
-                        return "";
-                    }
-                }
-<?php
-    try {
-        $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
+    var d2 = [];
+    var agencylabels = [];
+    function agencytrackformatter(obj) {
+
+        return agencylabels[Math.floor(obj.x)] + " = " + obj.y;
+
+    }
+    function agencytickformatter(val, axis) {
+        if (agencylabels[Math.floor(val)]) {
+            return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">' + (agencylabels[Math.floor(val)]) + "</b>";
+
+        } else {
+            return "";
+        }
+    }
+    <?php
+        try {
+            $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
 
 
-        $dataValues = Array();
-        $i = 0;
-        foreach ($rows as $row) {
-            echo "       d2.push([".$i.", $row->value]);" . PHP_EOL;
-            echo "       agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
-            
-            $i++;
+            $dataValues = Array();
+            $i = 0;
+            foreach ($rows as $row) {
+                echo "       d2.push([".$i.", $row->value]);" . PHP_EOL;
+                echo "       agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
+
+                $i++;
+            }
+        } catch (SetteeRestClientException $e) {
+            setteErrorHandler($e);
         }
-    } catch (SetteeRestClientException $e) {
-        setteErrorHandler($e);
-    }
-    ?>
-  // Draw the graph
-  Flotr.draw(
-   document.getElementById("byagency"),
-    [d2],
-    {
-      bars : {
-        show : true,
-        horizontal : false,
-        shadowSize : 0,
-        barWidth : 0.5
-      },
-mouse : {
-                        track : true,
-                        relative : true,
-                    trackFormatter: agencytrackformatter
-                    },
-      yaxis : {
-        min : 0,
-        autoscaleMargin : 1
-      },
-      xaxis: {
-                    minorTickFreq: 1,
-                    noTicks: agencylabels.length,
-                    showMinorLabels: true,
-                        tickFormatter: agencytickformatter
-                    },
-                    legend: {
-                        show: false
-                    }
-    }
-  );
+        ?>
+    // Draw the graph
+    Flotr.draw(
+        document.getElementById("byagency"),
+        [d2],
+        {
+            bars: {
+                show: true,
+                horizontal: false,
+                shadowSize: 0,
+                barWidth: 0.5
+            },
+            mouse: {
+                track: true,
+                relative: true,
+                trackFormatter: agencytrackformatter
+            },
+            yaxis: {
+                min: 0,
+                autoscaleMargin: 1
+            },
+            xaxis: {
+                minorTickFreq: 1,
+                noTicks: agencylabels.length,
+                showMinorLabels: true,
+                tickFormatter: agencytickformatter
+            },
+            legend: {
+                show: false
+            }
+        }
+    );
 </script>
 
 <?php

--- a/documents/crossdomain.xml
+++ b/documents/crossdomain.xml
@@ -3,24 +3,23 @@
 <cross-domain-policy>
 
 
-<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+    <!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
 
-<!-- Most restrictive policy: -->
-	<site-control permitted-cross-domain-policies="none"/>
+    <!-- Most restrictive policy: -->
+    <site-control permitted-cross-domain-policies="none"/>
 
 
-
-<!-- Least restrictive policy: -->
-<!--
-	<site-control permitted-cross-domain-policies="all"/>
-	<allow-access-from domain="*" to-ports="*" secure="false"/>
-	<allow-http-request-headers-from domain="*" headers="*" secure="false"/>
--->
-<!--
-  If you host a crossdomain.xml file with allow-access-from domain="*"
-  and don’t understand all of the points described here, you probably
-  have a nasty security vulnerability. ~ simon willison
--->
+    <!-- Least restrictive policy: -->
+    <!--
+        <site-control permitted-cross-domain-policies="all"/>
+        <allow-access-from domain="*" to-ports="*" secure="false"/>
+        <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+    -->
+    <!--
+      If you host a crossdomain.xml file with allow-access-from domain="*"
+      and don’t understand all of the points described here, you probably
+      have a nasty security vulnerability. ~ simon willison
+    -->
 
 </cross-domain-policy>
 

--- /dev/null
+++ b/documents/datagov.py
@@ -1,1 +1,48 @@
+import sys, os
 
+import scrape
+from bs4 import BeautifulSoup
+
+
+listurl = "http://data.gov.au/data/"
+(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
+    listurl, "data", "AGIMO")
+soup = BeautifulSoup(datasetlisthtml)
+for atag in soup.find_all(class_='result-title'):
+    if atag.has_key('href'):
+        url = scrape.fullurl(listurl, atag['href'])
+        (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+            url, "data", "AGIMO")
+        hash = scrape.mkhash(scrape.canonurl(url))
+        doc = scrape.docsdb.get(hash)
+        if "metadata" not in doc.keys():
+            doc['metadata'] = {}
+        soup = BeautifulSoup(html)
+        for metatag in soup.find_all('meta'):
+            if metatag.has_key('name'):
+                doc['metadata'][metatag['name']] = metatag['content']
+        for list in soup.find_all('dl'):
+            last_title = ""
+            for child in list.children:
+                if str(type(child)) != "<class 'bs4.element.NavigableString'>":
+                    if child.name == 'dt' and child.string != None:
+                        last_title = child.string.strip()
+                    if child.name == 'dd':
+                        #print last_title
+                        if last_title == "Download":
+                            for item in child.find_all("li"):
+                                link = item.find("a")
+                                format = item.find(property="dc:format")
+                                linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
+                                    "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+                                doc['metadata'][last_title] = linkobj
+
+                        else:
+                            atags = child.find_all('a')
+                            if len(atags) < 2:
+                                [s.extract() for s in child(class_='viewAll')]
+                                doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
+                            else:
+                                doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
+        print doc['metadata']
+        sys.exit("ggg")

--- a/documents/date.php
+++ b/documents/date.php
@@ -5,8 +5,11 @@
 include_once('../include/common.inc.php');
 $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
 ?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in
+    one place!
+</div>
+<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+<br>
 <?php
 /*$agenciesdb = $server->get_db('disclosr-agencies');
 

--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -34,10 +34,10 @@
                 if (isset($row->value->FOIDocumentsURL)) {
                     $disclogs++;
                     echo '<a href="' . $row->value->FOIDocumentsURL . '">'
-                    . $row->value->FOIDocumentsURL . '</a>';
+                        . $row->value->FOIDocumentsURL . '</a>';
                     if ($ENV == "DEV")
                         echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
-                        . 'view local copy</a>)</small>';
+                            . 'view local copy</a>)</small>';
                 } else {
                     echo "<font color='red'><abbr title='No'>✘</abbr></font>";
                 }
@@ -49,11 +49,11 @@
                     } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
                         if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
                             echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
-                        $yellow++;
+                            $yellow++;
                         } else {
                             echo file_get_contents("./scrapers/" . $row->id . '.txt');
-                        echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
-                        $orange++;
+                            echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
+                            $orange++;
                         }
                     } else {
                         echo "<font color='red'><abbr title='No'>✘</abbr></font>";
@@ -69,7 +69,7 @@
 }
 echo "</table>";
 echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; "
- . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
+    . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
 
 include_footer_documents();
 ?>

--- a/documents/exportAll.csv.php
+++ b/documents/exportAll.csv.php
@@ -39,7 +39,7 @@
                         if (is_array($agencyArray[$fieldName])) {
                             $row[] = implode(";", $agencyArray[$fieldName]);
                         } else {
-                            $row[] = str_replace(Array("\n", '"', "\t"),"",$agencyArray[$fieldName]);
+                            $row[] = str_replace(Array("\n", '"', "\t"), "", $agencyArray[$fieldName]);
                         }
                     } else {
                         $row[] = "";

--- /dev/null
+++ b/documents/gazette.py

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,5 +1,6 @@
 import sys
 import os
+
 sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
 import scrape
 from bs4 import BeautifulSoup
@@ -51,12 +52,12 @@
         """ do the scraping """
         return
 
+
 class GenericHTMLDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
-             self.getURL(), "foidocuments", self.getAgencyID())
+            self.getURL(), "foidocuments", self.getAgencyID())
         content = rcontent
         dochash = scrape.mkhash(content)
         doc = foidocsdb.get(dochash)
@@ -66,33 +67,32 @@
             last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
             if last_attach != None:
                 html_diff = difflib.HtmlDiff()
-                description = description + "\nChanges: "
-                description = description + html_diff.make_table(last_attach.read().split('\n'),
-                           content.split('\n'))
+                diff = html_diff.make_table(last_attach.read().split('\n'),
+                    content.split('\n'))
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
-            , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+                , 'url': self.getURL(), 'docID': dochash,
+                   "date": edate, "title": "Disclosure Log Updated", "description": description, "diff": diff}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
+
 class GenericPDFDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
-             self.getURL(), "foidocuments", self.getAgencyID())
+            self.getURL(), "foidocuments", self.getAgencyID())
         laparams = LAParams()
         rsrcmgr = PDFResourceManager(caching=True)
         outfp = StringIO()
         device = TextConverter(rsrcmgr, outfp, codec='utf-8',
-             laparams=laparams)
+            laparams=laparams)
         fp = StringIO()
         fp.write(content)
 
         process_pdf(rsrcmgr, device, fp, set(), caching=True,
-             check_extractable=True)
+            check_extractable=True)
         description = outfp.getvalue()
         fp.close()
         device.close()
@@ -103,19 +103,18 @@
             print "saving " + dochash
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
-            , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+                , 'url': self.getURL(), 'docID': dochash,
+                   "date": edate, "title": "Disclosure Log Updated", "description": description}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
 
 class GenericDOCXDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
-        , self.getURL(), "foidocuments", self.getAgencyID())
+            , self.getURL(), "foidocuments", self.getAgencyID())
         mydoc = zipfile.ZipFile(file)
         xmlcontent = mydoc.read('word/document.xml')
         document = etree.fromstring(xmlcontent)
@@ -125,7 +124,7 @@
         newparatextlist = []
         for paratext in paratextlist:
             newparatextlist.append(paratext.encode("utf-8"))
-        ## Print our documnts test with two newlines under each paragraph
+            ## Print our documnts test with two newlines under each paragraph
         description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
         dochash = scrape.mkhash(description)
         doc = foidocsdb.get(dochash)
@@ -134,42 +133,42 @@
             print "saving " + dochash
             edate = time().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
-            , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+                , 'url': self.getURL(), 'docID': dochash,
+                   "date": edate, "title": "Disclosure Log Updated", "description": description}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
 
 class GenericRSSDisclogScraper(GenericDisclogScraper):
-
-        def doScrape(self):
-            foidocsdb = scrape.couch['disclosr-foidocuments']
-            (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
-                 self.getURL(), "foidocuments", self.getAgencyID())
-            feed = feedparser.parse(content)
-            for entry in feed.entries:
-                #print entry
-                print entry.id
-                dochash = scrape.mkhash(entry.id)
-                doc = foidocsdb.get(dochash)
-                #print doc
-                if doc is None:
-                    print "saving " + dochash
-                    edate = datetime.fromtimestamp(
-                        mktime(entry.published_parsed)).strftime("%Y-%m-%d")
-