--- a/.gitmodules
+++ b/.gitmodules
@@ -25,4 +25,7 @@
 [submodule "lib/querypath"]
 	path = lib/querypath
 	url = https://github.com/technosophos/querypath.git
+[submodule "lib/amon-php"]
+	path = lib/amon-php
+	url = https://github.com/martinrusev/amon-php.git
 


--- a/about.php
+++ b/about.php
@@ -10,8 +10,10 @@
 Disclo.gs is a project to monitor Australian Federal Government agencies 
 compliance with their <a href="http://www.oaic.gov.au/publications/other_operational/foi_policy_frequently_asked_questions.html#_Toc291837571">"proactive disclosure requirements" to make a transparency league table as suggested by gov2 taskforce http://gov2.net.au/blog/2009/09/19/a-league-ladder-of-psi-openness/</a>.
 <h2> Attributions </h2>
-National Archives of Australia, Australian Governments’ Interactive Functions Thesaurus, 2nd edition, September 2005, published at http://www.naa.gov.au/recordkeeping/thesaurus/index.htm.
-
+National Archives of Australia, Australian Governments’ Interactive Functions Thesaurus, 2nd edition, September 2005, published at http://www.naa.gov.au/recordkeeping/thesaurus/index.htm <br/>
+data.gov.au http://data.gov.au/dataset/directory-gov-au-full-data-export/ <br/>
+directory.gov.au <br/>
+australia.gov.au http://australia.gov.au/about/copyright <br/>
 <h2> Open everything </h2>
 All documents released CC-BY 3 AU
 Open source git @


--- a/alaveteli/exportAgencies.csv.php
+++ b/alaveteli/exportAgencies.csv.php
@@ -1,20 +1,6 @@
 <?php
 
 include_once("../include/common.inc.php");
-
-function shortName($name) {
-    $name = trim($name);
-    if (strstr($name,"Minister ") || strstr($name,"Treasurer") || strstr($name,"Parliamentary Secretary")) {
-        $badWords = Array ("Assisting the Prime Minister on","Assisting on"," the "," of "," for "," on "," and "," to ",","," ","'","`");
-        return str_replace($badWords,"",$name);
-    }
-            
-    else {
-    $out = Array();
-    preg_match_all('/[A-Z]/', $name, $out);
-    return implode("", $out[0]);
-    }
-}
 
 setlocale(LC_CTYPE, 'C');
 
@@ -63,45 +49,36 @@
                 $row = Array();
                 $row["#id"] = $agency->id;
                 $row["name"] = trim($agency->value->name);
-                if (isset($agency->value->foiEmail)) {
-                    $row["request_email"] = $agency->value->foiEmail;
-                } else {
-                    if ($agency->value->orgType == "FMA-DepartmentOfState") {
-                        $row["request_email"] = "foi@" . GetDomain($agency->value->website);
-                    } else {
-                        $row["request_email"] = $foiEmail[$agency->value->parentOrg];
+                $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");
+                $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");
+                $row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");
+
+                $otherBodies = Array();
+                if (isset($agency->value->foiBodies)) {
+                    $otherBodies = array_merge($otherBodies, $agency->value->foiBodies);
+                }
+                if (isset($agency->value->positions)) {
+                    $positions = Array();
+                    foreach ($agency->value->positions as $position) {
+                        $positions[] = "Office of the ".$position;
                     }
+                    $otherBodies = array_merge($otherBodies, $positions);
                 }
-                if (isset($agency->value->shortName)) {
-                    $row["short_name"] = $agency->value->shortName;
-                } else {
-                    $row["short_name"] = shortName($agency->value->name);
+                sort($otherBodies);
+                if (count($otherBodies) > 0) {
+                    $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies);
                 }
-                $row["notes"] = "";
+
                 $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
                 $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
                 if ($agency->value->orgType == "FMA-DepartmentOfState") {
-                    $row["tag_string"] = $tag[$agency->value->_id] . " " . $agency->value->orgType;
+                    $row["tag_string"] = $tag[$agency->value->_id];
                 } else {
-                    $row["tag_string"] = $tag[$agency->value->parentOrg] . " " . $agency->value->orgType;
+                    $row["tag_string"] = $tag[$agency->value->parentOrg];
                 }
-
+                $row["tag_string"] .= " " . $agency->value->orgType;
+                $row["tag_string"] .= " federal";
                 fputcsv($fp, array_values($row));
-
-                if (isset($agency->value->foiBodies)) {
-                    foreach ($agency->value->foiBodies as $foiBody) {
-                        $row['name'] = iconv("UTF-8", "ASCII//TRANSLIT",$foiBody);
-                        $row["short_name"] = shortName($foiBody);
-                        fputcsv($fp, array_values($row));
-                    }
-                }
-                if (isset($agency->value->positions)) {
-                    foreach ($agency->value->positions as $position) {
-                        $row['name'] = iconv("UTF-8", "ASCII//TRANSLIT",$position);
-                        $row["short_name"] = shortName($position);
-                        fputcsv($fp, array_values($row));
-                    }
-                }
             }
         }
     } catch (SetteeRestClientException $e) {


--- /dev/null
+++ b/documents/404.html
@@ -1,1 +1,44 @@
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>Page Not Found :(</title>
+  <style>
+    ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
+    ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
+    html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
+    html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
+    body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
+    h1 { margin: 0 10px; font-size: 50px; text-align: center; }
+    h1 span { color: #bbb; }
+    h3 { margin: 1.5em 0 0.5em; }
+    p { margin: 1em 0; }
+    ul { padding: 0 0 0 40px; margin: 1em 0; }
+    .container { max-width: 380px; _width: 380px; margin: 0 auto; }
+    /* google search */
+    #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
+    #goog-fixurl form { margin: 0; }
+    #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
+    #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
+    #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
+    #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
+    #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
+    input::-moz-focus-inner { padding: 0; border: 0; }
+  </style>
+</head>
+<body>
+  <div class="container">
+    <h1>Not found <span>:(</span></h1>
+    <p>Sorry, but the page you were trying to view does not exist.</p>
+    <p>It looks like this was the result of either:</p>
+    <ul>
+      <li>a mistyped address</li>
+      <li>an out-of-date link</li>
+    </ul>
+    <script>
+      var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+    </script>
+    <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
+  </div>
 
+


--- /dev/null
+++ b/documents/checkScrapers.php
@@ -1,1 +1,42 @@
+<?php
 
+include_once('../include/common.inc.php');
+include_header('Webserver and Accessiblity');
+
+echo "<table>
+    <tr><th>name</th><th>disclog</th><th>scraper?</th></tr>";
+$agenciesdb = $server->get_db('disclosr-agencies');
+$docsdb = $server->get_db('disclosr-documents');
+try {
+    $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
+
+
+    if ($rows) {
+        foreach ($rows as $row) {
+
+            echo "<tr><td>" . $row->value->name . " (".$row->id.")</td>\n";
+             echo "<td>";
+             if (isset($row->value->FOIDocumentsURL)) {
+                 echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
+                     .$row->value->FOIDocumentsURL.'</a>';
+             } else {
+                echo "<font color='red'>✘</font>";
+            }
+            echo "</td>\n<td>";
+            if (isset($row->value->FOIDocumentsURL)) {
+            if (file_exists("./scrapers/".$row->id.'.py')) {
+                echo "<font color='green'>✔</font>";
+            } else if (file_exists("./scrapers/".$row->id.'.txt')){
+                echo "pass";
+            } else {
+                echo "<font color='red'>✘</font>";
+            }
+            }
+            echo "</td></tr>\n";
+        }
+    }
+} catch (SetteeRestClientException $e) {
+    setteErrorHandler($e);
+}
+include_footer();
+?>


--- /dev/null
+++ b/documents/crossdomain.xml
@@ -1,1 +1,26 @@
+<?xml version="1.0"?>
+<!DOCTYPE cross-domain-policy SYSTEM "http://www.adobe.com/xml/dtds/cross-domain-policy.dtd">
+<cross-domain-policy>
 
+
+<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+
+<!-- Most restrictive policy: -->
+	<site-control permitted-cross-domain-policies="none"/>
+
+
+
+<!-- Least restrictive policy: -->
+<!--
+	<site-control permitted-cross-domain-policies="all"/>
+	<allow-access-from domain="*" to-ports="*" secure="false"/>
+	<allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+-->
+<!--
+  If you host a crossdomain.xml file with allow-access-from domain="*"
+  and don’t understand all of the points described here, you probably
+  have a nasty security vulnerability. ~ simon willison
+-->
+
+</cross-domain-policy>
+


--- /dev/null
+++ b/documents/genericScrapers.py
@@ -1,1 +1,61 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import scrape
 
+from bs4 import BeautifulSoup
+import abc
+
+class GenericOAICDisclogScraper(object):
+	__metaclass__ = abc.ABCMeta
+	@abc.abstractmethod
+	def getAgencyID(self):
+		""" disclosr agency id """
+		return
+
+	@abc.abstractmethod
+	def getURL(self):
+		""" disclog URL"""
+		return
+
+	@abc.abstractmethod
+	def getColumns(self,columns):
+		""" rearranges columns if required """
+		return
+
+	def doScrape(self):
+		foidocsdb = scrape.couch['disclosr-foidocuments']
+		(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+		if content != None:
+			if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+			# http://www.crummy.com/software/BeautifulSoup/documentation.html
+				soup = BeautifulSoup(content)
+				for row in soup.table.find_all('tr'):
+					columns = row.find_all('td')
+					if len(columns) == 5:
+						(id, date, description, title, notes) = self.getColumns(columns)
+						print id.string
+						hash = scrape.mkhash(url+id.string)
+						links = []
+						for atag in row.find_all("a"):
+							if atag.has_key('href'):
+								links.append(scrape.fullurl(url,atag['href']))
+						doc = foidocsdb.get(hash)
+						descriptiontxt = ""
+						for string in description.stripped_strings:
+							descriptiontxt = descriptiontxt + string
+							
+						if doc == None:
+							print "saving"
+							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
+			 				 "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
+							foidocsdb.save(doc)
+						else:
+							print "already saved"
+					
+					elif len(row.find_all('th')) == 5:
+						print "header row"
+					
+					else:
+						print "ERROR number of columns incorrect"
+						print row
+


--- /dev/null
+++ b/documents/google676a414ad086cefb.html
@@ -1,1 +1,2 @@
+google-site-verification: google676a414ad086cefb.html
 


--- /dev/null
+++ b/documents/googlebcce906c6b666bb8.html
@@ -1,1 +1,2 @@
+google-site-verification: googlebcce906c6b666bb8.html
 


--- /dev/null
+++ b/documents/humans.txt
@@ -1,1 +1,44 @@
+/* the humans responsible & colophon */
+/* humanstxt.org */
 
+
+/* TEAM */
+  <your title>: <your name>
+  Site:
+  Twitter:
+  Location:
+
+/* THANKS */
+  Names (& URL):
+
+/* SITE */
+  Standards: HTML5, CSS3
+  Components: Modernizr, jQuery
+  Software:
+
+
+
+                               -o/-
+                               +oo//-
+                              :ooo+//:
+                             -ooooo///-
+                             /oooooo//:
+                            :ooooooo+//-
+                           -+oooooooo///-
+           -://////////////+oooooooooo++////////////::
+            :+ooooooooooooooooooooooooooooooooooooo+:::-
+              -/+ooooooooooooooooooooooooooooooo+/::////:-
+                -:+oooooooooooooooooooooooooooo/::///////:-
+                  --/+ooooooooooooooooooooo+::://////:-
+                     -:+ooooooooooooooooo+:://////:--
+                       /ooooooooooooooooo+//////:-
+                      -ooooooooooooooooooo////-
+                      /ooooooooo+oooooooooo//:
+                     :ooooooo+/::/+oooooooo+//-
+                    -oooooo/::///////+oooooo///-
+                    /ooo+::://////:---:/+oooo//:
+                   -o+/::///////:-      -:/+o+//-
+                   :-:///////:-            -:/://
+                     -////:-                 --//:
+                       --                       -:
+


--- /dev/null
+++ b/documents/index.php
@@ -1,1 +1,94 @@
+<!doctype html>
+<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
+<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
+<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
+<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
 
+  <!-- Use the .htaccess and remove these lines to avoid edge case issues.
+       More info: h5bp.com/i/378 -->
+  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+
+  <title>disclosure logs</title>
+  <meta name="description" content="">
+
+  <!-- Mobile viewport optimized: h5bp.com/viewport -->
+  <meta name="viewport" content="width=device-width">
+
+  <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
+<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" />
+  <link rel="stylesheet" href="css/style.css">
+
+  <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
+
+  <!-- All JavaScript at the bottom, except this Modernizr build.
+       Modernizr enables HTML5 elements & feature detects for optimal performance.
+       Create your own custom Modernizr build: www.modernizr.com/download/ -->
+  <script src="js/libs/modernizr-2.5.3.min.js"></script>
+</head>
+<body>
+  <!-- Prompt IE 6 users to install Chrome Frame. Remove this if you support IE 6.
+       chromium.org/developers/how-tos/chrome-frame-getting-started -->
+  <!--[if lt IE 7]><p class=chromeframe>Your browser is <em>ancient!</em> <a href="http://browsehappy.com/">Upgrade to a different browser</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to experience this site.</p><![endif]-->
+  <header>
+<center><h1>disclosurelo.gs</h1></center>
+
+  </header>
+  <div role="main">
+<dl>
+  <dt>Disclosure Log</dt>
+    <dd></dd>
+</dl>
+<a href="http://information.<?php echo $_SERVER['HTTP_HOST'];?>">information.disclo.gs - documents</a><br>
+<a href="http://orgs.<?php echo $_SERVER['HTTP_HOST'];?>">orgs.disclo.gs - structure</a><br>
+<a href="http://lobbyists.<?php echo $_SERVER['HTTP_HOST'];?>">lobbyists.disclo.gs - lobbylens</a><br>
+<a href="http://contracts.<?php echo $_SERVER['HTTP_HOST'];?>">contracts.disclo.gs - contractdashboard</a><br>
+</body>
+
+  </div>
+  <footer>
+
+  </footer>
+
+
+  <!-- JavaScript at the bottom for fast page loading -->
+
+  <!-- Grab Google CDN's jQuery, with a protocol relative URL; fall back to local if offline -->
+  <script src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
+  <script>window.jQuery || document.write('<script src="js/libs/jquery-1.7.1.min.js"><\/script>')</script>
+
+  <!-- scripts concatenated and minified via build script -->
+  <script src="js/plugins.js"></script>
+  <script src="js/script.js"></script>
+  <!-- end scripts -->
+
+  <!-- Asynchronous Google Analytics snippet. Change UA-XXXXX-X to be your site's ID.
+       mathiasbynens.be/notes/async-analytics-snippet -->
+  <script>
+    var _gaq=[['_setAccount','UA-XXXXX-X'],['_trackPageview']];
+    (function(d,t){var g=d.createElement(t),s=d.getElementsByTagName(t)[0];
+    g.src=('https:'==location.protocol?'//ssl':'//www')+'.google-analytics.com/ga.js';
+    s.parentNode.insertBefore(g,s)}(document,'script'));
+  </script>
+</body>
+</html>
+
+
+australian disclosure logs
+
+are you looking for more information about:
+contracts
+gov orgs
+lobbyists
+
+1/1/11 title (Dept dfggdfgdf)
+description:
+source link:
+documents:
+    #1 title link
+
+
+rss feed here


--- /dev/null
+++ b/documents/readme.md
@@ -1,1 +1,56 @@
+# [HTML5 Boilerplate](http://html5boilerplate.com)
 
+HTML5 Boilerplate is a professional front-end template that helps you build fast, robust, adaptable, and future-proof websites. Spend more time developing and less time reinventing the wheel.
+
+This project is the product of many years of iterative development and combined community knowledge. It does not impose a specific development philosophy or framework, so you're free to architect your code in the way that you want.
+
+
+## Quick start
+
+Clone the git repo - `git clone git://github.com/h5bp/html5-boilerplate.git` - or [download it](https://github.com/h5bp/html5-boilerplate/zipball/master)
+
+
+## Features
+
+* HTML5 ready. Use the new elements with confidence.
+* Cross-browser compatible (Chrome, Opera, Safari, Firefox 3.6+, IE6+).
+* Designed with progressive enhancement in mind.
+* CSS normalizations and common bug fixes.
+* IE-specific classes for easier cross-browser control.
+* A default print stylesheet, performance optimized.
+* Mobile browser optimizations.
+* Protection against any stray `console.log` causing JavaScript errors in IE6/7.
+* The latest jQuery via CDN, with a local fallback.
+* A custom Modernizr build for feature detection.
+* An optimized Google Analytics snippet.
+* Apache server caching, compression, and other configuration defaults for Grade-A performance.
+* Cross-domain Ajax and Flash.
+* "Delete-key friendly." Easy to strip out parts you don't need.
+* Extensive inline and accompanying documentation.
+
+
+## Contributing
+
+Anyone and everyone is welcome to [contribute](https://github.com/h5bp/html5-boilerplate/wiki/contribute). Hundreds of developers have helped make the HTML5 Boilerplate what it is today.
+
+
+## Project information
+
+* Source: http://github.com/h5bp/html5-boilerplate
+* Web: http://html5boilerplate.com
+* Docs: http://html5boilerplate.com/docs
+* Twitter: http://twitter.com/h5bp
+
+
+## License
+
+### Major components:
+
+* jQuery: MIT/GPL license
+* Modernizr: MIT/BSD license
+* Normalize.css: Public Domain
+
+### Everything else:
+
+The Unlicense (aka: public domain)
+


--- /dev/null
+++ b/documents/redirect.php
@@ -1,1 +1,19 @@
+<?php
+$subdomain = str_replace('disclo.gs','',$_SERVER['SERVER_NAME']);
+$script = $_SERVER['REQUEST_URI'];
 
+if ($script == '/google676a414ad086cefb.html') {
+	echo 'google-site-verification: google676a414ad086cefb.html';
+	exit();
+}
+if ($script == '/googlebcce906c6b666bb8.html') {
+        echo 'google-site-verification: googlebcce906c6b666bb8.html';
+        exit();
+}
+
+header('HTTP/1.1 301 Moved Permanently');
+header('Location: http://'.$subdomain.'disclosurelo.gs'.$script);
+exit();
+?>
+
+


--- /dev/null
+++ b/documents/robots.txt
@@ -1,1 +1,5 @@
+# www.robotstxt.org/
+# http://code.google.com/web/controlcrawlindex/
 
+User-agent: *
+


--- /dev/null
+++ b/documents/rss.xml.php
@@ -1,1 +1,30 @@
+<?php
 
+// Agency X updated Y,  new files, diff of plain text/link text,
+// feed for just one agency or all
+// This is a minimum example of using the Universal Feed Generator Class
+include("lib/FeedWriter.php");
+//Creating an instance of FeedWriter class.
+$TestFeed = new FeedWriter(RSS2);
+//Setting the channel elements
+//Use wrapper functions for common channelelements
+$TestFeed->setTitle('Last Modified - All');
+$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
+$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
+//Retriving informations from database
+$rows = $db->get_view("app", "byLastModified")->rows;
+//print_r($rows);
+foreach ($rows as $row) {
+    //Create an empty FeedItem
+    $newItem = $TestFeed->createNewItem();
+    //Add elements to the feed item
+    $newItem->setTitle($row['name']);
+    $newItem->setLink($row['id']);
+    $newItem->setDate(date("c", $row['metadata']['lastModified']));
+    $newItem->setDescription($row['name']);
+    //Now add the feed item
+    $TestFeed->addItem($newItem);
+}
+//OK. Everything is done. Now genarate the feed.
+$TestFeed->genarateFeed();
+?>


--- /dev/null
+++ b/documents/run.bat
@@ -1,1 +1,2 @@
-
+python scrape.py
+pause


--- /dev/null
+++ b/documents/runScrapers.php


--- /dev/null
+++ b/documents/scrape.py
@@ -1,1 +1,214 @@
-
+#http://packages.python.org/CouchDB/client.html
+import couchdb
+import urllib2
+from BeautifulSoup import BeautifulSoup
+import re
+import hashlib
+from urlparse import urljoin
+import time
+import os
+import mimetypes
+import re
+import urllib
+import urlparse
+
+def mkhash(input):
+	return hashlib.md5(input).hexdigest().encode("utf-8")
+
+def canonurl(url):
+	r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
+	if the URL looks invalid.
+	>>> canonurl('\xe2\x9e\xa1.ws')  # tinyarro.ws
+	'http://xn--hgi.ws/'
+	"""
+	# strip spaces at the ends and ensure it's prefixed with 'scheme://'
+	url = url.strip()
+	if not url:
+		return ''
+	if not urlparse.urlsplit(url).scheme:
+		url = 'http://' + url
+
+	# turn it into Unicode
+	#try:
+	#    url = unicode(url, 'utf-8')
+	#except UnicodeDecodeError:
+	#    return ''  # bad UTF-8 chars in URL
+
+	# parse the URL into its components
+	parsed = urlparse.urlsplit(url)
+	scheme, netloc, path, query, fragment = parsed
+
+	# ensure scheme is a letter followed by letters, digits, and '+-.' chars
+	if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
+		return ''
+	scheme = str(scheme)
+
+	# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
+	match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
+	if not match:
+		return ''
+	domain, port = match.groups()
+	netloc = domain + (port if port else '')
+	netloc = netloc.encode('idna')
+
+	# ensure path is valid and convert Unicode chars to %-encoded
+	if not path:
+		path = '/'  # eg: 'http://google.com' -> 'http://google.com/'
+	path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
+
+	# ensure query is valid