Add disclo.gs homepage
Add disclo.gs homepage


Former-commit-id: 6abf389463f189798b499509f3dc589f78c6eacf

--- /dev/null
+++ b/admin/importRTKbodies.php
@@ -1,1 +1,56 @@
+<?php
 
+require_once '../include/common.inc.php';
+
+$db = $server->get_db('disclosr-agencies');
+$rows = $db->get_view("app", "byName")->rows;
+$nametoid = Array();
+$accounts = Array();
+foreach ($rows as $row) {
+    $nametoid[trim($row->key)] = $row->value;
+}
+
+function extractCSVAccounts($url, $nameField, $accountField, $filter) {
+    global $accounts, $nametoid;
+    $request = Requests::get($url);
+    echo $url;
+    $Data = str_getcsv($request->body, "\n"); //parse the rows 
+    $headers = Array();
+    foreach ($Data as $num => $line) {
+        $Row = str_getcsv($line, ",");
+        if ($num == 0) {
+            $headers = $Row;
+            print_r($headers);
+        } else {
+            if (isset($Row[array_search($nameField, $headers)])) {
+                $agencyName = $Row[array_search($nameField, $headers)];
+                    if (!in_array(trim($agencyName), array_keys($nametoid))) {
+                        echo "$agencyName missing" . PHP_EOL;
+                    } else {
+                        echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+                             $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
+                    }
+               } else {
+                echo "error finding any agency" . $line . PHP_EOL;
+            }
+        }
+    }
+}
+
+extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
+print_r($accounts);
+/* foreach ($accounts as $id => $accountTypes) {
+    echo $id . "<br>" . PHP_EOL;
+    $doc = object_to_array($db->get($id));
+    // print_r($doc);
+
+    foreach ($accountTypes as $accountType => $accounts) {
+        if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
+            $doc["has" . $accountType] = Array();
+        }
+        $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
+    }
+    $db->save($doc);
+}*/
+?>
+

--- /dev/null
+++ b/documents/404.html
@@ -1,1 +1,44 @@
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>Page Not Found :(</title>
+  <style>
+    ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
+    ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
+    html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
+    html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
+    body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
+    h1 { margin: 0 10px; font-size: 50px; text-align: center; }
+    h1 span { color: #bbb; }
+    h3 { margin: 1.5em 0 0.5em; }
+    p { margin: 1em 0; }
+    ul { padding: 0 0 0 40px; margin: 1em 0; }
+    .container { max-width: 380px; _width: 380px; margin: 0 auto; }
+    /* google search */
+    #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
+    #goog-fixurl form { margin: 0; }
+    #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
+    #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
+    #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
+    #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
+    #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
+    input::-moz-focus-inner { padding: 0; border: 0; }
+  </style>
+</head>
+<body>
+  <div class="container">
+    <h1>Not found <span>:(</span></h1>
+    <p>Sorry, but the page you were trying to view does not exist.</p>
+    <p>It looks like this was the result of either:</p>
+    <ul>
+      <li>a mistyped address</li>
+      <li>an out-of-date link</li>
+    </ul>
+    <script>
+      var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+    </script>
+    <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
+  </div>
 
+

 Binary files /dev/null and b/documents/apple-touch-icon-114x114-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon-57x57-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon-72x72-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon.png differ
--- /dev/null
+++ b/documents/checkScrapers.php
@@ -1,1 +1,42 @@
+<?php
 
+include_once('../include/common.inc.php');
+include_header('Webserver and Accessiblity');
+
+echo "<table>
+    <tr><th>name</th><th>disclog</th><th>scraper?</th></tr>";
+$agenciesdb = $server->get_db('disclosr-agencies');
+$docsdb = $server->get_db('disclosr-documents');
+try {
+    $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
+
+
+    if ($rows) {
+        foreach ($rows as $row) {
+
+            echo "<tr><td>" . $row->value->name . " (".$row->id.")</td>\n";
+             echo "<td>";
+             if (isset($row->value->FOIDocumentsURL)) {
+                 echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
+                     .$row->value->FOIDocumentsURL.'</a>';
+             } else {
+                echo "<font color='red'>✘</font>";
+            }
+            echo "</td>\n<td>";
+            if (isset($row->value->FOIDocumentsURL)) {
+            if (file_exists("./scrapers/".$row->id.'.py')) {
+                echo "<font color='green'>✔</font>";
+            } else if (file_exists("./scrapers/".$row->id.'.txt')){
+                echo "pass";
+            } else {
+                echo "<font color='red'>✘</font>";
+            }
+            }
+            echo "</td></tr>\n";
+        }
+    }
+} catch (SetteeRestClientException $e) {
+    setteErrorHandler($e);
+}
+include_footer();
+?>

--- /dev/null
+++ b/documents/crossdomain.xml
@@ -1,1 +1,26 @@
+<?xml version="1.0"?>
+<!DOCTYPE cross-domain-policy SYSTEM "http://www.adobe.com/xml/dtds/cross-domain-policy.dtd">
+<cross-domain-policy>
 
+
+<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+
+<!-- Most restrictive policy: -->
+	<site-control permitted-cross-domain-policies="none"/>
+
+
+
+<!-- Least restrictive policy: -->
+<!--
+	<site-control permitted-cross-domain-policies="all"/>
+	<allow-access-from domain="*" to-ports="*" secure="false"/>
+	<allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+-->
+<!--
+  If you host a crossdomain.xml file with allow-access-from domain="*"
+  and don’t understand all of the points described here, you probably
+  have a nasty security vulnerability. ~ simon willison
+-->
+
+</cross-domain-policy>
+

 Binary files /dev/null and b/documents/favicon.ico differ
--- /dev/null
+++ b/documents/genericScrapers.py
@@ -1,1 +1,61 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import scrape
 
+from bs4 import BeautifulSoup
+import abc
+
+class GenericOAICDisclogScraper(object):
+	__metaclass__ = abc.ABCMeta
+	@abc.abstractmethod
+	def getAgencyID(self):
+		""" disclosr agency id """
+		return
+
+	@abc.abstractmethod
+	def getURL(self):
+		""" disclog URL"""
+		return
+
+	@abc.abstractmethod
+	def getColumns(self,columns):
+		""" rearranges columns if required """
+		return
+
+	def doScrape(self):
+		foidocsdb = scrape.couch['disclosr-foidocuments']
+		(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+		if content != None:
+			if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+			# http://www.crummy.com/software/BeautifulSoup/documentation.html
+				soup = BeautifulSoup(content)
+				for row in soup.table.find_all('tr'):
+					columns = row.find_all('td')
+					if len(columns) == 5:
+						(id, date, description, title, notes) = self.getColumns(columns)
+						print id.string
+						hash = scrape.mkhash(url+id.string)
+						links = []
+						for atag in row.find_all("a"):
+							if atag.has_key('href'):
+								links.append(scrape.fullurl(url,atag['href']))
+						doc = foidocsdb.get(hash)
+						descriptiontxt = ""
+						for string in description.stripped_strings:
+							descriptiontxt = descriptiontxt + string
+							
+						if doc == None:
+							print "saving"
+							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
+			 				 "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
+							foidocsdb.save(doc)
+						else:
+							print "already saved"
+					
+					elif len(row.find_all('th')) == 5:
+						print "header row"
+					
+					else:
+						print "ERROR number of columns incorrect"
+						print row
+

--- /dev/null
+++ b/documents/google676a414ad086cefb.html
@@ -1,1 +1,2 @@
+google-site-verification: google676a414ad086cefb.html
 

--- /dev/null
+++ b/documents/googlebcce906c6b666bb8.html
@@ -1,1 +1,2 @@
+google-site-verification: googlebcce906c6b666bb8.html
 

--- /dev/null
+++ b/documents/humans.txt
@@ -1,1 +1,44 @@
+/* the humans responsible & colophon */
+/* humanstxt.org */
 
+
+/* TEAM */
+  <your title>: <your name>
+  Site:
+  Twitter:
+  Location:
+
+/* THANKS */
+  Names (& URL):
+
+/* SITE */
+  Standards: HTML5, CSS3
+  Components: Modernizr, jQuery
+  Software:
+
+
+
+                               -o/-
+                               +oo//-
+                              :ooo+//:
+                             -ooooo///-
+                             /oooooo//:
+                            :ooooooo+//-
+                           -+oooooooo///-
+           -://////////////+oooooooooo++////////////::
+            :+ooooooooooooooooooooooooooooooooooooo+:::-
+              -/+ooooooooooooooooooooooooooooooo+/::////:-
+                -:+oooooooooooooooooooooooooooo/::///////:-
+                  --/+ooooooooooooooooooooo+::://////:-
+                     -:+ooooooooooooooooo+:://////:--
+                       /ooooooooooooooooo+//////:-
+                      -ooooooooooooooooooo////-
+                      /ooooooooo+oooooooooo//:
+                     :ooooooo+/::/+oooooooo+//-
+                    -oooooo/::///////+oooooo///-
+                    /ooo+::://////:---:/+oooo//:
+                   -o+/::///////:-      -:/+o+//-
+                   :-:///////:-            -:/://
+                     -////:-                 --//:
+                       --                       -:
+

--- /dev/null
+++ b/documents/index.php
@@ -1,1 +1,94 @@
+<!doctype html>
+<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
+<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
+<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
+<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
 
+  <!-- Use the .htaccess and remove these lines to avoid edge case issues.
+       More info: h5bp.com/i/378 -->
+  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+
+  <title>disclosure logs</title>
+  <meta name="description" content="">
+
+  <!-- Mobile viewport optimized: h5bp.com/viewport -->
+  <meta name="viewport" content="width=device-width">
+
+  <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
+<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" />
+  <link rel="stylesheet" href="css/style.css">
+
+  <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
+
+  <!-- All JavaScript at the bottom, except this Modernizr build.
+       Modernizr enables HTML5 elements & feature detects for optimal performance.
+       Create your own custom Modernizr build: www.modernizr.com/download/ -->
+  <script src="js/libs/modernizr-2.5.3.min.js"></script>
+</head>
+<body>
+  <!-- Prompt IE 6 users to install Chrome Frame. Remove this if you support IE 6.
+       chromium.org/developers/how-tos/chrome-frame-getting-started -->
+  <!--[if lt IE 7]><p class=chromeframe>Your browser is <em>ancient!</em> <a href="http://browsehappy.com/">Upgrade to a different browser</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to experience this site.</p><![endif]-->
+  <header>
+<center><h1>disclosurelo.gs</h1></center>
+
+  </header>
+  <div role="main">
+<dl>
+  <dt>Disclosure Log</dt>
+    <dd></dd>
+</dl>
+<a href="http://information.<?php echo $_SERVER['HTTP_HOST'];?>">information.disclo.gs - documents</a><br>
+<a href="http://orgs.<?php echo $_SERVER['HTTP_HOST'];?>">orgs.disclo.gs - structure</a><br>
+<a href="http://lobbyists.<?php echo $_SERVER['HTTP_HOST'];?>">lobbyists.disclo.gs - lobbylens</a><br>
+<a href="http://contracts.<?php echo $_SERVER['HTTP_HOST'];?>">contracts.disclo.gs - contractdashboard</a><br>
+</body>
+
+  </div>
+  <footer>
+
+  </footer>
+
+
+  <!-- JavaScript at the bottom for fast page loading -->
+
+  <!-- Grab Google CDN's jQuery, with a protocol relative URL; fall back to local if offline -->
+  <script src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
+  <script>window.jQuery || document.write('<script src="js/libs/jquery-1.7.1.min.js"><\/script>')</script>
+
+  <!-- scripts concatenated and minified via build script -->
+  <script src="js/plugins.js"></script>
+  <script src="js/script.js"></script>
+  <!-- end scripts -->
+
+  <!-- Asynchronous Google Analytics snippet. Change UA-XXXXX-X to be your site's ID.
+       mathiasbynens.be/notes/async-analytics-snippet -->
+  <script>
+    var _gaq=[['_setAccount','UA-XXXXX-X'],['_trackPageview']];
+    (function(d,t){var g=d.createElement(t),s=d.getElementsByTagName(t)[0];
+    g.src=('https:'==location.protocol?'//ssl':'//www')+'.google-analytics.com/ga.js';
+    s.parentNode.insertBefore(g,s)}(document,'script'));
+  </script>
+</body>
+</html>
+
+
+australian disclosure logs
+
+are you looking for more information about:
+contracts
+gov orgs
+lobbyists
+
+1/1/11 title (Dept dfggdfgdf)
+description:
+source link:
+documents:
+    #1 title link
+
+
+rss feed here

--- /dev/null
+++ b/documents/readme.md
@@ -1,1 +1,56 @@
+# [HTML5 Boilerplate](http://html5boilerplate.com)
 
+HTML5 Boilerplate is a professional front-end template that helps you build fast, robust, adaptable, and future-proof websites. Spend more time developing and less time reinventing the wheel.
+
+This project is the product of many years of iterative development and combined community knowledge. It does not impose a specific development philosophy or framework, so you're free to architect your code in the way that you want.
+
+
+## Quick start
+
+Clone the git repo - `git clone git://github.com/h5bp/html5-boilerplate.git` - or [download it](https://github.com/h5bp/html5-boilerplate/zipball/master)
+
+
+## Features
+
+* HTML5 ready. Use the new elements with confidence.
+* Cross-browser compatible (Chrome, Opera, Safari, Firefox 3.6+, IE6+).
+* Designed with progressive enhancement in mind.
+* CSS normalizations and common bug fixes.
+* IE-specific classes for easier cross-browser control.
+* A default print stylesheet, performance optimized.
+* Mobile browser optimizations.
+* Protection against any stray `console.log` causing JavaScript errors in IE6/7.
+* The latest jQuery via CDN, with a local fallback.
+* A custom Modernizr build for feature detection.
+* An optimized Google Analytics snippet.
+* Apache server caching, compression, and other configuration defaults for Grade-A performance.
+* Cross-domain Ajax and Flash.
+* "Delete-key friendly." Easy to strip out parts you don't need.
+* Extensive inline and accompanying documentation.
+
+
+## Contributing
+
+Anyone and everyone is welcome to [contribute](https://github.com/h5bp/html5-boilerplate/wiki/contribute). Hundreds of developers have helped make the HTML5 Boilerplate what it is today.
+
+
+## Project information
+
+* Source: http://github.com/h5bp/html5-boilerplate
+* Web: http://html5boilerplate.com
+* Docs: http://html5boilerplate.com/docs
+* Twitter: http://twitter.com/h5bp
+
+
+## License
+
+### Major components:
+
+* jQuery: MIT/GPL license
+* Modernizr: MIT/BSD license
+* Normalize.css: Public Domain
+
+### Everything else:
+
+The Unlicense (aka: public domain)
+

--- /dev/null
+++ b/documents/redirect.php
@@ -1,1 +1,19 @@
+<?php
+$subdomain = str_replace('disclo.gs','',$_SERVER['SERVER_NAME']);
+$script = $_SERVER['REQUEST_URI'];
 
+if ($script == '/google676a414ad086cefb.html') {
+	echo 'google-site-verification: google676a414ad086cefb.html';
+	exit();
+}
+if ($script == '/googlebcce906c6b666bb8.html') {
+        echo 'google-site-verification: googlebcce906c6b666bb8.html';
+        exit();
+}
+
+header('HTTP/1.1 301 Moved Permanently');
+header('Location: http://'.$subdomain.'disclosurelo.gs'.$script);
+exit();
+?>
+
+

--- /dev/null
+++ b/documents/robots.txt
@@ -1,1 +1,5 @@
+# www.robotstxt.org/
+# http://code.google.com/web/controlcrawlindex/
 
+User-agent: *
+

--- /dev/null
+++ b/documents/rss.xml.php
@@ -1,1 +1,30 @@
+<?php
 
+// Agency X updated Y,  new files, diff of plain text/link text,
+// feed for just one agency or all
+// This is a minimum example of using the Universal Feed Generator Class
+include("lib/FeedWriter.php");
+//Creating an instance of FeedWriter class.
+$TestFeed = new FeedWriter(RSS2);
+//Setting the channel elements
+//Use wrapper functions for common channelelements
+$TestFeed->setTitle('Last Modified - All');
+$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
+$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
+//Retriving informations from database
+$rows = $db->get_view("app", "byLastModified")->rows;
+//print_r($rows);
+foreach ($rows as $row) {
+    //Create an empty FeedItem
+    $newItem = $TestFeed->createNewItem();
+    //Add elements to the feed item
+    $newItem->setTitle($row['name']);
+    $newItem->setLink($row['id']);
+    $newItem->setDate(date("c", $row['metadata']['lastModified']));
+    $newItem->setDescription($row['name']);
+    //Now add the feed item
+    $TestFeed->addItem($newItem);
+}
+//OK. Everything is done. Now genarate the feed.
+$TestFeed->genarateFeed();
+?>

file:b/documents/run.bat (new)
--- /dev/null
+++ b/documents/run.bat
@@ -1,1 +1,2 @@
-
+python scrape.py
+pause

--- /dev/null
+++ b/documents/runScrapers.php

--- /dev/null
+++ b/documents/scrape.py
@@ -1,1 +1,214 @@
-
+#http://packages.python.org/CouchDB/client.html
+import couchdb
+import urllib2
+from BeautifulSoup import BeautifulSoup
+import re
+import hashlib
+from urlparse import urljoin
+import time
+import os
+import mimetypes
+import re
+import urllib
+import urlparse
+
+def mkhash(input):
+	return hashlib.md5(input).hexdigest().encode("utf-8")
+
+def canonurl(url):
+	r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
+	if the URL looks invalid.
+	>>> canonurl('\xe2\x9e\xa1.ws')  # tinyarro.ws
+	'http://xn--hgi.ws/'
+	"""
+	# strip spaces at the ends and ensure it's prefixed with 'scheme://'
+	url = url.strip()
+	if not url:
+		return ''
+	if not urlparse.urlsplit(url).scheme:
+		url = 'http://' + url
+
+	# turn it into Unicode
+	#try:
+	#    url = unicode(url, 'utf-8')
+	#except UnicodeDecodeError:
+	#    return ''  # bad UTF-8 chars in URL
+
+	# parse the URL into its components
+	parsed = urlparse.urlsplit(url)
+	scheme, netloc, path, query, fragment = parsed
+
+	# ensure scheme is a letter followed by letters, digits, and '+-.' chars
+	if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
+		return ''
+	scheme = str(scheme)
+
+	# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
+	match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
+	if not match:
+		return ''
+	domain, port = match.groups()
+	netloc = domain + (port if port else '')
+	netloc = netloc.encode('idna')
+
+	# ensure path is valid and convert Unicode chars to %-encoded
+	if not path:
+		path = '/'  # eg: 'http://google.com' -> 'http://google.com/'
+	path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
+
+	# ensure query is valid
+	query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
+
+	# ensure fragment is valid
+	fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
+
+	# piece it all back together, truncating it to a maximum of 4KB
+	url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+	return url[:4096]
+
+def fullurl(url,href):
+	href = href.replace(" ","%20")
+	href = re.sub('#.*$','',href)
+	return urljoin(url,href)
+
+#http://diveintopython.org/http_web_services/etags.html
+class NotModifiedHandler(urllib2.BaseHandler):  
+	def http_error_304(self, req, fp, code, message, headers):
+		addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
+		addinfourl.code = code
+		return addinfourl
+
+def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
+	url = canonurl(url)
+	hash = mkhash(url)
+	req = urllib2.Request(url)
+	print "Fetching %s" % url
+	if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
+		print "Not a valid HTTP url"
+		return (None,None,None)
+	doc = docsdb.get(hash) 
+	if doc == None:
+		doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
+	else:
+		if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
+			print "Uh oh, trying to scrape URL again too soon!"
+			last_attachment_fname = doc["_attachments"].keys()[-1]
+			last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+			return (doc['url'],doc['mime_type'],last_attachment.read())
+		if scrape_again == False:
+			print "Not scraping this URL again as requested"
+			return (None,None,None)
+
+	time.sleep(3) # wait 3 seconds to give webserver time to recover
+	
+	req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
+	#if there is a previous version stored in couchdb, load caching helper tags
+	if doc.has_key('etag'):
+		req.add_header("If-None-Match", doc['etag'])
+	if doc.has_key('last_modified'):
+		req.add_header("If-Modified-Since", doc['last_modified'])
+	 
+	opener = urllib2.build_opener(NotModifiedHandler())
+	try:
+		url_handle = opener.open(req)
+		doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
+		headers = url_handle.info() # the addinfourls have the .info() too
+		doc['etag'] = headers.getheader("ETag")
+		doc['last_modified'] = headers.getheader("Last-Modified") 
+		doc['date'] = headers.getheader("Date") 
+		doc['page_scraped'] = time.time() 
+		doc['web_server'] = headers.getheader("Server") 
+		doc['via'] = headers.getheader("Via") 
+		doc['powered_by'] = headers.getheader("X-Powered-By") 
+		doc['file_size'] = headers.getheader("Content-Length") 
+		content_type = headers.getheader("Content-Type")
+		if content_type != None:
+			 doc['mime_type'] = content_type.split(";")[0]
+		else:
+			 (type,encoding) = mimetypes.guess_type(url)
+			 doc['mime_type'] = type
+		if hasattr(url_handle, 'code'):
+			if url_handle.code == 304:
+				print "the web page has not been modified"
+				return (None,None,None)
+			else: 
+				content = url_handle.read()
+				docsdb.save(doc)
+				doc = docsdb.get(hash) # need to get a _rev
+				docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) 
+				return (doc['url'], doc['mime_type'], content)
+				#store as attachment epoch-filename
+				
+	except urllib2.URLError as e:
+			error = ""
+			if hasattr(e, 'reason'):
+				error = "error %s in downloading %s" % (str(e.reason), url)
+			elif hasattr(e, 'code'):
+				error = "error %s in downloading %s" % (e.code, url)
+			print error
+			doc['error'] = error
+			docsdb.save(doc)
+			return (None,None,None)
+
+