Add disclo.gs homepage
Former-commit-id: 6abf389463f189798b499509f3dc589f78c6eacf
--- /dev/null
+++ b/admin/importRTKbodies.php
@@ -1,1 +1,56 @@
+<?php
+require_once '../include/common.inc.php';
+
+$db = $server->get_db('disclosr-agencies');
+$rows = $db->get_view("app", "byName")->rows;
+$nametoid = Array();
+$accounts = Array();
+foreach ($rows as $row) {
+ $nametoid[trim($row->key)] = $row->value;
+}
+
+function extractCSVAccounts($url, $nameField, $accountField, $filter) {
+ global $accounts, $nametoid;
+ $request = Requests::get($url);
+ echo $url;
+ $Data = str_getcsv($request->body, "\n"); //parse the rows
+ $headers = Array();
+ foreach ($Data as $num => $line) {
+ $Row = str_getcsv($line, ",");
+ if ($num == 0) {
+ $headers = $Row;
+ print_r($headers);
+ } else {
+ if (isset($Row[array_search($nameField, $headers)])) {
+ $agencyName = $Row[array_search($nameField, $headers)];
+ if (!in_array(trim($agencyName), array_keys($nametoid))) {
+ echo "$agencyName missing" . PHP_EOL;
+ } else {
+ echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+ $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
+ }
+ } else {
+ echo "error finding any agency" . $line . PHP_EOL;
+ }
+ }
+ }
+}
+
+extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
+print_r($accounts);
+/* foreach ($accounts as $id => $accountTypes) {
+ echo $id . "<br>" . PHP_EOL;
+ $doc = object_to_array($db->get($id));
+ // print_r($doc);
+
+ foreach ($accountTypes as $accountType => $accounts) {
+ if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
+ $doc["has" . $accountType] = Array();
+ }
+ $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
+ }
+ $db->save($doc);
+}*/
+?>
+
--- /dev/null
+++ b/documents/404.html
@@ -1,1 +1,44 @@
+<!doctype html>
+<html lang="en">
+<head>
+ <meta charset="utf-8">
+ <title>Page Not Found :(</title>
+ <style>
+ ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
+ ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
+ html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
+ html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
+ body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
+ h1 { margin: 0 10px; font-size: 50px; text-align: center; }
+ h1 span { color: #bbb; }
+ h3 { margin: 1.5em 0 0.5em; }
+ p { margin: 1em 0; }
+ ul { padding: 0 0 0 40px; margin: 1em 0; }
+ .container { max-width: 380px; _width: 380px; margin: 0 auto; }
+ /* google search */
+ #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
+ #goog-fixurl form { margin: 0; }
+ #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
+ #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
+ #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
+ #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
+ #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
+ input::-moz-focus-inner { padding: 0; border: 0; }
+ </style>
+</head>
+<body>
+ <div class="container">
+ <h1>Not found <span>:(</span></h1>
+ <p>Sorry, but the page you were trying to view does not exist.</p>
+ <p>It looks like this was the result of either:</p>
+ <ul>
+ <li>a mistyped address</li>
+ <li>an out-of-date link</li>
+ </ul>
+ <script>
+ var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+ </script>
+ <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
+ </div>
+
Binary files /dev/null and b/documents/apple-touch-icon-114x114-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon-57x57-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon-72x72-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon.png differ
--- /dev/null
+++ b/documents/checkScrapers.php
@@ -1,1 +1,42 @@
+<?php
+include_once('../include/common.inc.php');
+include_header('Webserver and Accessiblity');
+
+echo "<table>
+ <tr><th>name</th><th>disclog</th><th>scraper?</th></tr>";
+$agenciesdb = $server->get_db('disclosr-agencies');
+$docsdb = $server->get_db('disclosr-documents');
+try {
+ $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
+
+
+ if ($rows) {
+ foreach ($rows as $row) {
+
+ echo "<tr><td>" . $row->value->name . " (".$row->id.")</td>\n";
+ echo "<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
+ .$row->value->FOIDocumentsURL.'</a>';
+ } else {
+ echo "<font color='red'>✘</font>";
+ }
+ echo "</td>\n<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ if (file_exists("./scrapers/".$row->id.'.py')) {
+ echo "<font color='green'>✔</font>";
+ } else if (file_exists("./scrapers/".$row->id.'.txt')){
+ echo "pass";
+ } else {
+ echo "<font color='red'>✘</font>";
+ }
+ }
+ echo "</td></tr>\n";
+ }
+ }
+} catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+}
+include_footer();
+?>
--- /dev/null
+++ b/documents/crossdomain.xml
@@ -1,1 +1,26 @@
+<?xml version="1.0"?>
+<!DOCTYPE cross-domain-policy SYSTEM "http://www.adobe.com/xml/dtds/cross-domain-policy.dtd">
+<cross-domain-policy>
+
+<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+
+<!-- Most restrictive policy: -->
+ <site-control permitted-cross-domain-policies="none"/>
+
+
+
+<!-- Least restrictive policy: -->
+<!--
+ <site-control permitted-cross-domain-policies="all"/>
+ <allow-access-from domain="*" to-ports="*" secure="false"/>
+ <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+-->
+<!--
+ If you host a crossdomain.xml file with allow-access-from domain="*"
+ and don’t understand all of the points described here, you probably
+ have a nasty security vulnerability. ~ simon willison
+-->
+
+</cross-domain-policy>
+
Binary files /dev/null and b/documents/favicon.ico differ
--- /dev/null
+++ b/documents/genericScrapers.py
@@ -1,1 +1,61 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import scrape
+from bs4 import BeautifulSoup
+import abc
+
+class GenericOAICDisclogScraper(object):
+ __metaclass__ = abc.ABCMeta
+ @abc.abstractmethod
+ def getAgencyID(self):
+ """ disclosr agency id """
+ return
+
+ @abc.abstractmethod
+ def getURL(self):
+ """ disclog URL"""
+ return
+
+ @abc.abstractmethod
+ def getColumns(self,columns):
+ """ rearranges columns if required """
+ return
+
+ def doScrape(self):
+ foidocsdb = scrape.couch['disclosr-foidocuments']
+ (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+ if content != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(content)
+ for row in soup.table.find_all('tr'):
+ columns = row.find_all('td')
+ if len(columns) == 5:
+ (id, date, description, title, notes) = self.getColumns(columns)
+ print id.string
+ hash = scrape.mkhash(url+id.string)
+ links = []
+ for atag in row.find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(url,atag['href']))
+ doc = foidocsdb.get(hash)
+ descriptiontxt = ""
+ for string in description.stripped_strings:
+ descriptiontxt = descriptiontxt + string
+
+ if doc == None:
+ print "saving"
+ doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
+ "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
+ foidocsdb.save(doc)
+ else:
+ print "already saved"
+
+ elif len(row.find_all('th')) == 5:
+ print "header row"
+
+ else:
+ print "ERROR number of columns incorrect"
+ print row
+
--- /dev/null
+++ b/documents/google676a414ad086cefb.html
@@ -1,1 +1,2 @@
+google-site-verification: google676a414ad086cefb.html
--- /dev/null
+++ b/documents/googlebcce906c6b666bb8.html
@@ -1,1 +1,2 @@
+google-site-verification: googlebcce906c6b666bb8.html
--- /dev/null
+++ b/documents/humans.txt
@@ -1,1 +1,44 @@
+/* the humans responsible & colophon */
+/* humanstxt.org */
+
+/* TEAM */
+ <your title>: <your name>
+ Site:
+ Twitter:
+ Location:
+
+/* THANKS */
+ Names (& URL):
+
+/* SITE */
+ Standards: HTML5, CSS3
+ Components: Modernizr, jQuery
+ Software:
+
+
+
+ -o/-
+ +oo//-
+ :ooo+//:
+ -ooooo///-
+ /oooooo//:
+ :ooooooo+//-
+ -+oooooooo///-
+ -://////////////+oooooooooo++////////////::
+ :+ooooooooooooooooooooooooooooooooooooo+:::-
+ -/+ooooooooooooooooooooooooooooooo+/::////:-
+ -:+oooooooooooooooooooooooooooo/::///////:-
+ --/+ooooooooooooooooooooo+::://////:-
+ -:+ooooooooooooooooo+:://////:--
+ /ooooooooooooooooo+//////:-
+ -ooooooooooooooooooo////-
+ /ooooooooo+oooooooooo//:
+ :ooooooo+/::/+oooooooo+//-
+ -oooooo/::///////+oooooo///-
+ /ooo+::://////:---:/+oooo//:
+ -o+/::///////:- -:/+o+//-
+ :-:///////:- -:/://
+ -////:- --//:
+ -- -:
+
--- /dev/null
+++ b/documents/index.php
@@ -1,1 +1,94 @@
+<!doctype html>
+<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
+<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
+<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
+<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
+<head>
+ <meta charset="utf-8">
+ <!-- Use the .htaccess and remove these lines to avoid edge case issues.
+ More info: h5bp.com/i/378 -->
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+
+ <title>disclosure logs</title>
+ <meta name="description" content="">
+
+ <!-- Mobile viewport optimized: h5bp.com/viewport -->
+ <meta name="viewport" content="width=device-width">
+
+ <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
+<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" />
+ <link rel="stylesheet" href="css/style.css">
+
+ <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
+
+ <!-- All JavaScript at the bottom, except this Modernizr build.
+ Modernizr enables HTML5 elements & feature detects for optimal performance.
+ Create your own custom Modernizr build: www.modernizr.com/download/ -->
+ <script src="js/libs/modernizr-2.5.3.min.js"></script>
+</head>
+<body>
+ <!-- Prompt IE 6 users to install Chrome Frame. Remove this if you support IE 6.
+ chromium.org/developers/how-tos/chrome-frame-getting-started -->
+ <!--[if lt IE 7]><p class=chromeframe>Your browser is <em>ancient!</em> <a href="http://browsehappy.com/">Upgrade to a different browser</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to experience this site.</p><![endif]-->
+ <header>
+<center><h1>disclosurelo.gs</h1></center>
+
+ </header>
+ <div role="main">
+<dl>
+ <dt>Disclosure Log</dt>
+ <dd></dd>
+</dl>
+<a href="http://information.<?php echo $_SERVER['HTTP_HOST'];?>">information.disclo.gs - documents</a><br>
+<a href="http://orgs.<?php echo $_SERVER['HTTP_HOST'];?>">orgs.disclo.gs - structure</a><br>
+<a href="http://lobbyists.<?php echo $_SERVER['HTTP_HOST'];?>">lobbyists.disclo.gs - lobbylens</a><br>
+<a href="http://contracts.<?php echo $_SERVER['HTTP_HOST'];?>">contracts.disclo.gs - contractdashboard</a><br>
+</body>
+
+ </div>
+ <footer>
+
+ </footer>
+
+
+ <!-- JavaScript at the bottom for fast page loading -->
+
+ <!-- Grab Google CDN's jQuery, with a protocol relative URL; fall back to local if offline -->
+ <script src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
+ <script>window.jQuery || document.write('<script src="js/libs/jquery-1.7.1.min.js"><\/script>')</script>
+
+ <!-- scripts concatenated and minified via build script -->
+ <script src="js/plugins.js"></script>
+ <script src="js/script.js"></script>
+ <!-- end scripts -->
+
+ <!-- Asynchronous Google Analytics snippet. Change UA-XXXXX-X to be your site's ID.
+ mathiasbynens.be/notes/async-analytics-snippet -->
+ <script>
+ var _gaq=[['_setAccount','UA-XXXXX-X'],['_trackPageview']];
+ (function(d,t){var g=d.createElement(t),s=d.getElementsByTagName(t)[0];
+ g.src=('https:'==location.protocol?'//ssl':'//www')+'.google-analytics.com/ga.js';
+ s.parentNode.insertBefore(g,s)}(document,'script'));
+ </script>
+</body>
+</html>
+
+
+australian disclosure logs
+
+are you looking for more information about:
+contracts
+gov orgs
+lobbyists
+
+1/1/11 title (Dept dfggdfgdf)
+description:
+source link:
+documents:
+ #1 title link
+
+
+rss feed here
--- /dev/null
+++ b/documents/readme.md
@@ -1,1 +1,56 @@
+# [HTML5 Boilerplate](http://html5boilerplate.com)
+HTML5 Boilerplate is a professional front-end template that helps you build fast, robust, adaptable, and future-proof websites. Spend more time developing and less time reinventing the wheel.
+
+This project is the product of many years of iterative development and combined community knowledge. It does not impose a specific development philosophy or framework, so you're free to architect your code in the way that you want.
+
+
+## Quick start
+
+Clone the git repo - `git clone git://github.com/h5bp/html5-boilerplate.git` - or [download it](https://github.com/h5bp/html5-boilerplate/zipball/master)
+
+
+## Features
+
+* HTML5 ready. Use the new elements with confidence.
+* Cross-browser compatible (Chrome, Opera, Safari, Firefox 3.6+, IE6+).
+* Designed with progressive enhancement in mind.
+* CSS normalizations and common bug fixes.
+* IE-specific classes for easier cross-browser control.
+* A default print stylesheet, performance optimized.
+* Mobile browser optimizations.
+* Protection against any stray `console.log` causing JavaScript errors in IE6/7.
+* The latest jQuery via CDN, with a local fallback.
+* A custom Modernizr build for feature detection.
+* An optimized Google Analytics snippet.
+* Apache server caching, compression, and other configuration defaults for Grade-A performance.
+* Cross-domain Ajax and Flash.
+* "Delete-key friendly." Easy to strip out parts you don't need.
+* Extensive inline and accompanying documentation.
+
+
+## Contributing
+
+Anyone and everyone is welcome to [contribute](https://github.com/h5bp/html5-boilerplate/wiki/contribute). Hundreds of developers have helped make the HTML5 Boilerplate what it is today.
+
+
+## Project information
+
+* Source: http://github.com/h5bp/html5-boilerplate
+* Web: http://html5boilerplate.com
+* Docs: http://html5boilerplate.com/docs
+* Twitter: http://twitter.com/h5bp
+
+
+## License
+
+### Major components:
+
+* jQuery: MIT/GPL license
+* Modernizr: MIT/BSD license
+* Normalize.css: Public Domain
+
+### Everything else:
+
+The Unlicense (aka: public domain)
+
--- /dev/null
+++ b/documents/redirect.php
@@ -1,1 +1,19 @@
+<?php
+$subdomain = str_replace('disclo.gs','',$_SERVER['SERVER_NAME']);
+$script = $_SERVER['REQUEST_URI'];
+if ($script == '/google676a414ad086cefb.html') {
+ echo 'google-site-verification: google676a414ad086cefb.html';
+ exit();
+}
+if ($script == '/googlebcce906c6b666bb8.html') {
+ echo 'google-site-verification: googlebcce906c6b666bb8.html';
+ exit();
+}
+
+header('HTTP/1.1 301 Moved Permanently');
+header('Location: http://'.$subdomain.'disclosurelo.gs'.$script);
+exit();
+?>
+
+
--- /dev/null
+++ b/documents/robots.txt
@@ -1,1 +1,5 @@
+# www.robotstxt.org/
+# http://code.google.com/web/controlcrawlindex/
+User-agent: *
+
--- /dev/null
+++ b/documents/rss.xml.php
@@ -1,1 +1,30 @@
+<?php
+// Agency X updated Y, new files, diff of plain text/link text,
+// feed for just one agency or all
+// This is a minimum example of using the Universal Feed Generator Class
+include("lib/FeedWriter.php");
+//Creating an instance of FeedWriter class.
+$TestFeed = new FeedWriter(RSS2);
+//Setting the channel elements
+//Use wrapper functions for common channelelements
+$TestFeed->setTitle('Last Modified - All');
+$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
+$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
+//Retriving informations from database
+$rows = $db->get_view("app", "byLastModified")->rows;
+//print_r($rows);
+foreach ($rows as $row) {
+ //Create an empty FeedItem
+ $newItem = $TestFeed->createNewItem();
+ //Add elements to the feed item
+ $newItem->setTitle($row['name']);
+ $newItem->setLink($row['id']);
+ $newItem->setDate(date("c", $row['metadata']['lastModified']));
+ $newItem->setDescription($row['name']);
+ //Now add the feed item
+ $TestFeed->addItem($newItem);
+}
+//OK. Everything is done. Now genarate the feed.
+$TestFeed->genarateFeed();
+?>
--- /dev/null
+++ b/documents/run.bat
@@ -1,1 +1,2 @@
-
+python scrape.py
+pause
--- /dev/null
+++ b/documents/runScrapers.php
--- /dev/null
+++ b/documents/scrape.py
@@ -1,1 +1,214 @@
-
+#http://packages.python.org/CouchDB/client.html
+import couchdb
+import urllib2
+from BeautifulSoup import BeautifulSoup
+import re
+import hashlib
+from urlparse import urljoin
+import time
+import os
+import mimetypes
+import re
+import urllib
+import urlparse
+
+def mkhash(input):
+ return hashlib.md5(input).hexdigest().encode("utf-8")
+
+def canonurl(url):
+ r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
+ if the URL looks invalid.
+ >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
+ 'http://xn--hgi.ws/'
+ """
+ # strip spaces at the ends and ensure it's prefixed with 'scheme://'
+ url = url.strip()
+ if not url:
+ return ''
+ if not urlparse.urlsplit(url).scheme:
+ url = 'http://' + url
+
+ # turn it into Unicode
+ #try:
+ # url = unicode(url, 'utf-8')
+ #except UnicodeDecodeError:
+ # return '' # bad UTF-8 chars in URL
+
+ # parse the URL into its components
+ parsed = urlparse.urlsplit(url)
+ scheme, netloc, path, query, fragment = parsed
+
+ # ensure scheme is a letter followed by letters, digits, and '+-.' chars
+ if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
+ return ''
+ scheme = str(scheme)
+
+ # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
+ match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
+ if not match:
+ return ''
+ domain, port = match.groups()
+ netloc = domain + (port if port else '')
+ netloc = netloc.encode('idna')
+
+ # ensure path is valid and convert Unicode chars to %-encoded
+ if not path:
+ path = '/' # eg: 'http://google.com' -> 'http://google.com/'
+ path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
+
+ # ensure query is valid
+ query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
+
+ # ensure fragment is valid
+ fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
+
+ # piece it all back together, truncating it to a maximum of 4KB
+ url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+ return url[:4096]
+
+def fullurl(url,href):
+ href = href.replace(" ","%20")
+ href = re.sub('#.*$','',href)
+ return urljoin(url,href)
+
+#http://diveintopython.org/http_web_services/etags.html
+class NotModifiedHandler(urllib2.BaseHandler):
+ def http_error_304(self, req, fp, code, message, headers):
+ addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
+ addinfourl.code = code
+ return addinfourl
+
+def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
+ url = canonurl(url)
+ hash = mkhash(url)
+ req = urllib2.Request(url)
+ print "Fetching %s" % url
+ if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
+ print "Not a valid HTTP url"
+ return (None,None,None)
+ doc = docsdb.get(hash)
+ if doc == None:
+ doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
+ else:
+ if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
+ print "Uh oh, trying to scrape URL again too soon!"
+ last_attachment_fname = doc["_attachments"].keys()[-1]
+ last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+ return (doc['url'],doc['mime_type'],last_attachment.read())
+ if scrape_again == False:
+ print "Not scraping this URL again as requested"
+ return (None,None,None)
+
+ time.sleep(3) # wait 3 seconds to give webserver time to recover
+
+ req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
+ #if there is a previous version stored in couchdb, load caching helper tags
+ if doc.has_key('etag'):
+ req.add_header("If-None-Match", doc['etag'])
+ if doc.has_key('last_modified'):
+ req.add_header("If-Modified-Since", doc['last_modified'])
+
+ opener = urllib2.build_opener(NotModifiedHandler())
+ try:
+ url_handle = opener.open(req)
+ doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
+ headers = url_handle.info() # the addinfourls have the .info() too
+ doc['etag'] = headers.getheader("ETag")
+ doc['last_modified'] = headers.getheader("Last-Modified")
+ doc['date'] = headers.getheader("Date")
+ doc['page_scraped'] = time.time()
+ doc['web_server'] = headers.getheader("Server")
+ doc['via'] = headers.getheader("Via")
+ doc['powered_by'] = headers.getheader("X-Powered-By")
+ doc['file_size'] = headers.getheader("Content-Length")
+ content_type = headers.getheader("Content-Type")
+ if content_type != None:
+ doc['mime_type'] = content_type.split(";")[0]
+ else:
+ (type,encoding) = mimetypes.guess_type(url)
+ doc['mime_type'] = type
+ if hasattr(url_handle, 'code'):
+ if url_handle.code == 304:
+ print "the web page has not been modified"
+ return (None,None,None)
+ else:
+ content = url_handle.read()
+ docsdb.save(doc)
+ doc = docsdb.get(hash) # need to get a _rev
+ docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
+ return (doc['url'], doc['mime_type'], content)
+ #store as attachment epoch-filename
+
+ except urllib2.URLError as e:
+ error = ""
+ if hasattr(e, 'reason'):
+ error = "error %s in downloading %s" % (str(e.reason), url)
+ elif hasattr(e, 'code'):
+ error = "error %s in downloading %s" % (e.code, url)
+ print error
+ doc['error'] = error
+ docsdb.save(doc)
+ return (None,None,None)
+
+