Add disclo.gs homepage
Former-commit-id: 6abf389463f189798b499509f3dc589f78c6eacf
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,4 +25,7 @@
[submodule "lib/querypath"]
path = lib/querypath
url = https://github.com/technosophos/querypath.git
+[submodule "lib/amon-php"]
+ path = lib/amon-php
+ url = https://github.com/martinrusev/amon-php.git
--- a/about.php
+++ b/about.php
@@ -10,8 +10,10 @@
Disclo.gs is a project to monitor Australian Federal Government agencies
compliance with their <a href="http://www.oaic.gov.au/publications/other_operational/foi_policy_frequently_asked_questions.html#_Toc291837571">"proactive disclosure requirements" to make a transparency league table as suggested by gov2 taskforce http://gov2.net.au/blog/2009/09/19/a-league-ladder-of-psi-openness/</a>.
<h2> Attributions </h2>
-National Archives of Australia, Australian Governments’ Interactive Functions Thesaurus, 2nd edition, September 2005, published at http://www.naa.gov.au/recordkeeping/thesaurus/index.htm.
-
+National Archives of Australia, Australian Governments’ Interactive Functions Thesaurus, 2nd edition, September 2005, published at http://www.naa.gov.au/recordkeeping/thesaurus/index.htm <br/>
+data.gov.au http://data.gov.au/dataset/directory-gov-au-full-data-export/ <br/>
+directory.gov.au <br/>
+australia.gov.au http://australia.gov.au/about/copyright <br/>
<h2> Open everything </h2>
All documents released CC-BY 3 AU
Open source git @
--- a/alaveteli/exportAgencies.csv.php
+++ b/alaveteli/exportAgencies.csv.php
@@ -1,20 +1,6 @@
<?php
include_once("../include/common.inc.php");
-
-function shortName($name) {
- $name = trim($name);
- if (strstr($name,"Minister ") || strstr($name,"Treasurer") || strstr($name,"Parliamentary Secretary")) {
- $badWords = Array ("Assisting the Prime Minister on","Assisting on"," the "," of "," for "," on "," and "," to ",","," ","'","`");
- return str_replace($badWords,"",$name);
- }
-
- else {
- $out = Array();
- preg_match_all('/[A-Z]/', $name, $out);
- return implode("", $out[0]);
- }
-}
setlocale(LC_CTYPE, 'C');
@@ -63,45 +49,36 @@
$row = Array();
$row["#id"] = $agency->id;
$row["name"] = trim($agency->value->name);
- if (isset($agency->value->foiEmail)) {
- $row["request_email"] = $agency->value->foiEmail;
- } else {
- if ($agency->value->orgType == "FMA-DepartmentOfState") {
- $row["request_email"] = "foi@" . GetDomain($agency->value->website);
- } else {
- $row["request_email"] = $foiEmail[$agency->value->parentOrg];
+ $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");
+ $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");
+ $row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");
+
+ $otherBodies = Array();
+ if (isset($agency->value->foiBodies)) {
+ $otherBodies = array_merge($otherBodies, $agency->value->foiBodies);
+ }
+ if (isset($agency->value->positions)) {
+ $positions = Array();
+ foreach ($agency->value->positions as $position) {
+ $positions[] = "Office of the ".$position;
}
+ $otherBodies = array_merge($otherBodies, $positions);
}
- if (isset($agency->value->shortName)) {
- $row["short_name"] = $agency->value->shortName;
- } else {
- $row["short_name"] = shortName($agency->value->name);
+ sort($otherBodies);
+ if (count($otherBodies) > 0) {
+ $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies);
}
- $row["notes"] = "";
+
$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
if ($agency->value->orgType == "FMA-DepartmentOfState") {
- $row["tag_string"] = $tag[$agency->value->_id] . " " . $agency->value->orgType;
+ $row["tag_string"] = $tag[$agency->value->_id];
} else {
- $row["tag_string"] = $tag[$agency->value->parentOrg] . " " . $agency->value->orgType;
+ $row["tag_string"] = $tag[$agency->value->parentOrg];
}
-
+ $row["tag_string"] .= " " . $agency->value->orgType;
+ $row["tag_string"] .= " federal";
fputcsv($fp, array_values($row));
-
- if (isset($agency->value->foiBodies)) {
- foreach ($agency->value->foiBodies as $foiBody) {
- $row['name'] = iconv("UTF-8", "ASCII//TRANSLIT",$foiBody);
- $row["short_name"] = shortName($foiBody);
- fputcsv($fp, array_values($row));
- }
- }
- if (isset($agency->value->positions)) {
- foreach ($agency->value->positions as $position) {
- $row['name'] = iconv("UTF-8", "ASCII//TRANSLIT",$position);
- $row["short_name"] = shortName($position);
- fputcsv($fp, array_values($row));
- }
- }
}
}
} catch (SetteeRestClientException $e) {
--- /dev/null
+++ b/documents/404.html
@@ -1,1 +1,44 @@
+<!doctype html>
+<html lang="en">
+<head>
+ <meta charset="utf-8">
+ <title>Page Not Found :(</title>
+ <style>
+ ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
+ ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
+ html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
+ html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
+ body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
+ h1 { margin: 0 10px; font-size: 50px; text-align: center; }
+ h1 span { color: #bbb; }
+ h3 { margin: 1.5em 0 0.5em; }
+ p { margin: 1em 0; }
+ ul { padding: 0 0 0 40px; margin: 1em 0; }
+ .container { max-width: 380px; _width: 380px; margin: 0 auto; }
+ /* google search */
+ #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
+ #goog-fixurl form { margin: 0; }
+ #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
+ #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
+ #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
+ #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
+ #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
+ input::-moz-focus-inner { padding: 0; border: 0; }
+ </style>
+</head>
+<body>
+ <div class="container">
+ <h1>Not found <span>:(</span></h1>
+ <p>Sorry, but the page you were trying to view does not exist.</p>
+ <p>It looks like this was the result of either:</p>
+ <ul>
+ <li>a mistyped address</li>
+ <li>an out-of-date link</li>
+ </ul>
+ <script>
+ var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+ </script>
+ <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
+ </div>
+
Binary files /dev/null and b/documents/apple-touch-icon-114x114-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon-57x57-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon-72x72-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon-precomposed.png differ
Binary files /dev/null and b/documents/apple-touch-icon.png differ
--- /dev/null
+++ b/documents/checkScrapers.php
@@ -1,1 +1,42 @@
+<?php
+include_once('../include/common.inc.php');
+include_header('Webserver and Accessiblity');
+
+echo "<table>
+ <tr><th>name</th><th>disclog</th><th>scraper?</th></tr>";
+$agenciesdb = $server->get_db('disclosr-agencies');
+$docsdb = $server->get_db('disclosr-documents');
+try {
+ $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
+
+
+ if ($rows) {
+ foreach ($rows as $row) {
+
+ echo "<tr><td>" . $row->value->name . " (".$row->id.")</td>\n";
+ echo "<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
+ .$row->value->FOIDocumentsURL.'</a>';
+ } else {
+ echo "<font color='red'>✘</font>";
+ }
+ echo "</td>\n<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ if (file_exists("./scrapers/".$row->id.'.py')) {
+ echo "<font color='green'>✔</font>";
+ } else if (file_exists("./scrapers/".$row->id.'.txt')){
+ echo "pass";
+ } else {
+ echo "<font color='red'>✘</font>";
+ }
+ }
+ echo "</td></tr>\n";
+ }
+ }
+} catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+}
+include_footer();
+?>
--- /dev/null
+++ b/documents/crossdomain.xml
@@ -1,1 +1,26 @@
+<?xml version="1.0"?>
+<!DOCTYPE cross-domain-policy SYSTEM "http://www.adobe.com/xml/dtds/cross-domain-policy.dtd">
+<cross-domain-policy>
+
+<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+
+<!-- Most restrictive policy: -->
+ <site-control permitted-cross-domain-policies="none"/>
+
+
+
+<!-- Least restrictive policy: -->
+<!--
+ <site-control permitted-cross-domain-policies="all"/>
+ <allow-access-from domain="*" to-ports="*" secure="false"/>
+ <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+-->
+<!--
+ If you host a crossdomain.xml file with allow-access-from domain="*"
+ and don’t understand all of the points described here, you probably
+ have a nasty security vulnerability. ~ simon willison
+-->
+
+</cross-domain-policy>
+
Binary files /dev/null and b/documents/favicon.ico differ
--- /dev/null
+++ b/documents/genericScrapers.py
@@ -1,1 +1,61 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import scrape
+from bs4 import BeautifulSoup
+import abc
+
+class GenericOAICDisclogScraper(object):
+ __metaclass__ = abc.ABCMeta
+ @abc.abstractmethod
+ def getAgencyID(self):
+ """ disclosr agency id """
+ return
+
+ @abc.abstractmethod
+ def getURL(self):
+ """ disclog URL"""
+ return
+
+ @abc.abstractmethod
+ def getColumns(self,columns):
+ """ rearranges columns if required """
+ return
+
+ def doScrape(self):
+ foidocsdb = scrape.couch['disclosr-foidocuments']
+ (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+ if content != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(content)
+ for row in soup.table.find_all('tr'):
+ columns = row.find_all('td')
+ if len(columns) == 5:
+ (id, date, description, title, notes) = self.getColumns(columns)
+ print id.string
+ hash = scrape.mkhash(url+id.string)
+ links = []
+ for atag in row.find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(url,atag['href']))
+ doc = foidocsdb.get(hash)
+ descriptiontxt = ""
+ for string in description.stripped_strings:
+ descriptiontxt = descriptiontxt + string
+
+ if doc == None:
+ print "saving"
+ doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
+ "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
+ foidocsdb.save(doc)
+ else:
+ print "already saved"
+
+ elif len(row.find_all('th')) == 5:
+ print "header row"
+
+ else:
+ print "ERROR number of columns incorrect"
+ print row
+
--- /dev/null
+++ b/documents/google676a414ad086cefb.html
@@ -1,1 +1,2 @@
+google-site-verification: google676a414ad086cefb.html
--- /dev/null
+++ b/documents/googlebcce906c6b666bb8.html
@@ -1,1 +1,2 @@
+google-site-verification: googlebcce906c6b666bb8.html
--- /dev/null
+++ b/documents/humans.txt
@@ -1,1 +1,44 @@
+/* the humans responsible & colophon */
+/* humanstxt.org */
+
+/* TEAM */
+ <your title>: <your name>
+ Site:
+ Twitter:
+ Location:
+
+/* THANKS */
+ Names (& URL):
+
+/* SITE */
+ Standards: HTML5, CSS3
+ Components: Modernizr, jQuery
+ Software:
+
+
+
+ -o/-
+ +oo//-
+ :ooo+//:
+ -ooooo///-
+ /oooooo//:
+ :ooooooo+//-
+ -+oooooooo///-
+ -://////////////+oooooooooo++////////////::
+ :+ooooooooooooooooooooooooooooooooooooo+:::-
+ -/+ooooooooooooooooooooooooooooooo+/::////:-
+ -:+oooooooooooooooooooooooooooo/::///////:-
+ --/+ooooooooooooooooooooo+::://////:-
+ -:+ooooooooooooooooo+:://////:--
+ /ooooooooooooooooo+//////:-
+ -ooooooooooooooooooo////-
+ /ooooooooo+oooooooooo//:
+ :ooooooo+/::/+oooooooo+//-
+ -oooooo/::///////+oooooo///-
+ /ooo+::://////:---:/+oooo//:
+ -o+/::///////:- -:/+o+//-
+ :-:///////:- -:/://
+ -////:- --//:
+ -- -:
+
--- /dev/null
+++ b/documents/index.php
@@ -1,1 +1,94 @@
+<!doctype html>
+<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
+<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
+<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
+<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
+<head>
+ <meta charset="utf-8">
+ <!-- Use the .htaccess and remove these lines to avoid edge case issues.
+ More info: h5bp.com/i/378 -->
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+
+ <title>disclosure logs</title>
+ <meta name="description" content="">
+
+ <!-- Mobile viewport optimized: h5bp.com/viewport -->
+ <meta name="viewport" content="width=device-width">
+
+ <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
+<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" />
+ <link rel="stylesheet" href="css/style.css">
+
+ <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
+
+ <!-- All JavaScript at the bottom, except this Modernizr build.
+ Modernizr enables HTML5 elements & feature detects for optimal performance.
+ Create your own custom Modernizr build: www.modernizr.com/download/ -->
+ <script src="js/libs/modernizr-2.5.3.min.js"></script>
+</head>
+<body>
+ <!-- Prompt IE 6 users to install Chrome Frame. Remove this if you support IE 6.
+ chromium.org/developers/how-tos/chrome-frame-getting-started -->
+ <!--[if lt IE 7]><p class=chromeframe>Your browser is <em>ancient!</em> <a href="http://browsehappy.com/">Upgrade to a different browser</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to experience this site.</p><![endif]-->
+ <header>
+<center><h1>disclosurelo.gs</h1></center>
+
+ </header>
+ <div role="main">
+<dl>
+ <dt>Disclosure Log</dt>
+ <dd></dd>
+</dl>
+<a href="http://information.<?php echo $_SERVER['HTTP_HOST'];?>">information.disclo.gs - documents</a><br>
+<a href="http://orgs.<?php echo $_SERVER['HTTP_HOST'];?>">orgs.disclo.gs - structure</a><br>
+<a href="http://lobbyists.<?php echo $_SERVER['HTTP_HOST'];?>">lobbyists.disclo.gs - lobbylens</a><br>
+<a href="http://contracts.<?php echo $_SERVER['HTTP_HOST'];?>">contracts.disclo.gs - contractdashboard</a><br>
+</body>
+
+ </div>
+ <footer>
+
+ </footer>
+
+
+ <!-- JavaScript at the bottom for fast page loading -->
+
+ <!-- Grab Google CDN's jQuery, with a protocol relative URL; fall back to local if offline -->
+ <script src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
+ <script>window.jQuery || document.write('<script src="js/libs/jquery-1.7.1.min.js"><\/script>')</script>
+
+ <!-- scripts concatenated and minified via build script -->
+ <script src="js/plugins.js"></script>
+ <script src="js/script.js"></script>
+ <!-- end scripts -->
+
+ <!-- Asynchronous Google Analytics snippet. Change UA-XXXXX-X to be your site's ID.
+ mathiasbynens.be/notes/async-analytics-snippet -->
+ <script>
+ var _gaq=[['_setAccount','UA-XXXXX-X'],['_trackPageview']];
+ (function(d,t){var g=d.createElement(t),s=d.getElementsByTagName(t)[0];
+ g.src=('https:'==location.protocol?'//ssl':'//www')+'.google-analytics.com/ga.js';
+ s.parentNode.insertBefore(g,s)}(document,'script'));
+ </script>
+</body>
+</html>
+
+
+australian disclosure logs
+
+are you looking for more information about:
+contracts
+gov orgs
+lobbyists
+
+1/1/11 title (Dept dfggdfgdf)
+description:
+source link:
+documents:
+ #1 title link
+
+
+rss feed here
--- /dev/null
+++ b/documents/readme.md
@@ -1,1 +1,56 @@
+# [HTML5 Boilerplate](http://html5boilerplate.com)
+HTML5 Boilerplate is a professional front-end template that helps you build fast, robust, adaptable, and future-proof websites. Spend more time developing and less time reinventing the wheel.
+
+This project is the product of many years of iterative development and combined community knowledge. It does not impose a specific development philosophy or framework, so you're free to architect your code in the way that you want.
+
+
+## Quick start
+
+Clone the git repo - `git clone git://github.com/h5bp/html5-boilerplate.git` - or [download it](https://github.com/h5bp/html5-boilerplate/zipball/master)
+
+
+## Features
+
+* HTML5 ready. Use the new elements with confidence.
+* Cross-browser compatible (Chrome, Opera, Safari, Firefox 3.6+, IE6+).
+* Designed with progressive enhancement in mind.
+* CSS normalizations and common bug fixes.
+* IE-specific classes for easier cross-browser control.
+* A default print stylesheet, performance optimized.
+* Mobile browser optimizations.
+* Protection against any stray `console.log` causing JavaScript errors in IE6/7.
+* The latest jQuery via CDN, with a local fallback.
+* A custom Modernizr build for feature detection.
+* An optimized Google Analytics snippet.
+* Apache server caching, compression, and other configuration defaults for Grade-A performance.
+* Cross-domain Ajax and Flash.
+* "Delete-key friendly." Easy to strip out parts you don't need.
+* Extensive inline and accompanying documentation.
+
+
+## Contributing
+
+Anyone and everyone is welcome to [contribute](https://github.com/h5bp/html5-boilerplate/wiki/contribute). Hundreds of developers have helped make the HTML5 Boilerplate what it is today.
+
+
+## Project information
+
+* Source: http://github.com/h5bp/html5-boilerplate
+* Web: http://html5boilerplate.com
+* Docs: http://html5boilerplate.com/docs
+* Twitter: http://twitter.com/h5bp
+
+
+## License
+
+### Major components:
+
+* jQuery: MIT/GPL license
+* Modernizr: MIT/BSD license
+* Normalize.css: Public Domain
+
+### Everything else:
+
+The Unlicense (aka: public domain)
+
--- /dev/null
+++ b/documents/redirect.php
@@ -1,1 +1,19 @@
+<?php
+$subdomain = str_replace('disclo.gs','',$_SERVER['SERVER_NAME']);
+$script = $_SERVER['REQUEST_URI'];
+if ($script == '/google676a414ad086cefb.html') {
+ echo 'google-site-verification: google676a414ad086cefb.html';
+ exit();
+}
+if ($script == '/googlebcce906c6b666bb8.html') {
+ echo 'google-site-verification: googlebcce906c6b666bb8.html';
+ exit();
+}
+
+header('HTTP/1.1 301 Moved Permanently');
+header('Location: http://'.$subdomain.'disclosurelo.gs'.$script);
+exit();
+?>
+
+
--- /dev/null
+++ b/documents/robots.txt
@@ -1,1 +1,5 @@
+# www.robotstxt.org/
+# http://code.google.com/web/controlcrawlindex/
+User-agent: *
+
--- /dev/null
+++ b/documents/rss.xml.php
@@ -1,1 +1,30 @@
+<?php
+// Agency X updated Y, new files, diff of plain text/link text,
+// feed for just one agency or all
+// This is a minimum example of using the Universal Feed Generator Class
+include("lib/FeedWriter.php");
+//Creating an instance of FeedWriter class.
+$TestFeed = new FeedWriter(RSS2);
+//Setting the channel elements
+//Use wrapper functions for common channelelements
+$TestFeed->setTitle('Last Modified - All');
+$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
+$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
+//Retriving informations from database
+$rows = $db->get_view("app", "byLastModified")->rows;
+//print_r($rows);
+foreach ($rows as $row) {
+ //Create an empty FeedItem
+ $newItem = $TestFeed->createNewItem();
+ //Add elements to the feed item
+ $newItem->setTitle($row['name']);
+ $newItem->setLink($row['id']);
+ $newItem->setDate(date("c", $row['metadata']['lastModified']));
+ $newItem->setDescription($row['name']);
+ //Now add the feed item
+ $TestFeed->addItem($newItem);
+}
+//OK. Everything is done. Now genarate the feed.
+$TestFeed->genarateFeed();
+?>
--- /dev/null
+++ b/documents/run.bat
@@ -1,1 +1,2 @@
-
+python scrape.py
+pause
--- /dev/null
+++ b/documents/runScrapers.php
--- /dev/null
+++ b/documents/scrape.py
@@ -1,1 +1,214 @@
-
+#http://packages.python.org/CouchDB/client.html
+import couchdb
+import urllib2
+from BeautifulSoup import BeautifulSoup
+import re
+import hashlib
+from urlparse import urljoin
+import time
+import os
+import mimetypes
+import re
+import urllib
+import urlparse
+
+def mkhash(input):
+ return hashlib.md5(input).hexdigest().encode("utf-8")
+
+def canonurl(url):
+ r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
+ if the URL looks invalid.
+ >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
+ 'http://xn--hgi.ws/'
+ """
+ # strip spaces at the ends and ensure it's prefixed with 'scheme://'
+ url = url.strip()
+ if not url:
+ return ''
+ if not urlparse.urlsplit(url).scheme:
+ url = 'http://' + url
+
+ # turn it into Unicode
+ #try:
+ # url = unicode(url, 'utf-8')
+ #except UnicodeDecodeError:
+ # return '' # bad UTF-8 chars in URL
+
+ # parse the URL into its components
+ parsed = urlparse.urlsplit(url)
+ scheme, netloc, path, query, fragment = parsed
+
+ # ensure scheme is a letter followed by letters, digits, and '+-.' chars
+ if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
+ return ''
+ scheme = str(scheme)
+
+ # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
+ match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
+ if not match:
+ return ''
+ domain, port = match.groups()
+ netloc = domain + (port if port else '')
+ netloc = netloc.encode('idna')
+
+ # ensure path is valid and convert Unicode chars to %-encoded
+ if not path:
+ path = '/' # eg: 'http://google.com' -> 'http://google.com/'
+ path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
+
+ # ensure query is valid