gitphp 0.2.9.1 :: disclosr.git/commitdiff

file:a/.gitmodules -> file:b/.gitmodules

[submodule "couchdb/couchdb-lucene"]	[submodule "couchdb/couchdb-lucene"]
path = couchdb/couchdb-lucene	path = couchdb/couchdb-lucene
url = https://github.com/rnewson/couchdb-lucene.git	url = https://github.com/rnewson/couchdb-lucene.git
[submodule "couchdb/settee"]	[submodule "couchdb/settee"]
path = couchdb/settee	path = couchdb/settee
url = https://github.com/inadarei/settee.git	url = https://github.com/inadarei/settee.git
[submodule "lib/php-diff"]	[submodule "lib/php-diff"]
path = lib/php-diff	path = lib/php-diff
url = https://github.com/chrisboulton/php-diff.git	url = https://github.com/chrisboulton/php-diff.git
[submodule "lib/Requests"]	[submodule "lib/Requests"]
path = lib/Requests	path = lib/Requests
url = https://github.com/rmccue/Requests.git	url = https://github.com/rmccue/Requests.git
[submodule "js/flotr2"]	[submodule "js/flotr2"]
path = js/flotr2	path = js/flotr2
url = https://github.com/HumbleSoftware/Flotr2.git	url = https://github.com/HumbleSoftware/Flotr2.git
[submodule "lib/phpquery"]	[submodule "lib/phpquery"]
path = lib/phpquery	path = lib/phpquery
url = https://github.com/TobiaszCudnik/phpquery.git	url = https://github.com/TobiaszCudnik/phpquery.git
[submodule "js/sigma"]	[submodule "js/sigma"]
path = js/sigma	path = js/sigma
url = https://github.com/jacomyal/sigma.js.git	url = https://github.com/jacomyal/sigma.js.git
[submodule "js/bubbletree"]	[submodule "js/bubbletree"]
path = js/bubbletree	path = js/bubbletree
url = https://github.com/okfn/bubbletree.git	url = https://github.com/okfn/bubbletree.git
[submodule "lib/querypath"]	[submodule "lib/querypath"]
path = lib/querypath	path = lib/querypath
url = https://github.com/technosophos/querypath.git	url = https://github.com/technosophos/querypath.git
	[submodule "lib/amon-php"]
	path = lib/amon-php
	url = https://github.com/martinrusev/amon-php.git

file:b/admin/importRTKbodies.php (new)

	<?php

	require_once '../include/common.inc.php';

	$db = $server->get_db('disclosr-agencies');
	$rows = $db->get_view("app", "byName")->rows;
	$nametoid = Array();
	$accounts = Array();
	foreach ($rows as $row) {
	$nametoid[trim($row->key)] = $row->value;
	}

	function extractCSVAccounts($url, $nameField, $accountField, $filter) {
	global $accounts, $nametoid;
	$request = Requests::get($url);
	echo $url;
	$Data = str_getcsv($request->body, "\n"); //parse the rows
	$headers = Array();
	foreach ($Data as $num => $line) {
	$Row = str_getcsv($line, ",");
	if ($num == 0) {
	$headers = $Row;
	print_r($headers);
	} else {
	if (isset($Row[array_search($nameField, $headers)])) {
	$agencyName = $Row[array_search($nameField, $headers)];
	if (!in_array(trim($agencyName), array_keys($nametoid))) {
	echo "$agencyName missing" . PHP_EOL;
	} else {
	echo $Row[array_search($nameField, $headers)] . PHP_EOL;
	$accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
	}
	} else {
	echo "error finding any agency" . $line . PHP_EOL;
	}
	}
	}
	}

	extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
	print_r($accounts);
	/* foreach ($accounts as $id => $accountTypes) {
	echo $id . "<br>" . PHP_EOL;
	$doc = object_to_array($db->get($id));
	// print_r($doc);

	foreach ($accountTypes as $accountType => $accounts) {
	if (!isset($doc["has" . $accountType]) \|\| !is_array($doc["has" . $accountType])) {
	$doc["has" . $accountType] = Array();
	}
	$doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
	}
	$db->save($doc);
	}*/
	?>

file:a/alaveteli/exportAgencies.csv.php -> file:b/alaveteli/exportAgencies.csv.php

<?php	<?php

include_once("../include/common.inc.php");	include_once("../include/common.inc.php");

setlocale(LC_CTYPE, 'C');	setlocale(LC_CTYPE, 'C');

$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");	$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");

$db = $server->get_db('disclosr-agencies');	$db = $server->get_db('disclosr-agencies');

$tag = Array();	$tag = Array();
try {	try {
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows;	$rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
//print_r($rows);	//print_r($rows);
foreach ($rows as $row) {	foreach ($rows as $row) {
$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key));	$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key));
}	}
} catch (SetteeRestClientException $e) {	} catch (SetteeRestClientException $e) {
setteErrorHandler($e);	setteErrorHandler($e);
die();	die();
}	}

$foiEmail = Array();	$foiEmail = Array();
try {	try {
$rows = $db->get_view("app", "foiEmails", null, true)->rows;	$rows = $db->get_view("app", "foiEmails", null, true)->rows;
//print_r($rows);	//print_r($rows);
foreach ($rows as $row) {	foreach ($rows as $row) {
$foiEmail[$row->key] = $row->value;	$foiEmail[$row->key] = $row->value;
}	}
} catch (SetteeRestClientException $e) {	} catch (SetteeRestClientException $e) {
setteErrorHandler($e);	setteErrorHandler($e);
die();	die();
}	}

$fp = fopen('php://output', 'w');	$fp = fopen('php://output', 'w');
if ($fp && $db) {	if ($fp && $db) {
header('Content-Type: text/csv; charset=utf-8');	header('Content-Type: text/csv; charset=utf-8');
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');	header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
header('Pragma: no-cache');	header('Pragma: no-cache');
header('Expires: 0');	header('Expires: 0');
fputcsv($fp, $headers);	fputcsv($fp, $headers);
try {	try {
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;	$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
//print_r($rows);	//print_r($rows);
foreach ($agencies as $agency) {	foreach ($agencies as $agency) {
// print_r($agency);	// print_r($agency);

if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) {	if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) {
$row = Array();	$row = Array();
$row["#id"] = $agency->id;	$row["#id"] = $agency->id;
$row["name"] = trim($agency->value->name);	$row["name"] = trim($agency->value->name);
$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");	$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");
$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");	$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");
$row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");	$row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");

$otherBodies = Array();	$otherBodies = Array();
if (isset($agency->value->foiBodies)) {	if (isset($agency->value->foiBodies)) {
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies);	$otherBodies = array_merge($otherBodies, $agency->value->foiBodies);
}	}
if (isset($agency->value->positions)) {	if (isset($agency->value->positions)) {
$otherBodies = array_merge($otherBodies, $agency->value->positions);	$positions = Array();
	foreach ($agency->value->positions as $position) {
	$positions[] = "Office of the ".$position;
	}
	$otherBodies = array_merge($otherBodies, $positions);
}	}
	sort($otherBodies);
if (count($otherBodies) > 0) {	if (count($otherBodies) > 0) {
$row["notes"] .= "<br/> This department also responds to requests for information held by ".implode(",",$otherBodies);	$row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies);
}	}

$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");	$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");	$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
if ($agency->value->orgType == "FMA-DepartmentOfState") {	if ($agency->value->orgType == "FMA-DepartmentOfState") {
$row["tag_string"] = $tag[$agency->value->_id];	$row["tag_string"] = $tag[$agency->value->_id];
} else {	} else {
$row["tag_string"] = $tag[$agency->value->parentOrg];	$row["tag_string"] = $tag[$agency->value->parentOrg];
}	}
$row["tag_string"] .= " " . $agency->value->orgType;	$row["tag_string"] .= " " . $agency->value->orgType;
$row["tag_string"] .= " federal";	$row["tag_string"] .= " federal";
fputcsv($fp, array_values($row));	fputcsv($fp, array_values($row));



}	}
}	}
} catch (SetteeRestClientException $e) {	} catch (SetteeRestClientException $e) {
setteErrorHandler($e);	setteErrorHandler($e);
}	}

die;	die;
}	}
?>	?>

file:b/documents/google676a414ad086cefb.html (new)

	google-site-verification: google676a414ad086cefb.html

file:b/documents/index.php (new)

	australian disclosure logs

	are you looking for more information about:
	contracts
	gov orgs
	lobbyists

	1/1/11 title (Dept dfggdfgdf)
	description:
	source link:
	documents:
	#1 title link


	rss feed here

file:b/documents/rss.xml.php (new)

	<?php

	// Agency X updated Y, new files, diff of plain text/link text,
	// feed for just one agency or all
	// This is a minimum example of using the Universal Feed Generator Class
	include("lib/FeedWriter.php");
	//Creating an instance of FeedWriter class.
	$TestFeed = new FeedWriter(RSS2);
	//Setting the channel elements
	//Use wrapper functions for common channelelements
	$TestFeed->setTitle('Last Modified - All');
	$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
	$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
	//Retriving informations from database
	$rows = $db->get_view("app", "byLastModified")->rows;
	//print_r($rows);
	foreach ($rows as $row) {
	//Create an empty FeedItem
	$newItem = $TestFeed->createNewItem();
	//Add elements to the feed item
	$newItem->setTitle($row['name']);
	$newItem->setLink($row['id']);
	$newItem->setDate(date("c", $row['metadata']['lastModified']));
	$newItem->setDescription($row['name']);
	//Now add the feed item
	$TestFeed->addItem($newItem);
	}
	//OK. Everything is done. Now genarate the feed.
	$TestFeed->genarateFeed();
	?>

file:b/documents/runScrapers.php (new)

file:b/documents/scrape.py (new)

	#http://packages.python.org/CouchDB/client.html
	import couchdb
	import urllib2
	from BeautifulSoup import BeautifulSoup
	import re
	import hashlib
	from urlparse import urljoin
	import time
	import os
	import mimetypes
	import re
	import urllib
	import urlparse

	def mkhash(input):
	return hashlib.md5(input).hexdigest().encode("utf-8")

	def canonurl(url):
	r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
	if the URL looks invalid.
	>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
	'http://xn--hgi.ws/'
	"""
	# strip spaces at the ends and ensure it's prefixed with 'scheme://'
	url = url.strip()
	if not url:
	return ''
	if not urlparse.urlsplit(url).scheme:
	url = 'http://' + url

	# turn it into Unicode
	#try:
	# url = unicode(url, 'utf-8')
	#except UnicodeDecodeError:
	# return '' # bad UTF-8 chars in URL

	# parse the URL into its components
	parsed = urlparse.urlsplit(url)
	scheme, netloc, path, query, fragment = parsed

	# ensure scheme is a letter followed by letters, digits, and '+-.' chars
	if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
	return ''
	scheme = str(scheme)

	# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
	match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
	if not match:
	return ''
	domain, port = match.groups()
	netloc = domain + (port if port else '')
	netloc = netloc.encode('idna')

	# ensure path is valid and convert Unicode chars to %-encoded
	if not path:
	path = '/' # eg: 'http://google.com' -> 'http://google.com/'
	path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')

	# ensure query is valid
	query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')

	# ensure fragment is valid
	fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))

	# piece it all back together, truncating it to a maximum of 4KB
	url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
	return url[:4096]

	def fullurl(url,href):
	href = href.replace(" ","%20")
	href = re.sub('#.*$','',href)
	return urljoin(url,href)

	#http://diveintopython.org/http_web_services/etags.html
	class NotModifiedHandler(urllib2.BaseHandler):
	def http_error_304(self, req, fp, code, message, headers):
	addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
	addinfourl.code = code
	return addinfourl

	def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
	url = canonurl(url)
	hash = mkhash(url)
	req = urllib2.Request(url)
	print "Fetching %s" % url
	if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
	print "Not a valid HTTP url"
	return (None,None,None)
	doc = docsdb.get(hash)
	if doc == None:
	doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
	else:
	if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 999999):
	print "Uh oh, trying to scrape URL again too soon!"
	last_attachment_fname = doc["_attachments"].keys()[-1]
	last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
	return (doc['url'],doc['mime_type'],last_attachment.read())
	if scrape_again == False:
	print "Not scraping this URL again as requested"
	return (None,None,None)

	time.sleep(3) # wait 3 seconds to give webserver time to recover

	req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
	#if there is a previous version stored in couchdb, load caching helper tags
	if doc.has_key('etag'):
	req.add_header("If-None-Match", doc['etag'])
	if doc.has_key('last_modified'):
	req.add_header("If-Modified-Since", doc['last_modified'])

	opener = urllib2.build_opener(NotModifiedHandler())
	try:
	url_handle = opener.open(req)
	doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
	headers = url_handle.info() # the addinfourls have the .info() too
	doc['etag'] = headers.getheader("ETag")
	doc['last_modified'] = headers.getheader("Last-Modified")
	doc['date'] = headers.getheader("Date")
	doc['page_scraped'] = time.time()
	doc['web_server'] = headers.getheader("Server")
	doc['via'] = headers.getheader("Via")
	doc['powered_by'] = headers.getheader("X-Powered-By")
	doc['file_size'] = headers.getheader("Content-Length")
	content_type = headers.getheader("Content-Type")
	if content_type != None:
	doc['mime_type'] = content_type.split(";")[0]
	else:
	(type,encoding) = mimetypes.guess_type(url)
	doc['mime_type'] = type
	if hasattr(url_handle, 'code'):
	if url_handle.code == 304:
	print "the web page has not been modified"
	return (None,None,None)
	else:
	content = url_handle.read()
	docsdb.save(doc)
	doc = docsdb.get(hash) # need to get a _rev
	docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
	return (doc['url'], doc['mime_type'], content)
	#store as attachment epoch-filename
	except urllib2.URLError as e:
	error = ""
	if hasattr(e, 'reason'):
	error = "error %s in downloading %s" % (str(e.reason), url)
	elif hasattr(e, 'code'):
	error = "error %s in downloading %s" % (e.code, url)
	print error
	doc['error'] = error
	docsdb.save(doc)
	return (None,None,None)



	def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
	(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
	badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
	if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
	# http://www.crummy.com/software/BeautifulSoup/documentation.html
	soup = BeautifulSoup(content)
	navIDs = soup.findAll(id=re.compile('nav\|Nav\|menu\|bar\|left\|right\|sidebar\|more-links\|breadcrumb\|footer\|header'))
	for nav in navIDs:
	print "Removing element", nav['id']
	nav.extract()
	navClasses = soup.findAll(attrs={'class' : re.compile('nav\|menu\|bar\|left\|right\|sidebar\|more-links\|breadcrumb\|footer\|header')})
	for nav in navClasses:
	print "Removing element", nav['class']
	nav.extract()
	links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
	linkurls = set([])
	for link in links:
	if link.has_key("href"):
	if link['href'].startswith("http"):
	# lets not do external links for now
	# linkurls.add(link['href'])
	None
	if link['href'].startswith("mailto"):
	# not http
	None
	if link['href'].startswith("javascript"):
	# not http
	None
	else:
	# remove anchors and spaces in urls
	linkurls.add(fullurl(url,link['href']))
	for linkurl in linkurls:
	#print linkurl
	scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)

	couch = couchdb.Server('http://127.0.0.1:5984/')
	# select database
	agencydb = couch['disclosr-agencies']
	docsdb = couch['disclosr-documents']

	if __name__ == "__main__":
	for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
	agency = agencydb.get(row.id)
	print agency['name']
	for key in agency.keys():
	if key == 'website':
	scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
	if key.endswith('URL'):
	print key
	depth = 1
	if 'scrapeDepth' in agency.keys():
	depth = agency['scrapeDepth']
	scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
	agency['metadata']['lastScraped'] = time.time()
	agencydb.save(agency)

file:b/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py (new)

	import sys,os
	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
	import scrape
	foidocsdb = scrape.couch['disclosr-foidocuments']

	#RSS feed not detailed
	from bs4 import BeautifulSoup
	#http://www.apvma.gov.au/about/foi/disclosure/index.php
	agencyID = "3cd40b1240e987cbcd3f0e67054ce259"
	(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, "http://www.apvma.gov.au/about/foi/disclosure/index.php", "foidocuments", agencyID)
	if content != None:
	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
	# http://www.crummy.com/software/BeautifulSoup/documentation.html
	soup = BeautifulSoup(content)
	for row in soup.table.find_all('tr'):
	columns = row.find_all('td')
	if len(columns) == 5:
	(id, date, description, title, notes) = columns
	print id.string
	hash = scrape.mkhash(url+id.string)
	links = []
	for atag in row.find_all("a"):
	if atag.has_key('href'):
	links.append(scrape.fullurl(url,atag['href']))
	doc = foidocsdb.get(hash)
	descriptiontxt = ""
	for string in description.stripped_strings:
	descriptiontxt = descriptiontxt + string
	if doc == None:
	print "saving"
	doc = {'_id': hash, 'agencyID': agencyID, 'url': url, "links": links, 'docID': id.string, "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
	foidocsdb.save(doc)
	else:
	print "already saved"
	elif len(row.find_all('th')) == 5:
	print "header row"
	else:
	print "ERROR number of columns incorrect"
	print row

file:b/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py (new)

	import sys,os
	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
	import scrape
	foidocsdb = scrape.couch['disclosr-foidocuments']

	#rss feed has only one entry
	http://www.daff.gov.au/about/foi/ips/disclosure-log

file:b/documents/scrapers/be9996f0ac58f71f23d074e82d44ead3.py (new)

	import sys,os
	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
	import scrape
	foidocsdb = scrape.couch['disclosr-foidocuments']

	import feedparser
	feed = feedparser.parse( "http://foi.deewr.gov.au/disclosure-log/rss")
	print feed.entries[0]
	#foreach feed.entries

file:b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py (new)

www.finance.gov.au/foi/disclosure-log/foi-rss.xml

file:b/documents/scrapers/rtk.py (new)

http://www.righttoknow.org.au/feed/search/%20(latest_status:successful%20OR%20latest_status:partially_successful)

file:b/documents/search.php (new)

	<?php
	include_once('include/common.inc.php');
	include_header('Search');
	?>
	<div class="foundation-header">
	<h1><a href="search.php">Search</a></h1>
	</div>
	<form>
	<input type="text" name="q" value="<?php if (isset($_REQUEST['q']))echo $_REQUEST['q'];?>"/>
	<input type="submit"/>
	</form>

	<?php
	if (isset($_REQUEST['q'])) {
	$request = Requests::get($serverAddr."disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=".$_REQUEST['q']);
	$results = json_decode($request->body);
	$db = $server->get_db('disclosr-documents');
	foreach ($results->rows as $result) {
	//print_r($result);
	//$row = $db->get($result->id);
	echo $result->doc->_id." ".$result->doc->url."<br>".PHP_EOL;
	}
	}
	include_footer();
	?>

file:a/getAgency.php -> file:b/getAgency.php

<?php	<?php

include_once('include/common.inc.php');	include_once('include/common.inc.php');


function displayValue($key, $value, $mode) {	function displayValue($key, $value, $mode) {
global $db, $schemas;	global $db, $schemas;
if ($mode == "view") {	if ($mode == "view") {
if (strpos($key, "_") === 0 \|\| $key== "metadata") return;	if (strpos($key, "_") === 0 \|\| $key== "metadata") return;
echo "<tr>";	echo "<tr>";

echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>";	echo "<td>";
	if (isset($schemas['agency']["properties"][$key])) {
	echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description']."</small>";
	}
	echo "</td><td>";
if (is_array($value)) {	if (is_array($value)) {
echo "<ol>";	echo "<ol>";
foreach ($value as $subkey => $subvalue) {	foreach ($value as $subkey => $subvalue) {

echo "<li ";	echo "<li ";
if (isset($schemas['agency']["properties"][$key]['x-property'])) {	if (isset($schemas['agency']["properties"][$key]['x-property'])) {
echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" ';	echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" ';
} if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {	} if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" ';	echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" ';
}	}
echo " >";	echo " >";

echo "$subvalue</li>";	echo "$subvalue</li>";
}	}
echo "</ol></td></tr>";	echo "</ol></td></tr>";
} else {	} else {
if (isset($schemas['agency']["properties"][$key]['x-property'])) {	if (isset($schemas['agency']["properties"][$key]['x-property'])) {
echo '<span property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';	echo '<span property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';
} else {	} else {
echo "<span>";	echo "<span>";
}	}
if ((strpos($key, "URL") > 0 \|\| $key == 'website') && $value != "") {	if ((strpos($key, "URL") > 0 \|\| $key == 'website') && $value != "") {
echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>$value</a>";	echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>$value</a>";
} else {	} else {
echo "$value</span>";	echo "$value</span>";
}	}
}	}
echo "</td></tr>";	echo "</td></tr>";
}	}
if ($mode == "edit") {	if ($mode == "edit") {
if (is_array($value)) {	if (is_array($value)) {
echo '<div class="row">	echo '<div class="row">
<div class="seven columns">	<div class="seven columns">
<fieldset>	<fieldset>
<h5>' . $key . '</h5>';	<h5>' . $key . '</h5>';
foreach ($value as $subkey => $subvalue) {	foreach ($value as $subkey => $subvalue) {
echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>";	echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>";
}	}
echo "</fieldset>	echo "</fieldset>
</div>	</div>
</div>";	</div>";
} else {	} else {
if (strpos($key, "_") === 0) {	if (strpos($key, "_") === 0) {
echo"<input type='hidden' id='$key' name='$key' value='$value'/>";	echo"<input type='hidden' id='$key' name='$key' value='$value'/>";
} else if ($key == "parentOrg") {	} else if ($key == "parentOrg") {
echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>";	echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>";
$rows = $db->get_view("app", "byDeptStateName")->rows;	$rows = $db->get_view("app", "byDeptStateName")->rows;
//print_r($rows);	//print_r($rows);
foreach ($rows as $row) {	foreach ($rows as $row) {
echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>";	echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>";
}	}
echo" </select>";	echo" </select>";
} else {	} else {
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>";	echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>";
if ((strpos($key, "URL") > 0 \|\| $key == 'website') && $value != "") {	if ((strpos($key, "URL") > 0 \|\| $key == 'website') && $value != "") {
echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>view</a>";	echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>view</a>";

}	}
if ($key == 'abn') {	if ($key == 'abn') {
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>";	echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>";
}	}
}	}
}	}
}	}
//	//
}	}

function addDefaultFields($row) {	function addDefaultFields($row) {
global $schemas;	global $schemas;
$defaultFields = array_keys($schemas['agency']['properties']);	$defaultFields = array_keys($schemas['agency']['properties']);
foreach ($defaultFields as $defaultField) {	foreach ($defaultFields as $defaultField) {
if (!isset($row[$defaultField])) {	if (!isset($row[$defaultField])) {
if ($schemas['agency']['properties'][$defaultField]['type'] == "string") {	if ($schemas['agency']['properties'][$defaultField]['type'] == "string") {
$row[$defaultField] = "";	$row[$defaultField] = "";
}	}
if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {	if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
$row[$defaultField] = Array("");	$row[$defaultField] = Array("");
}	}
} else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {	} else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
if (is_array($row[$defaultField])) {	if (is_array($row[$defaultField])) {
$row[$defaultField][] = "";	$row[$defaultField][] = "";
$row[$defaultField][] = "";	$row[$defaultField][] = "";
$row[$defaultField][] = "";	$row[$defaultField][] = "";
} else {	} else {
$value = $row[$defaultField];	$value = $row[$defaultField];
$row[$defaultField] = Array($value);	$row[$defaultField] = Array($value);
$row[$defaultField][] = "";	$row[$defaultField][] = "";
$row[$defaultField][] = "";	$row[$defaultField][] = "";

}	}
}	}
}	}
return $row;	return $row;
}	}

$db = $server->get_db('disclosr-agencies');	$db = $server->get_db('disclosr-agencies');

if (isset($_REQUEST['id'])) {	if (isset($_REQUEST['id'])) {
//get an agency record as json/html, search by name/abn/id	//get an agency record as json/html, search by name/abn/id
// by name = startkey="Ham"&endkey="Ham\ufff0"	// by name = startkey="Ham"&endkey="Ham\ufff0"
// edit?	// edit?

$obj = $db->get($_REQUEST['id']);	$obj = $db->get($_REQUEST['id']);
include_header($obj->name);	include_header($obj->name);
//print_r($row);	//print_r($row);
if (sizeof($_POST) > 0) {	if (sizeof($_POST) > 0) {
//print_r($_POST);	//print_r($_POST);
foreach ($_POST as $postkey => $postvalue) {	foreach ($_POST as $postkey => $postvalue) {
if ($postvalue == "") {	if ($postvalue == "") {
unset($_POST[$postkey]);	unset($_POST[$postkey]);
}	}
if (is_array($postvalue)) {	if (is_array($postvalue)) {
if (count($postvalue) == 1 && $postvalue[0] == "") {	if (count($postvalue) == 1 && $postvalue[0] == "") {
unset($_POST[$postkey]);	unset($_POST[$postkey]);
} else {	} else {
foreach ($_POST[$postkey] as $key => &$value) {	foreach ($_POST[$postkey] as $key => &$value) {
if ($value == "") {	if ($value == "") {
unset($_POST[$postkey][$key]);	unset($_POST[$postkey][$key]);
}	}
}	}
}	}
}	}
}	}
if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) {	if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) {
echo "Edited version was latest version, continue saving";	echo "Edited version was latest version, continue saving";
$newdoc = $_POST;	$newdoc = $_POST;
$newdoc['metadata']['lastModified'] = time();	$newdoc['metadata']['lastModified'] = time();
$obj = $db->save($newdoc);	$obj = $db->save($newdoc);
} else {	} else {
echo "ALERT doc revised by someone else while editing. Document not saved.";	echo "ALERT doc revised by someone else while editing. Document not saved.";
}	}
}	}

$mode = "view";	$mode = "view";
$rowArray = object_to_array($obj);	$rowArray = object_to_array($obj);
ksort($rowArray);	ksort($rowArray);
if ($mode == "edit") {	if ($mode == "edit") {
$row = addDefaultFields($rowArray);	$row = addDefaultFields($rowArray);
} else {	} else {
$row = $rowArray;	$row = $rowArray;
}	}

if ($mode == "view") {	if ($mode == "view") {
echo '<div itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization" about="#' . $row['_id'] . '"><table width="100%">';	echo '<div itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization" about="#' . $row['_id'] . '"><table width="100%">';
echo '<tr> <td colspan="2"><h3 itemprop="name">' . $row['name'] . "</h3></td></tr>";	echo '<tr> <td colspan="2"><h3 itemprop="name">' . $row['name'] . "</h3></td></tr>";
echo "<tr><th>Field Name</th><th>Field Value</th></tr>";	echo "<tr><th>Field Name</th><th>Field Value</th></tr>";
}	}
if ($mode == "edit") {	if ($mode == "edit") {
?>	?>
<input id="addfield" type="button" value="Add Field"/>	<input id="addfield" type="button" value="Add Field"/>
<script>	<script>
window.onload = function() {	window.onload = function() {
$(document).ready(function() {	$(document).ready(function() {
// put all your jQuery goodness in here.	// put all your jQuery goodness in here.
// http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/	// http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/
$('#addfield').click(function() {	$('#addfield').click(function() {
var field_name=window.prompt("fieldname?","");	var field_name=window.prompt("fieldname?","");
if (field_name !="") {	if (field_name !="") {
$('#submitbutton').before($('<span></span>')	$('#submitbutton').before($('<span></span>')
.append("<label>"+field_name+"</label>")	.append("<label>"+field_name+"</label>")
.append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>")	.append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>")
);	);
}	}
});	});
});	});
};	};
</script>	</script>
<form id="editform" class="nice" method="post">	<form id="editform" class="nice" method="post">
<?php	<?php

}	}
foreach ($row as $key => $value) {	foreach ($row as $key => $value) {
echo displayValue($key, $value, $mode);	echo displayValue($key, $value, $mode);
}	}
if ($mode == "view") {	if ($mode == "view") {
echo "</table></div>";	echo "</table></div>";
}	}
if ($mode == "edit") {	if ($mode == "edit") {
echo '<input id="submitbutton" type="submit"/></form>';	echo '<input id="submitbutton" type="submit"/></form>';
}	}
} else {	} else {
// show all list	// show all list
include_header('Agencies');	include_header('Agencies');
try {	try {
/* $rows = $db->get_view("app", "showNamesABNs")->rows;	/* $rows = $db->get_view("app", "showNamesABNs")->rows;
//print_r($rows);	//print_r($rows);
foreach ($rows as $row) {	foreach ($rows as $row) {
// print_r($row);	// print_r($row);
echo '<li><a href="getAgency.php?id=' . $row->key . '">' .	echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
(isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)	(isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
. '</a></li>';	. '</a></li>';
} */	} */
$rows = $db->get_view("app", "byCanonicalName")->rows;	$rows = $db->get_view("app", "byCanonicalName")->rows;
//print_r($rows);	//print_r($rows);
echo '<ul>';	echo '<ul>';
foreach ($rows as $row) {	foreach ($rows as $row) {
// print_r($row);	// print_r($row);
echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">	echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' .	<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' .
$row->value->name	$row->value->name
. '</span></a></li>';	. '</span></a></li>';
}	}
echo "</ul>";	echo "</ul>";
} catch (SetteeRestClientException $e) {	} catch (SetteeRestClientException $e) {
setteErrorHandler($e);	setteErrorHandler($e);
}	}
}	}
include_footer();	include_footer();
?>	?>

file:a/include/common.inc.php -> file:b/include/common.inc.php

<?php	<?php

date_default_timezone_set("Australia/Sydney");	date_default_timezone_set("Australia/Sydney");

$basePath = "";	$basePath = "";
if (strstr($_SERVER['PHP_SELF'], "alaveteli/")	if (strstr($_SERVER['PHP_SELF'], "alaveteli/")
\|\| strstr($_SERVER['PHP_SELF'], "admin/")	\|\| strstr($_SERVER['PHP_SELF'], "admin/")
\|\| strstr($_SERVER['PHP_SELF'], "lib/")	\|\| strstr($_SERVER['PHP_SELF'], "lib/")
\|\| strstr($_SERVER['PHP_SELF'], "include/"))	\|\| strstr($_SERVER['PHP_SELF'], "include/"))
$basePath = "../";	$basePath = "../";

include_once ('couchdb.inc.php');	include_once ('couchdb.inc.php');
include_once ('template.inc.php');	include_once ('template.inc.php');
require_once $basePath.'lib/Requests/library/Requests.php';	require_once $basePath.'lib/Requests/library/Requests.php';

Requests::register_autoloader();	Requests::register_autoloader();
	require $basePath."lib/amon-php/amon.php";
	Amon::config(array('address'=> 'http://127.0.0.1:2464',
	'protocol' => 'http',
	'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw"));
	Amon::setup_exception_handler();
# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735	# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735

function object_to_array(stdClass $Class) {	function object_to_array(stdClass $Class) {
# Typecast to (array) automatically converts stdClass -> array.	# Typecast to (array) automatically converts stdClass -> array.
$Class = (array) $Class;	$Class = (array) $Class;

# Iterate through the former properties looking for any stdClass properties.	# Iterate through the former properties looking for any stdClass properties.
# Recursively apply (array).	# Recursively apply (array).
foreach ($Class as $key => $value) {	foreach ($Class as $key => $value) {
if (is_object($value) && get_class($value) === 'stdClass') {	if (is_object($value) && get_class($value) === 'stdClass') {
$Class[$key] = object_to_array($value);	$Class[$key] = object_to_array($value);
}	}
}	}
return $Class;	return $Class;
}	}

# Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735	# Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735

function array_to_object(array $array) {	function array_to_object(array $array) {
# Iterate through our array looking for array values.	# Iterate through our array looking for array values.
# If found recurvisely call itself.	# If found recurvisely call itself.
foreach ($array as $key => $value) {	foreach ($array as $key => $value) {
if (is_array($value)) {	if (is_array($value)) {
$array[$key] = array_to_object($value);	$array[$key] = array_to_object($value);
}	}
}	}

# Typecast to (object) will automatically convert array -> stdClass	# Typecast to (object) will automatically convert array -> stdClass
return (object) $array;	return (object) $array;
}	}

function dept_to_portfolio($deptName) {	function dept_to_portfolio($deptName) {
return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName)));	return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName)));
}	}
function phrase_to_tag ($phrase) {	function phrase_to_tag ($phrase) {
return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase))));	return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase))));
}	}
function local_url() {	function local_url() {
return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/";	return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/";
}	}
function GetDomain($url)	function GetDomain($url)
{	{
$nowww = ereg_replace('www\.','',$url);	$nowww = ereg_replace('www\.','',$url);
$domain = parse_url($nowww);	$domain = parse_url($nowww);
if(!empty($domain["host"]))	if(!empty($domain["host"]))
{	{
return $domain["host"];	return $domain["host"];
} else	} else
{	{
return $domain["path"];	return $domain["path"];
}	}
}	}

file:a/include/couchdb.inc.php -> file:b/include/couchdb.inc.php

directory:b/js/bubbletree (new)

directory:b/js/flotr2 (new)

directory:b/js/sigma (new)

directory:b/lib/amon-php (new)

file:a/rss.xml.php (deleted)

<?php

// Agency X updated Y, new files, diff of plain text/link text,
// feed for just one agency or all
// This is a minimum example of using the Universal Feed Generator Class
include("lib/FeedWriter.php");
//Creating an instance of FeedWriter class.
$TestFeed = new FeedWriter(RSS2);
//Setting the channel elements
//Use wrapper functions for common channelelements
$TestFeed->setTitle('Last Modified - All');
$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
//Retriving informations from database
$rows = $db->get_view("app", "byLastModified")->rows;
//print_r($rows);
foreach ($rows as $row) {
//Create an empty FeedItem
$newItem = $TestFeed->createNewItem();
//Add elements to the feed item
$newItem->setTitle($row['name']);
$newItem->setLink($row['id']);
$newItem->setDate(date("c", $row['metadata']['lastModified']));
$newItem->setDescription($row['name']);
//Now add the feed item
$TestFeed->addItem($newItem);
}
//OK. Everything is done. Now genarate the feed.
$TestFeed->genarateFeed();
?>

file:a/scrape.py (deleted)

#http://packages.python.org/CouchDB/client.html
import couchdb
import urllib2
from BeautifulSoup import BeautifulSoup
import re
import hashlib
from urlparse import urljoin
import time
import os
import mimetypes
import re
import urllib
import urlparse

def canonurl(url):
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
if the URL looks invalid.
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
'http://xn--hgi.ws/'
"""
# strip spaces at the ends and ensure it's prefixed with 'scheme://'
url = url.strip()
if not url:
return ''
if not urlparse.urlsplit(url).scheme:
url = 'http://' + url

# turn it into Unicode
#try:
# url = unicode(url, 'utf-8')
#except UnicodeDecodeError:
# return '' # bad UTF-8 chars in URL

# parse the URL into its components
parsed = urlparse.urlsplit(url)
scheme, netloc, path, query, fragment = parsed

# ensure scheme is a letter followed by letters, digits, and '+-.' chars
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
return ''
scheme = str(scheme)

# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
if not match:
return ''
domain, port = match.groups()
netloc = domain + (port if port else '')
netloc = netloc.encode('idna')

# ensure path is valid and convert Unicode chars to %-encoded
if not path:
path = '/' # eg: 'http://google.com' -> 'http://google.com/'
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')

# ensure query is valid
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')

# ensure fragment is valid
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))

# piece it all back together, truncating it to a maximum of 4KB
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
return url[:4096]

#http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler):
def http_error_304(self, req, fp, code, message, headers):
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
addinfourl.code = code
return addinfourl

def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
url = canonurl(url)
hash = hashlib.md5(url).hexdigest().encode("utf-8")
req = urllib2.Request(url)
print "Fetching %s" % url
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
print "Not a valid HTTP url"
return (None,None,None)
doc = docsdb.get(hash)
if doc == None:
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
else:
if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 999999):
print "Uh oh, trying to scrape URL again too soon!"
last_attachment_fname = doc["_attachments"].keys()[-1]
last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
return (doc['url'],doc['mime_type'],last_attachment)
if scrape_again == False:
print "Not scraping this URL again as requested"
return (None,None,None)

time.sleep(3) # wait 3 seconds to give webserver time to recover

req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
#if there is a previous version stored in couchdb, load caching helper tags
if doc.has_key('etag'):
req.add_header("If-None-Match", doc['etag'])
if doc.has_key('last_modified'):
req.add_header("If-Modified-Since", doc['last_modified'])

opener = urllib2.build_opener(NotModifiedHandler())
try:
url_handle = opener.open(req)
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag")
doc['last_modified'] = headers.getheader("Last-Modified")
doc['date'] = headers.getheader("Date")
doc['page_scraped'] = time.time()
doc['web_server'] = headers.getheader("Server")
doc['via'] = headers.getheader("Via")
doc['powered_by'] = headers.getheader("X-Powered-By")
doc['file_size'] = headers.getheader("Content-Length")
content_type = headers.getheader("Content-Type")
if content_type != None:
doc['mime_type'] = content_type.split(";")[0]
else:
(type,encoding) = mimetypes.guess_type(url)
doc['mime_type'] = type
if hasattr(url_handle, 'code'):
if url_handle.code == 304:
print "the web page has not been modified"
return (None,None,None)
else:
content = url_handle.read()
docsdb.save(doc)
doc = docsdb.get(hash) # need to get a _rev
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
return (doc['url'], doc['mime_type'], content)
#store as attachment epoch-filename
except urllib2.URLError as e:
error = ""
if hasattr(e, 'reason'):
error = "error %s in downloading %s" % (str(e.reason), url)
elif hasattr(e, 'code'):
error = "error %s in downloading %s" % (e.code, url)
print error
doc['error'] = error
docsdb.save(doc)
return (None,None,None)



def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(content)
navIDs = soup.findAll(id=re.compile('nav\|Nav\|menu\|bar\|left\|right\|sidebar\|more-links\|breadcrumb\|footer\|header'))
for nav in navIDs:
print "Removing element", nav['id']
nav.extract()
navClasses = soup.findAll(attrs={'class' : re.compile('nav\|menu\|bar\|left\|right\|sidebar\|more-links\|breadcrumb\|footer\|header')})
for nav in navClasses:
print "Removing element", nav['class']
nav.extract()
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
linkurls = set([])
for link in links:
if link.has_key("href"):
if link['href'].startswith("http"):
# lets not do external links for now
# linkurls.add(link['href'])
None
if link['href'].startswith("mailto"):
# not http
None
if link['href'].startswith("javascript"):
# not http
None
else:
# remove anchors and spaces in urls
link['href'] = link['href'].replace(" ","%20")
link['href'] = re.sub('#.*$','',link['href'])
linkurls.add(urljoin(url,link['href']))
for linkurl in linkurls:
#print linkurl
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)

couch = couchdb.Server('http://127.0.0.1:5984/')

# select database
agencydb = couch['disclosr-agencies']
docsdb = couch['disclosr-documents']

for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
agency = agencydb.get(row.id)
print agency['name']
for key in agency.keys():
if key == 'website':
scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
if key.endswith('URL'):
print key
depth = 1
if 'scrapeDepth' in agency.keys():
depth = agency['scrapeDepth']
scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])

agency['metadata']['lastScraped'] = time.time()
agencydb.save(agency)

file:a/search.php (deleted)

<?php
include_once('include/common.inc.php');
include_header('Search');
?>
<div class="foundation-header">
<h1><a href="search.php">Search</a></h1>
</div>
<form>
<input type="text" name="q" value="<?php if (isset($_REQUEST['q']))echo $_REQUEST['q'];?>"/>
<input type="submit"/>
</form>

<?php
if (isset($_REQUEST['q'])) {
$request = Requests::get($serverAddr."disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=".$_REQUEST['q']);
$results = json_decode($request->body);
$db = $server->get_db('disclosr-documents');
foreach ($results->rows as $result) {
//print_r($result);
//$row = $db->get($result->id);
echo $result->doc->_id." ".$result->doc->url."<br>".PHP_EOL;
}
}
include_footer();
?>

<?php	<?php

include $basePath . "schemas/schemas.inc.php";	include $basePath . "schemas/schemas.inc.php";

require ($basePath . 'couchdb/settee/src/settee.php');	require ($basePath . 'couchdb/settee/src/settee.php');

function createDocumentsDesignDoc() {	function createDocumentsDesignDoc() {
/* "views": {	/* "views": {
"web_server": {	"web_server": {
"map": "function(doc) {\n emit(doc.web_server, 1);\n}",	"map": "function(doc) {\n emit(doc.web_server, 1);\n}",
"reduce": "function (key, values, rereduce) {\n return sum(values);\n}"	"reduce": "function (key, values, rereduce) {\n return sum(values);\n}"
},	},
"byAgency": {	"byAgency": {
"map": "function(doc) {\n emit(doc.agencyID, 1);\n}",	"map": "function(doc) {\n emit(doc.agencyID, 1);\n}",
"reduce": "function (key, values, rereduce) {\n return sum(values);\n}"	"reduce": "function (key, values, rereduce) {\n return sum(values);\n}"
},	},
"byURL": {	"byURL": {
"map": "function(doc) {\n emit(doc.url, doc);\n}"	"map": "function(doc) {\n emit(doc.url, doc);\n}"
},	},
"agency": {	"agency": {
"map": "function(doc) {\n emit(doc.agencyID, doc);\n}"	"map": "function(doc) {\n emit(doc.agencyID, doc);\n}"
},	},
"byWebServer": {	"byWebServer": {
"map": "function(doc) {\n emit(doc.web_server, doc);\n}"	"map": "function(doc) {\n emit(doc.web_server, doc);\n}"
},	},
"getValidationRequired": {	"getValidationRequired": {
"map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"	"map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"
}	}
} */	} */
}	}

function createAgencyDesignDoc() {	function createAgencyDesignDoc() {
global $db;	global $db;
$obj = new stdClass();	$obj = new stdClass();
$obj->_id = "_design/" . urlencode("app");	$obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript";	$obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";	$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";	$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) {	$obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg \|\| doc.orgType == 'FMA-DepartmentOfState') {	if (doc.parentOrg \|\| doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc);	emit(doc.name, doc);
}	}
};";	};";
$obj->views->byDeptStateName->map = "function(doc) {	$obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') {	if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id);	emit(doc.name, doc._id);
}	}
};";	};";
$obj->views->parentOrgs->map = "function(doc) {	$obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) {	if (doc.parentOrg) {
emit(doc._id, doc.parentOrg);	emit(doc._id, doc.parentOrg);
}	}
};";	};";
$obj->views->byName->map = 'function(doc) {	$obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {	if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {
emit(doc.name, doc._id);	emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {	if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id);	emit(doc.shortName, doc._id);
}	}
for (name in doc.otherNames) {	for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {	if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id);	emit(doc.otherNames[name], doc._id);
}	}
}	}
for (name in doc.foiBodies) {	for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {	if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id);	emit(doc.foiBodies[name], doc._id);
}	}
}	}
}	}
};';	};';

$obj->views->foiEmails->map = "function(doc) {	$obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail);	emit(doc._id, doc.foiEmail);
};";	};";

$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";	$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';	$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';	$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) {	$obj->views->getScrapeRequired->map = "function(doc) {

var lastScrape = Date.parse(doc.metadata.lastScraped);	var lastScrape = Date.parse(doc.metadata.lastScraped);

var today = new Date();	var today = new Date();

if (!lastScrape \|\| lastScrape.getTime() + 1000 != today.getTime()) {	if (!lastScrape \|\| lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc);	emit(doc._id, doc);
}	}

};";	};";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";	$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) {	$obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) {	if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts));	emit(null, [doc._rev].concat(doc._conflicts));
}	}
}";	}";
// http://stackoverflow.com/questions/646628/javascript-startswith	// http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){	$obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {	String.prototype.startsWith = function (str) {
return !this.indexOf(str);	return !this.indexOf(str);
}	}
}	}

function(doc) {	function(doc) {
count = 0;	count = 0;
if (doc["status"] != "suspended") {	if (doc["status"] != "suspended") {
for(var propName in doc) {	for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {	if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++;	count++;
}	}
}	}
portfolio = doc.parentOrg;	portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") {	if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id;	portfolio = doc._id;
}	}
if (doc.orgType == "Court-Commonwealth" \|\| doc.orgType == "FMA-DepartmentOfParliament") {	if (doc.orgType == "Court-Commonwealth" \|\| doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType;	portfolio = doc.orgType;
}	}
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});	emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
}	}
}';	}';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){	$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {	String.prototype.startsWith = function (str) {
return !this.indexOf(str);	return !this.indexOf(str);
}	}
}	}
if(!String.prototype.endsWith){	if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) {	String.prototype.endsWith = function(suffix) {
return this.indexOf(suffix, this.length - suffix.length) !== -1;	return this.indexOf(suffix, this.length - suffix.length) !== -1;
};	};
}	}
function(doc) {	function(doc) {
if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {	if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {
for(var propName in doc) {	for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") \|\| propName.endsWith("URL"))) {	if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") \|\| propName.endsWith("URL"))) {
emit(propName, 1);	emit(propName, 1);
}	}
}	}
emit("total", 1);	emit("total", 1);
}	}
}';	}';
$obj->views->scoreHas->reduce = 'function (key, values, rereduce) {	$obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
return sum(values);	return sum(values);
}';	}';
$obj->views->fieldNames->map = '	$obj->views->fieldNames->map = '
function(doc) {	function(doc) {
for(var propName in doc) {	for(var propName in doc) {
emit(propName, doc._id);	emit(propName, doc._id);
}	}

}';	}';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) {	$obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length;	return values.length;
}';	}';
// allow safe updates (even if slightly slower due to extra: rev-detection check).	// allow safe updates (even if slightly slower due to extra: rev-detection check).
return $db->save($obj, true);	return $db->save($obj, true);
}	}

if (php_uname('n') == "vanille") {	if (php_uname('n') == "vanille") {
$serverAddr = 'http://192.168.178.21:5984/';	$serverAddr = 'http://192.168.178.21:5984/';
} else	} else
if (php_uname('n') == "KYUUBEY") {	if (php_uname('n') == "KYUUBEY") {

$serverAddr = 'http://192.168.1.148:5984/';	$serverAddr = 'http://192.168.1.148:5984/';
} else {	} else {
$serverAddr = 'http://127.0.0.1:5984/';	$serverAddr = 'http://127.0.0.1:5984/';
}	}
$server = new SetteeServer($serverAddr);	$server = new SetteeServer($serverAddr);

function setteErrorHandler($e) {	function setteErrorHandler($e) {
	Amon::log($e->getMessage() . " " . print_r($_SERVER,true), array('error'));
echo $e->getMessage() . "<br>" . PHP_EOL;	echo $e->getMessage() . "<br>" . PHP_EOL;
}	}