gitphp 0.2.9.1 :: disclosr.git/commitdiff

FOI stats importer fixed

Former-commit-id: 81a6a149848e27565b7a7052d2a7ff4e5aaa9310

9 files changed: (show all)
admin/importOAICFOIrequests.php
admin/refreshDesignDoc.php
documents/charts.php
documents/genericScrapers.py
documents/runScrapers.sh
documents/scrapers/1803322b27286950cab0c543168b5f21.py
documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py
documents/scrapers/bf16d4ba0d306ee03e5a1d32aaba3da1.py
documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py

file:a/admin/importOAICFOIrequests.php -> file:b/admin/importOAICFOIrequests.php

<?php	<?php

require_once '../include/common.inc.php';	require_once '../include/common.inc.php';

$db = $server->get_db('disclosr-agencies');	$db = $server->get_db('disclosr-agencies');
$rows = $db->get_view("app", "byName")->rows;	$rows = $db->get_view("app", "byName")->rows;
$nametoid = Array();	$nametoid = Array();
$stats = Array();	$stats = Array();
foreach ($rows as $row) {	foreach ($rows as $row) {
$nametoid[trim($row->key)] = $row->value;	$nametoid[trim($row->key)] = $row->value;
}	}
$row = 0;	$row = 0;
$headers = Array();	$headers = Array();
// source: http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/	// source: http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/
if (($handle = fopen("FOI-quarterly-requests-and-reviews-2011-12.csv", "r")) !== FALSE) {	if (($handle = fopen("FOI-quarterly-requests-and-reviews-2011-12.csv", "r")) !== FALSE) {
while (($data = fgetcsv($handle, 10000, ",")) !== FALSE) {	while (($data = fgetcsv($handle, 10000, ",")) !== FALSE) {
if ($row >= 1) {	if ($row >= 1) {
// print_r($data);	// print_r($data);
$name = trim($data[2]);	$name = trim($data[2]);
echo "$name <br>";	// echo "$name <br>";
if ($data[0] != "TOTALS" && $data[0] != "") {	if ($data[0] != "TOTALS" && $data[0] != "") {
if (isset($nametoid[$name])) {	if (isset($nametoid[$name])) {
$id = $nametoid[$name];	$id = $nametoid[$name];
$timePeriod = $data[0] . "-Q" . $data[1];	$timePeriod = $data[0] . "-Q" . $data[1];

echo "$timePeriod <br>";	// echo "$timePeriod <br>";
unset($data[0]);	unset($data[0]);
unset($data[1]);	unset($data[1]);
unset($data[2]);	unset($data[2]);
unset($data[3]);	unset($data[3]);
unset($data[4]);	unset($data[4]);
unset($data[5]);	unset($data[5]);
unset($data[6]);	unset($data[6]);
unset($data[7]);	unset($data[7]);
unset($data[8]);	unset($data[8]);

//echo $id . "<br>" . PHP_EOL;	//echo $id . "<br>" . PHP_EOL;
$result = Array("source" => "http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/");	$result = Array("source" => "http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/");
foreach ($data as $key => $datum) {	foreach ($data as $key => $datum) {
if ($datum != 0) {	if ($datum != 0) {
	// tODO prefix header with "FOI"
	if (isset($stats[$id][$timePeriod][$key])) $datum += $stats[$id][$timePeriod][$key];
$result[trim($headers[$key])] = $datum;	$result[trim($headers[$key])] = $datum;
}	}
}	}
$stats[$id][$timePeriod] = $result;	$stats[$id][$timePeriod] = $result;
	// TODO merge if already exists
//print_r($stats);	//print_r($stats);
} else {	} else {
echo "<br>ERROR NAME MISSING FROM ID LIST<br><bR> $row" . PHP_EOL;	echo "<br>ERROR NAME MISSING FROM ID LIST<br><bR> $row" . PHP_EOL;
print_r($data);	print_r($data);
die();	die();
}	}
}	}
} else {	} else {
$headers = $data;	$headers = $data;
//print_r($headers);	//print_r($headers);
}	}
$row++;	$row++;
}	}
fclose($handle);	fclose($handle);
}	}
	echo "all stats loaded successfuly";
foreach ($stats as $id => $stat) {	foreach ($stats as $id => $stat) {
echo $id . "<br>" . PHP_EOL;	echo $id . "<br>" . PHP_EOL;
$doc = $db->get($id);	$doc = $db->get($id);
echo $doc->name . "<br>" . PHP_EOL;	echo $doc->name . "<br>" . PHP_EOL;
print_r($stat);	// print_r($stat);
die();
// print_r($doc);	// print_r($doc);
$changed = false;	$changed = false;
if (!isset($doc->statistics)) {	if (!isset($doc->statistics)) {
$changed = true;	$changed = true;
$doc->statistics = Array();	$doc->statistics = Array();
	} else {
	$doc->statistics = object_to_array($doc->statistics);
}	}
foreach ($stat as $timePeriod => $value) {	foreach ($stat as $timePeriod => $value) {
if (!isset($doc->statistics->foiRequests->$timePeriod)	if (!isset($doc->statistics["foiRequests"][$timePeriod])
\|\| $doc->statistics->foiRequests->$timePeriod != $value) {	\|\| $doc->statistics["foiRequests"][$timePeriod] != $value
	) {
$changed = true;	$changed = true;
$doc->statistics["foiRequests"][$timePeriod] = $value;	$doc->statistics["foiRequests"][$timePeriod] = $value;
}	}
}	}
if ($changed) {	if ($changed) {
$db->save($doc);	$db->save($doc);
} else {	} else {
echo "not changed" . "<br>" . PHP_EOL;	echo "not changed" . "<br>" . PHP_EOL;
}	}
	//print_r($doc);die();
}	}
?>	?>

file:a/admin/refreshDesignDoc.php -> file:b/admin/refreshDesignDoc.php

file:a/documents/charts.php -> file:b/documents/charts.php

<?php	<?php
include('template.inc.php');	include('template.inc.php');
include_header_documents("Charts");	include_header_documents("Charts");
include_once('../include/common.inc.php');	include_once('../include/common.inc.php');
$agenciesdb = $server->get_db('disclosr-agencies');	$agenciesdb = $server->get_db('disclosr-agencies');

$idtoname = Array();	$idtoname = Array();
	$idtofoirequestssuccessful = Array();
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {	foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name);	$idtoname[$row->id] = trim($row->value->name);
	$foirequestssuccessful = 0;
	if(isset($row->value->statistics->foiRequests)) {
	foreach ($row->value->statistics->foiRequests as $statperiod) {
	$statperiod=object_to_array($statperiod);
	if (isset($statperiod["Requests for other information granted in full"])) $foirequestssuccessful += $statperiod["Requests for other information granted in full"];
	if (isset($statperiod["Requests for other information granted in part"])) $foirequestssuccessful += $statperiod["Requests for other information granted in part"];
	}
	}
	$idtofoirequestssuccessful[$row->id] =$foirequestssuccessful;
}	}
$foidocsdb = $server->get_db('disclosr-foidocuments');	$foidocsdb = $server->get_db('disclosr-foidocuments');

?>	?>
<div class="foundation-header">	<div class="foundation-header">
<h1><a href="about.php">Charts</a></h1>	<h1><a href="about.php">Charts</a></h1>
<h4 class="subheader"></h4>	<h4 class="subheader"></h4>
</div>	</div>
<div id="bydate" style="width:1000px;height:300px;"></div>	<div id="bydate" style="width:1000px;height:300px;"></div>
<div id="byagency" style="width:1000px;height:1400px;"></div>	<div id="byagency" style="width:1000px;height:1400px;"></div>
<script id="source">	<script id="source">
window.onload = function () {	window.onload = function () {
$(document).ready(function () {	$(document).ready(function () {
var	var
d1 = [],	d1 = [],
options1,	options1,
o1;	o1;

<?php	<?php
try {	try {
$rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;	$rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;


$dataValues = Array();	$dataValues = Array();
foreach ($rows as $row) {	foreach ($rows as $row) {
$dataValues[$row->key] = $row->value;	$dataValues[$row->key] = $row->value;
}	}
$i = 0;	$i = 0;
ksort($dataValues);	ksort($dataValues);
foreach ($dataValues as $key => $value) {	foreach ($dataValues as $key => $value) {
$date = date_create_from_format('Y-m-d', $key);	$date = date_create_from_format('Y-m-d', $key);
if (date_format($date, 'U') != "") {	if (date_format($date, 'U') != "") {
echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;	echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
// echo " emplabels.push('$key');" . PHP_EOL;	// echo " emplabels.push('$key');" . PHP_EOL;
$i++;	$i++;
}	}
}	}
} catch (SetteeRestClientException $e) {	} catch (SetteeRestClientException $e) {
setteErrorHandler($e);	setteErrorHandler($e);
}	}
?>	?>


options1 = {	options1 = {
xaxis: {	xaxis: {
mode: 'time',	mode: 'time',
labelsAngle: 45	labelsAngle: 45
},	},
selection: {	selection: {
mode: 'x'	mode: 'x'
},	},
HtmlText: false,	HtmlText: false,
title: 'Disclosure Log entries added by Date'	title: 'Disclosure Log entries added by Date'
};	};

// Draw graph with default options, overwriting with passed options	// Draw graph with default options, overwriting with passed options
function drawGraph(opts) {	function drawGraph(opts) {

// Clone the options, so the 'options' variable always keeps intact.	// Clone the options, so the 'options' variable always keeps intact.
o1 = Flotr._.extend(Flotr._.clone(options1), opts \|\| {});	o1 = Flotr._.extend(Flotr._.clone(options1), opts \|\| {});

// Return a new graph.	// Return a new graph.
return Flotr.draw(	return Flotr.draw(
document.getElementById("bydate"),	document.getElementById("bydate"),
[ d1 ],	[ d1 ],
o1	o1
);	);
}	}

graph = drawGraph();	graph = drawGraph();

Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {	Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {
// Draw selected area	// Draw selected area
graph = drawGraph({	graph = drawGraph({
xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },	xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },
yaxis: { min: area.y1, max: area.y2 }	yaxis: { min: area.y1, max: area.y2 }
});	});
});	});

// When graph is clicked, draw the graph with default area.	// When graph is clicked, draw the graph with default area.
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {	Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {
graph = drawGraph();	graph = drawGraph();
});	});

});	});
};	};

var d2 = [];	var d2 = [];
	var d3 = [];
var agencylabels = [];	var agencylabels = [];
function agencytrackformatter(obj) {	function agencytrackformatter(obj) {

return agencylabels[Math.floor(obj.y)] + " = " + obj.x;	return agencylabels[Math.floor(obj.y)] + " = " + obj.x;

}	}
function agencytickformatter(val, axis) {	function agencytickformatter(val, axis) {
if (agencylabels[Math.floor(val)]) {	if (agencylabels[Math.floor(val)]) {
return (agencylabels[Math.floor(val)]) ;	return (agencylabels[Math.floor(val)]) ;

} else {	} else {
return "";	return "";
}	}
}	}
<?php	<?php
try {	try {
$rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;	$rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
function cmp($a, $b)	function cmp($a, $b)
{	{
return $a->value > $b->value;	return $a->value > $b->value;
}	}
usort($rows, "cmp");	usort($rows, "cmp");

$dataValues = Array();	$dataValues = Array();
$i = 0;	$i = 0;
foreach ($rows as $row) {	foreach ($rows as $row) {
echo " d2.push([ $row->value,$i]);" . PHP_EOL;	echo " d2.push([ $row->value,$i]);" . PHP_EOL;
	echo " d3.push([ ".$idtofoirequestssuccessful[$row->key].",$i]);" . PHP_EOL;
echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;	echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;

$i++;	$i++;
}	}
} catch (SetteeRestClientException $e) {	} catch (SetteeRestClientException $e) {
setteErrorHandler($e);	setteErrorHandler($e);
}	}
?>	?>
// Draw the graph	// Draw the graph
Flotr.draw(	Flotr.draw(
document.getElementById("byagency"),	document.getElementById("byagency"),
[d2],	[d2],
{	{
title: "Disclosure Log entries by Agency",	title: "Disclosure Log entries by Agency",
bars: {	bars: {
show: true,	show: true,
horizontal: true,	horizontal: true,
shadowSize: 0,	shadowSize: 0,
barWidth: 0.5	barWidth: 0.5
},	},
mouse: {	mouse: {
track: true,	track: true,
relative: true,	relative: true,
trackFormatter: agencytrackformatter	trackFormatter: agencytrackformatter
},	},
yaxis: {	yaxis: {
minorTickFreq: 1,	minorTickFreq: 1,
noTicks: agencylabels.length,	noTicks: agencylabels.length,
showMinorLabels: true,	showMinorLabels: true,
tickFormatter: agencytickformatter	tickFormatter: agencytickformatter
},	},
xaxis: {	xaxis: {
min: 0,	min: 0,
autoscaleMargin: 1	autoscaleMargin: 1
},	},
legend: {	legend: {
show: false	show: true
}	}
}	}
);	);
</script>	</script>

<?php	<?php
include_footer_documents();	include_footer_documents();
?>	?>

file:a/documents/genericScrapers.py -> file:b/documents/genericScrapers.py

import sys	import sys
import os	import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup
from time import mktime	from time import mktime
import feedparser	import feedparser
import abc	import abc
import unicodedata	import unicodedata
import re	import re
import dateutil	import dateutil
from dateutil.parser import *	from dateutil.parser import *
from datetime import *	from datetime import *
import codecs	import codecs

import difflib	import difflib

from StringIO import StringIO	from StringIO import StringIO

from pdfminer.pdfparser import PDFDocument, PDFParser	from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor	from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import TextConverter	from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB	from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams	from pdfminer.layout import LAParams


class GenericDisclogScraper(object):	class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta	__metaclass__ = abc.ABCMeta
agencyID = None	agencyID = None
disclogURL = None	disclogURL = None

def remove_control_chars(self, input):	def remove_control_chars(self, input):
return "".join([i for i in input if ord(i) in range(32, 127)])	return "".join([i for i in input if ord(i) in range(32, 127)])

def getAgencyID(self):	def getAgencyID(self):
""" disclosr agency id """	""" disclosr agency id """
if self.agencyID is None:	if self.agencyID is None:
self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")	self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
return self.agencyID	return self.agencyID

def getURL(self):	def getURL(self):
""" disclog URL"""	""" disclog URL"""
if self.disclogURL is None:	if self.disclogURL is None:
agency = scrape.agencydb.get(self.getAgencyID())	agency = scrape.agencydb.get(self.getAgencyID())
self.disclogURL = agency['FOIDocumentsURL']	self.disclogURL = agency['FOIDocumentsURL']
return self.disclogURL	return self.disclogURL

@abc.abstractmethod	@abc.abstractmethod
def doScrape(self):	def doScrape(self):
""" do the scraping """	""" do the scraping """
return	return


class GenericHTMLDisclogScraper(GenericDisclogScraper):	class GenericHTMLDisclogScraper(GenericDisclogScraper):
def doScrape(self):	def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']	foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID())	self.getURL(), "foidocuments", self.getAgencyID())
content = rcontent	content = rcontent
dochash = scrape.mkhash(content)	dochash = scrape.mkhash(content)
doc = foidocsdb.get(dochash)	doc = foidocsdb.get(dochash)
if doc is None:	if doc is None:
print "saving " + dochash	print "saving " + dochash
description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"	description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())	last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
if last_attach != None:	if last_attach != None:
html_diff = difflib.HtmlDiff()	html_diff = difflib.HtmlDiff()
diff = html_diff.make_table(last_attach.read().split('\n'),	diff = html_diff.make_table(last_attach.read().split('\n'),
content.split('\n'))	content.split('\n'))
edate = date.today().strftime("%Y-%m-%d")	edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()	doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash,	, 'url': self.getURL(), 'docID': dochash,
"date": edate, "title": "Disclosure Log Updated",	"date": edate, "title": "Disclosure Log Updated",
"description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)}	"description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)}
foidocsdb.save(doc)	foidocsdb.save(doc)
else:	else:
print "already saved"	print "already saved"


class GenericPDFDisclogScraper(GenericDisclogScraper):	class GenericPDFDisclogScraper(GenericDisclogScraper):
def doScrape(self):	def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']	foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID())	self.getURL(), "foidocuments", self.getAgencyID())
laparams = LAParams()	laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=True)	rsrcmgr = PDFResourceManager(caching=True)
outfp = StringIO()	outfp = StringIO()
device = TextConverter(rsrcmgr, outfp, codec='utf-8',	device = TextConverter(rsrcmgr, outfp, codec='utf-8',
laparams=laparams)	laparams=laparams)
fp = StringIO()	fp = StringIO()
fp.write(content)	fp.write(content)

process_pdf(rsrcmgr, device, fp, set(), caching=True,	process_pdf(rsrcmgr, device, fp, set(), caching=True,
check_extractable=True)	check_extractable=True)
description = outfp.getvalue()	description = outfp.getvalue()
fp.close()	fp.close()
device.close()	device.close()
outfp.close()	outfp.close()
dochash = scrape.mkhash(description)	dochash = scrape.mkhash(description)
doc = foidocsdb.get(dochash)	doc = foidocsdb.get(dochash)
if doc is None:	if doc is None:
print "saving " + dochash	print "saving " + dochash
edate = date.today().strftime("%Y-%m-%d")	edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()	doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash,	, 'url': self.getURL(), 'docID': dochash,
"date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}	"date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
foidocsdb.save(doc)	foidocsdb.save(doc)
else:	else:
print "already saved"	print "already saved"


class GenericDOCXDisclogScraper(GenericDisclogScraper):	class GenericDOCXDisclogScraper(GenericDisclogScraper):
def doScrape(self):	def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']	foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb	(url, mime_type, content) = scrape.fetchURL(scrape.docsdb
, self.getURL(), "foidocuments", self.getAgencyID())	, self.getURL(), "foidocuments", self.getAgencyID())
mydoc = zipfile.ZipFile(file)	mydoc = zipfile.ZipFile(file)
xmlcontent = mydoc.read('word/document.xml')	xmlcontent = mydoc.read('word/document.xml')
document = etree.fromstring(xmlcontent)	document = etree.fromstring(xmlcontent)
## Fetch all the text out of the document we just created	## Fetch all the text out of the document we just created
paratextlist = getdocumenttext(document)	paratextlist = getdocumenttext(document)
# Make explicit unicode version	# Make explicit unicode version
newparatextlist = []	newparatextlist = []
for paratext in paratextlist:	for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))	newparatextlist.append(paratext.encode("utf-8"))
## Print our documnts test with two newlines under each paragraph	## Print our documnts test with two newlines under each paragraph
description = '\n\n'.join(newparatextlist).strip(' \t\n\r')	description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
dochash = scrape.mkhash(description)	dochash = scrape.mkhash(description)
doc = foidocsdb.get(dochash)	doc = foidocsdb.get(dochash)

if doc is None:	if doc is None:
print "saving " + dochash	print "saving " + dochash
edate = time().strftime("%Y-%m-%d")	edate = time().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()	doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash,	, 'url': self.getURL(), 'docID': dochash,
"date": edate, "title": "Disclosure Log Updated", "description": description}	"date": edate, "title": "Disclosure Log Updated", "description": description}
foidocsdb.save(doc)	foidocsdb.save(doc)
else:	else:
print "already saved"	print "already saved"


class GenericRSSDisclogScraper(GenericDisclogScraper):	class GenericRSSDisclogScraper(GenericDisclogScraper):
def doScrape(self):	def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']	foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID())	self.getURL(), "foidocuments", self.getAgencyID())
feed = feedparser.parse(content)	feed = feedparser.parse(content)
for entry in feed.entries:	for entry in feed.entries:
#print entry	#print entry
print entry.id	print entry.id
dochash = scrape.mkhash(entry.id)	dochash = scrape.mkhash(entry.id)
doc = foidocsdb.get(dochash)	doc = foidocsdb.get(dochash)
#print doc	#print doc
if doc is None:	if doc is None:
print "saving " + dochash	print "saving " + dochash
edate = datetime.fromtimestamp(	edate = datetime.fromtimestamp(
mktime(entry.published_parsed)).strftime("%Y-%m-%d")	mktime(entry.published_parsed)).strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID(),	doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
'url': entry.link, 'docID': entry.id,	'url': entry.link, 'docID': entry.id,
"date": edate, "title": entry.title}	"date": edate, "title": entry.title}
self.getDescription(entry, entry, doc)	self.getDescription(entry, entry, doc)
foidocsdb.save(doc)	foidocsdb.save(doc)
else:	else:
print "already saved"	print "already saved"

def getDescription(self, content, entry, doc):	def getDescription(self, content, entry, doc):
""" get description from rss entry"""	""" get description from rss entry"""
doc.update({'description': content.summary})	doc.update({'description': content.summary})

return	return


class GenericOAICDisclogScraper(GenericDisclogScraper):	class GenericOAICDisclogScraper(GenericDisclogScraper):
__metaclass__ = abc.ABCMeta	__metaclass__ = abc.ABCMeta

@abc.abstractmethod	@abc.abstractmethod
def getColumns(self, columns):	def getColumns(self, columns):
""" rearranges columns if required """	""" rearranges columns if required """
return	return

def getColumnCount(self):	def getColumnCount(self):
return 5	return 5

def getDescription(self, content, entry, doc):	def getDescription(self, content, entry, doc):
""" get description from rss entry"""	""" get description from rss entry"""
descriptiontxt = ""	descriptiontxt = ""
for string in content.stripped_strings:	for string in content.stripped_strings:
descriptiontxt = descriptiontxt + " \n" + string	descriptiontxt = descriptiontxt + " \n" + string
doc.update({'description': descriptiontxt})	doc.update({'description': descriptiontxt})

def getTitle(self, content, entry, doc):	def getTitle(self, content, entry, doc):
doc.update({'title': (''.join(content.stripped_strings))})	doc.update({'title': (''.join(content.stripped_strings))})

def getTable(self, soup):	def getTable(self, soup):
return soup.table	return soup.table

def getRows(self, table):	def getRows(self, table):
return table.find_all('tr')	return table.find_all('tr')
	def findColumns(self, row):
	return row.find_all('td')

def getDocHash(self, id,date, url):	def getDocHash(self, id,date, url):
if id.string is None:	if id.string is None:
print "no id, using date as hash"	print "no id, using date as hash"
return scrape.mkhash(	return scrape.mkhash(
self.remove_control_chars(	self.remove_control_chars(
url + (''.join(date.stripped_strings))))	url + (''.join(date.stripped_strings))))
else:	else:
return scrape.mkhash(	return scrape.mkhash(
self.remove_control_chars(	self.remove_control_chars(
url + (''.join(id.stripped_strings))))	url + (''.join(id.stripped_strings))))

def getDate(self, content, entry, doc):	def getDate(self, content, entry, doc):
strdate = ''.join(content.stripped_strings).strip()	strdate = ''.join(content.stripped_strings).strip()
(a, b, c) = strdate.partition("(")	(a, b, c) = strdate.partition("(")
strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012"))	strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012"))
print strdate	print strdate
try:	try:
edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")	edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
except ValueError:	except ValueError:
print >> sys.stderr, "ERROR date invalid %s " % strdate	print >> sys.stderr, "ERROR date invalid %s " % strdate
print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip()	print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip()
edate = date.today().strftime("%Y-%m-%d")	edate = date.today().strftime("%Y-%m-%d")
print edate	print edate
doc.update({'date': edate})	doc.update({'date': edate})
return	return

def getLinks(self, content, entry, doc):	def getLinks(self, content, entry, doc):
links = []	links = []
for atag in entry.find_all("a"):	for atag in entry.find_all("a"):
if atag.has_key('href'):	if atag.has_key('href'):
links.append(scrape.fullurl(content, atag['href']))	links.append(scrape.fullurl(content, atag['href']))
if links != []:	if links != []:
doc.update({'links': links})	doc.update({'links': links})
return	return

def doScrape(self):	def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']	foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID())	self.getURL(), "foidocuments", self.getAgencyID())
if content is not None:	if content is not None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html	# http://www.crummy.com/software/BeautifulSoup/documentation.html
print "parsing"	print "parsing"
soup = BeautifulSoup(content)	soup = BeautifulSoup(content)
table = self.getTable(soup)	table = self.getTable(soup)
for row in self.getRows(table):	for row in self.getRows(table):
columns = row.find_all('td')	columns = self.findColumns(row)
if len(columns) is self.getColumnCount():	if len(columns) is self.getColumnCount():
(id, date, title,	(id, date, title,
description, notes) = self.getColumns(columns)	description, notes) = self.getColumns(columns)
print self.remove_control_chars(	print self.remove_control_chars(
''.join(id.stripped_strings))	''.join(id.stripped_strings))
dochash = self.getDocHash(id,date,url)	dochash = self.getDocHash(id,date,url)
doc = foidocsdb.get(dochash)	doc = foidocsdb.get(dochash)

if doc is None:	if doc is None:
print "saving " + dochash	print "saving " + dochash
doc = {'_id': dochash,	doc = {'_id': dochash,
'agencyID': self.getAgencyID(),	'agencyID': self.getAgencyID(),
'url': self.getURL(),	'url': self.getURL(),
'docID': (''.join(id.stripped_strings))}	'docID': (''.join(id.stripped_strings))}
self.getLinks(self.getURL(), row, doc)	self.getLinks(self.getURL(), row, doc)
self.getTitle(title, row, doc)	self.getTitle(title, row, doc)
self.getDate(date, row, doc)	self.getDate(date, row, doc)
self.getDescription(description, row, doc)	self.getDescription(description, row, doc)
if notes is not None:	if notes is not None:
doc.update({'notes': (	doc.update({'notes': (
''.join(notes.stripped_strings))})	''.join(notes.stripped_strings))})
badtitles = ['-', 'Summary of FOI Request'	badtitles = ['-', 'Summary of FOI Request'
, 'FOI request(in summary form)'	, 'FOI request(in summary form)'
, 'Summary of FOI request received by the ASC',	, 'Summary of FOI request received by the ASC',
'Summary of FOI request received by agency/minister',	'Summary of FOI request received by agency/minister',
'Description of Documents Requested', 'FOI request',	'Description of Documents Requested', 'FOI request',
'Description of FOI Request', 'Summary of request', 'Description', 'Summary',	'Description of FOI Request', 'Summary of request', 'Description', 'Summary',
'Summary of FOIrequest received by agency/minister',	'Summary of FOIrequest received by agency/minister',
'Summary of FOI request received', 'Description of FOI Request',	'Summary of FOI request received', 'Description of FOI Request',
"FOI request", 'Results 1 to 67 of 67']	"FOI request", 'Results 1 to 67 of 67']
if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '':	if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '':
print "saving"	print "saving"
foidocsdb.save(doc)	foidocsdb.save(doc)
else:	else:
print "already saved " + dochash	print "already saved " + dochash

elif len(row.find_all('th')) is self.getColumnCount():	elif len(row.find_all('th')) is self.getColumnCount():
print "header row"	print "header row"

else:	else:
print >> sys.stderr, "ERROR number of columns incorrect"	print >> sys.stderr, "ERROR number of columns incorrect"
print row	print row

file:a/documents/runScrapers.sh -> file:b/documents/runScrapers.sh

#!/bin/bash	#!/bin/bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"	DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
echo $DIR	echo $DIR
cd $DIR	cd $DIR
echo "" > /tmp/disclosr-error	echo "" > /tmp/disclosr-error
for f in $DIR/scrapers/*.py; do	for f in $DIR/scrapers/*.py; do
echo "Processing $f file..";	echo "Processing $f file..";
md5=`md5sum /tmp/disclosr-error`	md5=`md5sum /tmp/disclosr-error`
python $f 3>&1 1>&2 2>&3 \| tee --append /tmp/disclosr-error;	python $f 3>&1 1>&2 2>&3 \| tee --append /tmp/disclosr-error;
md52=`md5sum /tmp/disclosr-error`	md52=`md5sum /tmp/disclosr-error`
if [ "$md5" != "$md52" ]; then	if [ "$md5" != "$md52" ]; then
echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error;	echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error;
fi	fi
if [ "$?" -ne "0" ]; then	if [ "$?" -ne "0" ]; then
echo "error";	echo "error";
sleep 1;	sleep 1;
fi	fi
done	done
	curl "localhost:5984/disclosr-foidocuments/_design/app/_view/byDate?startkey=\"9999-99-99\"&endkey=\"0000-00-00\"&descending=true&limit=20"
if [ -s /tmp/disclosr-error ] ; then	if [ -s /tmp/disclosr-error ] ; then
echo "emailling logs..";	echo "emailling logs..";
mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ;	mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ;
fi	fi

file:a/documents/scrapers/1803322b27286950cab0c543168b5f21.py -> file:b/documents/scrapers/1803322b27286950cab0c543168b5f21.py

import sys,os	import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers	import genericScrapers
import dateutil	import dateutil
from dateutil.parser import *	from dateutil.parser import *
from datetime import *	from datetime import *
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):	class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):

def __init__(self):	def __init__(self):
super(ScraperImplementation, self).__init__()	super(ScraperImplementation, self).__init__()

def getDescription(self,content, entry,doc):	def getDescription(self,content, entry,doc):
link = None	link = None
links = []	links = []
description = ""	description = ""
for atag in entry.find_all('a'):	for atag in entry.find_all('a'):
if atag.has_attr('href'):	if atag.has_attr('href'):
link = scrape.fullurl(self.getURL(), atag['href'])	link = scrape.fullurl(self.getURL(), atag['href'])
(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)	(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None:	if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
soup = BeautifulSoup(htcontent)	soup = BeautifulSoup(htcontent)
row = soup.find(id="content_div_148050")	row = soup.find(id="content_div_148050")
description = ''.join(row.stripped_strings)	description = ''.join(row.stripped_strings)
for atag in row.find_all("a"):	for atag in row.find_all("a"):
if atag.has_attr('href'):	if atag.has_attr('href'):
links.append(scrape.fullurl(link, atag['href']))	links.append(scrape.fullurl(link, atag['href']))

if links != []:	if links != []:
doc.update({'links': links})	doc.update({'links': links})
if description != "":	if description != "":
doc.update({ 'description': description})	doc.update({ 'description': description})
def getColumnCount(self):	def getColumnCount(self):
return 4	return 4

def getColumns(self, columns):	def getColumns(self, columns):
(id, date, datepub, title) = columns	(id, date, datepub, title) = columns
return (id, date, title, title, None)	return (id, date, title, title, None)


if __name__ == '__main__':	if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)	print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)	print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)

nsi = ScraperImplementation()	nsi = ScraperImplementation()
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1"
nsi.doScrape()
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2"
nsi.doScrape()
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3"
nsi.doScrape()
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4"
nsi.doScrape()
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5"
nsi.doScrape()	nsi.doScrape()

file:a/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py -> file:b/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py

import sys,os	import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers	import genericScrapers
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup

#http://www.doughellmann.com/PyMOTW/abc/	#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):	class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getDescription(self,content, entry,doc):	def getDescription(self,content, entry,doc):
link = None	link = None
links = []	links = []
description = ""	description = ""
for atag in entry.find_all('a'):	for atag in entry.find_all('a'):
if atag.has_attr('href'):	if atag.has_attr('href'):
link = scrape.fullurl(self.getURL(),atag['href'])	link = scrape.fullurl(self.getURL(),atag['href'])
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)	(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None:	if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html	# http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(htcontent)	soup = BeautifulSoup(htcontent)
rowtitle = soup.find(class_ = "wc-title").find("h1").string	rowtitle = soup.find(class_ = "wc-title").find("h1").string
if rowtitle != None:	if rowtitle != None:
description = rowtitle + ": "	description = rowtitle + ": "
for row in soup.find(class_ ="wc-content").find_all('td'):	for row in soup.find(class_ ="wc-content").find_all('td'):
if row != None:	if row != None:
for text in row.stripped_strings:	for text in row.stripped_strings:
description = description + text + "\n"	description = description + text + "\n"
for atag in row.find_all("a"):	for atag in row.find_all("a"):
if atag.has_attr('href'):	if atag.has_attr('href'):
links.append(scrape.fullurl(link,atag['href']))	links.append(scrape.fullurl(link,atag['href']))

if links != []:	if links != []:
doc.update({'links': links})	doc.update({'links': links})
if description != "":	if description != "":
doc.update({ 'description': description})	doc.update({ 'description': description})
	def getRows(self, table):
	return table.find_all(class_ = "dl-row");
	def findColumns(self, table):
	return table.find_all('div');
def getColumnCount(self):	def getColumnCount(self):
return 2	return 2
def getTable(self,soup):	def getTable(self,soup):
return soup.find(class_ = "ms-rteTable-default")	return soup.find(class_ = "foi-dl-list")
def getColumns(self,columns):	def getColumns(self,columns):
(date, title) = columns	(title,date) = columns
return (title, date, title, title, None)	return (title, date, title, title, None)

if __name__ == '__main__':	if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)	print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)	print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape()	ScraperImplementation().doScrape()

file:a/documents/scrapers/bf16d4ba0d306ee03e5a1d32aaba3da1.py -> file:b/documents/scrapers/bf16d4ba0d306ee03e5a1d32aaba3da1.py

import sys,os	import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers	import genericScrapers
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup

#http://www.doughellmann.com/PyMOTW/abc/	#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):	class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getTable(self,soup):	def getTable(self,soup):
return soup.find(summary="This table shows every FOI request to date.")	return soup
def getColumnCount(self):	def getColumnCount(self):
return 5	return 5
def getColumns(self,columns):	def getColumns(self,columns):
(id, date, title, description,notes) = columns	(id, date, title, description,notes) = columns
return (id, date, title, description, notes)	return (id, date, title, description, notes)

if __name__ == '__main__':	if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)	print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)	print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape()	ScraperImplementation().doScrape()

file:a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py -> file:b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py

import sys,os	import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))	sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers	import genericScrapers
import dateutil	import dateutil
from dateutil.parser import *	from dateutil.parser import *
from datetime import *	from datetime import *
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):	class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):

def __init__(self):	def __init__(self):
super(ScraperImplementation, self).__init__()	super(ScraperImplementation, self).__init__()
def getTable(self, soup):	def getTable(self, soup):
return soup.find(id='content')	return soup.find(id='zone-content')

def getDescription(self,content, entry,doc):	def getDescription(self,content, entry,doc):
link = None	link = None
links = []	links = []
description = ""	description = ""
for atag in entry.find_all('a'):	for atag in entry.find_all('a'):
if atag.has_attr('href'):	if atag.has_attr('href'):
link = scrape.fullurl(self.getURL(), atag['href'])	link = scrape.fullurl(self.getURL(), atag['href'])
(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)	(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None:	if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
soup = BeautifulSoup(htcontent)	soup = BeautifulSoup(htcontent)
row = soup.find(id="foidetails")	row = soup.find(id="foidetails")
if row == None:	if row == None:
row = soup.find(id="content").table	row = soup.find(id="content").table
if row == None:	if row == None:
row = soup.find(id="content")	row = soup.find(id="content")
description = ''.join(row.stripped_strings)	description = ''.join(row.stripped_strings)
for atag in row.find_all("a"):	for atag in row.find_all("a"):
if atag.has_attr('href'):	if atag.has_attr('href'):
links.append(scrape.fullurl(link, atag['href']))	links.append(scrape.fullurl(link, atag['href']))

if links != []:	if links != []:
doc.update({'links': links})	doc.update({'links': links})
if description != "":	if description != "":
doc.update({ 'description': description})	doc.update({ 'description': description})

def getColumnCount(self):	def getColumnCount(self):
return 3	return 3

def getColumns(self, columns):	def getColumns(self, columns):
(id, title, date) = columns	(id, title, date) = columns
return (id, date, title, title, None)	return (id, date, title, title, None)


if __name__ == '__main__':	if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)	print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)	print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape()	ScraperImplementation().doScrape()

<?php	<?php

require_once '../include/common.inc.php';	require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() {	//function createFOIDocumentsDesignDoc() {

$foidb = $server->get_db('disclosr-foidocuments');	$foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass();	$obj = new stdClass();
$obj->_id = "_design/" . urlencode("app");	$obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript";	$obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";	$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { if (doc.title != \"Disclosure Log Updated\") { emit(doc.date, doc); } };";	$obj->views->byDate->map = "function(doc) { if (doc.title != \"Disclosure Log Updated\") { emit(doc.date, doc); } };";
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";	$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count";	$obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";	$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count";	$obj->views->byAgencyID->reduce = "_count";
$obj->views->fieldNames->map = 'function(doc) { for(var propName in doc) { emit(propName, doc._id); }}';	$obj->views->fieldNames->map = 'function(doc) { for(var propName in doc) { emit(propName, doc._id); }}';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }';	$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check).	// allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true);	$foidb->save($obj, true);


//function createDocumentsDesignDoc() {	//function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents');	$docdb = $server->get_db('disclosr-documents');

$obj = new stdClass();	$obj = new stdClass();
$obj->_id = "_design/" . urlencode("app");	$obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript";	$obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";	$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "_sum";	$obj->views->web_server->reduce = "_sum";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";	$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "_sum";	$obj->views->byAgency->reduce = "_sum";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";	$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";	$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";	$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";

$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";	$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";
$obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}";	$obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}";
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";	$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
$docdb->save($obj, true);	$docdb->save($obj, true);