[submodule "couchdb/couchdb-lucene"] | [submodule "couchdb/couchdb-lucene"] |
path = couchdb/couchdb-lucene | path = couchdb/couchdb-lucene |
url = https://github.com/rnewson/couchdb-lucene.git | url = https://github.com/rnewson/couchdb-lucene.git |
[submodule "couchdb/settee"] | [submodule "couchdb/settee"] |
path = couchdb/settee | path = couchdb/settee |
url = https://github.com/inadarei/settee.git | url = https://github.com/inadarei/settee.git |
[submodule "lib/php-diff"] | [submodule "lib/php-diff"] |
path = lib/php-diff | path = lib/php-diff |
url = https://github.com/chrisboulton/php-diff.git | url = https://github.com/chrisboulton/php-diff.git |
[submodule "lib/Requests"] | [submodule "lib/Requests"] |
path = lib/Requests | path = lib/Requests |
url = https://github.com/rmccue/Requests.git | url = https://github.com/rmccue/Requests.git |
[submodule "js/flotr2"] | [submodule "js/flotr2"] |
path = js/flotr2 | path = js/flotr2 |
url = https://github.com/HumbleSoftware/Flotr2.git | url = https://github.com/HumbleSoftware/Flotr2.git |
[submodule "lib/phpquery"] | [submodule "lib/phpquery"] |
path = lib/phpquery | path = lib/phpquery |
url = https://github.com/TobiaszCudnik/phpquery.git | url = https://github.com/TobiaszCudnik/phpquery.git |
[submodule "js/sigma"] | [submodule "js/sigma"] |
path = js/sigma | path = js/sigma |
url = https://github.com/jacomyal/sigma.js.git | url = https://github.com/jacomyal/sigma.js.git |
[submodule "js/bubbletree"] | [submodule "js/bubbletree"] |
path = js/bubbletree | path = js/bubbletree |
url = https://github.com/okfn/bubbletree.git | url = https://github.com/okfn/bubbletree.git |
[submodule "lib/querypath"] | [submodule "lib/querypath"] |
path = lib/querypath | path = lib/querypath |
url = https://github.com/technosophos/querypath.git | url = https://github.com/technosophos/querypath.git |
[submodule "lib/amon-php"] | |
path = lib/amon-php | |
url = https://github.com/martinrusev/amon-php.git | |
<?php | |
require_once '../include/common.inc.php'; | |
$db = $server->get_db('disclosr-agencies'); | |
$rows = $db->get_view("app", "byName")->rows; | |
$nametoid = Array(); | |
$accounts = Array(); | |
foreach ($rows as $row) { | |
$nametoid[trim($row->key)] = $row->value; | |
} | |
function extractCSVAccounts($url, $nameField, $accountField, $filter) { | |
global $accounts, $nametoid; | |
$request = Requests::get($url); | |
echo $url; | |
$Data = str_getcsv($request->body, "\n"); //parse the rows | |
$headers = Array(); | |
foreach ($Data as $num => $line) { | |
$Row = str_getcsv($line, ","); | |
if ($num == 0) { | |
$headers = $Row; | |
print_r($headers); | |
} else { | |
if (isset($Row[array_search($nameField, $headers)])) { | |
$agencyName = $Row[array_search($nameField, $headers)]; | |
if (!in_array(trim($agencyName), array_keys($nametoid))) { | |
echo "$agencyName missing" . PHP_EOL; | |
} else { | |
echo $Row[array_search($nameField, $headers)] . PHP_EOL; | |
$accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)]; | |
} | |
} else { | |
echo "error finding any agency" . $line . PHP_EOL; | |
} | |
} | |
} | |
} | |
extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name"); | |
print_r($accounts); | |
/* foreach ($accounts as $id => $accountTypes) { | |
echo $id . "<br>" . PHP_EOL; | |
$doc = object_to_array($db->get($id)); | |
// print_r($doc); | |
foreach ($accountTypes as $accountType => $accounts) { | |
if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { | |
$doc["has" . $accountType] = Array(); | |
} | |
$doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); | |
} | |
$db->save($doc); | |
}*/ | |
?> | |
<?php | <?php |
include_once("../include/common.inc.php"); | include_once("../include/common.inc.php"); |
setlocale(LC_CTYPE, 'C'); | setlocale(LC_CTYPE, 'C'); |
$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); | $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
$tag = Array(); | $tag = Array(); |
try { | try { |
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; | $rows = $db->get_view("app", "byDeptStateName", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); | $tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
die(); | die(); |
} | } |
$foiEmail = Array(); | $foiEmail = Array(); |
try { | try { |
$rows = $db->get_view("app", "foiEmails", null, true)->rows; | $rows = $db->get_view("app", "foiEmails", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$foiEmail[$row->key] = $row->value; | $foiEmail[$row->key] = $row->value; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
die(); | die(); |
} | } |
$fp = fopen('php://output', 'w'); | $fp = fopen('php://output', 'w'); |
if ($fp && $db) { | if ($fp && $db) { |
header('Content-Type: text/csv; charset=utf-8'); | header('Content-Type: text/csv; charset=utf-8'); |
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); | header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); |
header('Pragma: no-cache'); | header('Pragma: no-cache'); |
header('Expires: 0'); | header('Expires: 0'); |
fputcsv($fp, $headers); | fputcsv($fp, $headers); |
try { | try { |
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; | $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($agencies as $agency) { | foreach ($agencies as $agency) { |
// print_r($agency); | // print_r($agency); |
if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { | if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { |
$row = Array(); | $row = Array(); |
$row["#id"] = $agency->id; | $row["#id"] = $agency->id; |
$row["name"] = trim($agency->value->name); | $row["name"] = trim($agency->value->name); |
$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); | $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); |
$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); | $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); |
$row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); | $row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); |
$otherBodies = Array(); | $otherBodies = Array(); |
if (isset($agency->value->foiBodies)) { | if (isset($agency->value->foiBodies)) { |
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies); | $otherBodies = array_merge($otherBodies, $agency->value->foiBodies); |
} | } |
if (isset($agency->value->positions)) { | if (isset($agency->value->positions)) { |
$otherBodies = array_merge($otherBodies, $agency->value->positions); | $positions = Array(); |
foreach ($agency->value->positions as $position) { | |
$positions[] = "Office of the ".$position; | |
} | |
$otherBodies = array_merge($otherBodies, $positions); | |
} | } |
sort($otherBodies); | |
if (count($otherBodies) > 0) { | if (count($otherBodies) > 0) { |
$row["notes"] .= "<br/> This department also responds to requests for information held by ".implode(",",$otherBodies); | $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies); |
} | } |
$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); | $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); |
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); | $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); |
if ($agency->value->orgType == "FMA-DepartmentOfState") { | if ($agency->value->orgType == "FMA-DepartmentOfState") { |
$row["tag_string"] = $tag[$agency->value->_id]; | $row["tag_string"] = $tag[$agency->value->_id]; |
} else { | } else { |
$row["tag_string"] = $tag[$agency->value->parentOrg]; | $row["tag_string"] = $tag[$agency->value->parentOrg]; |
} | } |
$row["tag_string"] .= " " . $agency->value->orgType; | $row["tag_string"] .= " " . $agency->value->orgType; |
$row["tag_string"] .= " federal"; | $row["tag_string"] .= " federal"; |
fputcsv($fp, array_values($row)); | fputcsv($fp, array_values($row)); |
} | } |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
die; | die; |
} | } |
?> | ?> |
google-site-verification: google676a414ad086cefb.html | |
australian disclosure logs | |
are you looking for more information about: | |
contracts | |
gov orgs | |
lobbyists | |
1/1/11 title (Dept dfggdfgdf) | |
description: | |
source link: | |
documents: | |
#1 title link | |
rss feed here |
<?php | |
// Agency X updated Y, new files, diff of plain text/link text, | |
// feed for just one agency or all | |
// This is a minimum example of using the Universal Feed Generator Class | |
include("lib/FeedWriter.php"); | |
//Creating an instance of FeedWriter class. | |
$TestFeed = new FeedWriter(RSS2); | |
//Setting the channel elements | |
//Use wrapper functions for common channelelements | |
$TestFeed->setTitle('Last Modified - All'); | |
$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php'); | |
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer'); | |
//Retriving informations from database | |
$rows = $db->get_view("app", "byLastModified")->rows; | |
//print_r($rows); | |
foreach ($rows as $row) { | |
//Create an empty FeedItem | |
$newItem = $TestFeed->createNewItem(); | |
//Add elements to the feed item | |
$newItem->setTitle($row['name']); | |
$newItem->setLink($row['id']); | |
$newItem->setDate(date("c", $row['metadata']['lastModified'])); | |
$newItem->setDescription($row['name']); | |
//Now add the feed item | |
$TestFeed->addItem($newItem); | |
} | |
//OK. Everything is done. Now genarate the feed. | |
$TestFeed->genarateFeed(); | |
?> |
#http://packages.python.org/CouchDB/client.html | |
import couchdb | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import re | |
import hashlib | |
from urlparse import urljoin | |
import time | |
import os | |
import mimetypes | |
import re | |
import urllib | |
import urlparse | |
def mkhash(input): | |
return hashlib.md5(input).hexdigest().encode("utf-8") | |
def canonurl(url): | |
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' | |
if the URL looks invalid. | |
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws | |
'http://xn--hgi.ws/' | |
""" | |
# strip spaces at the ends and ensure it's prefixed with 'scheme://' | |
url = url.strip() | |
if not url: | |
return '' | |
if not urlparse.urlsplit(url).scheme: | |
url = 'http://' + url | |
# turn it into Unicode | |
#try: | |
# url = unicode(url, 'utf-8') | |
#except UnicodeDecodeError: | |
# return '' # bad UTF-8 chars in URL | |
# parse the URL into its components | |
parsed = urlparse.urlsplit(url) | |
scheme, netloc, path, query, fragment = parsed | |
# ensure scheme is a letter followed by letters, digits, and '+-.' chars | |
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): | |
return '' | |
scheme = str(scheme) | |
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] | |
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) | |
if not match: | |
return '' | |
domain, port = match.groups() | |
netloc = domain + (port if port else '') | |
netloc = netloc.encode('idna') | |
# ensure path is valid and convert Unicode chars to %-encoded | |
if not path: | |
path = '/' # eg: 'http://google.com' -> 'http://google.com/' | |
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') | |
# ensure query is valid | |
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') | |
# ensure fragment is valid | |
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) | |
# piece it all back together, truncating it to a maximum of 4KB | |
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) | |
return url[:4096] | |
def fullurl(url,href): | |
href = href.replace(" ","%20") | |
href = re.sub('#.*$','',href) | |
return urljoin(url,href) | |
#http://diveintopython.org/http_web_services/etags.html | |
class NotModifiedHandler(urllib2.BaseHandler): | |
def http_error_304(self, req, fp, code, message, headers): | |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) | |
addinfourl.code = code | |
return addinfourl | |
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): | |
url = canonurl(url) | |
hash = mkhash(url) | |
req = urllib2.Request(url) | |
print "Fetching %s" % url | |
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": | |
print "Not a valid HTTP url" | |
return (None,None,None) | |
doc = docsdb.get(hash) | |
if doc == None: | |
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} | |
else: | |
if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 999999): | |
print "Uh oh, trying to scrape URL again too soon!" | |
last_attachment_fname = doc["_attachments"].keys()[-1] | |
last_attachment = docsdb.get_attachment(doc,last_attachment_fname) | |
return (doc['url'],doc['mime_type'],last_attachment.read()) | |
if scrape_again == False: | |
print "Not scraping this URL again as requested" | |
return (None,None,None) | |