beginning deewr scraper
beginning deewr scraper


Former-commit-id: c17beeeab98ca8e389303cfcc75566c09aaf49cc

[submodule "couchdb/couchdb-lucene"] [submodule "couchdb/couchdb-lucene"]
path = couchdb/couchdb-lucene path = couchdb/couchdb-lucene
url = https://github.com/rnewson/couchdb-lucene.git url = https://github.com/rnewson/couchdb-lucene.git
[submodule "couchdb/settee"] [submodule "couchdb/settee"]
path = couchdb/settee path = couchdb/settee
url = https://github.com/inadarei/settee.git url = https://github.com/inadarei/settee.git
[submodule "lib/php-diff"] [submodule "lib/php-diff"]
path = lib/php-diff path = lib/php-diff
url = https://github.com/chrisboulton/php-diff.git url = https://github.com/chrisboulton/php-diff.git
[submodule "lib/Requests"] [submodule "lib/Requests"]
path = lib/Requests path = lib/Requests
url = https://github.com/rmccue/Requests.git url = https://github.com/rmccue/Requests.git
[submodule "js/flotr2"] [submodule "js/flotr2"]
path = js/flotr2 path = js/flotr2
url = https://github.com/HumbleSoftware/Flotr2.git url = https://github.com/HumbleSoftware/Flotr2.git
[submodule "lib/phpquery"] [submodule "lib/phpquery"]
path = lib/phpquery path = lib/phpquery
url = https://github.com/TobiaszCudnik/phpquery.git url = https://github.com/TobiaszCudnik/phpquery.git
[submodule "js/sigma"] [submodule "js/sigma"]
path = js/sigma path = js/sigma
url = https://github.com/jacomyal/sigma.js.git url = https://github.com/jacomyal/sigma.js.git
[submodule "js/bubbletree"] [submodule "js/bubbletree"]
path = js/bubbletree path = js/bubbletree
url = https://github.com/okfn/bubbletree.git url = https://github.com/okfn/bubbletree.git
[submodule "lib/querypath"] [submodule "lib/querypath"]
path = lib/querypath path = lib/querypath
url = https://github.com/technosophos/querypath.git url = https://github.com/technosophos/querypath.git
[submodule "sigma.js"] [submodule "lib/amon-php"]
path = sigma.js path = lib/amon-php
url = https://github.com/jacomyal/sigma.js.git url = https://github.com/martinrusev/amon-php.git
  [submodule "documents/lib/parsedatetime"]
  path = documents/lib/parsedatetime
  url = git://github.com/bear/parsedatetime.git
   
  <?php
 
  require_once '../include/common.inc.php';
 
  $db = $server->get_db('disclosr-agencies');
  $rows = $db->get_view("app", "byName")->rows;
  $nametoid = Array();
  $accounts = Array();
  foreach ($rows as $row) {
  $nametoid[trim($row->key)] = $row->value;
  }
 
  function extractCSVAccounts($url, $nameField, $accountField, $filter) {
  global $accounts, $nametoid;
  $request = Requests::get($url);
  echo $url;
  $Data = str_getcsv($request->body, "\n"); //parse the rows
  $headers = Array();
  foreach ($Data as $num => $line) {
  $Row = str_getcsv($line, ",");
  if ($num == 0) {
  $headers = $Row;
  print_r($headers);
  } else {
  if (isset($Row[array_search($nameField, $headers)])) {
  $agencyName = $Row[array_search($nameField, $headers)];
  if (!in_array(trim($agencyName), array_keys($nametoid))) {
  echo "$agencyName missing" . PHP_EOL;
  } else {
  echo $Row[array_search($nameField, $headers)] . PHP_EOL;
  $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
  }
  } else {
  echo "error finding any agency" . $line . PHP_EOL;
  }
  }
  }
  }
 
  extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
  print_r($accounts);
  /* foreach ($accounts as $id => $accountTypes) {
  echo $id . "<br>" . PHP_EOL;
  $doc = object_to_array($db->get($id));
  // print_r($doc);
 
  foreach ($accountTypes as $accountType => $accounts) {
  if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
  $doc["has" . $accountType] = Array();
  }
  $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
  }
  $db->save($doc);
  }*/
  ?>
 
<?php <?php
   
include_once("../include/common.inc.php"); include_once("../include/common.inc.php");
   
setlocale(LC_CTYPE, 'C'); setlocale(LC_CTYPE, 'C');
   
$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
   
$tag = Array(); $tag = Array();
try { try {
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); $tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key));
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$foiEmail = Array(); $foiEmail = Array();
try { try {
$rows = $db->get_view("app", "foiEmails", null, true)->rows; $rows = $db->get_view("app", "foiEmails", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$foiEmail[$row->key] = $row->value; $foiEmail[$row->key] = $row->value;
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$fp = fopen('php://output', 'w'); $fp = fopen('php://output', 'w');
if ($fp && $db) { if ($fp && $db) {
header('Content-Type: text/csv; charset=utf-8'); header('Content-Type: text/csv; charset=utf-8');
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
header('Pragma: no-cache'); header('Pragma: no-cache');
header('Expires: 0'); header('Expires: 0');
fputcsv($fp, $headers); fputcsv($fp, $headers);
try { try {
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($agencies as $agency) { foreach ($agencies as $agency) {
// print_r($agency); // print_r($agency);
   
if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) {
$row = Array(); $row = Array();
$row["#id"] = $agency->id; $row["#id"] = $agency->id;
$row["name"] = trim($agency->value->name); $row["name"] = trim($agency->value->name);
$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");
$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");
$row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); $row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");
   
$otherBodies = Array(); $otherBodies = Array();
if (isset($agency->value->foiBodies)) { if (isset($agency->value->foiBodies)) {
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies); $otherBodies = array_merge($otherBodies, $agency->value->foiBodies);
} }
if (isset($agency->value->positions)) { if (isset($agency->value->positions)) {
$otherBodies = array_merge($otherBodies, $agency->value->positions); $positions = Array();
  foreach ($agency->value->positions as $position) {
  $positions[] = "Office of the ".$position;
  }
  $otherBodies = array_merge($otherBodies, $positions);
} }
  sort($otherBodies);
if (count($otherBodies) > 0) { if (count($otherBodies) > 0) {
$row["notes"] .= "<br/> This department also responds to requests for information held by ".implode(",",$otherBodies); $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies);
} }
   
$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
if ($agency->value->orgType == "FMA-DepartmentOfState") { if ($agency->value->orgType == "FMA-DepartmentOfState") {
$row["tag_string"] = $tag[$agency->value->_id]; $row["tag_string"] = $tag[$agency->value->_id];
} else { } else {
$row["tag_string"] = $tag[$agency->value->parentOrg]; $row["tag_string"] = $tag[$agency->value->parentOrg];
} }
$row["tag_string"] .= " " . $agency->value->orgType; $row["tag_string"] .= " " . $agency->value->orgType;
$row["tag_string"] .= " federal"; $row["tag_string"] .= " federal";
fputcsv($fp, array_values($row)); fputcsv($fp, array_values($row));
   
   
   
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
   
die; die;
} }
?> ?>
   
  <!doctype html>
  <html lang="en">
  <head>
  <meta charset="utf-8">
  <title>Page Not Found :(</title>
  <style>
  ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
  ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
  html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
  html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
  body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
  h1 { margin: 0 10px; font-size: 50px; text-align: center; }
  h1 span { color: #bbb; }
  h3 { margin: 1.5em 0 0.5em; }
  p { margin: 1em 0; }
  ul { padding: 0 0 0 40px; margin: 1em 0; }
  .container { max-width: 380px; _width: 380px; margin: 0 auto; }
  /* google search */
  #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
  #goog-fixurl form { margin: 0; }
  #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
  #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
  #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
  #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
  #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
  input::-moz-focus-inner { padding: 0; border: 0; }
  </style>
  </head>
  <body>
  <div class="container">
  <h1>Not found <span>:(</span></h1>
  <p>Sorry, but the page you were trying to view does not exist.</p>
  <p>It looks like this was the result of either:</p>
  <ul>
  <li>a mistyped address</li>
  <li>an out-of-date link</li>
  </ul>
  <script>
  var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
  </script>
  <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
  </div>
 
 
 Binary files /dev/null and b/documents/apple-touch-icon-114x114-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon-57x57-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon-72x72-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon-precomposed.png differ
 Binary files /dev/null and b/documents/apple-touch-icon.png differ
  <?xml version="1.0"?>
  <!DOCTYPE cross-domain-policy SYSTEM "http://www.adobe.com/xml/dtds/cross-domain-policy.dtd">
  <cross-domain-policy>
 
 
  <!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
 
  <!-- Most restrictive policy: -->
  <site-control permitted-cross-domain-policies="none"/>
 
 
 
  <!-- Least restrictive policy: -->
  <!--
  <site-control permitted-cross-domain-policies="all"/>
  <allow-access-from domain="*" to-ports="*" secure="false"/>
  <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
  -->
  <!--
  If you host a crossdomain.xml file with allow-access-from domain="*"
  and don’t understand all of the points described here, you probably
  have a nasty security vulnerability. ~ simon willison
  -->
 
  </cross-domain-policy>
 
<
  /*!
  * Bootstrap Responsive v2.2.1
  *
  * Copyright 2012 Twitter, Inc
  * Licensed under the Apache License v2.0
  * http://www.apache.org/licenses/LICENSE-2.0