scraper fixes
scraper fixes


Former-commit-id: 7c09d22a257167842febb35ef0a1605548e871c2

<?php <?php
   
include_once("../include/common.inc.php"); include_once("../include/common.inc.php");
require($basePath . 'lib/phpquery/phpQuery/phpQuery.php'); require($basePath . 'lib/phpquery/phpQuery/phpQuery.php');
   
setlocale(LC_CTYPE, 'C'); setlocale(LC_CTYPE, 'C');
   
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
  // metatags
  try {
  $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
  //print_r($rows);
  foreach ($agencies as $agency) {
  if (isset($agency->value->scrapeDepth)) {
  unset($agency->value->scrapeDepth);
  }
   
  if (isset($agency->value->lastScraped)) {
  unset($agency->value->lastScraped);
  }
  $db->save($agency->value);
  echo "<hr>";
  flush();
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  // metatags
try { try {
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($agencies as $agency) { foreach ($agencies as $agency) {
//echo $agency->value->name . " ".$agency->value->website."<br />\n"; //echo $agency->value->name . " ".$agency->value->website."<br />\n";
// print_r($agency); // print_r($agency);
//hasRestricitiveLicence" hasRestrictiveLicense -> has Restrictive Licence //hasRestricitiveLicence" hasRestrictiveLicense -> has Restrictive Licence
// "hasYoutube" -> Tube // "hasYoutube" -> Tube
// "comment" -> "comments" // "comment" -> "comments"
if (!isset($agency->value->metaTags) && isset($agency->value->website)) { if (!isset($agency->value->metaTags) && isset($agency->value->website)) {
echo $agency->value->name . " ".$agency->value->website."<br />\n"; echo $agency->value->name . " " . $agency->value->website . "<br />\n";
$agency->value->metaTags = Array(); $agency->value->metaTags = Array();
$request = Requests::get($agency->value->website); $request = Requests::get($agency->value->website);
$html = phpQuery::newDocumentHTML($request->body); $html = phpQuery::newDocumentHTML($request->body);
phpQuery::selectDocument($html); phpQuery::selectDocument($html);
foreach (pq('meta')->elements as $meta) { foreach (pq('meta')->elements as $meta) {
$tagName = $meta->getAttribute('name');; $tagName = $meta->getAttribute('name');
  ;
$content = $meta->getAttribute('content'); $content = $meta->getAttribute('content');
if ($tagName != "") { if ($tagName != "") {
echo "$tagName == $content <br>\n"; echo "$tagName == $content <br>\n";
$agency->value->metaTags[$tagName] = $content; $agency->value->metaTags[$tagName] = $content;
} }
} }
//print_r($agency->value->metaTags); //print_r($agency->value->metaTags);
$db->save($agency->value); $db->save($agency->value);
echo "<hr>"; echo "<hr>";
flush(); flush();
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
?> ?>
   
<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$rows = $db->get_view("app", "byName")->rows; $rows = $db->get_view("app", "byName")->rows;
$nametoid = Array(); $nametoid = Array();
$accounts = Array(); $accounts = Array();
foreach ($rows as $row) { foreach ($rows as $row) {
$nametoid[trim($row->key)] = $row->value; $nametoid[trim($row->key)] = $row->value;
} }
   
function extractCSVAccounts($url, $nameField, $accountField, $filter) { function extractCSVAccounts($url, $nameField, $accountField, $filter) {
global $accounts, $nametoid; global $accounts, $nametoid;
$request = Requests::get($url); $request = Requests::get($url);
echo $url; echo $url;
$Data = str_getcsv($request->body, "\n"); //parse the rows $Data = str_getcsv($request->body, "\n"); //parse the rows
$headers = Array(); $headers = Array();
foreach ($Data as $num => $line) { foreach ($Data as $num => $line) {
$Row = str_getcsv($line, ","); $Row = str_getcsv($line, ",");
if ($num == 0) { if ($num == 0) {
$headers = $Row; $headers = $Row;
print_r($headers); print_r($headers);
} else { } else {
if (isset($Row[array_search($nameField, $headers)])) { if (isset($Row[array_search($nameField, $headers)])) {
$agencyName = $Row[array_search($nameField, $headers)]; $agencyName = $Row[array_search($nameField, $headers)];
if (!in_array(trim($agencyName), array_keys($nametoid))) { if (!in_array(trim($agencyName), array_keys($nametoid))) {
echo "$agencyName missing" . PHP_EOL; echo "$agencyName missing" . PHP_EOL;
} else { } else {
echo $Row[array_search($nameField, $headers)] . PHP_EOL; echo $Row[array_search($nameField, $headers)] . PHP_EOL;
$accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)]; $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
  $accounts[$nametoid[trim($agencyName)]]["rtkDescriptions"][$agencyName] = $Row[array_search("Notes", $headers)];
} }
} else { } else {
echo "error finding any agency" . $line . PHP_EOL; echo "error finding any agency" . $line . PHP_EOL;
} }
} }
} }
} }
   
extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name"); extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
print_r($accounts); //print_r($accounts);
/* foreach ($accounts as $id => $accountTypes) { foreach ($accounts as $id => $allvalues) {
echo $id . "<br>" . PHP_EOL; echo $id . "<br>" . PHP_EOL;
$doc = object_to_array($db->get($id)); $doc = object_to_array($db->get($id));
// print_r($doc); // print_r($doc);
   
foreach ($accountTypes as $accountType => $accounts) { foreach ($allvalues as $valueType => $values) {
if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { if (!isset($doc[ $valueType]) || !is_array($doc[ $valueType])) {
$doc["has" . $accountType] = Array(); $doc[ $valueType] = Array();
} }
$doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); $doc[ $valueType] = array_unique(array_merge($doc[ $valueType], $values));
  if ( $valueType == "rtkDescriptions") {
  foreach ($values as $descriptionAgency => $descriptionValue) {
  if ($descriptionAgency == $doc->value->name) {
  $doc->value->description = $descriptionValue;
  }
  }
  }
} }
$db->save($doc); $db->save($doc);
}*/ }
?> ?>
   
  /*!
  * Bootstrap Responsive v2.2.1
  *
  * Copyright 2012 Twitter, Inc
  * Licensed under the Apache License v2.0
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Designed and built with all the love in the world @twitter by @mdo and @fat.
  */
 
  .clearfix {
  *zoom: 1;
  }
 
  .clearfix:before,
  .clearfix:after {
  display: table;
  line-height: 0;
  content: "";
  }
 
  .clearfix:after {
  clear: both;
  }
 
  .hide-text {
  font: 0/0 a;
  color: transparent;
  text-shadow: none;
  background-color: transparent;
  border: 0;
  }
 
  .input-block-level {
  display: block;
  width: 100%;
  min-height: 30px;
  -webkit-box-sizing: border-box;
  -moz-box-sizing: border-box;
  box-sizing: border-box;
  }
 
  .hidden {
  display: none;
  visibility: hidden;
  }
 
  .visible-phone {
  display: none !important;
  }
 
  .visible-tablet {
  display: none !important;
  }
 
  .hidden-desktop {
  display: none !important;
  }
 
  .visible-desktop {
  display: inherit !important;
  }
 
  @media (min-width: 768px) and (max-width: 979px) {
  .hidden-desktop {
  display: inherit !important;
  }
  .visible-desktop {
  display: none !important ;
  }
  .visible-tablet {
  display: inherit !important;
  }
  .hidden-tablet {
  display: none !important;
  }
  }
 
  @media (max-width: 767px) {
  .hidden-desktop {
  display: inherit !important;
  }
  .visible-desktop {
  display: none !important;
  }
  .visible-phone {
  display: inherit !important;
  }
  .hidden-phone {
  display: none !important;
  }
  }
 
  @media (min-width: 1200px) {
  .row {
  margin-left: -30px;
  *zoom: 1;
  }
  .row:before,
  .row:after {
  display: table;
  line-height: 0;
  content: "";
  }
  .row:after {
  clear: both;
  }
  [class*="span"] {
  float: left;
  min-height: 1px;
  margin-left: 30px;
  }
  .container,
  .navbar-static-top .container,
  .navbar-fixed-top .container,
  .navbar-fixed-bottom .container {
  width: 1170px;
  }
  .span12 {
  width: 1170px;
  }
  .span11 {
  width: 1070px;
  }
  .span10 {
  width: 970px;
  }
  .span9 {
  width: 870px;
  }
  .span8 {
  width: 770px;
  }
  .span7 {
  width: 670px;
  }
  .span6 {
  width: 570px;
  }
  .span5 {
  width: 470px;
  }
  .span4 {
  width: 370px;
  }
  .span3 {
  width: 270px;
  }
  .span2 {
  width: 170px;
  }
  .span1 {
  width: 70px;
  }
  .offset12 {
  margin-left: 1230px;
  }
  .offset11 {
  margin-left: 1130px;
  }
  .offset10 {
  margin-left: 1030px;
  }
  .offset9 {
  margin-left: 930px;
  }
  .offset8 {
  margin-left: 830px;
  }
  .offset7 {
  margin-left: 730px;
  }
  .offset6 {
  margin-left: 630px;
  }
  .offset5 {
  margin-left: 530px;
  }
  .offset4 {
  margin-left: 430px;
  }
  .offset3 {
  margin-left: 330px;
  }
  .offset2 {
  margin-left: 230px;
  }
  .offset1 {
  margin-left: 130px;
  }
  .row-fluid {
  width: 100%;
  *zoom: 1;
  }
  .row-fluid:before,
  .row-fluid:after {
  display: table;
  line-height: 0;
  content: "";
  }
  .row-fluid:after {
  clear: both;
  }
  .row-fluid [class*="span"] {
  display: block;
  float: left;
  width: 100%;
  min-height: 30px;
  margin-left: 2.564102564102564%;
  *margin-left: 2.5109110747408616%;
  -webkit-box-sizing: border-box;
  -moz-box-sizing: border-box;
  box-sizing: border-box;
  }
  .row-fluid [class*="span"]:first-child {
  margin-left: 0;
  }
  .row-fluid .controls-row [class*="span"] + [class*="span"] {
  margin-left: 2.564102564102564%;
  }
  .row-fluid .span12 {
  width: 100%;
  *width: 99.94680851063829%;
  }
  .row-fluid .span11 {
  width: 91.45299145299145%;
  *width: 91.39979996362975%;
  }
  .row-fluid .span10 {
  width: 82.90598290598291%;
  *width: 82.8527914166212%;
  }
  .row-fluid .span9 {
  width: 74.35897435897436%;
  *width: 74.30578286961266%;
  }
  .row-fluid .span8 {
  width: 65.81196581196582%;
  *width: 65.75877432260411%;
  }
  .row-fluid .span7 {
  width: 57.26495726495726%;
  *width: 57.21176577559556%;
  }
  .row-fluid .span6 {
  width: 48.717948717948715%;
  *width: 48.664757228587014%;
  }
  .row-fluid .span5 {