scraper monitoring page
scraper monitoring page


Former-commit-id: 9c90d43c41be813659e4e4d1768a29299f552ba4

[submodule "couchdb/couchdb-lucene"] [submodule "couchdb/couchdb-lucene"]
path = couchdb/couchdb-lucene path = couchdb/couchdb-lucene
url = https://github.com/rnewson/couchdb-lucene.git url = https://github.com/rnewson/couchdb-lucene.git
[submodule "couchdb/settee"] [submodule "couchdb/settee"]
path = couchdb/settee path = couchdb/settee
url = https://github.com/inadarei/settee.git url = https://github.com/inadarei/settee.git
[submodule "lib/php-diff"] [submodule "lib/php-diff"]
path = lib/php-diff path = lib/php-diff
url = https://github.com/chrisboulton/php-diff.git url = https://github.com/chrisboulton/php-diff.git
[submodule "lib/Requests"] [submodule "lib/Requests"]
path = lib/Requests path = lib/Requests
url = https://github.com/rmccue/Requests.git url = https://github.com/rmccue/Requests.git
[submodule "js/flotr2"] [submodule "js/flotr2"]
path = js/flotr2 path = js/flotr2
url = https://github.com/HumbleSoftware/Flotr2.git url = https://github.com/HumbleSoftware/Flotr2.git
[submodule "lib/phpquery"] [submodule "lib/phpquery"]
path = lib/phpquery path = lib/phpquery
url = https://github.com/TobiaszCudnik/phpquery.git url = https://github.com/TobiaszCudnik/phpquery.git
[submodule "js/sigma"] [submodule "js/sigma"]
path = js/sigma path = js/sigma
url = https://github.com/jacomyal/sigma.js.git url = https://github.com/jacomyal/sigma.js.git
[submodule "js/bubbletree"] [submodule "js/bubbletree"]
path = js/bubbletree path = js/bubbletree
url = https://github.com/okfn/bubbletree.git url = https://github.com/okfn/bubbletree.git
[submodule "lib/querypath"] [submodule "lib/querypath"]
path = lib/querypath path = lib/querypath
url = https://github.com/technosophos/querypath.git url = https://github.com/technosophos/querypath.git
  [submodule "lib/amon-php"]
  path = lib/amon-php
  url = https://github.com/martinrusev/amon-php.git
   
  <?php
 
  require_once '../include/common.inc.php';
 
  $db = $server->get_db('disclosr-agencies');
  $rows = $db->get_view("app", "byName")->rows;
  $nametoid = Array();
  $accounts = Array();
  foreach ($rows as $row) {
  $nametoid[trim($row->key)] = $row->value;
  }
 
  function extractCSVAccounts($url, $nameField, $accountField, $filter) {
  global $accounts, $nametoid;
  $request = Requests::get($url);
  echo $url;
  $Data = str_getcsv($request->body, "\n"); //parse the rows
  $headers = Array();
  foreach ($Data as $num => $line) {
  $Row = str_getcsv($line, ",");
  if ($num == 0) {
  $headers = $Row;
  print_r($headers);
  } else {
  if (isset($Row[array_search($nameField, $headers)])) {
  $agencyName = $Row[array_search($nameField, $headers)];
  if (!in_array(trim($agencyName), array_keys($nametoid))) {
  echo "$agencyName missing" . PHP_EOL;
  } else {
  echo $Row[array_search($nameField, $headers)] . PHP_EOL;
  $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
  }
  } else {
  echo "error finding any agency" . $line . PHP_EOL;
  }
  }
  }
  }
 
  extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
  print_r($accounts);
  /* foreach ($accounts as $id => $accountTypes) {
  echo $id . "<br>" . PHP_EOL;
  $doc = object_to_array($db->get($id));
  // print_r($doc);
 
  foreach ($accountTypes as $accountType => $accounts) {
  if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
  $doc["has" . $accountType] = Array();
  }
  $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
  }
  $db->save($doc);
  }*/
  ?>
 
<?php <?php
   
include_once("../include/common.inc.php"); include_once("../include/common.inc.php");
   
setlocale(LC_CTYPE, 'C'); setlocale(LC_CTYPE, 'C');
   
$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
   
$tag = Array(); $tag = Array();
try { try {
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); $tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key));
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$foiEmail = Array(); $foiEmail = Array();
try { try {
$rows = $db->get_view("app", "foiEmails", null, true)->rows; $rows = $db->get_view("app", "foiEmails", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$foiEmail[$row->key] = $row->value; $foiEmail[$row->key] = $row->value;
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$fp = fopen('php://output', 'w'); $fp = fopen('php://output', 'w');
if ($fp && $db) { if ($fp && $db) {
header('Content-Type: text/csv; charset=utf-8'); header('Content-Type: text/csv; charset=utf-8');
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
header('Pragma: no-cache'); header('Pragma: no-cache');
header('Expires: 0'); header('Expires: 0');
fputcsv($fp, $headers); fputcsv($fp, $headers);
try { try {
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($agencies as $agency) { foreach ($agencies as $agency) {
// print_r($agency); // print_r($agency);
   
if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) {
$row = Array(); $row = Array();
$row["#id"] = $agency->id; $row["#id"] = $agency->id;
$row["name"] = trim($agency->value->name); $row["name"] = trim($agency->value->name);
$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");
$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");
$row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); $row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");
   
$otherBodies = Array(); $otherBodies = Array();
if (isset($agency->value->foiBodies)) { if (isset($agency->value->foiBodies)) {
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies); $otherBodies = array_merge($otherBodies, $agency->value->foiBodies);
} }
if (isset($agency->value->positions)) { if (isset($agency->value->positions)) {
$otherBodies = array_merge($otherBodies, $agency->value->positions); $positions = Array();
  foreach ($agency->value->positions as $position) {
  $positions[] = "Office of the ".$position;
  }
  $otherBodies = array_merge($otherBodies, $positions);
} }
  sort($otherBodies);
if (count($otherBodies) > 0) { if (count($otherBodies) > 0) {
$row["notes"] .= "<br/> This department also responds to requests for information held by ".implode(",",$otherBodies); $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies);
} }
   
$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
if ($agency->value->orgType == "FMA-DepartmentOfState") { if ($agency->value->orgType == "FMA-DepartmentOfState") {
$row["tag_string"] = $tag[$agency->value->_id]; $row["tag_string"] = $tag[$agency->value->_id];
} else { } else {
$row["tag_string"] = $tag[$agency->value->parentOrg]; $row["tag_string"] = $tag[$agency->value->parentOrg];
} }
$row["tag_string"] .= " " . $agency->value->orgType; $row["tag_string"] .= " " . $agency->value->orgType;
$row["tag_string"] .= " federal"; $row["tag_string"] .= " federal";
fputcsv($fp, array_values($row)); fputcsv($fp, array_values($row));
   
   
   
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
   
die; die;
} }
?> ?>
   
  <?php
 
  include_once('../include/common.inc.php');
  include_header('Webserver and Accessiblity');
 
  echo "<table>
  <tr><th>name</th><th>webserver</th><th>accessiblity errors</th></tr>";
  $agenciesdb = $server->get_db('disclosr-agencies');
  $docsdb = $server->get_db('disclosr-documents');
  try {
  $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
 
 
  if ($rows) {
  foreach ($rows as $row) {
 
  echo "<tr><td>" . $row->value->name . "</td>\n";
  echo "<td>";
  if (isset($row->value->FOIDocumentsURL)) {
  echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
  .$row->value->FOIDocumentsURL.'</a>';
  }
  echo "</td>\n";
  echo "</tr>\n";
  }
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  include_footer();
  ?>
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import scrape
 
  from bs4 import BeautifulSoup
  import abc
 
  class GenericOAICDisclogScraper(object):
  __metaclass__ = abc.ABCMeta
  @abc.abstractmethod
  def getAgencyID(self):
  """ disclosr agency id """
  return
 
  @abc.abstractmethod
  def getURL(self):
  """ disclog URL"""
  return
 
  @abc.abstractmethod
  def getColumns(self,columns):
  """ rearranges columns if required """
  return
 
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
  if content != None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
  # http://www.crummy.com/software/BeautifulSoup/documentation.html
  soup = BeautifulSoup(content)
  for row in soup.table.find_all('tr'):
  columns = row.find_all('td')
  if len(columns) == 5:
  (id, date, description, title, notes) = self.getColumns(columns)
  print id.string
  hash = scrape.mkhash(url+id.string)
  links = []
  for atag in row.find_all("a"):
  if atag.has_key('href'):
  links.append(scrape.fullurl(url,atag['href']))
  doc = foidocsdb.get(hash)
  descriptiontxt = ""
  for string in description.stripped_strings:
  descriptiontxt = descriptiontxt + string
 
  if doc == None:
  print "saving"
  doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
  "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
  foidocsdb.save(doc)
  else:
  print "already saved"
 
  elif len(row.find_all('th')) == 5:
  print "header row"
 
  else:
  print "ERROR number of columns incorrect"
  print row
 
  google-site-verification: google676a414ad086cefb.html
 
  australian disclosure logs
 
  are you looking for more information about:
  contracts
  gov orgs
  lobbyists
 
  1/1/11 title (Dept dfggdfgdf)
  description:
  source link:
  documents:
  #1 title link
 
 
  rss feed here
  <?php
 
  // Agency X updated Y, new files, diff of plain text/link text,
  // feed for just one agency or all
  // This is a minimum example of using the Universal Feed Generator Class
  include("lib/FeedWriter.php");
  //Creating an instance of FeedWriter class.
  $TestFeed = new FeedWriter(RSS2);
  //Setting the channel elements
  //Use wrapper functions for common channelelements
  $TestFeed->setTitle('Last Modified - All');
  $TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
  $TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
  //Retriving informations from database
  $rows = $db->get_view("app", "byLastModified")->rows;
  //print_r($rows);
  foreach ($rows as $row) {
  //Create an empty FeedItem
  $newItem = $TestFeed->createNewItem();
  //Add elements to the feed item
  $newItem->setTitle($row['name']);
  $newItem->setLink($row['id']);
  $newItem->setDate(date("c", $row['metadata']['lastModified']));
  $newItem->setDescription($row['name']);
  //Now add the feed item
  $TestFeed->addItem($newItem);
  }
  //OK. Everything is done. Now genarate the feed.
  $TestFeed->genarateFeed();