added colors to scrapers validator
added colors to scrapers validator


Former-commit-id: ab47cd0593c47f1a2099381e7118267a82ce2fe3

  <?php
 
  require_once '../include/common.inc.php';
 
  $db = $server->get_db('disclosr-agencies');
  $rows = $db->get_view("app", "byName")->rows;
  $nametoid = Array();
  $accounts = Array();
  foreach ($rows as $row) {
  $nametoid[trim($row->key)] = $row->value;
  }
 
  function extractCSVAccounts($url, $nameField, $accountField, $filter) {
  global $accounts, $nametoid;
  $request = Requests::get($url);
  echo $url;
  $Data = str_getcsv($request->body, "\n"); //parse the rows
  $headers = Array();
  foreach ($Data as $num => $line) {
  $Row = str_getcsv($line, ",");
  if ($num == 0) {
  $headers = $Row;
  print_r($headers);
  } else {
  if (isset($Row[array_search($nameField, $headers)])) {
  $agencyName = $Row[array_search($nameField, $headers)];
  if (!in_array(trim($agencyName), array_keys($nametoid))) {
  echo "$agencyName missing" . PHP_EOL;
  } else {
  echo $Row[array_search($nameField, $headers)] . PHP_EOL;
  $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
  }
  } else {
  echo "error finding any agency" . $line . PHP_EOL;
  }
  }
  }
  }
 
  extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
  print_r($accounts);
  /* foreach ($accounts as $id => $accountTypes) {
  echo $id . "<br>" . PHP_EOL;
  $doc = object_to_array($db->get($id));
  // print_r($doc);
 
  foreach ($accountTypes as $accountType => $accounts) {
  if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
  $doc["has" . $accountType] = Array();
  }
  $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
  }
  $db->save($doc);
  }*/
  ?>
 
<?php <?php
   
include_once("../include/common.inc.php"); include_once("../include/common.inc.php");
   
setlocale(LC_CTYPE, 'C'); setlocale(LC_CTYPE, 'C');
   
$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
   
$tag = Array(); $tag = Array();
try { try {
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); $tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key));
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$foiEmail = Array(); $foiEmail = Array();
try { try {
$rows = $db->get_view("app", "foiEmails", null, true)->rows; $rows = $db->get_view("app", "foiEmails", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$foiEmail[$row->key] = $row->value; $foiEmail[$row->key] = $row->value;
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$fp = fopen('php://output', 'w'); $fp = fopen('php://output', 'w');
if ($fp && $db) { if ($fp && $db) {
header('Content-Type: text/csv; charset=utf-8'); header('Content-Type: text/csv; charset=utf-8');
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
header('Pragma: no-cache'); header('Pragma: no-cache');
header('Expires: 0'); header('Expires: 0');
fputcsv($fp, $headers); fputcsv($fp, $headers);
try { try {
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($agencies as $agency) { foreach ($agencies as $agency) {
// print_r($agency); // print_r($agency);
   
if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) {
$row = Array(); $row = Array();
$row["#id"] = $agency->id; $row["#id"] = $agency->id;
$row["name"] = trim($agency->value->name); $row["name"] = trim($agency->value->name);
$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");
$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");
$row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); $row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");
   
$otherBodies = Array(); $otherBodies = Array();
if (isset($agency->value->foiBodies)) { if (isset($agency->value->foiBodies)) {
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies); $otherBodies = array_merge($otherBodies, $agency->value->foiBodies);
} }
if (isset($agency->value->positions)) { if (isset($agency->value->positions)) {
$otherBodies = array_merge($otherBodies, $agency->value->positions); $positions = Array();
  foreach ($agency->value->positions as $position) {
  $positions[] = "Office of the ".$position;
  }
  $otherBodies = array_merge($otherBodies, $positions);
} }
  sort($otherBodies);
if (count($otherBodies) > 0) { if (count($otherBodies) > 0) {
$row["notes"] .= "<br/> This department also responds to requests for information held by ".implode(",",$otherBodies); $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies);
} }
   
$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
if ($agency->value->orgType == "FMA-DepartmentOfState") { if ($agency->value->orgType == "FMA-DepartmentOfState") {
$row["tag_string"] = $tag[$agency->value->_id]; $row["tag_string"] = $tag[$agency->value->_id];
} else { } else {
$row["tag_string"] = $tag[$agency->value->parentOrg]; $row["tag_string"] = $tag[$agency->value->parentOrg];
} }
$row["tag_string"] .= " " . $agency->value->orgType; $row["tag_string"] .= " " . $agency->value->orgType;
$row["tag_string"] .= " federal"; $row["tag_string"] .= " federal";
fputcsv($fp, array_values($row)); fputcsv($fp, array_values($row));
   
   
   
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
   
die; die;
} }
?> ?>
   
  <?php
 
  include_once('../include/common.inc.php');
  include_header('Webserver and Accessiblity');
 
  echo "<table>
  <tr><th>name</th><th>disclog</th><th>scraper?</th></tr>";
  $agenciesdb = $server->get_db('disclosr-agencies');
  $docsdb = $server->get_db('disclosr-documents');
  try {
  $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
 
 
  if ($rows) {
  foreach ($rows as $row) {
 
  echo "<tr><td>" . $row->value->name . " (".$row->id.")</td>\n";
  echo "<td>";
  if (isset($row->value->FOIDocumentsURL)) {
  echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
  .$row->value->FOIDocumentsURL.'</a>';
  } else {
  echo "<font color='red'>✘</font>";
  }
  echo "</td>\n<td>";
  if (isset($row->value->FOIDocumentsURL)) {
  if (file_exists("./scrapers/".$row->id.'.py')) {
  echo "<font color='green'>✔</font>";
  } else if (file_exists("./scrapers/".$row->id.'.txt')){
  echo "pass";
  } else {
  echo "<font color='red'>✘</font>";
  }
  }
  echo "</td></tr>\n";
  }
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  include_footer();
  ?>
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import scrape
 
  from bs4 import BeautifulSoup
  import abc
 
  class GenericOAICDisclogScraper(object):
  __metaclass__ = abc.ABCMeta
  @abc.abstractmethod
  def getAgencyID(self):
  """ disclosr agency id """
  return
 
  @abc.abstractmethod
  def getURL(self):
  """ disclog URL"""
  return
 
  @abc.abstractmethod
  def getColumns(self,columns):
  """ rearranges columns if required """
  return
 
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
  if content != None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
  # http://www.crummy.com/software/BeautifulSoup/documentation.html
  soup = BeautifulSoup(content)
  for row in soup.table.find_all('tr'):
  columns = row.find_all('td')
  if len(columns) == 5:
  (id, date, description, title, notes) = self.getColumns(columns)
  print id.string
  hash = scrape.mkhash(url+id.string)
  links = []
  for atag in row.find_all("a"):
  if atag.has_key('href'):
  links.append(scrape.fullurl(url,atag['href']))
  doc = foidocsdb.get(hash)
  descriptiontxt = ""
  for string in description.stripped_strings:
  descriptiontxt = descriptiontxt + string
 
  if doc == None:
  print "saving"
  doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
  "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
  foidocsdb.save(doc)
  else:
  print "already saved"
 
  elif len(row.find_all('th')) == 5:
  print "header row"
 
  else:
  print "ERROR number of columns incorrect"
  print row
 
  google-site-verification: google676a414ad086cefb.html
 
  australian disclosure logs
 
  are you looking for more information about:
  contracts
  gov orgs
  lobbyists
 
  1/1/11 title (Dept dfggdfgdf)
  description:
  source link:
  documents:
  #1 title link
 
 
  rss feed here
  <?php
 
  // Agency X updated Y, new files, diff of plain text/link text,
  // feed for just one agency or all
  // This is a minimum example of using the Universal Feed Generator Class
  include("lib/FeedWriter.php");
  //Creating an instance of FeedWriter class.
  $TestFeed = new FeedWriter(RSS2);
  //Setting the channel elements
  //Use wrapper functions for common channelelements
  $TestFeed->setTitle('Last Modified - All');
  $TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
  $TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
  //Retriving informations from database
  $rows = $db->get_view("app", "byLastModified")->rows;
  //print_r($rows);
  foreach ($rows as $row) {
  //Create an empty FeedItem
  $newItem = $TestFeed->createNewItem();
  //Add elements to the feed item
  $newItem->setTitle($row['name']);
  $newItem->setLink($row['id']);
  $newItem->setDate(date("c", $row['metadata']['lastModified']));
  $newItem->setDescription($row['name']);
  //Now add the feed item
  $TestFeed->addItem($newItem);
  }
  //OK. Everything is done. Now genarate the feed.
  $TestFeed->genarateFeed();
  ?>
file:b/documents/run.bat (new)
  python scrape.py
  pause
  #http://packages.python.org/CouchDB/client.html
  import couchdb
  import urllib2
  from BeautifulSoup import BeautifulSoup
  import re
  import hashlib
  from urlparse import urljoin
  import time
  import os
  import mimetypes
  import re
  import urllib
  import urlparse
 
  def mkhash(input):
  return hashlib.md5(input).hexdigest().encode("utf-8")
 
  def canonurl(url):
  r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
  if the URL looks invalid.
  >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
  'http://xn--hgi.ws/'
  """
  # strip spaces at the ends and ensure it's prefixed with 'scheme://'
  url = url.strip()
  if not url: