more scraper work
more scraper work


Former-commit-id: f58b0639b55df64a91f425e957bd65ca579545ce

[submodule "couchdb/couchdb-lucene"] [submodule "couchdb/couchdb-lucene"]
path = couchdb/couchdb-lucene path = couchdb/couchdb-lucene
url = https://github.com/rnewson/couchdb-lucene.git url = https://github.com/rnewson/couchdb-lucene.git
[submodule "couchdb/settee"] [submodule "couchdb/settee"]
path = couchdb/settee path = couchdb/settee
url = https://github.com/inadarei/settee.git url = https://github.com/inadarei/settee.git
[submodule "lib/php-diff"] [submodule "lib/php-diff"]
path = lib/php-diff path = lib/php-diff
url = https://github.com/chrisboulton/php-diff.git url = https://github.com/chrisboulton/php-diff.git
[submodule "lib/Requests"] [submodule "lib/Requests"]
path = lib/Requests path = lib/Requests
url = https://github.com/rmccue/Requests.git url = https://github.com/rmccue/Requests.git
[submodule "js/flotr2"] [submodule "js/flotr2"]
path = js/flotr2 path = js/flotr2
url = https://github.com/HumbleSoftware/Flotr2.git url = https://github.com/HumbleSoftware/Flotr2.git
[submodule "lib/phpquery"] [submodule "lib/phpquery"]
path = lib/phpquery path = lib/phpquery
url = https://github.com/TobiaszCudnik/phpquery.git url = https://github.com/TobiaszCudnik/phpquery.git
[submodule "js/sigma"] [submodule "js/sigma"]
path = js/sigma path = js/sigma
url = https://github.com/jacomyal/sigma.js.git url = https://github.com/jacomyal/sigma.js.git
[submodule "js/bubbletree"] [submodule "js/bubbletree"]
path = js/bubbletree path = js/bubbletree
url = https://github.com/okfn/bubbletree.git url = https://github.com/okfn/bubbletree.git
[submodule "lib/querypath"] [submodule "lib/querypath"]
path = lib/querypath path = lib/querypath
url = https://github.com/technosophos/querypath.git url = https://github.com/technosophos/querypath.git
  [submodule "lib/amon-php"]
  path = lib/amon-php
  url = https://github.com/martinrusev/amon-php.git
   
  <?php
 
  require_once '../include/common.inc.php';
 
  $db = $server->get_db('disclosr-agencies');
  $rows = $db->get_view("app", "byName")->rows;
  $nametoid = Array();
  $accounts = Array();
  foreach ($rows as $row) {
  $nametoid[trim($row->key)] = $row->value;
  }
 
  function extractCSVAccounts($url, $nameField, $accountField, $filter) {
  global $accounts, $nametoid;
  $request = Requests::get($url);
  echo $url;
  $Data = str_getcsv($request->body, "\n"); //parse the rows
  $headers = Array();
  foreach ($Data as $num => $line) {
  $Row = str_getcsv($line, ",");
  if ($num == 0) {
  $headers = $Row;
  print_r($headers);
  } else {
  if (isset($Row[array_search($nameField, $headers)])) {
  $agencyName = $Row[array_search($nameField, $headers)];
  if (!in_array(trim($agencyName), array_keys($nametoid))) {
  echo "$agencyName missing" . PHP_EOL;
  } else {
  echo $Row[array_search($nameField, $headers)] . PHP_EOL;
  $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)];
  }
  } else {
  echo "error finding any agency" . $line . PHP_EOL;
  }
  }
  }
  }
 
  extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name");
  print_r($accounts);
  /* foreach ($accounts as $id => $accountTypes) {
  echo $id . "<br>" . PHP_EOL;
  $doc = object_to_array($db->get($id));
  // print_r($doc);
 
  foreach ($accountTypes as $accountType => $accounts) {
  if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
  $doc["has" . $accountType] = Array();
  }
  $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
  }
  $db->save($doc);
  }*/
  ?>
 
<?php <?php
   
include_once("../include/common.inc.php"); include_once("../include/common.inc.php");
   
setlocale(LC_CTYPE, 'C'); setlocale(LC_CTYPE, 'C');
   
$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
   
$tag = Array(); $tag = Array();
try { try {
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); $tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key));
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$foiEmail = Array(); $foiEmail = Array();
try { try {
$rows = $db->get_view("app", "foiEmails", null, true)->rows; $rows = $db->get_view("app", "foiEmails", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
$foiEmail[$row->key] = $row->value; $foiEmail[$row->key] = $row->value;
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
die(); die();
} }
   
$fp = fopen('php://output', 'w'); $fp = fopen('php://output', 'w');
if ($fp && $db) { if ($fp && $db) {
header('Content-Type: text/csv; charset=utf-8'); header('Content-Type: text/csv; charset=utf-8');
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
header('Pragma: no-cache'); header('Pragma: no-cache');
header('Expires: 0'); header('Expires: 0');
fputcsv($fp, $headers); fputcsv($fp, $headers);
try { try {
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($agencies as $agency) { foreach ($agencies as $agency) {
// print_r($agency); // print_r($agency);
   
if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) {
$row = Array(); $row = Array();
$row["#id"] = $agency->id; $row["#id"] = $agency->id;
$row["name"] = trim($agency->value->name); $row["name"] = trim($agency->value->name);
$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : "");
$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : "");
$row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); $row["notes"] = (isset($agency->value->description) ? $agency->value->description : "");
   
$otherBodies = Array(); $otherBodies = Array();
if (isset($agency->value->foiBodies)) { if (isset($agency->value->foiBodies)) {
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies); $otherBodies = array_merge($otherBodies, $agency->value->foiBodies);
} }
if (isset($agency->value->positions)) { if (isset($agency->value->positions)) {
$otherBodies = array_merge($otherBodies, $agency->value->positions); $positions = Array();
  foreach ($agency->value->positions as $position) {
  $positions[] = "Office of the ".$position;
  }
  $otherBodies = array_merge($otherBodies, $positions);
} }
  sort($otherBodies);
if (count($otherBodies) > 0) { if (count($otherBodies) > 0) {
$row["notes"] .= "<br/> This department also responds to requests for information held by ".implode(",",$otherBodies); $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies);
} }
   
$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
if ($agency->value->orgType == "FMA-DepartmentOfState") { if ($agency->value->orgType == "FMA-DepartmentOfState") {
$row["tag_string"] = $tag[$agency->value->_id]; $row["tag_string"] = $tag[$agency->value->_id];
} else { } else {
$row["tag_string"] = $tag[$agency->value->parentOrg]; $row["tag_string"] = $tag[$agency->value->parentOrg];
} }
$row["tag_string"] .= " " . $agency->value->orgType; $row["tag_string"] .= " " . $agency->value->orgType;
$row["tag_string"] .= " federal"; $row["tag_string"] .= " federal";
fputcsv($fp, array_values($row)); fputcsv($fp, array_values($row));
   
   
   
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
   
die; die;
} }
?> ?>
   
  google-site-verification: google676a414ad086cefb.html
 
  australian disclosure logs
 
  are you looking for more information about:
  contracts
  gov orgs
  lobbyists
 
  1/1/11 title (Dept dfggdfgdf)
  description:
  source link:
  documents:
  #1 title link
 
 
  rss feed here
  <?php
 
  // Agency X updated Y, new files, diff of plain text/link text,
  // feed for just one agency or all
  // This is a minimum example of using the Universal Feed Generator Class
  include("lib/FeedWriter.php");
  //Creating an instance of FeedWriter class.
  $TestFeed = new FeedWriter(RSS2);
  //Setting the channel elements
  //Use wrapper functions for common channelelements
  $TestFeed->setTitle('Last Modified - All');
  $TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
  $TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
  //Retriving informations from database
  $rows = $db->get_view("app", "byLastModified")->rows;
  //print_r($rows);
  foreach ($rows as $row) {
  //Create an empty FeedItem
  $newItem = $TestFeed->createNewItem();
  //Add elements to the feed item
  $newItem->setTitle($row['name']);
  $newItem->setLink($row['id']);
  $newItem->setDate(date("c", $row['metadata']['lastModified']));
  $newItem->setDescription($row['name']);
  //Now add the feed item
  $TestFeed->addItem($newItem);
  }
  //OK. Everything is done. Now genarate the feed.
  $TestFeed->genarateFeed();
  ?>
  #http://packages.python.org/CouchDB/client.html
  import couchdb
  import urllib2
  from BeautifulSoup import BeautifulSoup
  import re
  import hashlib
  from urlparse import urljoin
  import time
  import os
  import mimetypes
  import re
  import urllib
  import urlparse
 
  def mkhash(input):
  return hashlib.md5(input).hexdigest().encode("utf-8")
 
  def canonurl(url):
  r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
  if the URL looks invalid.
  >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
  'http://xn--hgi.ws/'
  """
  # strip spaces at the ends and ensure it's prefixed with 'scheme://'
  url = url.strip()
  if not url:
  return ''
  if not urlparse.urlsplit(url).scheme:
  url = 'http://' + url
 
  # turn it into Unicode
  #try:
  # url = unicode(url, 'utf-8')
  #except UnicodeDecodeError:
  # return '' # bad UTF-8 chars in URL
 
  # parse the URL into its components
  parsed = urlparse.urlsplit(url)
  scheme, netloc, path, query, fragment = parsed
 
  # ensure scheme is a letter followed by letters, digits, and '+-.' chars
  if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
  return ''
  scheme = str(scheme)
 
  # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
  match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
  if not match:
  return ''
  domain, port = match.groups()
  netloc = domain + (port if port else '')
  netloc = netloc.encode('idna')
 
  # ensure path is valid and convert Unicode chars to %-encoded
  if not path:
  path = '/' # eg: 'http://google.com' -> 'http://google.com/'
  path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
 
  # ensure query is valid
  query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
 
  # ensure fragment is valid
  fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
 
  # piece it all back together, truncating it to a maximum of 4KB
  url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
  return url[:4096]
 
  def fullurl(url,href):
  href = href.replace(" ","%20")
  href = re.sub('#.*$','',href)
  return urljoin(url,href)
 
  #http://diveintopython.org/http_web_services/etags.html
  class NotModifiedHandler(urllib2.BaseHandler):
  def http_error_304(self, req, fp, code, message, headers):
  addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
  addinfourl.code = code
  return addinfourl
 
  def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
  url = canonurl(url)
  hash = mkhash(url)
  req = urllib2.Request(url)
  print "Fetching %s" % url
  if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
  print "Not a valid HTTP url"
  return (None,None,None)
  doc = docsdb.get(hash)
  if doc == None:
  doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
  else:
  if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 999999):
  print "Uh oh, trying to scrape URL again too soon!"
  last_attachment_fname = doc["_attachments"].keys()[-1]
  last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
  return (doc['url'],doc['mime_type'],last_attachment.read())
  if scrape_again == False:
  print "Not scraping this URL again as requested"
  return (None,None,None)