argh
argh


Former-commit-id: 5633b69c577c7553ef393e89754b6647eedbf014

<?php <?php
   
/** /**
* Databaase class. * Databaase class.
*/ */
class SetteeDatabase { class SetteeDatabase {
   
/** /**
* Base URL of the CouchDB REST API * Base URL of the CouchDB REST API
*/ */
private $conn_url; private $conn_url;
/** /**
* HTTP REST Client instance * HTTP REST Client instance
*/ */
protected $rest_client; protected $rest_client;
/** /**
* Name of the database * Name of the database
*/ */
private $dbname; private $dbname;
/** /**
* Default constructor * Default constructor
*/ */
function __construct($conn_url, $dbname) { function __construct($conn_url, $dbname) {
$this->conn_url = $conn_url; $this->conn_url = $conn_url;
$this->dbname = $dbname; $this->dbname = $dbname;
$this->rest_client = SetteeRestClient::get_instance($this->conn_url); $this->rest_client = SetteeRestClient::get_instance($this->conn_url);
} }
   
   
/** /**
* Get UUID from CouchDB * Get UUID from CouchDB
* *
* @return * @return
* CouchDB-generated UUID string * CouchDB-generated UUID string
* *
*/ */
function gen_uuid() { function gen_uuid() {
$ret = $this->rest_client->http_get('_uuids'); $ret = $this->rest_client->http_get('_uuids');
return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking
} }
   
/** /**
* Create or update a document database * Create or update a document database
* *
* @param $document * @param $document
* PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically. * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically.
* *
* <p>If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation). * <p>If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation).
* If "_id" is missing, CouchDB will be used to generate a UUID. * If "_id" is missing, CouchDB will be used to generate a UUID.
* *
* <p>If $document has a "_rev" property (revision), document will be updated, rather than creating a new document. * <p>If $document has a "_rev" property (revision), document will be updated, rather than creating a new document.
* You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be
* one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but
* not provide "_id" since that is an invalid input. * not provide "_id" since that is an invalid input.
* *
* @param $allowRevAutoDetection * @param $allowRevAutoDetection
* Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision
* for a document and use it. This option is "false" by default because it involves an extra http HEAD request and * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and
* therefore can make save() operation slightly slower if such auto-detection is not required. * therefore can make save() operation slightly slower if such auto-detection is not required.
* *
* @return * @return
* document object with the database id (uuid) and revision attached; * document object with the database id (uuid) and revision attached;
* *
* @throws SetteeCreateDatabaseException * @throws SetteeCreateDatabaseException
*/ */
function save($document, $allowRevAutoDetection = false) { function save($document, $allowRevAutoDetection = false) {
if (is_string($document)) { if (is_string($document)) {
$document = json_decode($document); $document = json_decode($document);
} }
   
// Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter) // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter)
if(is_array($document)) { if(is_array($document)) {
$document = (object) $document; $document = (object) $document;
} }
   
if (empty($document->_id) && empty($document->_rev)) { if (empty($document->_id) && empty($document->_rev)) {
$id = $this->gen_uuid(); $id = $this->gen_uuid();
} }
elseif (empty($document->_id) && !empty($document->_rev)) { elseif (empty($document->_id) && !empty($document->_rev)) {
throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id"); throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id");
} }
else { else {
$id = $document->_id; $id = $document->_id;
   
if ($allowRevAutoDetection) { if ($allowRevAutoDetection) {
try { try {
$rev = $this->get_rev($id); $rev = $this->get_rev($id);
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
// auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error
} }
if (!empty($rev)) { if (!empty($rev)) {
$document->_rev = $rev; $document->_rev = $rev;
} }
} }
} }
$full_uri = $this->dbname . "/" . $this->safe_urlencode($id); $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
$document_json = json_encode($document, JSON_NUMERIC_CHECK); $document_json = json_encode($document, JSON_NUMERIC_CHECK);
$ret = $this->rest_client->http_put($full_uri, $document_json); $ret = $this->rest_client->http_put($full_uri, $document_json);
   
$document->_id = $ret['decoded']->id; $document->_id = $ret['decoded']->id;
$document->_rev = $ret['decoded']->rev; $document->_rev = $ret['decoded']->rev;
   
return $document; return $document;
} }
   
/** /**
* @param $doc * @param $doc
* @param $name * @param $name
* @param $content * @param $content
* Content of the attachment in a string-buffer format. This function will automatically base64-encode content for * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for
* you, so you don't have to do it. * you, so you don't have to do it.
* @param $mime_type * @param $mime_type
* Optional. Will be auto-detected if not provided * Optional. Will be auto-detected if not provided
* @return void * @return void
*/ */
public function add_attachment($doc, $name, $content, $mime_type = null) { public function add_attachment($doc, $name, $content, $mime_type = null) {
if (empty($doc->_attachments) || !is_object($doc->_attachments)) { if (empty($doc->_attachments) || !is_object($doc->_attachments)) {
$doc->_attachments = new stdClass(); $doc->_attachments = new stdClass();
} }
   
if (empty($mime_type)) { if (empty($mime_type)) {
$mime_type = $this->rest_client->content_mime_type($content); $mime_type = $this->rest_client->content_mime_type($content);
} }
   
$doc->_attachments->$name = new stdClass(); $doc->_attachments->$name = new stdClass();
$doc->_attachments->$name->content_type = $mime_type; $doc->_attachments->$name->content_type = $mime_type;
$doc->_attachments->$name->data = base64_encode($content); $doc->_attachments->$name->data = base64_encode($content);
} }
   
/** /**
* @param $doc * @param $doc
* @param $name * @param $name
* @param $file * @param $file
* Full path to a file (e.g. as returned by PHP's realpath function). * Full path to a file (e.g. as returned by PHP's realpath function).
* @param $mime_type * @param $mime_type
* Optional. Will be auto-detected if not provided * Optional. Will be auto-detected if not provided
* @return void * @return void
*/ */
public function add_attachment_file($doc, $name, $file, $mime_type = null) { public function add_attachment_file($doc, $name, $file, $mime_type = null) {
$content = file_get_contents($file); $content = file_get_contents($file);
$this->add_attachment($doc, $name, $content, $mime_type); $this->add_attachment($doc, $name, $content, $mime_type);
} }
   
/** /**
* *
* Retrieve a document from CouchDB * Retrieve a document from CouchDB
* *
* @throws SetteeWrongInputException * @throws SetteeWrongInputException
* *
* @param $id * @param $id
* Unique ID (usually: UUID) of the document to be retrieved. * Unique ID (usually: UUID) of the document to be retrieved.
* @return * @return
* database document in PHP object format. * database document in PHP object format.
*/ */
function get($id) { function get($id) {
if (empty($id)) { if (empty($id)) {
throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
} }
   
$full_uri = $this->dbname . "/" . $this->safe_urlencode($id); $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
$full_uri = str_replace("%3Frev%3D","?rev=",$full_uri); $full_uri = str_replace("%3Frev%3D","?rev=",$full_uri);
$ret = $this->rest_client->http_get($full_uri); $ret = $this->rest_client->http_get($full_uri);
return $ret['decoded']; return $ret['decoded'];
} }
   
/** /**
* *
* Get the latest revision of a document with document id: $id in CouchDB. * Get the latest revision of a document with document id: $id in CouchDB.
* *
* @throws SetteeWrongInputException * @throws SetteeWrongInputException
* *
* @param $id * @param $id
* Unique ID (usually: UUID) of the document to be retrieved. * Unique ID (usually: UUID) of the document to be retrieved.
* @return * @return
* database document in PHP object format. * database document in PHP object format.
*/ */
function get_rev($id) { function get_rev($id) {
if (empty($id)) { if (empty($id)) {
throw new SetteeWrongInputException("Error: Can't query a document without a uuid."); throw new SetteeWrongInputException("Error: Can't query a document without a uuid.");
} }
   
$full_uri = $this->dbname . "/" . $this->safe_urlencode($id); $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
$headers = $this->rest_client->http_head($full_uri); $headers = $this->rest_client->http_head($full_uri);
if (empty($headers['Etag'])) { if (empty($headers['Etag'])) {
throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag"); throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag");
} }
$etag = str_replace('"', '', $headers['Etag']); $etag = str_replace('"', '', $headers['Etag']);
return $etag; return $etag;
} }
/** /**
* Delete a document * Delete a document
* *
* @param $document * @param $document
* a PHP object or JSON representation of the document that has _id and _rev fields. * a PHP object or JSON representation of the document that has _id and _rev fields.
* *
* @return void * @return void
*/ */
function delete($document) { function delete($document) {
if (!is_object($document)) { if (!is_object($document)) {
$document = json_decode($document); $document = json_decode($document);
} }
   
$full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev; $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev;
$this->rest_client->http_delete($full_uri); $this->rest_client->http_delete($full_uri);
} }
   
/*----------------- View-related functions --------------*/ /*----------------- View-related functions --------------*/
   
/** /**
* Create a new view or update an existing one. * Create a new view or update an existing one.
* *
* @param $design_doc * @param $design_doc
* @param $view_name * @param $view_name
* @param $map_src * @param $map_src
* Source code of the map function in Javascript * Source code of the map function in Javascript
* @param $reduce_src * @param $reduce_src
* Source code of the reduce function in Javascript (optional) * Source code of the reduce function in Javascript (optional)
* @return void * @return void
*/ */
function save_view($design_doc, $view_name, $map_src, $reduce_src = null) { function save_view($design_doc, $view_name, $map_src, $reduce_src = null) {
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode($design_doc); $obj->_id = "_design/" . urlencode($design_doc);
$view_name = urlencode($view_name); $view_name = urlencode($view_name);
$obj->views->$view_name->map = $map_src; $obj->views->$view_name->map = $map_src;
if (!empty($reduce_src)) { if (!empty($reduce_src)) {
$obj->views->$view_name->reduce = $reduce_src; $obj->views->$view_name->reduce = $reduce_src;
} }
   
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
return $this->save($obj, true); return $this->save($obj, true);
} }
   
/** /**
* Create a new view or update an existing one. * Create a new view or update an existing one.
* *
* @param $design_doc * @param $design_doc
* @param $view_name * @param $view_name
* @param $key * @param $key
* key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes
* that first element is startkey, second: endkey. * that first element is startkey, second: endkey.
* @param $descending * @param $descending
* return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change
* order you also need to swap startkey and endkey values! * order you also need to swap startkey and endkey values!
* *
* @return void * @return void
*/ */
function get_view($design_doc, $view_name, $key = null, $descending = false, $limit = false, $reduce=false) { function get_view($design_doc, $view_name, $key = null, $descending = false, $limit = false, $reduce=null) {
$id = "_design/" . urlencode($design_doc); $id = "_design/" . urlencode($design_doc);
$view_name = urlencode($view_name); $view_name = urlencode($view_name);
$id .= "/_view/$view_name"; $id .= "/_view/$view_name";
   
$data = array(); $data = array();
if (!empty($key)) { if (!empty($key)) {
if (is_string($key)) { if (is_string($key)) {
$data = "key=" . '"' . $key . '"'; $data = "key=" . '"' . $key . '"';
} }
elseif (is_array($key)) { elseif (is_array($key)) {
list($startkey, $endkey) = $key; list($startkey, $endkey) = $key;
$data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"'; $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"';
} }
   
if ($descending) { if ($descending) {
$data .= "&descending=true"; $data .= "&descending=true";
} }
if ($reduce) { if ($reduce != null) {
  if ($reduce == true) {
$data .= "&reduce=true"; $data .= "&reduce=true";
} else { } else {
$data .= "&reduce=false"; $data .= "&reduce=false";
  }
} }
if ($limit) { if ($limit) {
$data .= "&limit=".$limit; $data .= "&limit=".$limit;
} }
} }
   
   
   
if (empty($id)) { if (empty($id)) {
throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
} }
   
$full_uri = $this->dbname . "/" . $this->safe_urlencode($id); $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
   
$full_uri = str_replace("%253Fgroup%253D","?group=",$full_uri); $full_uri = str_replace("%253Fgroup%253D","?group=",$full_uri);
$full_uri = str_replace("%253Flimit%253D","?limit=",$full_uri); $full_uri = str_replace("%253Flimit%253D","?limit=",$full_uri);
$ret = $this->rest_client->http_get($full_uri, $data); $ret = $this->rest_client->http_get($full_uri, $data);
//$ret['decoded'] = str_replace("?k","&k",$ret['decoded']); //$ret['decoded'] = str_replace("?k","&k",$ret['decoded']);
return $ret['decoded']; return $ret['decoded'];
} }
   
/** /**
* @param $id * @param $id
* @return * @return
* return a properly url-encoded id. * return a properly url-encoded id.
*/ */
private function safe_urlencode($id) { private function safe_urlencode($id) {
//-- System views like _design can have "/" in their URLs. //-- System views like _design can have "/" in their URLs.
$id = rawurlencode($id); $id = rawurlencode($id);
if (substr($id, 0, 1) == '_') { if (substr($id, 0, 1) == '_') {
$id = str_replace('%2F', '/', $id); $id = str_replace('%2F', '/', $id);
} }
return $id; return $id;
} }
/** Getter for a database name */ /** Getter for a database name */
function get_name() { function get_name() {
return $this->dbname; return $this->dbname;
} }
   
} }
import sys import sys
import os import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from datetime import date from datetime import date
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from lxml import etree from lxml import etree
import urllib import urllib
  import dateutil
  from dateutil.parser import *
   
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
   
d = pq(content) d = pq(content.read())
d.make_links_absolute() d.make_links_absolute(base_url = self.getURL())
d.table.filter('.ncTAF_DataTABLE') for table in d('table').items():
print [i.text() for i in d.items('span')] title= table('thead').text()
description = "" print title
dochash = scrape.mkhash(description) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text())
doc = foidocsdb.get(dochash) links = table('a').map(lambda i, e: pq(e).attr('href'))
if doc is None: description = descA+" "+descB
print "saving " + dochash edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
edate = date.today().strftime("%Y-%m-%d") print edate
doc = {'_id': dochash, 'agencyID': self.getAgencyID() dochash = scrape.mkhash(self.remove_control_chars(title))
, 'url': self.getURL(), 'docID': dochash, doc = foidocsdb.get(dochash)
"date": edate, "title": "Disclosure Log Updated", "description": description} if doc is None:
#foidocsdb.save(doc) print "saving " + dochash
else: edate = date.today().strftime("%Y-%m-%d")
print "already saved" doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "links": links,