Handle more http edge cases in scraper
Handle more http edge cases in scraper


Former-commit-id: 994d782d8883843a55bf2558f8e6a6c9ffbcebde

  <?php
 
  include_once("../include/common.inc.php");
 
 
  setlocale(LC_CTYPE, 'C');
 
  $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");
 
  $db = $server->get_db('disclosr-agencies');
  $headers = Array();
  try {
  $rows = $db->get_view("app", "fieldNames?group=true", null, true)->rows;
 
  $dataValues = Array();
  foreach ($rows as $row) {
  $headers[] = $row->key;
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
 
  $fp = fopen('php://output', 'w');
  if ($fp && $db) {
  header('Content-Type: text/csv; charset=utf-8');
  header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
  header('Pragma: no-cache');
  header('Expires: 0');
  fputcsv($fp, $headers);
  try {
  $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
  //print_r($rows);
  foreach ($agencies as $agency) {
  // print_r($agency);
 
  if ( !isset($agency->value->status)) {
  $row = Array();
  $agencyArray = object_to_array($agency->value);
  foreach ($headers as $fieldName) {
  if (isset($agencyArray[$fieldName])) {
  if (is_array($agencyArray[$fieldName])) {
  $row[] = implode(";",$agencyArray[$fieldName]);
  } else {
  $row[] = $agencyArray[$fieldName];
  }
  } else {
  $row[] = "";
  }
  }
 
  fputcsv($fp, array_values($row));
 
 
  }
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
 
  die;
  }
  ?>
 
  <?php
 
  /**
  * Databaase class.
  */
  class SetteeDatabase {
 
  /**
  * Base URL of the CouchDB REST API
  */
  private $conn_url;
 
  /**
  * HTTP REST Client instance
  */
  protected $rest_client;
 
  /**
  * Name of the database
  */
  private $dbname;
 
  /**
  * Default constructor
  */
  function __construct($conn_url, $dbname) {
  $this->conn_url = $conn_url;
  $this->dbname = $dbname;
  $this->rest_client = SetteeRestClient::get_instance($this->conn_url);
  }
 
 
  /**
  * Get UUID from CouchDB
  *
  * @return
  * CouchDB-generated UUID string
  *
  */
  function gen_uuid() {
  $ret = $this->rest_client->http_get('_uuids');
  return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking
  }
 
  /**
  * Create or update a document database
  *
  * @param $document
  * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically.
  *
  * <p>If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation).
  * If "_id" is missing, CouchDB will be used to generate a UUID.
  *
  * <p>If $document has a "_rev" property (revision), document will be updated, rather than creating a new document.
  * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be
  * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but
  * not provide "_id" since that is an invalid input.
  *
  * @param $allowRevAutoDetection
  * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision
  * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and
  * therefore can make save() operation slightly slower if such auto-detection is not required.
  *
  * @return
  * document object with the database id (uuid) and revision attached;
  *
  * @throws SetteeCreateDatabaseException
  */
  function save($document, $allowRevAutoDetection = false) {
  if (is_string($document)) {
  $document = json_decode($document);
  }
 
  // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter)
  if(is_array($document)) {
  $document = (object) $document;
  }
 
  if (empty($document->_id) && empty($document->_rev)) {
  $id = $this->gen_uuid();
  }
  elseif (empty($document->_id) && !empty($document->_rev)) {
  throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id");
  }
  else {
  $id = $document->_id;
 
  if ($allowRevAutoDetection) {
  try {
  $rev = $this->get_rev($id);
  } catch (SetteeRestClientException $e) {
  // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error
  }
  if (!empty($rev)) {
  $document->_rev = $rev;
  }
  }
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $document_json = json_encode($document, JSON_NUMERIC_CHECK);
 
  $ret = $this->rest_client->http_put($full_uri, $document_json);
 
  $document->_id = $ret['decoded']->id;
  $document->_rev = $ret['decoded']->rev;
 
  return $document;
  }
 
  /**
  * @param $doc
  * @param $name
  * @param $content
  * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for
  * you, so you don't have to do it.
  * @param $mime_type
  * Optional. Will be auto-detected if not provided
  * @return void
  */
  public function add_attachment($doc, $name, $content, $mime_type = null) {
  if (empty($doc->_attachments) || !is_object($doc->_attachments)) {
  $doc->_attachments = new stdClass();
  }
 
  if (empty($mime_type)) {
  $mime_type = $this->rest_client->content_mime_type($content);
  }
 
  $doc->_attachments->$name = new stdClass();
  $doc->_attachments->$name->content_type = $mime_type;
  $doc->_attachments->$name->data = base64_encode($content);
  }
 
  /**
  * @param $doc
  * @param $name
  * @param $file
  * Full path to a file (e.g. as returned by PHP's realpath function).
  * @param $mime_type
  * Optional. Will be auto-detected if not provided
  * @return void
  */
  public function add_attachment_file($doc, $name, $file, $mime_type = null) {
  $content = file_get_contents($file);
  $this->add_attachment($doc, $name, $content, $mime_type);
  }
 
  /**
  *
  * Retrieve a document from CouchDB
  *
  * @throws SetteeWrongInputException
  *
  * @param $id
  * Unique ID (usually: UUID) of the document to be retrieved.
  * @return
  * database document in PHP object format.
  */
  function get($id) {
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $full_uri = str_replace("%3Frev%3D","?rev=",$full_uri);
  $ret = $this->rest_client->http_get($full_uri);
  return $ret['decoded'];
  }
 
  /**
  *
  * Get the latest revision of a document with document id: $id in CouchDB.
  *
  * @throws SetteeWrongInputException
  *
  * @param $id
  * Unique ID (usually: UUID) of the document to be retrieved.
  * @return
  * database document in PHP object format.
  */
  function get_rev($id) {
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't query a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $headers = $this->rest_client->http_head($full_uri);
  if (empty($headers['Etag'])) {
  throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag");
  }
  $etag = str_replace('"', '', $headers['Etag']);
  return $etag;
  }
 
  /**
  * Delete a document
  *
  * @param $document
  * a PHP object or JSON representation of the document that has _id and _rev fields.
  *
  * @return void
  */
  function delete($document) {
  if (!is_object($document)) {
  $document = json_decode($document);
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev;
  $this->rest_client->http_delete($full_uri);
  }
 
 
  /*----------------- View-related functions --------------*/
 
  /**
  * Create a new view or update an existing one.
  *
  * @param $design_doc
  * @param $view_name
  * @param $map_src
  * Source code of the map function in Javascript
  * @param $reduce_src
  * Source code of the reduce function in Javascript (optional)
  * @return void
  */
  function save_view($design_doc, $view_name, $map_src, $reduce_src = null) {
  $obj = new stdClass();
  $obj->_id = "_design/" . urlencode($design_doc);
  $view_name = urlencode($view_name);
  $obj->views->$view_name->map = $map_src;
  if (!empty($reduce_src)) {
  $obj->views->$view_name->reduce = $reduce_src;
  }
 
  // allow safe updates (even if slightly slower due to extra: rev-detection check).
  return $this->save($obj, true);
  }
 
  /**
  * Create a new view or update an existing one.
  *
  * @param $design_doc
  * @param $view_name
  * @param $key
  * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes
  * that first element is startkey, second: endkey.
  * @param $descending
  * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change
  * order you also need to swap startkey and endkey values!
  *
  * @return void
  */
  function get_view($design_doc, $view_name, $key = null, $descending = false) {
  $id = "_design/" . urlencode($design_doc);
  $view_name = urlencode($view_name);
  $id .= "/_view/$view_name";
 
  $data = array();
  if (!empty($key)) {
  if (is_string($key)) {
  $data = "key=" . '"' . $key . '"';
  }
  elseif (is_array($key)) {
  list($startkey, $endkey) = $key;
  $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"';
  }
 
  if ($descending) {
  $data .= "&descending=true";
  }
  }
 
 
 
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $full_uri = str_replace("%253Fgroup%253Dtrue","?group=true",$full_uri);
  $ret = $this->rest_client->http_get($full_uri, $data);
  return $ret['decoded'];
 
  }
 
  /**
  * @param $id
  * @return
  * return a properly url-encoded id.
  */
  private function safe_urlencode($id) {
  //-- System views like _design can have "/" in their URLs.
  $id = rawurlencode($id);
  if (substr($id, 0, 1) == '_') {
  $id = str_replace('%2F', '/', $id);
  }
  return $id;
  }
 
  /** Getter for a database name */
  function get_name() {
  return $this->dbname;
  }
 
  }
file:a/scrape.py -> file:b/scrape.py
#http://packages.python.org/CouchDB/client.html #http://packages.python.org/CouchDB/client.html
import couchdb import couchdb
import urllib2 import urllib2
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import re import re
import hashlib import hashlib
from urlparse import urljoin from urlparse import urljoin
import time import time
import os import os
  import mimetypes
  import re
  import urllib
  import urlparse
   
  def canonurl(url):
  r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
  if the URL looks invalid.
  >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
  'http://xn--hgi.ws/'
  """
  # strip spaces at the ends and ensure it's prefixed with 'scheme://'
  url = url.strip()
  if not url:
  return ''
  if not urlparse.urlsplit(url).scheme:
  url = 'http://' + url
   
  # turn it into Unicode
  #try:
  # url = unicode(url, 'utf-8')
  #except UnicodeDecodeError:
  # return '' # bad UTF-8 chars in URL
   
  # parse the URL into its components
  parsed = urlparse.urlsplit(url)
  scheme, netloc, path, query, fragment = parsed
   
  # ensure scheme is a letter followed by letters, digits, and '+-.' chars
  if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
  return ''
  scheme = str(scheme)
   
  # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
  match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
  if not match:
  return ''
  domain, port = match.groups()
  netloc = domain + (port if port else '')
  netloc = netloc.encode('idna')
   
  # ensure path is valid and convert Unicode chars to %-encoded
  if not path:
  path = '/' # eg: 'http://google.com' -> 'http://google.com/'
  path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
   
  # ensure query is valid
  query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
   
  # ensure fragment is valid
  fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))