design doc updater
design doc updater


Former-commit-id: 52c0dccb230edddd3a6d311dcb1d58dece972ccd

[submodule "couchdb/couchdb-lucene"] [submodule "couchdb/couchdb-lucene"]
path = couchdb/couchdb-lucene path = couchdb/couchdb-lucene
url = https://github.com/rnewson/couchdb-lucene.git url = https://github.com/rnewson/couchdb-lucene.git
[submodule "couchdb/settee"] [submodule "couchdb/settee"]
path = couchdb/settee path = couchdb/settee
url = https://github.com/inadarei/settee.git url = https://github.com/inadarei/settee.git
[submodule "lib/php-diff"] [submodule "lib/php-diff"]
path = lib/php-diff path = lib/php-diff
url = https://github.com/chrisboulton/php-diff.git url = https://github.com/chrisboulton/php-diff.git
[submodule "lib/Requests"] [submodule "lib/Requests"]
path = lib/Requests path = lib/Requests
url = https://github.com/rmccue/Requests.git url = https://github.com/rmccue/Requests.git
[submodule "js/flotr2"] [submodule "js/flotr2"]
path = js/flotr2 path = js/flotr2
url = https://github.com/HumbleSoftware/Flotr2.git url = https://github.com/HumbleSoftware/Flotr2.git
[submodule "lib/phpquery"] [submodule "lib/phpquery"]
path = lib/phpquery path = lib/phpquery
url = https://github.com/TobiaszCudnik/phpquery.git url = https://github.com/TobiaszCudnik/phpquery.git
[submodule "js/sigma"] [submodule "js/sigma"]
path = js/sigma path = js/sigma
url = https://github.com/jacomyal/sigma.js.git url = https://github.com/jacomyal/sigma.js.git
[submodule "js/bubbletree"] [submodule "js/bubbletree"]
path = js/bubbletree path = js/bubbletree
url = https://github.com/okfn/bubbletree.git url = https://github.com/okfn/bubbletree.git
[submodule "lib/querypath"] [submodule "lib/querypath"]
path = lib/querypath path = lib/querypath
url = https://github.com/technosophos/querypath.git url = https://github.com/technosophos/querypath.git
[submodule "lib/amon-php"] [submodule "lib/amon-php"]
path = lib/amon-php path = lib/amon-php
url = https://github.com/martinrusev/amon-php.git url = https://github.com/martinrusev/amon-php.git
  [submodule "documents/lib/parsedatetime"]
  path = documents/lib/parsedatetime
  url = git://github.com/bear/parsedatetime.git
   
<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
  //function createFOIDocumentsDesignDoc() {
   
  $foidb = $server->get_db('disclosr-foidocuments');
  $obj = new stdClass();
  $obj->_id = "_design/" . urlencode("app");
  $obj->language = "javascript";
  $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
  $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
   
  // allow safe updates (even if slightly slower due to extra: rev-detection check).
  $foidb->save($obj, true);
   
   
  function createDocumentsDesignDoc() {
  /*
  global $db;
  $obj = new stdClass();
  $obj->_id = "_design/" . urlencode("app");
  $obj->language = "javascript";
  $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
  $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
  "views": {
  "web_server": {
  "map": "function(doc) {\n emit(doc.web_server, 1);\n}",
  "reduce": "function (key, values, rereduce) {\n return sum(values);\n}"
  },
  "byAgency": {
  "map": "function(doc) {\n emit(doc.agencyID, 1);\n}",
  "reduce": "function (key, values, rereduce) {\n return sum(values);\n}"
  },
  "byURL": {
  "map": "function(doc) {\n emit(doc.url, doc);\n}"
  },
  "agency": {
  "map": "function(doc) {\n emit(doc.agencyID, doc);\n}"
  },
  "byWebServer": {
  "map": "function(doc) {\n emit(doc.web_server, doc);\n}"
  },
  "getValidationRequired": {
  "map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"
  }
  } */
  }
   
  //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
createAgencyDesignDoc(); $obj = new stdClass();
  $obj->_id = "_design/" . urlencode("app");
  $obj->language = "javascript";
  $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
  $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
  $obj->views->byCanonicalName->map = "function(doc) {
  if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
  emit(doc.name, doc);
  }
  };";
  $obj->views->byDeptStateName->map = "function(doc) {
  if (doc.orgType == 'FMA-DepartmentOfState') {
  emit(doc.name, doc._id);
  }
  };";
  $obj->views->parentOrgs->map = "function(doc) {
  if (doc.parentOrg) {
  emit(doc._id, doc.parentOrg);
  }
  };";
  $obj->views->byName->map = 'function(doc) {
  if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
  emit(doc.name, doc._id);
  if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
  emit(doc.shortName, doc._id);
  }
  for (name in doc.otherNames) {
  if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
  emit(doc.otherNames[name], doc._id);
  }
  }
  for (name in doc.foiBodies) {
  if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
  emit(doc.foiBodies[name], doc._id);
  }
  }
  }
  };';
   
  $obj->views->foiEmails->map = "function(doc) {
  emit(doc._id, doc.foiEmail);
  };";
   
  $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
  $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
  $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
  $obj->views->getScrapeRequired->map = "function(doc) {
   
  var lastScrape = Date.parse(doc.metadata.lastScraped);
   
  var today = new Date();
   
  if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
  emit(doc._id, doc);
  }
   
  };";
  $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
  $obj->views->getConflicts->map = "function(doc) {
  if (doc._conflicts) {
  emit(null, [doc._rev].concat(doc._conflicts));
  }
  }";
  // http://stackoverflow.com/questions/646628/javascript-startswith
  $obj->views->score->map = 'if(!String.prototype.startsWith){
  String.prototype.startsWith = function (str) {
  return !this.indexOf(str);
  }
  }
   
  function(doc) {
  count = 0;
  if (doc["status"] != "suspended") {
  for(var propName in doc) {
  if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
  count++;
  }
  }
  portfolio = doc.parentOrg;
  if (doc.orgType == "FMA-DepartmentOfState") {
  portfolio = doc._id;
  }
  if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
  portfolio = doc.orgType;
  }
  emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
  }
  }';
  $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
  String.prototype.startsWith = function (str) {
  return !this.indexOf(str);
  }
  }
  if(!String.prototype.endsWith){
  String.prototype.endsWith = function(suffix) {
      return this.indexOf(suffix, this.length - suffix.length) !== -1;
  };
  }
  function(doc) {
  if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
  for(var propName in doc) {
  if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
  emit(propName, 1);
  }
  }
  emit("total", 1);
  }
  }';
  $obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
  return sum(values);
  }';
  $obj->views->fieldNames->map = '
  function(doc) {
  for(var propName in doc) {
  emit(propName, doc._id);
  }
   
  }';
  $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
  return values.length;
  }';
  // allow safe updates (even if slightly slower due to extra: rev-detection check).
  $db->save($obj, true);
   
   
?> ?>
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape import scrape
  from bs4 import BeautifulSoup
  import parsedatetime as pdt
  from time import mktime
  from datetime import datetime
  import feedparser
  import abc
   
from bs4 import BeautifulSoup class GenericDisclogScraper(object):
import abc __metaclass__ = abc.ABCMeta
import dateutil.parser agencyID = None
  disclogURL = None
  def getAgencyID(self):
  """ disclosr agency id """
  if self.agencyID == None:
  self.agencyID = os.path.basename(sys.argv[0]).replace(".py","")
  return self.agencyID
   
class GenericOAICDisclogScraper(object): def getURL(self):
__metaclass__ = abc.ABCMeta """ disclog URL"""
  if self.disclogURL == None:
  agency = scrape.agencydb.get(self.getAgencyID())
  self.disclogURL = agency['FOIDocumentsURL']
  return self.disclogURL
   
@abc.abstractmethod @abc.abstractmethod
def getAgencyID(self): def doScrape(self):
""" disclosr agency id """ """ do the scraping """
return return
   
@abc.abstractmethod  
def getURL(self):  
""" disclog URL""" class GenericRSSDisclogScraper(GenericDisclogScraper):
  def getDescription(self, entry, doc):
  """ get description from rss entry"""
  doc['description'] = entry.summary
return return
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
  feed = feedparser.parse(content)
  for entry in feed.entries:
  #print entry
  print entry.id
  hash = scrape.mkhash(entry.id)
  #print hash
  doc = foidocsdb.get(hash)
  #print doc
  if doc == None:
  print "saving"
  edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
  doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
  "date": edate,"title": entry.title}
  self.getDescription(entry, doc)
  foidocsdb.save(doc)
  else:
  print "already saved"
   
  class GenericOAICDisclogScraper(GenericDisclogScraper):
  __metaclass__ = abc.ABCMeta
@abc.abstractmethod @abc.abstractmethod
def getColumns(self,columns): def getColumns(self,columns):
""" rearranges columns if required """ """ rearranges columns if required """
return return
   
def doScrape(self): def doScrape(self):
  cal = pdt.Calendar()
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
if content != None: if content != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(content) soup = BeautifulSoup(content)
for row in soup.table.find_all('tr'): for row in soup.table.find_all('tr'):
columns = row.find_all('td') columns = row.find_all('td')
if len(columns) == 5: if len(columns) == 5:
(id, date, description, title, notes) = self.getColumns(columns) (id, date, description, title, notes) = self.getColumns(columns)
print id.string print id.string
hash = scrape.mkhash(url+id.string) hash = scrape.mkhash(url+id.string)
links = [] links = []
for atag in row.find_all("a"): for atag in row.find_all("a"):
if atag.has_key('href'): if atag.has_key('href'):
links.append(scrape.fullurl(url,atag['href'])) links.append(scrape.fullurl(url,atag['href']))
doc = foidocsdb.get(hash) doc = foidocsdb.get(hash)
descriptiontxt = "" descriptiontxt = ""
for string in description.stripped_strings: for string in description.stripped_strings:
descriptiontxt = descriptiontxt + string descriptiontxt = descriptiontxt + " \n" + string
if doc == None: if doc == None:
print "saving" print "saving"
edate = dateutil.parser.parse(date.string).date().strftime("%Y-%m-%d") dtresult = cal.parseDateText(date.string)
  if len(dtresult) == 2:
  (dtdate,dtr) = dtresult
  print dtdate
  edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
  else:
  edate = ""
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
"date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string}
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
elif len(row.find_all('th')) == 5: elif len(row.find_all('th')) == 5:
print "header row" print "header row"
else: else:
print "ERROR number of columns incorrect" print "ERROR number of columns incorrect"
print row print row
   
<?php <?php
include('template.inc.php'); include('template.inc.php');
include_header_documents(""); include_header_documents("");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php&#