Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr
Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr

Conflicts:
documents/rss.xml.php
lib/FeedWriter

Former-commit-id: 9f314c20fca6d7ffb1eaa4892e7b316bc0ea3628

<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() { //function createFOIDocumentsDesignDoc() {
   
$foidb = $server->get_db('disclosr-foidocuments'); $foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDate->reduce = "_count";  
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count"; $obj->views->byAgencyID->reduce = "_count";
   
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true); $foidb->save($obj, true);
   
   
//function createDocumentsDesignDoc() { //function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents'); $docdb = $server->get_db('disclosr-documents');
   
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; $obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; $obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
   
   
   
   
//function createAgencyDesignDoc() { //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = 'function(doc) { $obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id); emit(doc.shortName, doc._id);
} }
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
for (name in doc.foiBodies) { for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id); emit(doc.foiBodies[name], doc._id);
} }
} }
for (name in doc.positions) { for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id); emit(doc.positions[name], doc._id);
} }
} }
} }
};'; };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (doc["status"] != "suspended") { if (doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++; count++;
} }
} }
portfolio = doc.parentOrg; portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") { if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id; portfolio = doc._id;
} }
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType; portfolio = doc.orgType;
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
} }
}'; }';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->scoreHas->reduce = 'function (key, values, rereduce) { $obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
return sum(values); return sum(values);
}'; }';
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
}'; }';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length; return values.length;
}'; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true); $db->save($obj, true);
?> ?>
   
  <?php
 
  include('template.inc.php');
  include_header_documents("");
  include_once('../include/common.inc.php');
  ?>
  <h1>About</h1>
  <?php
  include_footer_documents();
  ?>
 
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import mktime from time import mktime
import feedparser import feedparser
import abc import abc
import unicodedata, re import unicodedata, re
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
from datetime import * from datetime import *
import codecs import codecs
   
from StringIO import StringIO from StringIO import StringIO
   
from docx import * from docx import *
from lxml import etree from lxml import etree
import zipfile import zipfile
   
from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import TextConverter from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
   
class GenericDisclogScraper(object): class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
agencyID = None agencyID = None
disclogURL = None disclogURL = None
def remove_control_chars(self, input): def remove_control_chars(self, input):
return "".join([i for i in input if ord(i) in range(32, 127)]) return "".join([i for i in input if ord(i) in range(32, 127)])
def getAgencyID(self): def getAgencyID(self):
""" disclosr agency id """ """ disclosr agency id """
if self.agencyID == None: if self.agencyID == None:
self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") self.agencyID = os.path.basename(sys.argv[0]).replace(".py","")
return self.agencyID return self.agencyID
   
def getURL(self): def getURL(self):
""" disclog URL""" """ disclog URL"""
if self.disclogURL == None: if self.disclogURL == None:
agency = scrape.agencydb.get(self.getAgencyID()) agency = scrape.agencydb.get(self.getAgencyID())
self.disclogURL = agency['FOIDocumentsURL'] self.disclogURL = agency['FOIDocumentsURL']
return self.disclogURL return self.disclogURL
   
@abc.abstractmethod @abc.abstractmethod
def doScrape(self): def doScrape(self):
""" do the scraping """ """ do the scraping """
return return
   
class GenericPDFDisclogScraper(GenericDisclogScraper): class GenericPDFDisclogScraper(GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
laparams = LAParams() laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=True) rsrcmgr = PDFResourceManager(caching=True)
   
outfp = StringIO.StringIO() outfp = StringIO.StringIO()
device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams)
fp = StringIO.StringIO() fp = StringIO.StringIO()
fp.write(content) fp.write(content)
description = output.getvalue(); description = output.getvalue();
process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True) process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True)
fp.close() fp.close()
device.close() device.close()
outfp.close() outfp.close()
   
hash = scrape.mkhash(description) hash = scrape.mkhash(description)
#print hash #print hash
doc = foidocsdb.get(hash) doc = foidocsdb.get(hash)
#print doc #print doc
if doc == None: if doc == None:
print "saving "+ hash print "saving "+ hash
edate = datetime.fromtimestamp(mktime( )).strftime("%Y-%m-%d") edate = datetime.fromtimestamp(mktime( )).strftime("%Y-%m-%d")
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash, doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash,
"date": edate,"title": "Disclosure Log Updated"} "date": edate,"title": "Disclosure Log Updated"}
self.getDescription(entry,entry, doc) self.getDescription(entry,entry, doc)
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
   
class GenericDOCXDisclogScraper(GenericDisclogScraper): class GenericDOCXDisclogScraper(GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
mydoc = zipfile.ZipFile(file) mydoc = zipfile.ZipFile(file)
xmlcontent = mydoc.read('word/document.xml') xmlcontent = mydoc.read('word/document.xml')
document = etree.fromstring(xmlcontent) document = etree.fromstring(xmlcontent)
## Fetch all the text out of the document we just created ## Fetch all the text out of the document we just created
paratextlist = getdocumenttext(document) paratextlist = getdocumenttext(document)
   
# Make explicit unicode version # Make explicit unicode version
newparatextlist = [] newparatextlist = []
for paratext in paratextlist: for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8")) newparatextlist.append(paratext.encode("utf-8"))
## Print our documnts test with two newlines under each paragraph ## Print our documnts test with two newlines under each paragraph
description = '\n\n'.join(newparatextlist) description = '\n\n'.join(newparatextlist)
   
hash = scrape.mkhash(description) hash = scrape.mkhash(description)
#print hash #print hash
doc = foidocsdb.get(hash) doc = foidocsdb.get(hash)
#print doc #print doc
if doc == None: if doc == None:
print "saving "+ hash print "saving "+ hash
edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d")
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash, doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash,
"date": edate,"title": "Disclosure Log Updated"} "date": edate,"title": "Disclosure Log Updated"}
self.getDescription(entry,entry, doc) self.getDescription(entry,entry, doc)
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
   
class GenericRSSDisclogScraper(GenericDisclogScraper): class GenericRSSDisclogScraper(GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
feed = feedparser.parse(content) feed = feedparser.parse(content)
for entry in feed.entries: for entry in feed.entries:
#print entry #print entry
print entry.id print entry.id
hash = scrape.mkhash(entry.id) hash = scrape.mkhash(entry.id)
#print hash #print hash
doc = foidocsdb.get(hash) doc = foidocsdb.get(hash)
#print doc #print doc
if doc == None: if doc == None:
print "saving "+ hash print "saving "+ hash
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
"date": edate,"title": entry.title} "date": edate,"title": entry.title}
self.getDescription(entry,entry, doc) self.getDescription(entry,entry, doc)
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"