[submodule "couchdb/couchdb-lucene"] | [submodule "couchdb/couchdb-lucene"] |
path = couchdb/couchdb-lucene | path = couchdb/couchdb-lucene |
url = https://github.com/rnewson/couchdb-lucene.git | url = https://github.com/rnewson/couchdb-lucene.git |
[submodule "couchdb/settee"] | [submodule "couchdb/settee"] |
path = couchdb/settee | path = couchdb/settee |
url = https://github.com/inadarei/settee.git | url = https://github.com/inadarei/settee.git |
[submodule "lib/php-diff"] | [submodule "lib/php-diff"] |
path = lib/php-diff | path = lib/php-diff |
url = https://github.com/chrisboulton/php-diff.git | url = https://github.com/chrisboulton/php-diff.git |
[submodule "lib/Requests"] | [submodule "lib/Requests"] |
path = lib/Requests | path = lib/Requests |
url = https://github.com/rmccue/Requests.git | url = https://github.com/rmccue/Requests.git |
[submodule "js/flotr2"] | [submodule "js/flotr2"] |
path = js/flotr2 | path = js/flotr2 |
url = https://github.com/HumbleSoftware/Flotr2.git | url = https://github.com/HumbleSoftware/Flotr2.git |
[submodule "lib/phpquery"] | [submodule "lib/phpquery"] |
path = lib/phpquery | path = lib/phpquery |
url = https://github.com/TobiaszCudnik/phpquery.git | url = https://github.com/TobiaszCudnik/phpquery.git |
[submodule "js/sigma"] | [submodule "js/sigma"] |
path = js/sigma | path = js/sigma |
url = https://github.com/jacomyal/sigma.js.git | url = https://github.com/jacomyal/sigma.js.git |
[submodule "js/bubbletree"] | [submodule "js/bubbletree"] |
path = js/bubbletree | path = js/bubbletree |
url = https://github.com/okfn/bubbletree.git | url = https://github.com/okfn/bubbletree.git |
[submodule "lib/querypath"] | [submodule "lib/querypath"] |
path = lib/querypath | path = lib/querypath |
url = https://github.com/technosophos/querypath.git | url = https://github.com/technosophos/querypath.git |
[submodule "lib/amon-php"] | [submodule "lib/amon-php"] |
path = lib/amon-php | path = lib/amon-php |
url = https://github.com/martinrusev/amon-php.git | url = https://github.com/martinrusev/amon-php.git |
[submodule "documents/lib/parsedatetime"] | |
path = documents/lib/parsedatetime | |
url = git://github.com/bear/parsedatetime.git | |
<?php | <?php |
require_once '../include/common.inc.php'; | require_once '../include/common.inc.php'; |
//function createFOIDocumentsDesignDoc() { | |
$foidb = $server->get_db('disclosr-foidocuments'); | |
$obj = new stdClass(); | |
$obj->_id = "_design/" . urlencode("app"); | |
$obj->language = "javascript"; | |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | |
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; | |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | |
$foidb->save($obj, true); | |
function createDocumentsDesignDoc() { | |
/* | |
global $db; | |
$obj = new stdClass(); | |
$obj->_id = "_design/" . urlencode("app"); | |
$obj->language = "javascript"; | |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | |
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; | |
"views": { | |
"web_server": { | |
"map": "function(doc) {\n emit(doc.web_server, 1);\n}", | |
"reduce": "function (key, values, rereduce) {\n return sum(values);\n}" | |
}, | |
"byAgency": { | |
"map": "function(doc) {\n emit(doc.agencyID, 1);\n}", | |
"reduce": "function (key, values, rereduce) {\n return sum(values);\n}" | |
}, | |
"byURL": { | |
"map": "function(doc) {\n emit(doc.url, doc);\n}" | |
}, | |
"agency": { | |
"map": "function(doc) {\n emit(doc.agencyID, doc);\n}" | |
}, | |
"byWebServer": { | |
"map": "function(doc) {\n emit(doc.web_server, doc);\n}" | |
}, | |
"getValidationRequired": { | |
"map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}" | |
} | |
} */ | |
} | |
//function createAgencyDesignDoc() { | |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
createAgencyDesignDoc(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | |
$obj->language = "javascript"; | |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | |
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; | |
$obj->views->byCanonicalName->map = "function(doc) { | |
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { | |
emit(doc.name, doc); | |
} | |
};"; | |
$obj->views->byDeptStateName->map = "function(doc) { | |
if (doc.orgType == 'FMA-DepartmentOfState') { | |
emit(doc.name, doc._id); | |
} | |
};"; | |
$obj->views->parentOrgs->map = "function(doc) { | |
if (doc.parentOrg) { | |
emit(doc._id, doc.parentOrg); | |
} | |
};"; | |
$obj->views->byName->map = 'function(doc) { | |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | |
emit(doc.name, doc._id); | |
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { | |
emit(doc.shortName, doc._id); | |
} | |
for (name in doc.otherNames) { | |
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { | |
emit(doc.otherNames[name], doc._id); | |
} | |
} | |
for (name in doc.foiBodies) { | |
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { | |
emit(doc.foiBodies[name], doc._id); | |
} | |
} | |
} | |
};'; | |
$obj->views->foiEmails->map = "function(doc) { | |
emit(doc._id, doc.foiEmail); | |
};"; | |
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; | |
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; | |
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; | |
$obj->views->getScrapeRequired->map = "function(doc) { | |
var lastScrape = Date.parse(doc.metadata.lastScraped); | |
var today = new Date(); | |
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { | |
emit(doc._id, doc); | |
} | |
};"; | |
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; | |
$obj->views->getConflicts->map = "function(doc) { | |
if (doc._conflicts) { | |
emit(null, [doc._rev].concat(doc._conflicts)); | |
} | |
}"; | |
// http://stackoverflow.com/questions/646628/javascript-startswith | |
$obj->views->score->map = 'if(!String.prototype.startsWith){ | |
String.prototype.startsWith = function (str) { | |
return !this.indexOf(str); | |
} | |
} | |
function(doc) { | |
count = 0; | |
if (doc["status"] != "suspended") { | |
for(var propName in doc) { | |
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { | |
count++; | |
} | |
} | |
portfolio = doc.parentOrg; | |
if (doc.orgType == "FMA-DepartmentOfState") { | |
portfolio = doc._id; | |
} | |
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { | |
portfolio = doc.orgType; | |
} | |
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); | |
} | |
}'; | |
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ | |
String.prototype.startsWith = function (str) { | |
return !this.indexOf(str); | |
} | |
} | |
if(!String.prototype.endsWith){ | |
String.prototype.endsWith = function(suffix) { | |
return this.indexOf(suffix, this.length - suffix.length) !== -1; | |
}; | |
} | |
function(doc) { | |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | |
for(var propName in doc) { | |
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { | |
emit(propName, 1); | |
} | |
} | |
emit("total", 1); | |
} | |
}'; | |
$obj->views->scoreHas->reduce = 'function (key, values, rereduce) { | |
return sum(values); | |
}'; | |
$obj->views->fieldNames->map = ' | |
function(doc) { | |
for(var propName in doc) { | |
emit(propName, doc._id); | |
} | |
}'; | |
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { | |
return values.length; | |
}'; | |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | |
$db->save($obj, true); | |
?> | ?> |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import scrape | import scrape |
from bs4 import BeautifulSoup | |
import parsedatetime as pdt | |
from time import mktime | |
from datetime import datetime | |
import feedparser | |
import abc | |
from bs4 import BeautifulSoup | class GenericDisclogScraper(object): |
import abc | __metaclass__ = abc.ABCMeta |
import dateutil.parser | agencyID = None |
disclogURL = None | |
def getAgencyID(self): | |
""" disclosr agency id """ | |
if self.agencyID == None: | |
self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") | |
return self.agencyID | |
class GenericOAICDisclogScraper(object): | def getURL(self): |
__metaclass__ = abc.ABCMeta | """ disclog URL""" |
if self.disclogURL == None: | |
agency = scrape.agencydb.get(self.getAgencyID()) | |
self.disclogURL = agency['FOIDocumentsURL'] | |
return self.disclogURL | |
@abc.abstractmethod | @abc.abstractmethod |
def getAgencyID(self): | def doScrape(self): |
""" disclosr agency id """ | """ do the scraping """ |
return | return |
@abc.abstractmethod | @abc.abstractmethod |
def getURL(self): | def getDescription(self, content, entry, doc): |
""" disclog URL""" | """ get description""" |
return | return |
class GenericRSSDisclogScraper(GenericDisclogScraper): | |
def doScrape(self): | |
foidocsdb = scrape.couch['disclosr-foidocuments'] | |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | |
feed = feedparser.parse(content) | |
for entry in feed.entries: | |
#print entry | |
print entry.id | |
hash = scrape.mkhash(entry.id) | |
#print hash | |
doc = foidocsdb.get(hash) | |
#print doc | |
if doc == None: | |
print "saving" | |
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") | |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, | |
"date": edate,"title": entry.title} | |
self.getDescription(entry,entry, doc) | |
foidocsdb.save(doc) | |
else: | |
print "already saved" | |
def getDescription(self, content, entry, doc): | |
""" get description from rss entry""" | |
doc.update({'description': content.summary}) | |
return | |
class GenericOAICDisclogScraper(GenericDisclogScraper): | |
__metaclass__ = abc.ABCMeta | |
@abc.abstractmethod | @abc.abstractmethod |
def getColumns(self,columns): | def getColumns(self,columns): |
""" rearranges columns if required """ | """ rearranges columns if required """ |
return | return |
def getColumnCount(self): | |
return 5 | |
def getDescription(self, content, entry, doc): | |
""" get description from rss entry""" | |
descriptiontxt = "" | |
for string in content.stripped_strings: | |
descriptiontxt = descriptiontxt + " \n" + string | |
doc.update({'description': descriptiontxt}) | |
return | |
def doScrape(self): | def doScrape(self): |
cal = pdt.Calendar() | |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) |
if content != None: | if content != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
for row in soup.table.find_all('tr'): | for row in soup.table.find_all('tr'): |
columns = row.find_all('td') | columns = row.find_all('td') |
if len(columns) == 5: | if len(columns) == self.getColumnCount(): |
(id, date, description, title, notes) = self.getColumns(columns) | (id, date, description, title, notes) = self.getColumns(columns) |
print id.string | print id.string |
hash = scrape.mkhash(url+id.string) | hash = scrape.mkhash(url+id.string) |
links = [] | links = [] |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_key('href'): | if atag.has_key('href'): |
links.append(scrape.fullurl(url,atag['href'])) | links.append(scrape.fullurl(url,atag['href'])) |
doc = foidocsdb.get(hash) | doc = foidocsdb.get(hash) |
descriptiontxt = "" | |
for string in description.stripped_strings: | |
descriptiontxt = descriptiontxt + string | |
if doc == None: | if doc == None: |
print "saving" | print "saving" |
edate = dateutil.parser.parse(date.string).date().strftime("%Y-%m-%d") | dtresult = cal.parseDateText(date.string) |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, | if len(dtresult) == 2: |
"date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} | (dtdate,dtr) = dtresult |
print dtdate | |
edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) | |
else: | |