<?php | <?php |
require_once '../include/common.inc.php'; | require_once '../include/common.inc.php'; |
//function createFOIDocumentsDesignDoc() { | |
$foidb = $server->get_db('disclosr-foidocuments'); | |
$obj = new stdClass(); | |
$obj->_id = "_design/" . urlencode("app"); | |
$obj->language = "javascript"; | |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | |
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; | |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | |
$foidb->save($obj, true); | |
function createDocumentsDesignDoc() { | |
/* | |
global $db; | |
$obj = new stdClass(); | |
$obj->_id = "_design/" . urlencode("app"); | |
$obj->language = "javascript"; | |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | |
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; | |
"views": { | |
"web_server": { | |
"map": "function(doc) {\n emit(doc.web_server, 1);\n}", | |
"reduce": "function (key, values, rereduce) {\n return sum(values);\n}" | |
}, | |
"byAgency": { | |
"map": "function(doc) {\n emit(doc.agencyID, 1);\n}", | |
"reduce": "function (key, values, rereduce) {\n return sum(values);\n}" | |
}, | |
"byURL": { | |
"map": "function(doc) {\n emit(doc.url, doc);\n}" | |
}, | |
"agency": { | |
"map": "function(doc) {\n emit(doc.agencyID, doc);\n}" | |
}, | |
"byWebServer": { | |
"map": "function(doc) {\n emit(doc.web_server, doc);\n}" | |
}, | |
"getValidationRequired": { | |
"map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}" | |
} | |
} */ | |
} | |
//function createAgencyDesignDoc() { | |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
createAgencyDesignDoc(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | |
$obj->language = "javascript"; | |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | |
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; | |
$obj->views->byCanonicalName->map = "function(doc) { | |
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { | |
emit(doc.name, doc); | |
} | |
};"; | |
$obj->views->byDeptStateName->map = "function(doc) { | |
if (doc.orgType == 'FMA-DepartmentOfState') { | |
emit(doc.name, doc._id); | |
} | |
};"; | |
$obj->views->parentOrgs->map = "function(doc) { | |
if (doc.parentOrg) { | |
emit(doc._id, doc.parentOrg); | |
} | |
};"; | |
$obj->views->byName->map = 'function(doc) { | |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | |
emit(doc.name, doc._id); | |
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { | |
emit(doc.shortName, doc._id); | |
} | |
for (name in doc.otherNames) { | |
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { | |
emit(doc.otherNames[name], doc._id); | |
} | |
} | |
for (name in doc.foiBodies) { | |
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { | |
emit(doc.foiBodies[name], doc._id); | |
} | |
} | |
} | |
};'; | |
$obj->views->foiEmails->map = "function(doc) { | |
emit(doc._id, doc.foiEmail); | |
};"; | |
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; | |
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; | |
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; | |
$obj->views->getScrapeRequired->map = "function(doc) { | |
var lastScrape = Date.parse(doc.metadata.lastScraped); | |
var today = new Date(); | |
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { | |
emit(doc._id, doc); | |
} | |
};"; | |
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; | |
$obj->views->getConflicts->map = "function(doc) { | |
if (doc._conflicts) { | |
emit(null, [doc._rev].concat(doc._conflicts)); | |
} | |
}"; | |
// http://stackoverflow.com/questions/646628/javascript-startswith | |
$obj->views->score->map = 'if(!String.prototype.startsWith){ | |
String.prototype.startsWith = function (str) { | |
return !this.indexOf(str); | |
} | |
} | |
function(doc) { | |
count = 0; | |
if (doc["status"] != "suspended") { | |
for(var propName in doc) { | |
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { | |
count++; | |
} | |
} | |
portfolio = doc.parentOrg; | |
if (doc.orgType == "FMA-DepartmentOfState") { | |
portfolio = doc._id; | |
} | |
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { | |
portfolio = doc.orgType; | |
} | |
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); | |
} | |
}'; | |
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ | |
String.prototype.startsWith = function (str) { | |
return !this.indexOf(str); | |
} | |
} | |
if(!String.prototype.endsWith){ | |
String.prototype.endsWith = function(suffix) { | |
return this.indexOf(suffix, this.length - suffix.length) !== -1; | |
}; | |
} | |
function(doc) { | |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | |
for(var propName in doc) { | |
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { | |
emit(propName, 1); | |
} | |
} | |
emit("total", 1); | |
} | |
}'; | |
$obj->views->scoreHas->reduce = 'function (key, values, rereduce) { | |
return sum(values); | |
}'; | |
$obj->views->fieldNames->map = ' | |
function(doc) { | |
for(var propName in doc) { | |
emit(propName, doc._id); | |
} | |
}'; | |
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { | |
return values.length; | |
}'; | |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | |
$db->save($obj, true); | |
?> | ?> |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
import parsedatetime as pdt | import parsedatetime as pdt |
from time import mktime | from time import mktime |
from datetime import datetime | from datetime import datetime |
import feedparser | import feedparser |
import abc | import abc |
class GenericRSSDisclogScraper(object): | class GenericDisclogScraper(object): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
@abc.abstractmethod | agencyID = None |
disclogURL = None | |
def getAgencyID(self): | def getAgencyID(self): |
""" disclosr agency id """ | """ disclosr agency id """ |
return | if self.agencyID == None: |
self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") | |
return self.agencyID | |
@abc.abstractmethod | |
def getURL(self): | def getURL(self): |
""" disclog URL""" | """ disclog URL""" |
return | if self.disclogURL == None: |
agency = scrape.agencydb.get(self.getAgencyID()) | |
self.disclogURL = agency['FOIDocumentsURL'] | |
return self.disclogURL | |
@abc.abstractmethod | |
def doScrape(self): | |
""" do the scraping """ | |
return | |
class GenericRSSDisclogScraper(GenericDisclogScraper): | |
def getDescription(self, entry, doc): | |
""" get description from rss entry""" | |
doc['description'] = entry.summary | |
return | |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) |
feed = feedparser.parse(content) | feed = feedparser.parse(content) |
for entry in feed.entries: | for entry in feed.entries: |
#print entry | #print entry |
print entry.id | print entry.id |
hash = scrape.mkhash(entry.link) | hash = scrape.mkhash(entry.id) |
#print hash | |
doc = foidocsdb.get(hash) | doc = foidocsdb.get(hash) |
#print doc | |
if doc == None: | if doc == None: |
print "saving" | print "saving" |
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") | edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") |
doc = {'id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, | doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, |
"date": edate, "description": entry.summary,"title": entry.title} | "date": edate,"title": entry.title} |
self.getDescription(entry, doc) | |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericOAICDisclogScraper(object): | class GenericOAICDisclogScraper(GenericDisclogScraper): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
@abc.abstractmethod | |
def getAgencyID(self): | |
""" disclosr agency id """ | |
return | |
@abc.abstractmethod | |
def getURL(self): | |
""" disclog URL""" | |
return | |
@abc.abstractmethod | @abc.abstractmethod |
def getColumns(self,columns): | def getColumns(self,columns): |
""" rearranges columns if required """ | """ rearranges columns if required """ |
return | return |
def doScrape(self): | def doScrape(self): |
cal = pdt.Calendar() | cal = pdt.Calendar() |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) |
if content != None: | if content != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
for row in soup.table.find_all('tr'): | for row in soup.table.find_all('tr'): |
columns = row.find_all('td') | columns = row.find_all('td') |
if len(columns) == 5: | if len(columns) == 5: |
(id, date, description, title, notes) = self.getColumns(columns) | (id, date, description, title, notes) = self.getColumns(columns) |
print id.string | print id.string |
hash = scrape.mkhash(url+id.string) | hash = scrape.mkhash(url+id.string) |
links = [] | links = [] |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_key('href'): | if atag.has_key('href'): |
links.append(scrape.fullurl(url,atag['href'])) | links.append(scrape.fullurl(url,atag['href'])) |
doc = foidocsdb.get(hash) | doc = foidocsdb.get(hash) |
descriptiontxt = "" | descriptiontxt = "" |
for string in description.stripped_strings: | for string in description.stripped_strings: |
descriptiontxt = descriptiontxt + " \n" + string | descriptiontxt = descriptiontxt + " \n" + string |
if doc == None: | if doc == None: |
print "saving" | print "saving" |
dtresult = cal.parseDateText(date.string) | dtresult = cal.parseDateText(date.string) |
if len(dtresult) == 2: | if len(dtresult) == 2: |
(dtdate,dtr) = dtresult | (dtdate,dtr) = dtresult |
print dtdate | print dtdate |
edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) | edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) |
else: | else: |
edate = "" | edate = "" |
doc = {'id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, | doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, |
"date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} | "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
elif len(row.find_all('th')) == 5: | elif len(row.find_all('th')) == 5: |
print "header row" | print "header row" |
else: | else: |
print "ERROR number of columns incorrect" | print "ERROR number of columns incorrect" |
print row | print row |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getAgencyID(self): | |
return "3cd40b1240e987cbcd3f0e67054ce259" | |
def getURL(self): | |
return "http://www.apvma.gov.au/about/foi/disclosure/index.php" | |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, description, title, notes) = columns | (id, date, description, title, notes) = columns |
return (id, date, description, title, notes) | return (id, date, description, title, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ |