Scrape required and chart of complied features views
Scrape required and chart of complied features views


Former-commit-id: 1f70b43713b7686e9f7a8a0f6a5aced655d53221

<?php <?php
   
include_once("../include/common.inc.php"); include_once("../include/common.inc.php");
setlocale(LC_CTYPE, 'C'); setlocale(LC_CTYPE, 'C');
header('Content-Type: text/csv'); header('Content-Type: text/csv');
header('Content-Disposition: attachment; filename="public_body_categories_en.rb"'); header('Content-Disposition: attachment; filename="public_body_categories_en.rb"');
header('Pragma: no-cache'); header('Pragma: no-cache');
header('Expires: 0'); header('Expires: 0');
echo 'PublicBodyCategories.add(:en, ['.PHP_EOL; echo 'PublicBodyCategories.add(:en, [' . PHP_EOL;
echo ' "Portfolios",'.PHP_EOL; echo ' "Portfolios",' . PHP_EOL;
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
   
try { try {
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
echo ' [ "'.phrase_to_tag(dept_to_portfolio($row->key)).'","'. dept_to_portfolio($row->key).'","part of the '.dept_to_portfolio($row->key).' portfolio" ],'.PHP_EOL; echo ' [ "' . phrase_to_tag(dept_to_portfolio($row->key)) . '","' . dept_to_portfolio($row->key) . '","part of the ' . dept_to_portfolio($row->key) . ' portfolio" ],' . PHP_EOL;
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
echo '])'; echo '])';
?> ?>
   
<?php <?php
   
include $basePath."schemas/schemas.inc.php"; include $basePath . "schemas/schemas.inc.php";
   
require ($basePath.'couchdb/settee/src/settee.php'); require ($basePath . 'couchdb/settee/src/settee.php');
   
function createAgencyDesignDoc() { function createAgencyDesignDoc() {
global $db; global $db;
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = "function(doc) { $obj->views->byName->map = "function(doc) {
emit(doc.name, doc._id); emit(doc.name, doc._id);
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != '' && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != '' && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
};"; };";
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->getScrapeRequired->map = "function(doc) {
   
  var lastScrape = Date.parse(doc.metadata.lastScraped);
   
  var today = new Date();
   
  if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
  emit(doc._id, doc);
  }
   
  };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
  $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
  String.prototype.startsWith = function (str) {
  return !this.indexOf(str);
  }
  }
  if(!String.prototype.endsWith){
  String.prototype.endsWith = function(suffix) {
      return this.indexOf(suffix, this.length - suffix.length) !== -1;
  };
  }
  function(doc) {
  if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
  for(var propName in doc) {
  if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
  emit(propName, 1);
  }
  }
  emit("total", 1);
  }
  }';
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && propName.startsWith("l")) { if(typeof(doc[propName]) != "undefined" && propName.startsWith("l")) {
count++ count++
} }
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count});
} }
}'; }';
   
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
return $db->save($obj, true); return $db->save($obj, true);
} }
   
  if (php_uname('n') == "vanille") {
   
if( php_uname('n') == "vanille") { $server = new SetteeServer('http://192.168.178.21:5984');
  } else
  if (php_uname('n') == "KYUUBEY") {
   
$server = new SetteeServer('http://192.168.178.21:5984'); $server = new SetteeServer('http://192.168.1.148:5984');
} else  
if( php_uname('n') == "KYUUBEY") {  
   
$server = new SetteeServer('http://192.168.1.148:5984');  
} else { } else {
$server = new SetteeServer('http://127.0.0.1:5984'); $server = new SetteeServer('http://127.0.0.1:5984');
} }
   
function setteErrorHandler($e) { function setteErrorHandler($e) {
echo $e->getMessage() . "<br>" . PHP_EOL; echo $e->getMessage() . "<br>" . PHP_EOL;
} }
file:a/scrape.py -> file:b/scrape.py
#http://packages.python.org/CouchDB/client.html #http://packages.python.org/CouchDB/client.html
import couchdb import couchdb
import urllib2 import urllib2
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import re import re
   
couch = couchdb.Server('http://192.168.1.148:5984/')  
   
# select database  
agencydb = couch['disclosr-agencies']  
   
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?  
agency = agencydb.get(row.id)  
print agency['agencyName']  
   
#http://diveintopython.org/http_web_services/etags.html #http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler): class NotModifiedHandler(urllib2.BaseHandler):
def http_error_304(self, req, fp, code, message, headers): def http_error_304(self, req, fp, code, message, headers):
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
addinfourl.code = code addinfourl.code = code
return addinfourl return addinfourl
   
def scrapeAndStore(URL, depth, agency): def scrapeAndStore(URL, depth, agency):
URL = "http://www.hole.fi/jajvirta/weblog/" URL = "http://www.google.com"
req = urllib2.Request(URL) req = urllib2.Request(URL)
  etag = 'y'
  last_modified = 'y'
#if there is a previous version sotred in couchdb, load caching helper tags #if there is a previous version sotred in couchdb, load caching helper tags
if etag: if etag:
req.add_header("If-None-Match", etag) req.add_header("If-None-Match", etag)
if last_modified: if last_modified:
req.add_header("If-Modified-Since", last_modified) req.add_header("If-Modified-Since", last_modified)
opener = urllib2.build_opener(NotModifiedHandler()) opener = urllib2.build_opener(NotModifiedHandler())
url_handle = opener.open(req) url_handle = opener.open(req)
headers = url_handle.info() # the addinfourls have the .info() too headers = url_handle.info() # the addinfourls have the .info() too
etag = headers.getheader("ETag") etag = headers.getheader("ETag")
last_modified = headers.getheader("Last-Modified") last_modified = headers.getheader("Last-Modified")
web_server = headers.getheader("Server") web_server = headers.getheader("Server")
file_size = headers.getheader("Content-Length") file_size = headers.getheader("Content-Length")
mime_type = headers.getheader("Content-Type") mime_type = headers.getheader("Content-Type")
if hasattr(url_handle, 'code') if hasattr(url_handle, 'code'):
if url_handle.code == 304: if url_handle.code == 304:
print "the web page has not been modified" print "the web page has not been modified"
else: else:
#do scraping #do scraping
html = url_handle.read() html = url_handle.read()
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(html) soup = BeautifulSoup(html)
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
for link in links: for link in links:
print link['href'] print link['href']
#for each unique link #for each unique link
#if html mimetype #if html mimetype
# go down X levels, # go down X levels,
# diff with last stored attachment, store in document # diff with last stored attachment, store in document
#if not #if not
# remember to save parentURL and title (link text that lead to document) # remember to save parentURL and title (link text that lead to document)
#store as attachment epoch-filename #store as attachment epoch-filename
else: else:
print "error %s in downloading %s", url_handle.code, URL print "error %s in downloading %s", url_handle.code, URL
#record/alert error to error database #record/alert error to error database
   
   
   
   
   
   
   
   
   
  couch = couchdb.Server('http://192.168.1.148:5984/')
   
  # select database
  agencydb = couch['disclosr-agencies']
   
  for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
  agency = agencydb.get(row.id)
  print agency['name']
  scrapeAndStore("A",1,1)