Scrape required and chart of complied features views
Scrape required and chart of complied features views


Former-commit-id: 1f70b43713b7686e9f7a8a0f6a5aced655d53221

--- a/alaveteli/exportCategories.rb.php
+++ b/alaveteli/exportCategories.rb.php
@@ -1,19 +1,20 @@
 <?php
+
 include_once("../include/common.inc.php");
 setlocale(LC_CTYPE, 'C');
- header('Content-Type: text/csv');
-    header('Content-Disposition: attachment; filename="public_body_categories_en.rb"');
-    header('Pragma: no-cache');
-    header('Expires: 0');
-echo 'PublicBodyCategories.add(:en, ['.PHP_EOL;
-echo '    "Portfolios",'.PHP_EOL;
+header('Content-Type: text/csv');
+header('Content-Disposition: attachment; filename="public_body_categories_en.rb"');
+header('Pragma: no-cache');
+header('Expires: 0');
+echo 'PublicBodyCategories.add(:en, [' . PHP_EOL;
+echo '    "Portfolios",' . PHP_EOL;
 $db = $server->get_db('disclosr-agencies');
 
 try {
     $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
     //print_r($rows);
     foreach ($rows as $row) {
-        echo '        [ "'.phrase_to_tag(dept_to_portfolio($row->key)).'","'. dept_to_portfolio($row->key).'","part of the '.dept_to_portfolio($row->key).' portfolio" ],'.PHP_EOL;
+        echo '        [ "' . phrase_to_tag(dept_to_portfolio($row->key)) . '","' . dept_to_portfolio($row->key) . '","part of the ' . dept_to_portfolio($row->key) . ' portfolio" ],' . PHP_EOL;
     }
 } catch (SetteeRestClientException $e) {
     setteErrorHandler($e);

--- a/include/couchdb.inc.php
+++ b/include/couchdb.inc.php
@@ -1,8 +1,8 @@
 <?php
 
-include $basePath."schemas/schemas.inc.php";
+include $basePath . "schemas/schemas.inc.php";
 
-require ($basePath.'couchdb/settee/src/settee.php');
+require ($basePath . 'couchdb/settee/src/settee.php');
 
 function createAgencyDesignDoc() {
     global $db;
@@ -11,17 +11,17 @@
     $obj->language = "javascript";
     $obj->views->all->map = "function(doc) {   emit(doc._id, doc); };";
     $obj->views->byABN->map = "function(doc) {   emit(doc.abn, doc); };";
-        $obj->views->byCanonicalName->map = "function(doc) {  
+    $obj->views->byCanonicalName->map = "function(doc) {  
             if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
         emit(doc.name, doc); 
         }
 };";
-        $obj->views->byDeptStateName->map = "function(doc) {  
+    $obj->views->byDeptStateName->map = "function(doc) {  
             if (doc.orgType == 'FMA-DepartmentOfState') {
         emit(doc.name, doc._id); 
         }
 };";
-        $obj->views->parentOrgs->map = "function(doc) {
+    $obj->views->parentOrgs->map = "function(doc) {
             if (doc.parentOrg) {
         emit(doc._id, doc.parentOrg);
         }
@@ -34,15 +34,25 @@
 }
         }
 };";
-    
-   $obj->views->foiEmails->map = "function(doc) {  
+
+    $obj->views->foiEmails->map = "function(doc) {  
         emit(doc._id, doc.foiEmail);
 };";
-    
+
     $obj->views->byLastModified->map = "function(doc) {   emit(doc.metadata.lastModified, doc); }";
     $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") {  emit(doc._id, doc); } };';
     $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") {  emit(doc._id, doc); } };';
-    $obj->views->getScrapeRequired->map = "function(doc) {   emit(doc.abn, doc); };";
+    $obj->views->getScrapeRequired->map = "function(doc) {   
+
+var lastScrape = Date.parse(doc.metadata.lastScraped);
+
+var today = new Date();
+
+if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
+ emit(doc._id, doc); 
+}
+
+};";
     $obj->views->showNamesABNs->map = "function(doc) {   emit(doc._id, {name: doc.name, abn: doc.abn}); };";
     $obj->views->getConflicts->map = "function(doc) {
   if (doc._conflicts) {
@@ -50,6 +60,26 @@
   }
 }";
     // http://stackoverflow.com/questions/646628/javascript-startswith
+    $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
+    String.prototype.startsWith = function (str) {
+        return !this.indexOf(str);
+    }
+}
+if(!String.prototype.endsWith){
+	String.prototype.endsWith = function(suffix) {
+	    return this.indexOf(suffix, this.length - suffix.length) !== -1;
+	};
+}
+function(doc) {
+if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
+for(var propName in doc) {
+      if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
+  	emit(propName, 1);
+	}
+}
+  emit("total", 1);
+  }
+}';
     $obj->views->score->map = 'if(!String.prototype.startsWith){
     String.prototype.startsWith = function (str) {
         return !this.indexOf(str);
@@ -72,17 +102,17 @@
     return $db->save($obj, true);
 }
 
+if (php_uname('n') == "vanille") {
 
-if( php_uname('n') == "vanille") {
+    $server = new SetteeServer('http://192.168.178.21:5984');
+} else
+if (php_uname('n') == "KYUUBEY") {
 
-$server = new SetteeServer('http://192.168.178.21:5984');
-} else 
-    if( php_uname('n') == "KYUUBEY") {
-
-$server = new SetteeServer('http://192.168.1.148:5984');
+    $server = new SetteeServer('http://192.168.1.148:5984');
 } else {
     $server = new SetteeServer('http://127.0.0.1:5984');
 }
+
 function setteErrorHandler($e) {
     echo $e->getMessage() . "<br>" . PHP_EOL;
 }

file:a/scrape.py -> file:b/scrape.py
--- a/scrape.py
+++ b/scrape.py
@@ -3,15 +3,6 @@
 import urllib2
 from BeautifulSoup import BeautifulSoup
 import re
-
-couch = couchdb.Server('http://192.168.1.148:5984/')
-
-# select database
-agencydb = couch['disclosr-agencies']
-
-for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
-    agency = agencydb.get(row.id)
-    print agency['agencyName']
 
 #http://diveintopython.org/http_web_services/etags.html
 class NotModifiedHandler(urllib2.BaseHandler):  
@@ -21,9 +12,10 @@
         return addinfourl
 
 def scrapeAndStore(URL, depth, agency):
-    URL = "http://www.hole.fi/jajvirta/weblog/"
+    URL = "http://www.google.com"
     req = urllib2.Request(URL)
-    
+    etag = 'y'
+    last_modified = 'y'
     #if there is a previous version sotred in couchdb, load caching helper tags
     if etag:
         req.add_header("If-None-Match", etag)
@@ -39,7 +31,7 @@
     file_size = headers.getheader("Content-Length") 
     mime_type = headers.getheader("Content-Type") 
      
-    if hasattr(url_handle, 'code') 
+    if hasattr(url_handle, 'code'): 
         if url_handle.code == 304:
             print "the web page has not been modified"
         else: 
@@ -63,3 +55,22 @@
         #record/alert error to error database
     
     
+
+
+
+
+
+
+
+
+
+couch = couchdb.Server('http://192.168.1.148:5984/')
+
+# select database
+agencydb = couch['disclosr-agencies']
+
+for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+    agency = agencydb.get(row.id)
+    print agency['name']
+scrapeAndStore("A",1,1)
+