From: Maxious <maxious@lambdacomplex.org>
Date: Mon, 23 Jan 2012 04:05:18 +0000
Subject: Handling of minister/secretary names in FOI export
X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=c82c38421a63b171d9ce94168ab1f4073cf349de
---
Handling of minister/secretary names in FOI export


Former-commit-id: 4d6a601bd2ae9012300836e1ddf12bc147981e10
---


--- a/alaveteli/exportAgencies.csv.php
+++ b/alaveteli/exportAgencies.csv.php
@@ -1,6 +1,21 @@
 <?php
 
 include_once("../include/common.inc.php");
+
+function shortName($name) {
+    $name = trim($name);
+    if (strstr($name,"Minister ") || strstr($name,"Treasurer") || strstr($name,"Parliamentary Secretary")) {
+        $badWords = Array ("Assisting the Prime Minister on","Assisting on"," the "," of "," for "," on "," and "," to ",","," ","'","`");
+        return str_replace($badWords,"",$name);
+    }
+            
+    else {
+    $out = Array();
+    preg_match_all('/[A-Z]/', $name, $out);
+    return implode("", $out[0]);
+    }
+}
+
 setlocale(LC_CTYPE, 'C');
 
 $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");
@@ -12,15 +27,28 @@
     $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
     //print_r($rows);
     foreach ($rows as $row) {
-        $tag[$row->value] = phrase_to_tag(dept_to_portfolio($row->key));
+        $tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key));
     }
 } catch (SetteeRestClientException $e) {
     setteErrorHandler($e);
+    die();
+}
+
+$foiEmail = Array();
+try {
+    $rows = $db->get_view("app", "foiEmails", null, true)->rows;
+    //print_r($rows);
+    foreach ($rows as $row) {
+        $foiEmail[$row->key] = $row->value;
+    }
+} catch (SetteeRestClientException $e) {
+    setteErrorHandler($e);
+    die();
 }
 
 $fp = fopen('php://output', 'w');
 if ($fp && $db) {
-    header('Content-Type: text/csv');
+    header('Content-Type: text/csv; charset=utf-8');
     header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
     header('Pragma: no-cache');
     header('Expires: 0');
@@ -29,39 +57,43 @@
         $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
         //print_r($rows);
         foreach ($agencies as $agency) {
-        //    print_r($agency);
-            $row = Array();
-            $row["#id"] = $agency->value->_id;
-            $row["name"] = $agency->value->name;
-            if (isset($agency->value->foi_email)) {
-            $row["request_email"] = $agency->value->foiEmail;
-            } else {
-                $row["request_email"] = "foi@".GetDomain($agency->value->website);
-                // 
-            }
-            if (isset($agency->value->shortName)) {
-            $row["short_name"] = $agency->value->shortName;
-            } else {
-                $out = Array();
-                preg_match_all('/[A-Z]/', $agency->value->name, $out);
-                $row["short_name"] = implode("",$out[0]);
-            }
-            $row["notes"] = "";
-            $row["publication_scheme"] = $agency->value->infoPublicationSchemeURL;
-            $row["home_page"] = $agency->value->website;
-            if ($agency->value->orgType == "FMA-DepartmentOfState") {
-                $row["tag_string"] = $tag[$agency->value->_id];
-            } else {
-                $row["tag_string"] = $tag[$agency->value->parentOrg];;
-            }
-            
-            fputcsv($fp, array_values($row));
-            
-            if (isset($agency->value->foiBodies)) {
-                foreach ($agency->value->foiBodies as $foiBody) {
-                    $row['name'] = $foiBody;
-                    $row['short_name'] = "";
-                    fputcsv($fp, array_values($row));
+            // print_r($agency);
+
+            if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) {
+                $row = Array();
+                $row["#id"] = $agency->id;
+                $row["name"] = trim($agency->value->name);
+                if (isset($agency->value->foiEmail)) {
+                    $row["request_email"] = $agency->value->foiEmail;
+                } else {
+                    if ($agency->value->orgType == "FMA-DepartmentOfState") {
+                        $row["request_email"] = "foi@" . GetDomain($agency->value->website);
+                    } else {
+                        $row["request_email"] = $foiEmail[$agency->value->parentOrg];
+                    }
+                }
+                if (isset($agency->value->shortName)) {
+                    $row["short_name"] = $agency->value->shortName;
+                } else {
+                    $row["short_name"] = shortName($agency->value->name);
+                }
+                $row["notes"] = "";
+                $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : "");
+                $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : "");
+                if ($agency->value->orgType == "FMA-DepartmentOfState") {
+                    $row["tag_string"] = $tag[$agency->value->_id] . " " . $agency->value->orgType;
+                } else {
+                    $row["tag_string"] = $tag[$agency->value->parentOrg] . " " . $agency->value->orgType;
+                }
+
+                fputcsv($fp, array_values($row));
+
+                if (isset($agency->value->foiBodies)) {
+                    foreach ($agency->value->foiBodies as $foiBody) {
+                        $row['name'] = iconv("UTF-8", "ASCII//TRANSLIT",$foiBody);
+                        $row["short_name"] = shortName($foiBody);
+                        fputcsv($fp, array_values($row));
+                    }
                 }
             }
         }

--- a/graph.php
+++ b/graph.php
@@ -1,21 +1,51 @@
 <?php
 include_once('include/common.inc.php');
 //include_header();
-?>
-<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js"></script>
-<script src="lib/springy/springy.js"></script>
-<script src="lib/springy/springyui.js"></script>
-<script>
-    var graph = new Graph();
-    var nodes = [];
-<?php
+$format = "html";
+if (isset($_REQUEST['format'])) {
+    $format = $_REQUEST['format'];
+}
+
+function add_node($id, $label) {
+    global $format;
+    if ($format == "html") {
+        echo "nodes[\"$id\"] = graph.newNode({label: \"$label\"});" . PHP_EOL;
+    }
+     if ($format == "dot" && $label != "") {
+         echo "$id [label=\"$label\"];". PHP_EOL;
+     }
+}
+
+function add_edge($from, $to, $color) {
+    global $format;
+    if ($format == "html") {
+        echo "graph.newEdge(nodes[\"$from\"], nodes['$to'], {color: '$color'});" . PHP_EOL;
+    }
+    if ($format == "dot") {
+        echo "$from -> $to ".($color != ""? "[color=$color]":"").";". PHP_EOL;
+    }
+}
+
+if ($format == "html") {
+    ?>
+    <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js"></script>
+    <script src="lib/springy/springy.js"></script>
+    <script src="lib/springy/springyui.js"></script>
+    <script>
+        var graph = new Graph();
+        var nodes = [];
+    <?php
+}
+if ($format == "dot") {
+    echo 'digraph g {'. PHP_EOL;
+}
 $db = $server->get_db('disclosr-agencies');
-echo "nodes[\"fedg\"] = graph.newNode({label: \"Federal Government - Commonwealth of Australia\"});" . PHP_EOL;
+ add_node("fedg","Federal Government - Commonwealth of Australia");
 try {
     $rows = $db->get_view("app", "byCanonicalName", null, true)->rows;
-    //print_r($rows);
+//print_r($rows);
     foreach ($rows as $row) {
-        echo "nodes[\"{$row->id}\"] = graph.newNode({label: \"{$row->key}\"});" . PHP_EOL;
+        add_node($row->id, $row->key);
     }
 } catch (SetteeRestClientException $e) {
     setteErrorHandler($e);
@@ -23,9 +53,9 @@
 
 try {
     $rows = $db->get_view("app", "byDeptStateName", null, true)->rows;
-    //print_r($rows);
+//print_r($rows);
     foreach ($rows as $row) {
-        echo "graph.newEdge(nodes[\"fedg\"], nodes['{$row->value}'], {color: '#00A0B0'});" . PHP_EOL;
+        add_edge("fedg", $row->value, 'yellow');
     }
 } catch (SetteeRestClientException $e) {
     setteErrorHandler($e);
@@ -33,25 +63,30 @@
 
 try {
     $rows = $db->get_view("app", "parentOrgs", null, true)->rows;
-    //   print_r($rows);
+//   print_r($rows);
     foreach ($rows as $row) {
-        echo "graph.newEdge(nodes[\"{$row->key}\"], nodes['{$row->value}'], {color: '#FFA0B0'});" . PHP_EOL;
+        add_edge($row->key, $row->value, 'blue');
     }
 } catch (SetteeRestClientException $e) {
     setteErrorHandler($e);
 }
-?>
-    window.onload = function() {
-        $(document).ready(function() {
-            var springy = $('#springydemo').springy({
-                graph: graph
+if ($format == "html") {
+    ?>
+        window.onload = function() {
+            $(document).ready(function() {
+                var springy = $('#springydemo').springy({
+                    graph: graph
+                });
             });
-        });
-    };
-</script>
+        };
+    </script>
 
-<canvas id="springydemo" width="1260" height="680" />
-<?php
+    <canvas id="springydemo" width="1260" height="680" />
+    <?php
+}
+if ($format == "dot") {
+    echo "}";
+}
 //include_footer();
 ?>
 

--- a/include/common.inc.php
+++ b/include/common.inc.php
@@ -62,7 +62,3 @@
      }
 }
 
-?>
-
-
-

--- a/include/couchdb.inc.php
+++ b/include/couchdb.inc.php
@@ -13,7 +13,7 @@
     $obj->views->byABN->map = "function(doc) {   emit(doc.abn, doc); };";
         $obj->views->byCanonicalName->map = "function(doc) {  
             if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
-        emit(doc.name, doc._id); 
+        emit(doc.name, doc); 
         }
 };";
         $obj->views->byDeptStateName->map = "function(doc) {  
@@ -34,6 +34,11 @@
 }
         }
 };";
+    
+   $obj->views->foiEmails->map = "function(doc) {  
+        emit(doc._id, doc.foiEmail);
+};";
+    
     $obj->views->byLastModified->map = "function(doc) {   emit(doc.metadata.lastModified, doc); }";
     $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") {  emit(doc._id, doc); } };';
     $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") {  emit(doc._id, doc); } };';
@@ -81,5 +86,3 @@
 function setteErrorHandler($e) {
     echo $e->getMessage() . "<br>" . PHP_EOL;
 }
-?>
-

--- a/include/template.inc.php
+++ b/include/template.inc.php
@@ -74,5 +74,3 @@
     </html>
 
 <?php }
-?>
-

--- /dev/null
+++ b/scrape.py
@@ -1,1 +1,65 @@
+#http://packages.python.org/CouchDB/client.html
+import couchdb
+import urllib2
+from BeautifulSoup import BeautifulSoup
+import re
 
+couch = couchdb.Server('http://192.168.1.148:5984/')
+
+# select database
+agencydb = couch['disclosr-agencies']
+
+for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+    agency = agencydb.get(row.id)
+    print agency['agencyName']
+
+#http://diveintopython.org/http_web_services/etags.html
+class NotModifiedHandler(urllib2.BaseHandler):  
+    def http_error_304(self, req, fp, code, message, headers):
+        addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
+        addinfourl.code = code
+        return addinfourl
+
+def scrapeAndStore(URL, depth, agency):
+    URL = "http://www.hole.fi/jajvirta/weblog/"
+    req = urllib2.Request(URL)
+    
+    #if there is a previous version sotred in couchdb, load caching helper tags
+    if etag:
+        req.add_header("If-None-Match", etag)
+    if last_modified:
+        req.add_header("If-Modified-Since", last_modified)
+     
+    opener = urllib2.build_opener(NotModifiedHandler())
+    url_handle = opener.open(req)
+    headers = url_handle.info() # the addinfourls have the .info() too
+    etag = headers.getheader("ETag")
+    last_modified = headers.getheader("Last-Modified") 
+    web_server = headers.getheader("Server") 
+    file_size = headers.getheader("Content-Length") 
+    mime_type = headers.getheader("Content-Type") 
+     
+    if hasattr(url_handle, 'code') 
+        if url_handle.code == 304:
+            print "the web page has not been modified"
+        else: 
+            #do scraping
+            html = url_handle.read()
+            # http://www.crummy.com/software/BeautifulSoup/documentation.html
+            soup = BeautifulSoup(html)
+        links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
+        for link in links:
+            print link['href']
+            #for each unique link
+            #if html mimetype
+            # go down X levels,
+            # diff with last stored attachment, store in document
+            #if not
+            #   remember to save parentURL and title (link text that lead to document)
+    
+    #store as attachment epoch-filename
+    else:
+        print "error %s in downloading %s", url_handle.code, URL
+        #record/alert error to error database
+    
+    

--- a/unimplemented/exportAgencies.csv.php
+++ /dev/null
@@ -1,65 +1,1 @@
-<?php
 
-include_once("./lib/common.inc.php");
-setlocale(LC_CTYPE, 'C');
-// source: http://stackoverflow.com/questions/81934/easy-way-to-export-a-sql-table-without-access-to-the-server-or-phpmyadmin#81951
-
-$unspsc = Array();
-$unspscresult = $conn->prepare('select * from "UNSPSCcategories" where "UNSPSC"::text like \'%00000\';');
-$unspscresult->execute();
-foreach ($unspscresult->fetchAll() as $row) {
-    $unspsc[$row['UNSPSC']] = $row['Title'];
-}
-
-$query = $conn->prepare('
-SELECT "CNID",contractnotice."agencyName",agency_nametoabn.abn as "agencyABN",
-EXTRACT(EPOCH FROM "publishDate") as "publishDate",
-EXTRACT(EPOCH FROM "contractStart") as "contractStart",
-EXTRACT(EPOCH FROM "contractEnd") as "contractEnd",
-value,description,category,
-"supplierName",(case when "supplierABN" != 0 THEN "supplierABN"::text ELSE "supplierName" END) as supplierID,
-(\'https://www.tenders.gov.au/?event=public.advancedsearch.keyword&keyword=CN\'::text || "CNID"::text) as sourceURL 
-FROM contractnotice join agency_nametoabn on contractnotice."agencyName"=agency_nametoabn."agencyName"  
-where "childCN" is null'
-        , array(PDO::ATTR_CURSOR => PDO::FETCH_ORI_NEXT));
-$query->execute();
-$errors = $conn->errorInfo();
-if ($errors[2] != "") {
-    die("Export terminated, db error" . print_r($errors, true));
-}
-
-$num_fields = $query->columnCount();
-$headers = Array();
-for ($i = 0; $i < $num_fields; $i++) { // for each column in query, make a CSV header
-    $meta = $query->getColumnMeta($i);
-    $headers[] = $meta['name'];
-}
-$fp = fopen('php://output', 'w');
-if ($fp && $query) {
-    header('Content-Type: text/csv');
-    header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
-    header('Pragma: no-cache');
-    header('Expires: 0');
-    fputcsv($fp, $headers);
-    while ($row = $query->fetch(PDO::FETCH_NUM, PDO::FETCH_ORI_NEXT)) {
-        foreach ($row as $key => &$colvalue) {
-
-            $colvalue = preg_replace('/[^[:print:]]/', '', utf8_encode($colvalue));
-            if ($headers[$key] == "publishDate" || $headers[$key] == "contractStart"
-                    || $headers[$key] == "contractEnd") {
-                $colvalue = date("Y-m-d", $colvalue);
-            }
-           /* if ($headers[$key] == "CNID") {
-                $colvalue = str_replace("A","", $colvalue);
-}*/
-            if ($headers[$key] == "cat1" || $headers[$key] == "cat2"
-                    || $headers[$key] == "cat3") {
-                $colvalue = $unspsc[$colvalue];
-            }
-        }
-        fputcsv($fp, array_values($row));
-    }
-    die;
-}
-?>
-

--- a/unimplemented/scrape.py
+++ /dev/null
@@ -1,64 +1,1 @@
-#http://packages.python.org/CouchDB/client.html
-import couchdb
-import urllib2
-from BeautifulSoup import BeautifulSoup
-import re
 
-couch = couchdb.Server() # Assuming localhost:5984
-# If your CouchDB server is running elsewhere, set it up like this:
-# couch = couchdb.Server('http://example.com:5984/')
-
-# select database
-agencydb = couch['disclosr-agencies']
-
-for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
-    agency = agencydb.get(row.id)
-    print agency['agencyName']
-
-#http://diveintopython.org/http_web_services/etags.html
-class NotModifiedHandler(urllib2.BaseHandler):  
-    def http_error_304(self, req, fp, code, message, headers):
-        addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
-        addinfourl.code = code
-        return addinfourl
-
-def scrapeAndStore(URL, depth, agency):
-    URL = "http://www.hole.fi/jajvirta/weblog/"
-    req = urllib2.Request(URL)
-    
-    #if there is a previous version sotred in couchdb, load caching helper tags
-    if etag:
-        req.add_header("If-None-Match", etag)
-    if last_modified:
-        req.add_header("If-Modified-Since", last_modified)
-     
-    opener = urllib2.build_opener(NotModifiedHandler())
-    url_handle = opener.open(req)
-    headers = url_handle.info() # the addinfourls have the .info() too
-    etag = headers.getheader("ETag")
-    last_modified = headers.getheader("Last-Modified") 
-    web_server = headers.getheader("Server") 
-    file_size = headers.getheader("Content-Length") 
-    mime_type = headers.getheader("Content-Type") 
-     
-    if hasattr(url_handle, 'code') and url_handle.code == 304:
-        print "the web page has not been modified"
-    else:
-        print "error %s in downloading %s", url_handle.code, URL
-        #record/alert error to error database
-    
-    #do scraping
-    html = ?
-    # http://www.crummy.com/software/BeautifulSoup/documentation.html
-    soup = BeautifulSoup(html)
-links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
-for link in links:
-    print link['href']
-    #for each unique link
-    #if html mimetype
-    # go down X levels,
-    # diff with last stored attachment, store in document
-    #if not
-    #   remember to save parentURL and title (link text that lead to document)
-    
-    #store as attachment epoch-filename