From: maxious <maxious@lambdacomplex.org>
Date: Sun, 11 Mar 2012 10:39:08 +0000
Subject: Testing facebook imports
X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=758723a1e47a93641ef639806f8d72c2dfeeb1f7
---
Testing facebook imports


Former-commit-id: 7c36c79b53a3bdd3ff5acaec5e5ceadb4b73aff1
---


--- a/admin/importGov2RegisterRSSFacebookTwitter.php
+++ b/admin/importGov2RegisterRSSFacebookTwitter.php
@@ -1,17 +1,51 @@
 <?php
 
 require_once '../include/common.inc.php';
-try {
-    $server->create_db('disclosr-agencies');
-} catch (SetteeRestClientException $e) {
-    setteErrorHandler($e);
+
+$db = $server->get_db('disclosr-agencies');
+$rows = $db->get_view("app", "byName")->rows;
+$nametoid = Array();
+$accounts = Array();
+foreach ($rows as $row) {
+    $nametoid[trim($row->key)] = $row->value;
 }
-$db = $server->get_db('disclosr-agencies');
-createAgencyDesignDoc();
 
-// twitter https://docs.google.com/spreadsheet/fm?id=tsJVd9EYoAjbl014y3qMgWg.03918275400592898296.8568379511161083736&hl=en&fmcmd=5&gid=0
-// RSS https://docs.google.com/spreadsheet/fm?id=tbqjwIDHKHiVYF_glJ93GgA.03918275400592898296.8789688748524615194&authkey=CJDP-uQG&hl=en_GB&fmcmd=5&gid=0
-// facebook https://docs.google.com/spreadsheet/fm?id=tkcqoo9wrgzNWmoANuVhsBw.03918275400592898296.3040387705062056060&authkey=CKzl7r0I&hl=en_GB&fmcmd=5&gid=0
+function extractCSVAccounts($url, $nameField, $accountField, $filter) {
+    global $accounts, $nametoid;
+    $request = Requests::get($url);
+    $Data = str_getcsv($request->body, "\n"); //parse the rows 
+    $headers = Array();
+    foreach ($Data as $num => $line) {
+        $Row = str_getcsv($line, ",");
+        if ($num == 0) {
+            
+        } else if ($num == 1) {
+            $headers = $Row;
+            //print_r($headers);
+        } else {
+            if (isset($Row[array_search($nameField, $headers)])) {
+                $agencyName = $Row[array_search($nameField, $headers)];
+                if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
+                    if (!in_array(trim($agencyName), array_keys($nametoid))) {
+                        echo "$agencyName missing" . PHP_EOL;
+                    } else {
+                        // echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+                    }
+                }
+            } else {
+                //echo "error finding agency" . $line . PHP_EOL;
+            }
+        }
+    }
+}
+
+// http://agimo.govspace.gov.au/page/gov2register/
+// twitter
+//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true);
+// RSS
+// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv
+// facebook 
+extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","Agency","Name");
 
 /*
  * http://australia.gov.au/news-and-media/media-release-rss-feeds

--- a/couchdb/settee
+++ b/couchdb/settee

--- a/getAgency.php
+++ b/getAgency.php
@@ -14,10 +14,10 @@
             echo "<ol>";
             foreach ($value as $subkey => $subvalue) {
                 if (isset($schemas['agency']["properties"][$key]['x-property'])) {
-                echo '<li property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';
-            } else {
-                echo "<li>";
-            }
+                    echo '<li property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';
+                } else {
+                    echo "<li>";
+                }
                 echo "$subvalue</li>";
             }
             echo "</ol></td></tr>";
@@ -27,11 +27,11 @@
             } else {
                 echo "<span>";
             }
-              if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
-                    echo "<a href='$value'>view</a></span>";
-                } else {
-            echo "$value</span>";
-                }
+            if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
+                echo "<a href='$value'>view</a></span>";
+            } else {
+                echo "$value</span>";
+            }
         }
         echo "</td></tr>";
     }
@@ -53,12 +53,12 @@
             } else if ($key == "parentOrg") {
                 echo "<label for='$key'>$key</label><select  id='$key' name='$key'><option value=''> Select... </option>";
                 $rows = $db->get_view("app", "byDeptStateName")->rows;
-                //print_r($rows);
+//print_r($rows);
                 foreach ($rows as $row) {
                     echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>";
                 }
                 echo" </select>";
-              } else {
+            } else {
                 echo "<label>$key</label><input  class='input-text' type='text' id='$key' name='$key' value='$value'/>";
                 if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
                     echo "<a href='$value'>view</a>";
@@ -69,7 +69,7 @@
             }
         }
     }
-    // 
+// 
 }
 
 function addDefaultFields($row) {
@@ -78,13 +78,17 @@
     foreach ($defaultFields as $defaultField) {
         if (!isset($row[$defaultField])) {
             if ($schemas['agency']['properties'][$defaultField]['type'] == "string") {
-               
-                    $row[$defaultField] = "";
-            
+                $row[$defaultField] = "";
             }
             if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
-
                 $row[$defaultField] = Array("");
+            }
+        } else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
+            if (is_array($row[$defaultField])) {
+                $row[$defaultField][] = "";
+            } else {
+                $value = $row[$defaultField];
+                $row[$defaultField] = Array($value);
             }
         }
     }
@@ -94,20 +98,28 @@
 $db = $server->get_db('disclosr-agencies');
 
 if (isset($_REQUEST['id'])) {
-    //get an agency record as json/html, search by name/abn/id
+//get an agency record as json/html, search by name/abn/id
 // by name = startkey="Ham"&endkey="Ham\ufff0"
 // edit?
 
     $row = $db->get($_REQUEST['id']);
-    //print_r($row);
+//print_r($row);
     if (sizeof($_POST) > 0) {
-        //print_r($_POST);
+//print_r($_POST);
         foreach ($_POST as $postkey => $postvalue) {
             if ($postvalue == "") {
                 unset($_POST[$postkey]);
             }
-            if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") {
-                unset($_POST[$postkey]);
+            if (is_array($postvalue)) {
+                if (count($postvalue) == 1 && $postvalue[0] == "") {
+                    unset($_POST[$postkey]);
+                } else {
+                    foreach ($_POST[$postkey] as $key => &$value) {
+                        if ($value == "") {
+                            unset($_POST[$postkey][$key]);
+                        }
+                    }
+                }
             }
         }
         if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) {
@@ -124,9 +136,9 @@
     if ($mode == "edit") {
         $row = addDefaultFields(object_to_array($row));
     } else {
-         $row = object_to_array($row);
-    }
-    
+        $row = object_to_array($row);
+    }
+
     if ($mode == "view") {
         echo '<div typeof="schema:GovernmentOrganisation" about="#' . $row['_id'] . '"><table width="100%">';
         echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>";
@@ -153,44 +165,44 @@
             };
         </script>
         <form id="editform" class="nice" method="post">
-        <?php
-
-    }
-    foreach ($row as $key => $value) {
-        echo displayValue($key, $value, $mode);
-    }
-    if ($mode == "view") {
-        echo "</table></div>";
-    }
-    if ($mode == "edit") {
-        echo '<input id="submitbutton" type="submit"/></form>';
-    }
-} else {
-
-    try {
-        /* $rows = $db->get_view("app", "showNamesABNs")->rows;
-          //print_r($rows);
-          foreach ($rows as $row) {
-          //   print_r($row);
-          echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
-          (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
-          . '</a></li>';
-          } */
-        $rows = $db->get_view("app", "byName")->rows;
-        //print_r($rows);
-echo '<ul>';
-        foreach ($rows as $row) {
-            //   print_r($row);
-            echo '<li typeof="schema:GovernmentOrganisation foaf:Organization" about="getAgency.php?id=' . $row->value . '">
+            <?php
+
+        }
+        foreach ($row as $key => $value) {
+            echo displayValue($key, $value, $mode);
+        }
+        if ($mode == "view") {
+            echo "</table></div>";
+        }
+        if ($mode == "edit") {
+            echo '<input id="submitbutton" type="submit"/></form>';
+        }
+    } else {
+
+        try {
+            /* $rows = $db->get_view("app", "showNamesABNs")->rows;
+              //print_r($rows);
+              foreach ($rows as $row) {
+              //   print_r($row);
+              echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
+              (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
+              . '</a></li>';
+              } */
+            $rows = $db->get_view("app", "byName")->rows;
+            //print_r($rows);
+            echo '<ul>';
+            foreach ($rows as $row) {
+                //   print_r($row);
+                echo '<li typeof="schema:GovernmentOrganisation foaf:Organization" about="getAgency.php?id=' . $row->value . '">
 <a href="getAgency.php?id=' . $row->value . '" rel="schema:url foaf:page" property="schema:name foaf:name">' .
-            $row->key
-            . '</a></li>';
-        }
-echo "</ul>";
-    } catch (SetteeRestClientException $e) {
-        setteErrorHandler($e);
-    }
-}
-include_footer();
-?>
-
+                $row->key
+                . '</a></li>';
+            }
+            echo "</ul>";
+        } catch (SetteeRestClientException $e) {
+            setteErrorHandler($e);
+        }
+    }
+    include_footer();
+    ?>
+

--- a/schemas/agency.json.php
+++ b/schemas/agency.json.php
@@ -33,6 +33,8 @@
         "appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
         "advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "),
         "hasRSS" => Array("type" => "array", "required" => true, "x-title" => "Has RSS", "description" => ""),
+      "hasBlog" => Array("type" => "array", "required" => true, "x-title" => "Has Blog", "description" => ""),
+              "hasMobileApp" => Array("type" => "array", "required" => true, "x-title" => "Has Mobile App", "description" => ""),
         "hasMailingList" => Array("type" => "array", "required" => true, "x-title" => "Has Mailing List", "description" => "",
             "items" => Array("type" => "string")),
         "hasTwitter" => Array("type" => "array", "required" => true, "x-title" => "Has Twitter", "description" => "",

--- a/scrape.py
+++ b/scrape.py
@@ -77,7 +77,7 @@
     print "Fetching %s" % url
     if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
 		print "Not a valid HTTP url"
-		return (None,None)
+		return (None,None,None)
     doc = docsdb.get(hash) 
     if doc == None:
 	doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
@@ -86,13 +86,14 @@
 		print "Uh oh, trying to scrape URL again too soon!"
 		last_attachment_fname = doc["_attachments"].keys()[-1]
 		last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-		return (doc['mime_type'],last_attachment)
+		return (doc['url'],doc['mime_type'],last_attachment)
 	if scrape_again == False:
 		print "Not scraping this URL again as requested"
-		return (None,None)
+		return (None,None,None)
 
     time.sleep(3) # wait 3 seconds to give webserver time to recover
     
+    req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
     #if there is a previous version stored in couchdb, load caching helper tags
     if doc.has_key('etag'):
         req.add_header("If-None-Match", doc['etag'])
@@ -102,12 +103,14 @@
     opener = urllib2.build_opener(NotModifiedHandler())
     try:
      url_handle = opener.open(req)
+     doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
      headers = url_handle.info() # the addinfourls have the .info() too
      doc['etag'] = headers.getheader("ETag")
      doc['last_modified'] = headers.getheader("Last-Modified") 
      doc['date'] = headers.getheader("Date") 
      doc['page_scraped'] = time.time() 
      doc['web_server'] = headers.getheader("Server") 
+     doc['via'] = headers.getheader("Via") 
      doc['powered_by'] = headers.getheader("X-Powered-By") 
      doc['file_size'] = headers.getheader("Content-Length") 
      content_type = headers.getheader("Content-Type")
@@ -119,13 +122,13 @@
      if hasattr(url_handle, 'code'): 
         if url_handle.code == 304:
             print "the web page has not been modified"
-	    return (None,None)
+	    return (None,None,None)
         else: 
             content = url_handle.read()
 	    docsdb.save(doc)
 	    doc = docsdb.get(hash) # need to get a _rev
 	    docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) 
-	    return (doc['mime_type'], content)
+	    return (doc['url'], doc['mime_type'], content)
 	    #store as attachment epoch-filename
     except urllib2.URLError as e:
     	error = ""
@@ -136,21 +139,22 @@
         print error
 	doc['error'] = error
         docsdb.save(doc)
-        return (None,None)
+        return (None,None,None)
 
 
 
 def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
-    (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
-    if content != None and depth > 0:
+    (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+    badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+    if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
 	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
             # http://www.crummy.com/software/BeautifulSoup/documentation.html
             soup = BeautifulSoup(content)
-	    navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
+	    navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
 	    for nav in navIDs:
 		print "Removing element", nav['id']
 		nav.extract()
-	    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
+	    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
 	    for nav in navClasses:
 		print "Removing element", nav['class']
 		nav.extract()
@@ -169,7 +173,10 @@
 				# not http
 				None
 			else:
-                		linkurls.add(urljoin(url,link['href'].replace(" ","%20")))
+				# remove anchors and spaces in urls
+				link['href'] = link['href'].replace(" ","%20")
+				link['href'] = re.sub('#.*$','',link['href'])
+                		linkurls.add(urljoin(url,link['href']))
             for linkurl in linkurls:
 		#print linkurl
 		scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)