From: Maxious <maxious@lambdacomplex.org>
Date: Mon, 12 Mar 2012 04:52:08 +0000
Subject: Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr
X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=607d96fff60ea5674237dc394f422ec6143a65f1
---
Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr

Conflicts:
admin/importGov2RegisterRSSFacebookTwitter.php

Former-commit-id: 3fa2d8260298f3ae808000c29ba85d0e80e1b507
---


--- a/.gitmodules
+++ b/.gitmodules
@@ -16,4 +16,7 @@
 [submodule "javascripts/flotr2"]
 	path = javascripts/flotr2
 	url = https://github.com/HumbleSoftware/Flotr2.git
+[submodule "lib/phpquery"]
+	path = lib/phpquery
+	url = https://github.com/TobiaszCudnik/phpquery.git
 

--- /dev/null
+++ b/admin/importAustraliaGovAuGov2.php
@@ -1,1 +1,61 @@
+<?php
 
+require_once '../include/common.inc.php';
+
+$db = $server->get_db('disclosr-agencies');
+$rows = $db->get_view("app", "byName")->rows;
+$nametoid = Array();
+$accounts = Array();
+foreach ($rows as $row) {
+    $nametoid[trim($row->key)] = $row->value;
+}
+
+function extractCSVAccounts($url, $nameField, $accountField, $filter) {
+    global $accounts, $nametoid;
+    $request = Requests::get($url);
+    $Data = str_getcsv($request->body, "\n"); //parse the rows 
+    $headers = Array();
+    foreach ($Data as $num => $line) {
+        $Row = str_getcsv($line, ",");
+        if ($num == 0) {
+            
+        } else if ($num == 1) {
+            $headers = $Row;
+            //print_r($headers);
+        } else {
+            if (isset($Row[array_search($nameField, $headers)])) {
+                $agencyName = $Row[array_search($nameField, $headers)];
+                if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
+                    if (!in_array(trim($agencyName), array_keys($nametoid))) {
+                        echo "$agencyName missing" . PHP_EOL;
+                    } else {
+                        // echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+                    }
+                }
+            } else {
+                //echo "error finding agency" . $line . PHP_EOL;
+            }
+        }
+    }
+}
+
+// http://agimo.govspace.gov.au/page/gov2register/
+// twitter
+//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true);
+// RSS
+// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv
+// facebook 
+extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","Agency","Name");
+
+/*
+ * http://australia.gov.au/news-and-media/media-release-rss-feeds
+ * http://australia.gov.au/news-and-media/social-media/blogs
+ * http://australia.gov.au/news-and-media/social-media/twitter
+ * http://australia.gov.au/news-and-media/social-media/facebook
+ * http://australia.gov.au/news-and-media/social-media/youtube
+ * http://australia.gov.au/news-and-media/social-media/flickr
+ * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx
+ * http://australia.gov.au/news-and-media/social-media/podcasts
+ */
+?>
+

--- a/admin/importGov2RegisterRSSFacebookTwitter.php
+++ b/admin/importGov2RegisterRSSFacebookTwitter.php
@@ -1,6 +1,7 @@
 <?php
 
 require_once '../include/common.inc.php';
+require($basePath.'lib/phpquery/phpQuery/phpQuery.php');
 
 $db = $server->get_db('disclosr-agencies');
 $rows = $db->get_view("app", "byName")->rows;
@@ -9,14 +10,50 @@
 foreach ($rows as $row) {
     $nametoid[trim($row->key)] = $row->value;
 }
+function extractHTMLAccounts($url, $accountType) {
+    global $accounts, $nametoid;
+    $request = Requests::get($url);
+    $doc = phpQuery::newDocumentHTML($request->body);
+    phpQuery::selectDocument($doc);
+    foreach (pq('tr')->elements as $tr) {
+       //echo $tr->nodeValue.PHP_EOL;
+       $agency = "";
+       $url = "";
+       foreach ($tr->childNodes as $td) {
+           $class = $td->getAttribute("class");
+           //echo "cccc $class ".$td->nodeValue.PHP_EOL;
+           if ($class == "s11" || $class == "s10" || $class == "s7") {
+               $agency = $td->nodeValue;
+           } else if ($class == "s6" || $class == "s9"){
+               $url = $td->nodeValue;
+               foreach($td->childNodes as $a) {
+                   $href = $a->getAttribute("href");
+                   if ($href != "") {
+                       $url = $href;
+                   }
+               }
+           }
+       }
+       if ($agency != "" && $url != "") {
+           if (!in_array(trim($agency), array_keys($nametoid))) {
+                        echo trim($agency)." missing" . PHP_EOL;
+                    } else {
+                     //   echo $agency." = ".$url.PHP_EOL;
+                        $accounts[$nametoid[trim($agency)]][$accountType][] = $url;
+                    }
+       
+       }
+    }
+    
+}
 
-function extractCSVAccounts($url, $nameField, $accountField, $filter) {
+function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) {
     global $accounts, $nametoid;
     $request = Requests::get($url);
     $Data = str_getcsv($request->body, "\n"); //parse the rows 
     $headers = Array();
     foreach ($Data as $num => $line) {
-        $Row = str_getcsv($line, ",");
+        $Row = str_getcsv($line, ",",'"');
         if ($num == 0) {
             
         } else if ($num == 1) {
@@ -27,9 +64,10 @@
                 $agencyName = $Row[array_search($nameField, $headers)];
                 if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
                     if (!in_array(trim($agencyName), array_keys($nametoid))) {
-                        echo "$agencyName missing" . PHP_EOL;
+                        echo trim($agencyName)." missing" . PHP_EOL;
                     } else {
                         // echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+                        $accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)];
                     }
                 }
             } else {
@@ -41,21 +79,11 @@
 
 // http://agimo.govspace.gov.au/page/gov2register/
 // twitter
-extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true);
+extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Twitter", "Agency/Body/Event", "", true);
 // RSS
-// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv
+extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS");
 // facebook 
-//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","","");
+extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook");
 
-/*
- * http://australia.gov.au/news-and-media/media-release-rss-feeds
- * http://australia.gov.au/news-and-media/social-media/blogs
- * http://australia.gov.au/news-and-media/social-media/twitter
- * http://australia.gov.au/news-and-media/social-media/facebook
- * http://australia.gov.au/news-and-media/social-media/youtube
- * http://australia.gov.au/news-and-media/social-media/flickr
- * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx
- * http://australia.gov.au/news-and-media/social-media/podcasts
- */
 ?>
 

--- /dev/null
+++ b/lib/phpquery

--- a/scrape.py
+++ b/scrape.py
@@ -77,7 +77,7 @@
     print "Fetching %s" % url
     if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
 		print "Not a valid HTTP url"
-		return (None,None)
+		return (None,None,None)
     doc = docsdb.get(hash) 
     if doc == None:
 	doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
@@ -86,13 +86,14 @@
 		print "Uh oh, trying to scrape URL again too soon!"
 		last_attachment_fname = doc["_attachments"].keys()[-1]
 		last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-		return (doc['mime_type'],last_attachment)
+		return (doc['url'],doc['mime_type'],last_attachment)
 	if scrape_again == False:
 		print "Not scraping this URL again as requested"
-		return (None,None)
+		return (None,None,None)
 
     time.sleep(3) # wait 3 seconds to give webserver time to recover
     
+    req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
     #if there is a previous version stored in couchdb, load caching helper tags
     if doc.has_key('etag'):
         req.add_header("If-None-Match", doc['etag'])
@@ -102,12 +103,14 @@
     opener = urllib2.build_opener(NotModifiedHandler())
     try:
      url_handle = opener.open(req)
+     doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
      headers = url_handle.info() # the addinfourls have the .info() too
      doc['etag'] = headers.getheader("ETag")
      doc['last_modified'] = headers.getheader("Last-Modified") 
      doc['date'] = headers.getheader("Date") 
      doc['page_scraped'] = time.time() 
      doc['web_server'] = headers.getheader("Server") 
+     doc['via'] = headers.getheader("Via") 
      doc['powered_by'] = headers.getheader("X-Powered-By") 
      doc['file_size'] = headers.getheader("Content-Length") 
      content_type = headers.getheader("Content-Type")
@@ -119,13 +122,13 @@
      if hasattr(url_handle, 'code'): 
         if url_handle.code == 304:
             print "the web page has not been modified"
-	    return (None,None)
+	    return (None,None,None)
         else: 
             content = url_handle.read()
 	    docsdb.save(doc)
 	    doc = docsdb.get(hash) # need to get a _rev
 	    docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) 
-	    return (doc['mime_type'], content)
+	    return (doc['url'], doc['mime_type'], content)
 	    #store as attachment epoch-filename
     except urllib2.URLError as e:
     	error = ""
@@ -136,21 +139,22 @@
         print error
 	doc['error'] = error
         docsdb.save(doc)
-        return (None,None)
+        return (None,None,None)
 
 
 def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
-    (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
-    if content != None and depth > 0:
+    (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+    badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+    if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
 	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
             # http://www.crummy.com/software/BeautifulSoup/documentation.html
             soup = BeautifulSoup(content)
-	    navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
+	    navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
 	    for nav in navIDs:
 		print "Removing element", nav['id']
 		nav.extract()
-	    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
+	    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
 	    for nav in navClasses:
 		print "Removing element", nav['class']
 		nav.extract()
@@ -169,7 +173,10 @@
 				# not http
 				None
 			else:
-                		linkurls.add(urljoin(url,link['href'].replace(" ","%20")))
+				# remove anchors and spaces in urls
+				link['href'] = link['href'].replace(" ","%20")
+				link['href'] = re.sub('#.*$','',link['href'])
+                		linkurls.add(urljoin(url,link['href']))
             for linkurl in linkurls:
 		#print linkurl
 		scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)