Beginning of socmed import
[disclosr.git] / admin / importGov2RegisterRSSFacebookTwitter.php
blob:a/admin/importGov2RegisterRSSFacebookTwitter.php -> blob:b/admin/importGov2RegisterRSSFacebookTwitter.php
--- a/admin/importGov2RegisterRSSFacebookTwitter.php
+++ b/admin/importGov2RegisterRSSFacebookTwitter.php
@@ -1,6 +1,7 @@
 <?php
 
 require_once '../include/common.inc.php';
+require($basePath . 'lib/phpquery/phpQuery/phpQuery.php');
 
 $db = $server->get_db('disclosr-agencies');
 $rows = $db->get_view("app", "byName")->rows;
@@ -10,13 +11,48 @@
     $nametoid[trim($row->key)] = $row->value;
 }
 
-function extractCSVAccounts($url, $nameField, $accountField, $filter) {
+function extractHTMLAccounts($url, $accountType) {
+    global $accounts, $nametoid;
+    $request = Requests::get($url);
+    $doc = phpQuery::newDocumentHTML($request->body);
+    phpQuery::selectDocument($doc);
+    foreach (pq('tr')->elements as $tr) {
+        //echo $tr->nodeValue.PHP_EOL;
+        $agency = "";
+        $url = "";
+        foreach ($tr->childNodes as $td) {
+            $class = $td->getAttribute("class");
+            //echo "cccc $class ".$td->nodeValue.PHP_EOL;
+            if ($class == "s11" || $class == "s10" || $class == "s7") {
+                $agency = $td->nodeValue;
+            } else if ($class == "s6" || $class == "s9") {
+                $url = $td->nodeValue;
+                foreach ($td->childNodes as $a) {
+                    $href = $a->getAttribute("href");
+                    if ($href != "") {
+                        $url = $href;
+                    }
+                }
+            }
+        }
+        if ($agency != "" && $url != "") {
+            if (!in_array(trim($agency), array_keys($nametoid))) {
+                echo trim($agency) . " missing" . PHP_EOL;
+            } else {
+                //   echo $agency." = ".$url.PHP_EOL;
+                $accounts[$nametoid[trim($agency)]][$accountType][] = $url;
+            }
+        }
+    }
+}
+
+function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) {
     global $accounts, $nametoid;
     $request = Requests::get($url);
     $Data = str_getcsv($request->body, "\n"); //parse the rows 
     $headers = Array();
     foreach ($Data as $num => $line) {
-        $Row = str_getcsv($line, ",");
+        $Row = str_getcsv($line, ",", '"');
         if ($num == 0) {
             
         } else if ($num == 1) {
@@ -27,9 +63,10 @@
                 $agencyName = $Row[array_search($nameField, $headers)];
                 if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
                     if (!in_array(trim($agencyName), array_keys($nametoid))) {
-                        echo "$agencyName missing" . PHP_EOL;
+                        echo trim($agencyName) . " missing" . PHP_EOL;
                     } else {
                         // echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+                        $accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)];
                     }
                 }
             } else {
@@ -41,21 +78,23 @@
 
 // http://agimo.govspace.gov.au/page/gov2register/
 // twitter
-extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true);
+extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Twitter", "Agency/Body/Event", "", true);
 // RSS
-// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv
+extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS");
 // facebook 
-//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","","");
+extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook");
+foreach ($accounts as $id => $accountTypes) {
+    echo $id . "<br>" . PHP_EOL;
+    $doc = object_to_array($db->get($id));
+    // print_r($doc);
 
-/*
- * http://australia.gov.au/news-and-media/media-release-rss-feeds
- * http://australia.gov.au/news-and-media/social-media/blogs
- * http://australia.gov.au/news-and-media/social-media/twitter
- * http://australia.gov.au/news-and-media/social-media/facebook
- * http://australia.gov.au/news-and-media/social-media/youtube
- * http://australia.gov.au/news-and-media/social-media/flickr
- * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx
- * http://australia.gov.au/news-and-media/social-media/podcasts
- */
+    foreach ($accountTypes as $accountType => $accounts) {
+        if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
+            $doc["has" . $accountType] = Array();
+        }
+        $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
+    }
+    $db->save($doc);
+}
 ?>