be permissive with datagov resource file size
[disclosr.git] / admin / importGov2RegisterRSSFacebookTwitter.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
<?php
 
require_once '../include/common.inc.php';
require($basePath . 'lib/phpquery/phpQuery/phpQuery.php');
 
$db = $server->get_db('disclosr-agencies');
$rows = $db->get_view("app", "byName")->rows;
$nametoid = Array();
$accounts = Array();
foreach ($rows as $row) {
    $nametoid[trim($row->key)] = $row->value;
}
 
function extractHTMLAccounts($url, $accountType) {
    global $accounts, $nametoid;
    $request = Requests::get($url);
    $doc = phpQuery::newDocumentHTML($request->body);
    phpQuery::selectDocument($doc);
    foreach (pq('tr')->elements as $tr) {
        //echo $tr->nodeValue.PHP_EOL;
        $agency = "";
        $url = "";
        foreach ($tr->childNodes as $td) {
            $class = $td->getAttribute("class");
            //echo "cccc $class ".$td->nodeValue.PHP_EOL;
            if ($class == "s11" || $class == "s10" || $class == "s7") {
                $agency = $td->nodeValue;
            } else if ($class == "s6" || $class == "s9") {
                $url = $td->nodeValue;
                foreach ($td->childNodes as $a) {
                    $href = $a->getAttribute("href");
                    if ($href != "") {
                        $url = $href;
                    }
                }
            }
        }
        if ($agency != "" && $url != "") {
            if (!in_array(trim($agency), array_keys($nametoid))) {
                echo trim($agency) . " missing" . PHP_EOL;
            } else {
                //   echo $agency." = ".$url.PHP_EOL;
                $accounts[$nametoid[trim($agency)]][$accountType][] = $url;
            }
        }
    }
}
 
function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) {
    global $accounts, $nametoid;
    $request = Requests::get($url);
    $Data = str_getcsv($request->body, "\n"); //parse the rows 
    $headers = Array();
    foreach ($Data as $num => $line) {
        $Row = str_getcsv($line, ",", '"');
        if ($num == 0) {
            
        } else if ($num == 1) {
            $headers = $Row;
            //print_r($headers);
        } else {
            if (isset($Row[array_search($nameField, $headers)])) {
                $agencyName = $Row[array_search($nameField, $headers)];
                if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
                    if (!in_array(trim($agencyName), array_keys($nametoid))) {
                        echo trim($agencyName) . " missing" . PHP_EOL;
                    } else {
                        // echo $Row[array_search($nameField, $headers)] . PHP_EOL;
                        $accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)];
                    }
                }
            } else {
                //echo "error finding agency" . $line . PHP_EOL;
            }
        }
    }
}
 
// http://agimo.govspace.gov.au/page/gov2register/
// twitter
extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Twitter", "Agency/Body/Event", "", true);
// RSS
extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS");
// facebook 
extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook");
foreach ($accounts as $id => $accountTypes) {
    echo $id . "<br>" . PHP_EOL;
    $doc = object_to_array($db->get($id));
    // print_r($doc);
 
    foreach ($accountTypes as $accountType => $accounts) {
        if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
            $doc["has" . $accountType] = Array();
        }
        $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
    }
    $db->save($doc);
}
?>