From: Maxious Date: Fri, 23 Mar 2012 10:49:40 +0000 Subject: Upgrade graph.php to use sigma.js and output gexf X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=0ab527607f0af1ce5ff412b33b71f822935415f0 --- Upgrade graph.php to use sigma.js and output gexf Former-commit-id: f7184884c78601da7d51fa0016334c0f329d43cd --- --- a/.gitmodules +++ b/.gitmodules @@ -4,9 +4,6 @@ [submodule "couchdb/settee"] path = couchdb/settee url = https://github.com/inadarei/settee.git -[submodule "lib/springy"] - path = lib/springy - url = https://github.com/dhotson/springy.git [submodule "lib/php-diff"] path = lib/php-diff url = https://github.com/chrisboulton/php-diff.git @@ -16,4 +13,10 @@ [submodule "javascripts/flotr2"] path = javascripts/flotr2 url = https://github.com/HumbleSoftware/Flotr2.git +[submodule "lib/phpquery"] + path = lib/phpquery + url = https://github.com/TobiaszCudnik/phpquery.git +[submodule "javascripts/sigma"] + path = javascripts/sigma + url = https://github.com/jacomyal/sigma.js.git --- a/admin/exportEmployees.csv.php +++ b/admin/exportEmployees.csv.php @@ -22,6 +22,7 @@ if (isset($row->value->statistics->employees)) { $headers = array_unique(array_merge($headers, array_keys(object_to_array($row->value->statistics->employees)))); + } } } catch (SetteeRestClientException $e) { --- /dev/null +++ b/admin/exportScore.csv.php @@ -1,1 +1,73 @@ +get_db('disclosr-agencies'); +$format = "csv"; +//$format = "json"; +if (isset($_REQUEST['format'])) $format = $_REQUEST['format']; + +setlocale(LC_CTYPE, 'C'); + + $headers = Array(); + +$fp = fopen('php://output', 'w'); +if ($fp && $db) { + if ($format == "csv") { + header('Content-Type: text/csv; charset=utf-8'); + header('Content-Disposition: attachment; filename="export.score.' . date("c") . '.csv"'); + } + header('Pragma: no-cache'); + header('Expires: 0'); + + try { + $agencies = $db->get_view("score", "score", null, true)->rows; + //print_r($agencies); + $first = true; + if ($format == "json") { + echo '"data" : ['.PHP_EOL; + + } + foreach ($agencies as $agency) { + $agencyArray = object_to_array($agency->value); + if ($first) { + $headers = array_keys($agencyArray); +if ($format == "csv") { + fputcsv($fp, $headers); + } else if ($format == "json") { + echo '{ + "labels" : ["' . implode('","', $headers) . '"],'.PHP_EOL; + } + } + $row = Array(); + + foreach ($headers as $i => $fieldName) { + if (isset($agencyArray[$fieldName])) { + $row[] = $agencyArray[$fieldName]; + } else { + $row[] = ''; + } + } + if ($format == "csv") { + fputcsv($fp, array_values($row)); + } else if ($format == "json") { + if (!$first) echo ","; + echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "'.$agency->value->name.'", "lines" : { "show" : true }, "points" : { "show" : true }}'.PHP_EOL; + + } + $first = false; + } + + if ($format == "json") { + echo '] + }'.PHP_EOL; + + } + } catch (SetteeRestClientException $e) { + setteErrorHandler($e); + } + + die; +} +?> + --- /dev/null +++ b/admin/importAustraliaGovAuGov2.php @@ -1,1 +1,61 @@ +get_db('disclosr-agencies'); +$rows = $db->get_view("app", "byName")->rows; +$nametoid = Array(); +$accounts = Array(); +foreach ($rows as $row) { + $nametoid[trim($row->key)] = $row->value; +} + +function extractCSVAccounts($url, $nameField, $accountField, $filter) { + global $accounts, $nametoid; + $request = Requests::get($url); + $Data = str_getcsv($request->body, "\n"); //parse the rows + $headers = Array(); + foreach ($Data as $num => $line) { + $Row = str_getcsv($line, ","); + if ($num == 0) { + + } else if ($num == 1) { + $headers = $Row; + //print_r($headers); + } else { + if (isset($Row[array_search($nameField, $headers)])) { + $agencyName = $Row[array_search($nameField, $headers)]; + if (!$filter || $Row[array_search("State", $headers)] == "NAT") { + if (!in_array(trim($agencyName), array_keys($nametoid))) { + echo "$agencyName missing" . PHP_EOL; + } else { + // echo $Row[array_search($nameField, $headers)] . PHP_EOL; + } + } + } else { + //echo "error finding agency" . $line . PHP_EOL; + } + } + } +} + +// http://agimo.govspace.gov.au/page/gov2register/ +// twitter +//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true); +// RSS +// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv +// facebook +extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","Agency","Name"); + +/* + * http://australia.gov.au/news-and-media/media-release-rss-feeds + * http://australia.gov.au/news-and-media/social-media/blogs + * http://australia.gov.au/news-and-media/social-media/twitter + * http://australia.gov.au/news-and-media/social-media/facebook + * http://australia.gov.au/news-and-media/social-media/youtube + * http://australia.gov.au/news-and-media/social-media/flickr + * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx + * http://australia.gov.au/news-and-media/social-media/podcasts + */ +?> + --- a/admin/importGov2RegisterRSSFacebookTwitter.php +++ b/admin/importGov2RegisterRSSFacebookTwitter.php @@ -1,27 +1,89 @@ create_db('disclosr-agencies'); -} catch (SetteeRestClientException $e) { - setteErrorHandler($e); +require($basePath.'lib/phpquery/phpQuery/phpQuery.php'); + +$db = $server->get_db('disclosr-agencies'); +$rows = $db->get_view("app", "byName")->rows; +$nametoid = Array(); +$accounts = Array(); +foreach ($rows as $row) { + $nametoid[trim($row->key)] = $row->value; } -$db = $server->get_db('disclosr-agencies'); -createAgencyDesignDoc(); +function extractHTMLAccounts($url, $accountType) { + global $accounts, $nametoid; + $request = Requests::get($url); + $doc = phpQuery::newDocumentHTML($request->body); + phpQuery::selectDocument($doc); + foreach (pq('tr')->elements as $tr) { + //echo $tr->nodeValue.PHP_EOL; + $agency = ""; + $url = ""; + foreach ($tr->childNodes as $td) { + $class = $td->getAttribute("class"); + //echo "cccc $class ".$td->nodeValue.PHP_EOL; + if ($class == "s11" || $class == "s10" || $class == "s7") { + $agency = $td->nodeValue; + } else if ($class == "s6" || $class == "s9"){ + $url = $td->nodeValue; + foreach($td->childNodes as $a) { + $href = $a->getAttribute("href"); + if ($href != "") { + $url = $href; + } + } + } + } + if ($agency != "" && $url != "") { + if (!in_array(trim($agency), array_keys($nametoid))) { + echo trim($agency)." missing" . PHP_EOL; + } else { + // echo $agency." = ".$url.PHP_EOL; + $accounts[$nametoid[trim($agency)]][$accountType][] = $url; + } + + } + } + +} -// twitter https://docs.google.com/spreadsheet/fm?id=tsJVd9EYoAjbl014y3qMgWg.03918275400592898296.8568379511161083736&hl=en&fmcmd=5&gid=0 -// RSS https://docs.google.com/spreadsheet/fm?id=tbqjwIDHKHiVYF_glJ93GgA.03918275400592898296.8789688748524615194&authkey=CJDP-uQG&hl=en_GB&fmcmd=5&gid=0 -// facebook https://docs.google.com/spreadsheet/fm?id=tkcqoo9wrgzNWmoANuVhsBw.03918275400592898296.3040387705062056060&authkey=CKzl7r0I&hl=en_GB&fmcmd=5&gid=0 +function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) { + global $accounts, $nametoid; + $request = Requests::get($url); + $Data = str_getcsv($request->body, "\n"); //parse the rows + $headers = Array(); + foreach ($Data as $num => $line) { + $Row = str_getcsv($line, ",",'"'); + if ($num == 0) { + + } else if ($num == 1) { + $headers = $Row; + //print_r($headers); + } else { + if (isset($Row[array_search($nameField, $headers)])) { + $agencyName = $Row[array_search($nameField, $headers)]; + if (!$filter || $Row[array_search("State", $headers)] == "NAT") { + if (!in_array(trim($agencyName), array_keys($nametoid))) { + echo trim($agencyName)." missing" . PHP_EOL; + } else { + // echo $Row[array_search($nameField, $headers)] . PHP_EOL; + $accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)]; + } + } + } else { + //echo "error finding agency" . $line . PHP_EOL; + } + } + } +} -/* - * http://australia.gov.au/news-and-media/media-release-rss-feeds - * http://australia.gov.au/news-and-media/social-media/blogs - * http://australia.gov.au/news-and-media/social-media/twitter - * http://australia.gov.au/news-and-media/social-media/facebook - * http://australia.gov.au/news-and-media/social-media/youtube - * http://australia.gov.au/news-and-media/social-media/flickr - * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx - * http://australia.gov.au/news-and-media/social-media/podcasts - */ +// http://agimo.govspace.gov.au/page/gov2register/ +// twitter +extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Twitter", "Agency/Body/Event", "", true); +// RSS +extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS"); +// facebook +extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook"); + ?> --- a/getAgency.php +++ b/getAgency.php @@ -14,10 +14,10 @@ echo "
    "; foreach ($value as $subkey => $subvalue) { if (isset($schemas['agency']["properties"][$key]['x-property'])) { - echo '
  1. '; - } else { - echo "
  2. "; - } + echo '
  3. '; + } else { + echo "
  4. "; + } echo "$subvalue
  5. "; } echo "
"; @@ -27,11 +27,11 @@ } else { echo ""; } - if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { - echo "view"; - } else { - echo "$value"; - } + if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { + echo "view"; + } else { + echo "$value"; + } } echo ""; } @@ -53,12 +53,12 @@ } else if ($key == "parentOrg") { echo ""; - } else { + } else { echo ""; if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { echo "view"; @@ -69,7 +69,7 @@ } } } - // +// } function addDefaultFields($row) { @@ -78,13 +78,22 @@ foreach ($defaultFields as $defaultField) { if (!isset($row[$defaultField])) { if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { - - $row[$defaultField] = ""; - + $row[$defaultField] = ""; } if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { - $row[$defaultField] = Array(""); + } + } else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { + if (is_array($row[$defaultField])) { + $row[$defaultField][] = ""; + $row[$defaultField][] = ""; + $row[$defaultField][] = ""; + } else { + $value = $row[$defaultField]; + $row[$defaultField] = Array($value); + $row[$defaultField][] = ""; + $row[$defaultField][] = ""; + } } } @@ -94,39 +103,49 @@ $db = $server->get_db('disclosr-agencies'); if (isset($_REQUEST['id'])) { - //get an agency record as json/html, search by name/abn/id +//get an agency record as json/html, search by name/abn/id // by name = startkey="Ham"&endkey="Ham\ufff0" // edit? - $row = $db->get($_REQUEST['id']); - //print_r($row); + $obj = $db->get($_REQUEST['id']); +//print_r($row); if (sizeof($_POST) > 0) { - //print_r($_POST); +//print_r($_POST); foreach ($_POST as $postkey => $postvalue) { if ($postvalue == "") { unset($_POST[$postkey]); } - if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") { - unset($_POST[$postkey]); + if (is_array($postvalue)) { + if (count($postvalue) == 1 && $postvalue[0] == "") { + unset($_POST[$postkey]); + } else { + foreach ($_POST[$postkey] as $key => &$value) { + if ($value == "") { + unset($_POST[$postkey][$key]); + } + } + } } } if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { echo "Edited version was latest version, continue saving"; $newdoc = $_POST; $newdoc['metadata']['lastModified'] = time(); - $row = $db->save($newdoc); + $obj = $db->save($newdoc); } else { echo "ALERT doc revised by someone else while editing. Document not saved."; } } $mode = "edit"; + $rowArray = object_to_array($obj); +ksort($rowArray); if ($mode == "edit") { - $row = addDefaultFields(object_to_array($row)); + $row = addDefaultFields($rowArray); } else { - $row = object_to_array($row); - } - + $row = $rowArray; + } + if ($mode == "view") { echo '
'; echo '"; @@ -153,44 +172,44 @@ }; - $value) { - echo displayValue($key, $value, $mode); - } - if ($mode == "view") { - echo "

' . $row['name'] . "

"; - } - if ($mode == "edit") { - echo ''; - } -} else { - - try { - /* $rows = $db->get_view("app", "showNamesABNs")->rows; - //print_r($rows); - foreach ($rows as $row) { - // print_r($row); - echo '
  • ' . - (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) - . '
  • '; - } */ - $rows = $db->get_view("app", "byName")->rows; - //print_r($rows); -echo '"; - } catch (SetteeRestClientException $e) { - setteErrorHandler($e); - } -} -include_footer(); -?> - + $value) { + echo displayValue($key, $value, $mode); + } + if ($mode == "view") { + echo ""; + } + if ($mode == "edit") { + echo ''; + } + } else { + + try { + /* $rows = $db->get_view("app", "showNamesABNs")->rows; + //print_r($rows); + foreach ($rows as $row) { + // print_r($row); + echo '
  • ' . + (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) + . '
  • '; + } */ + $rows = $db->get_view("app", "byCanonicalName")->rows; + //print_r($rows); + echo '"; + } catch (SetteeRestClientException $e) { + setteErrorHandler($e); + } + } + include_footer(); + ?> + --- a/graph.php +++ b/graph.php @@ -6,36 +6,46 @@ $format = $_REQUEST['format']; } -function add_node($id, $label) { +function add_node($id, $label, $parent="") { global $format; if ($format == "html") { - echo "nodes[\"$id\"] = graph.newNode({label: \"$label\"});" . PHP_EOL; + // echo "nodes[\"$id\"] = graph.newNode({label: \"$label\"});" . PHP_EOL; } if ($format == "dot" && $label != "") { echo "$id [label=\"$label\"];". PHP_EOL; } + if ($format == "gexf") { + echo "":">") + ."" + ."". PHP_EOL; + } } function add_edge($from, $to, $color) { global $format; if ($format == "html") { - echo "graph.newEdge(nodes[\"$from\"], nodes['$to'], {color: '$color'});" . PHP_EOL; + // echo "graph.newEdge(nodes[\"$from\"], nodes['$to'], {color: '$color'});" . PHP_EOL; } if ($format == "dot") { echo "$from -> $to ".($color != ""? "[color=$color]":"").";". PHP_EOL; } + if ($format == "gexf") { + echo "". PHP_EOL; + } +} +if ($format == "gexf") { + //header('Content-Type: text/xml'); + header('Content-Type: application/gexf+xml'); +echo ' + + + Gexf.net + A hello world! file + + + '. PHP_EOL; } -if ($format == "html") { - ?> - - - - +
    + + + + + +
    +
    '. PHP_EOL; +} //include_footer(); ?> --- a/include/couchdb.inc.php +++ b/include/couchdb.inc.php @@ -92,25 +92,29 @@ } }"; // http://stackoverflow.com/questions/646628/javascript-startswith - $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ +$obj->views->score->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { return !this.indexOf(str); } } -if(!String.prototype.endsWith){ - String.prototype.endsWith = function(suffix) { -     return this.indexOf(suffix, this.length - suffix.length) !== -1; - }; -} + function(doc) { -if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { -for(var propName in doc) { - if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { - emit(propName, 1); - } -} - emit("total", 1); - } + count = 0; + if (doc["status"] != "suspended") { + for(var propName in doc) { + if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { + count++; + } + } + portfolio = doc.parentOrg; + if (doc.orgType == "FMA-DepartmentOfState") { + portfolio = doc._id; + } + if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { + portfolio = doc.orgType; + } + emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); + } }'; $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { --- /dev/null +++ b/javascripts/sigma --- /dev/null +++ b/javascripts/sigma.min.js @@ -1,1 +1,63 @@ +/* sigmajs.org - an open-source light-weight JavaScript graph drawing library - Version: 0.1 - Author: Alexis Jacomy - License: MIT */ +var sigma={tools:{},classes:{},instances:{}}; +(function(){if(!Array.prototype.some)Array.prototype.some=function(i,n){var g=this.length;if("function"!=typeof i)throw new TypeError;for(var j=0;j";a+="

    LOCAL :

    ";for(b in d.p.localProbes)a+="

    "+b+" : "+d.p.localProbes[b]()+"

    ";d.p.dom.innerHTML= +a;return d}sigma.classes.Cascade.call(this);var d=this;this.instance=b;this.monitoring=!1;this.p={fps:40,dom:h,globalProbes:{"Time (ms)":sigma.chronos.getExecutionTime,Queue:sigma.chronos.getQueuedTasksCount,Tasks:sigma.chronos.getTasksCount,FPS:sigma.chronos.getFPS},localProbes:{"Nodes count":function(){return d.instance.graph.nodes.length},"Edges count":function(){return d.instance.graph.edges.length}}};this.activate=function(){if(!d.monitoring)d.monitoring=window.setInterval(e,1E3/d.p.fps);return d}; +this.desactivate=function(){if(d.monitoring)window.clearInterval(d.monitoring),d.monitoring=null,d.p.dom.innerHTML="";return d}}function j(b){function h(b){if(a.p.mouseEnabled&&(e(a.mouseX,a.mouseY,a.ratio*(0<(void 0!=b.wheelDelta&&b.wheelDelta||void 0!=b.detail&&-b.detail)?a.p.zoomMultiply:1/a.p.zoomMultiply)),a.p.blockScroll))b.preventDefault?b.preventDefault():b.returnValue=!1}function e(b,c,e){if(!a.isMouseDown&&(window.clearInterval(a.interpolationID),n=void 0!=e,i=a.stageX,j=b,k=a.stageY,l= +c,f=e||a.ratio,f=Math.min(Math.max(f,a.p.minRatio),a.p.maxRatio),u=a.p.directZooming?1-(n?a.p.zoomDelta:a.p.dragDelta):0,a.ratio!=f||a.stageX!=j||a.stageY!=l))d(),a.interpolationID=window.setInterval(d,50),a.dispatch("startinterpolate")}function d(){u+=n?a.p.zoomDelta:a.p.dragDelta;u=Math.min(u,1);var b=sigma.easing.quadratic.easeout(u),c=a.ratio;a.ratio=c*(1-b)+f*b;n?(a.stageX=j+(a.stageX-j)*a.ratio/c,a.stageY=l+(a.stageY-l)*a.ratio/c):(a.stageX=i*(1-b)+j*b,a.stageY=k*(1-b)+l*b);a.dispatch("interpolate"); +if(1<=u)window.clearInterval(a.interpolationID),b=a.ratio,n?(a.ratio=f,a.stageX=j+(a.stageX-j)*a.ratio/b,a.stageY=l+(a.stageY-l)*a.ratio/b):(a.stageX=j,a.stageY=l),a.dispatch("stopinterpolate")}sigma.classes.Cascade.call(this);sigma.classes.EventDispatcher.call(this);var a=this;this.p={minRatio:1,maxRatio:32,marginRatio:1,zoomDelta:0.1,dragDelta:0.3,zoomMultiply:2,directZooming:!1,blockScroll:!0,inertia:1.1,mouseEnabled:!0};var g=0,c=0,i=0,k=0,f=1,j=0,l=0,s=0,q=0,z=0,m=0,u=0,n=!1;this.stageY=this.stageX= +0;this.ratio=1;this.mouseY=this.mouseX=0;this.isMouseDown=!1;b.addEventListener("DOMMouseScroll",h,!0);b.addEventListener("mousewheel",h,!0);b.addEventListener("mousemove",function(b){a.mouseX=void 0!=b.offsetX&&b.offsetX||void 0!=b.layerX&&b.layerX||void 0!=b.clientX&&b.clientX;a.mouseY=void 0!=b.offsetY&&b.offsetY||void 0!=b.layerY&&b.layerY||void 0!=b.clientY&&b.clientY;if(a.isMouseDown){var d=a.mouseX-g+i,f=a.mouseY-c+k;if(d!=a.stageX||f!=a.stageY)q=s,m=z,s=d,z=f,a.stageX=d,a.stageY=f,a.dispatch("drag")}a.dispatch("move"); +b.preventDefault?b.preventDefault():b.returnValue=!1},!0);b.addEventListener("mousedown",function(b){if(a.p.mouseEnabled)a.isMouseDown=!0,a.dispatch("mousedown"),i=a.stageX,k=a.stageY,g=a.mouseX,c=a.mouseY,q=s=a.stageX,m=z=a.stageY,a.dispatch("startdrag"),b.preventDefault?b.preventDefault():b.returnValue=!1},!0);document.addEventListener("mouseup",function(b){if(a.p.mouseEnabled&&a.isMouseDown)a.isMouseDown=!1,a.dispatch("mouseup"),(i!=a.stageX||k!=a.stageY)&&e(a.stageX+a.p.inertia*(a.stageX-q),a.stageY+ +a.p.inertia*(a.stageY-m)),b.preventDefault?b.preventDefault():b.returnValue=!1},!0);this.checkBorders=function(){return a};this.interpolate=e}function m(b,h,e,d,a,g,c){function i(a){var b=d,c="fixed"==f.p.labelSize?f.p.defaultLabelSize:f.p.labelSizeRatio*a.displaySize;b.font=(f.p.hoverFontStyle||f.p.fontStyle||"")+" "+c+"px "+(f.p.hoverFont||f.p.font||"");b.fillStyle="node"==f.p.labelHoverBGColor?a.color||f.p.defaultNodeColor:f.p.defaultHoverLabelBGColor;b.beginPath();if(f.p.labelHoverShadow)b.shadowOffsetX= +0,b.shadowOffsetY=0,b.shadowBlur=4,b.shadowColor=f.p.labelHoverShadowColor;sigma.tools.drawRoundRect(b,Math.round(a.displayX-c/2-2),Math.round(a.displayY-c/2-2),Math.round(b.measureText(a.label).width+1.5*a.displaySize+c/2+4),Math.round(c+4),Math.round(c/2+2),"left");b.closePath();b.fill();b.shadowOffsetX=0;b.shadowOffsetY=0;b.shadowBlur=0;b.beginPath();b.fillStyle="node"==f.p.nodeBorderColor?a.color||f.p.defaultNodeColor:f.p.defaultNodeBorderColor;b.arc(Math.round(a.displayX),Math.round(a.displayY), +a.displaySize+f.p.borderSize,0,2*Math.PI,!0);b.closePath();b.fill();b.beginPath();b.fillStyle="node"==f.p.nodeHoverColor?a.color||f.p.defaultNodeColor:f.p.defaultNodeHoverColor;b.arc(Math.round(a.displayX),Math.round(a.displayY),a.displaySize,0,2*Math.PI,!0);b.closePath();b.fill();b.fillStyle="node"==f.p.labelHoverColor?a.color||f.p.defaultNodeColor:f.p.defaultLabelHoverColor;b.fillText(a.label,Math.round(a.displayX+1.5*a.displaySize),Math.round(a.displayY+c/2-3));return f}function k(a){if(isNaN(a.x)|| +isNaN(a.y))throw Error("A node's coordinate is not a number (id: "+a.id+")");return!a.hidden&&a.displayX+a.displaySize>-j/3&&a.displayX-a.displaySize<4*j/3&&a.displayY+a.displaySize>-l/3&&a.displayY-a.displaySize<4*l/3}sigma.classes.Cascade.call(this);var f=this;this.p={labelColor:"default",defaultLabelColor:"#000",labelHoverBGColor:"default",defaultHoverLabelBGColor:"#fff",labelHoverShadow:!0,labelHoverShadowColor:"#000",labelHoverColor:"default",defaultLabelHoverColor:"#000",labelActiveBGColor:"default", +defaultActiveLabelBGColor:"#fff",labelActiveShadow:!0,labelActiveShadowColor:"#000",labelActiveColor:"default",defaultLabelActiveColor:"#000",labelSize:"fixed",defaultLabelSize:12,labelSizeRatio:2,labelThreshold:6,font:"Arial",hoverFont:"",activeFont:"",fontStyle:"",hoverFontStyle:"",activeFontStyle:"",edgeColor:"source",defaultEdgeColor:"#aaa",defaultEdgeType:"line",defaultNodeColor:"#aaa",nodeHoverColor:"node",defaultNodeHoverColor:"#fff",nodeActiveColor:"node",defaultNodeActiveColor:"#fff",borderSize:0, +nodeBorderColor:"node",defaultNodeBorderColor:"#fff",edgesSpeed:200,nodesSpeed:200,labelsSpeed:200};var j=g,l=c;this.currentLabelIndex=this.currentNodeIndex=this.currentEdgeIndex=0;this.task_drawLabel=function(){for(var b=a.nodes.length,c=0;c++=f.p.labelThreshold){var k="fixed"==f.p.labelSize?f.p.defaultLabelSize:f.p.labelSizeRatio*d.displaySize;h.font= +f.p.fontStyle+k+"px "+f.p.font;h.fillStyle="node"==f.p.labelColor?d.color||f.p.defaultNodeColor:f.p.defaultLabelColor;h.fillText(d.label,Math.round(d.displayX+1.5*d.displaySize),Math.round(d.displayY+k/2-3))}}else f.currentLabelIndex++;return f.currentLabelIndex(b*=2)?0.5*b*b:-0.5*(--b*(b-2)-1)};sigma.tools.drawRoundRect=function(b,h,e,d,a,g,c){var g=g?g:0,i=c?c:[],i="string"==typeof i?i.split(" "):i,c=g&&(0<=i.indexOf("topleft")||0<=i.indexOf("top")||0<=i.indexOf("left")),j=g&&(0<=i.indexOf("topright")||0<=i.indexOf("top")||0<=i.indexOf("right")),f=g&&(0<=i.indexOf("bottomleft")||0<=i.indexOf("bottom")||0<=i.indexOf("left")),i=g&&(0<=i.indexOf("bottomright")|| +0<=i.indexOf("bottom")||0<=i.indexOf("right"));b.moveTo(h,e+g);c?b.arcTo(h,e,h+g,e,g):b.lineTo(h,e);j?(b.lineTo(h+d-g,e),b.arcTo(h+d,e,h+d,e+g,g)):b.lineTo(h+d,e);i?(b.lineTo(h+d,e+a-g),b.arcTo(h+d,e+a,h+d-g,e+a,g)):b.lineTo(h+d,e+a);f?(b.lineTo(h+g,e+a),b.arcTo(h,e+a,h,e+a-g,g)):b.lineTo(h,e+a);b.lineTo(h,e+g)};sigma.tools.getRGB=function(b,g){var b=b.toString(),e={r:0,g:0,b:0};if(3<=b.length&&"#"==b.charAt(0)){var d=b.length-1;6==d?e={r:parseInt(b.charAt(1)+b.charAt(2),16),g:parseInt(b.charAt(3)+ +b.charAt(4),16),b:parseInt(b.charAt(5)+b.charAt(5),16)}:3==d&&(e={r:parseInt(b.charAt(1)+b.charAt(1),16),g:parseInt(b.charAt(2)+b.charAt(2),16),b:parseInt(b.charAt(3)+b.charAt(3),16)})}g&&(e=[e.r,e.g,e.b]);return e};sigma.tools.rgbToHex=function(b,g,e){return sigma.tools.toHex(b)+sigma.tools.toHex(g)+sigma.tools.toHex(e)};sigma.tools.toHex=function(b){b=parseInt(b,10);if(isNaN(b))return"00";b=Math.max(0,Math.min(b,255));return"0123456789ABCDEF".charAt((b-b%16)/16)+"0123456789ABCDEF".charAt(b%16)}; +sigma.publicPrototype=p.prototype})(); --- /dev/null +++ b/lib/phpquery --- a/lib/springy +++ /dev/null --- a/schemas/agency.json.php +++ b/schemas/agency.json.php @@ -17,6 +17,7 @@ "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"), "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), + "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"), "contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, mandated by the Senate" ), "budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"), "grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", @@ -33,6 +34,8 @@ "appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , mandated by the Senate"), "advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, mandated by the Senate "), "hasRSS" => Array("type" => "array", "required" => true, "x-title" => "Has RSS", "description" => ""), + "hasBlog" => Array("type" => "array", "required" => true, "x-title" => "Has Blog", "description" => ""), + "hasMobileApp" => Array("type" => "array", "required" => true, "x-title" => "Has Mobile App", "description" => ""), "hasMailingList" => Array("type" => "array", "required" => true, "x-title" => "Has Mailing List", "description" => "", "items" => Array("type" => "string")), "hasTwitter" => Array("type" => "array", "required" => true, "x-title" => "Has Twitter", "description" => "", @@ -47,6 +50,8 @@ "items" => Array("type" => "string")), "hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright", "items" => Array("type" => "string")), + "hasPermissiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Permissive Licence", "description" => "Has any page licenced under terms more permissive than Crown Copyright but not clear CCBY", + "items" => Array("type" => "string")), "hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration", "items" => Array("type" => "string")), ), --- a/scrape.py +++ b/scrape.py @@ -77,7 +77,7 @@ print "Fetching %s" % url if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": print "Not a valid HTTP url" - return (None,None) + return (None,None,None) doc = docsdb.get(hash) if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} @@ -86,13 +86,14 @@ print "Uh oh, trying to scrape URL again too soon!" last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - return (doc['mime_type'],last_attachment) + return (doc['url'],doc['mime_type'],last_attachment) if scrape_again == False: print "Not scraping this URL again as requested" - return (None,None) + return (None,None,None) time.sleep(3) # wait 3 seconds to give webserver time to recover + req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags if doc.has_key('etag'): req.add_header("If-None-Match", doc['etag']) @@ -102,12 +103,14 @@ opener = urllib2.build_opener(NotModifiedHandler()) try: url_handle = opener.open(req) + doc['url'] = url_handle.geturl() # may have followed a redirect to a new url headers = url_handle.info() # the addinfourls have the .info() too doc['etag'] = headers.getheader("ETag") doc['last_modified'] = headers.getheader("Last-Modified") doc['date'] = headers.getheader("Date") doc['page_scraped'] = time.time() doc['web_server'] = headers.getheader("Server") + doc['via'] = headers.getheader("Via") doc['powered_by'] = headers.getheader("X-Powered-By") doc['file_size'] = headers.getheader("Content-Length") content_type = headers.getheader("Content-Type") @@ -119,13 +122,13 @@ if hasattr(url_handle, 'code'): if url_handle.code == 304: print "the web page has not been modified" - return (None,None) + return (None,None,None) else: content = url_handle.read() docsdb.save(doc) doc = docsdb.get(hash) # need to get a _rev docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) - return (doc['mime_type'], content) + return (doc['url'], doc['mime_type'], content) #store as attachment epoch-filename except urllib2.URLError as e: error = "" @@ -136,21 +139,22 @@ print error doc['error'] = error docsdb.save(doc) - return (None,None) + return (None,None,None) def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): - (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) - if content != None and depth > 0: + (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) + badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] + if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(content) - navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar')) + navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) for nav in navIDs: print "Removing element", nav['id'] nav.extract() - navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')}) + navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) for nav in navClasses: print "Removing element", nav['class'] nav.extract() @@ -169,7 +173,10 @@ # not http None else: - linkurls.add(urljoin(url,link['href'].replace(" ","%20"))) + # remove anchors and spaces in urls + link['href'] = link['href'].replace(" ","%20") + link['href'] = re.sub('#.*$','',link['href']) + linkurls.add(urljoin(url,link['href'])) for linkurl in linkurls: #print linkurl scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)