Add extra blank fields
Former-commit-id: a6c55876b5cd8b26bbb91de39e18f73ad1a36a86
--- a/.gitmodules
+++ b/.gitmodules
@@ -16,4 +16,7 @@
[submodule "javascripts/flotr2"]
path = javascripts/flotr2
url = https://github.com/HumbleSoftware/Flotr2.git
+[submodule "lib/phpquery"]
+ path = lib/phpquery
+ url = https://github.com/TobiaszCudnik/phpquery.git
--- a/admin/exportEmployees.csv.php
+++ b/admin/exportEmployees.csv.php
@@ -22,6 +22,7 @@
if (isset($row->value->statistics->employees)) {
$headers = array_unique(array_merge($headers, array_keys(object_to_array($row->value->statistics->employees))));
+
}
}
} catch (SetteeRestClientException $e) {
--- /dev/null
+++ b/admin/exportScore.csv.php
@@ -1,1 +1,73 @@
+<?php
+include_once("../include/common.inc.php");
+
+$db = $server->get_db('disclosr-agencies');
+$format = "csv";
+//$format = "json";
+if (isset($_REQUEST['format'])) $format = $_REQUEST['format'];
+
+setlocale(LC_CTYPE, 'C');
+
+ $headers = Array();
+
+$fp = fopen('php://output', 'w');
+if ($fp && $db) {
+ if ($format == "csv") {
+ header('Content-Type: text/csv; charset=utf-8');
+ header('Content-Disposition: attachment; filename="export.score.' . date("c") . '.csv"');
+ }
+ header('Pragma: no-cache');
+ header('Expires: 0');
+
+ try {
+ $agencies = $db->get_view("score", "score", null, true)->rows;
+ //print_r($agencies);
+ $first = true;
+ if ($format == "json") {
+ echo '"data" : ['.PHP_EOL;
+
+ }
+ foreach ($agencies as $agency) {
+ $agencyArray = object_to_array($agency->value);
+ if ($first) {
+ $headers = array_keys($agencyArray);
+if ($format == "csv") {
+ fputcsv($fp, $headers);
+ } else if ($format == "json") {
+ echo '{
+ "labels" : ["' . implode('","', $headers) . '"],'.PHP_EOL;
+ }
+ }
+ $row = Array();
+
+ foreach ($headers as $i => $fieldName) {
+ if (isset($agencyArray[$fieldName])) {
+ $row[] = $agencyArray[$fieldName];
+ } else {
+ $row[] = '';
+ }
+ }
+ if ($format == "csv") {
+ fputcsv($fp, array_values($row));
+ } else if ($format == "json") {
+ if (!$first) echo ",";
+ echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "'.$agency->value->name.'", "lines" : { "show" : true }, "points" : { "show" : true }}'.PHP_EOL;
+
+ }
+ $first = false;
+ }
+
+ if ($format == "json") {
+ echo ']
+ }'.PHP_EOL;
+
+ }
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+ }
+
+ die;
+}
+?>
+
--- /dev/null
+++ b/admin/importAustraliaGovAuGov2.php
@@ -1,1 +1,61 @@
+<?php
+require_once '../include/common.inc.php';
+
+$db = $server->get_db('disclosr-agencies');
+$rows = $db->get_view("app", "byName")->rows;
+$nametoid = Array();
+$accounts = Array();
+foreach ($rows as $row) {
+ $nametoid[trim($row->key)] = $row->value;
+}
+
+function extractCSVAccounts($url, $nameField, $accountField, $filter) {
+ global $accounts, $nametoid;
+ $request = Requests::get($url);
+ $Data = str_getcsv($request->body, "\n"); //parse the rows
+ $headers = Array();
+ foreach ($Data as $num => $line) {
+ $Row = str_getcsv($line, ",");
+ if ($num == 0) {
+
+ } else if ($num == 1) {
+ $headers = $Row;
+ //print_r($headers);
+ } else {
+ if (isset($Row[array_search($nameField, $headers)])) {
+ $agencyName = $Row[array_search($nameField, $headers)];
+ if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
+ if (!in_array(trim($agencyName), array_keys($nametoid))) {
+ echo "$agencyName missing" . PHP_EOL;
+ } else {
+ // echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+ }
+ }
+ } else {
+ //echo "error finding agency" . $line . PHP_EOL;
+ }
+ }
+ }
+}
+
+// http://agimo.govspace.gov.au/page/gov2register/
+// twitter
+//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true);
+// RSS
+// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv
+// facebook
+extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","Agency","Name");
+
+/*
+ * http://australia.gov.au/news-and-media/media-release-rss-feeds
+ * http://australia.gov.au/news-and-media/social-media/blogs
+ * http://australia.gov.au/news-and-media/social-media/twitter
+ * http://australia.gov.au/news-and-media/social-media/facebook
+ * http://australia.gov.au/news-and-media/social-media/youtube
+ * http://australia.gov.au/news-and-media/social-media/flickr
+ * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx
+ * http://australia.gov.au/news-and-media/social-media/podcasts
+ */
+?>
+
--- a/admin/importGov2RegisterRSSFacebookTwitter.php
+++ b/admin/importGov2RegisterRSSFacebookTwitter.php
@@ -1,27 +1,89 @@
<?php
require_once '../include/common.inc.php';
-try {
- $server->create_db('disclosr-agencies');
-} catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
+require($basePath.'lib/phpquery/phpQuery/phpQuery.php');
+
+$db = $server->get_db('disclosr-agencies');
+$rows = $db->get_view("app", "byName")->rows;
+$nametoid = Array();
+$accounts = Array();
+foreach ($rows as $row) {
+ $nametoid[trim($row->key)] = $row->value;
}
-$db = $server->get_db('disclosr-agencies');
-createAgencyDesignDoc();
+function extractHTMLAccounts($url, $accountType) {
+ global $accounts, $nametoid;
+ $request = Requests::get($url);
+ $doc = phpQuery::newDocumentHTML($request->body);
+ phpQuery::selectDocument($doc);
+ foreach (pq('tr')->elements as $tr) {
+ //echo $tr->nodeValue.PHP_EOL;
+ $agency = "";
+ $url = "";
+ foreach ($tr->childNodes as $td) {
+ $class = $td->getAttribute("class");
+ //echo "cccc $class ".$td->nodeValue.PHP_EOL;
+ if ($class == "s11" || $class == "s10" || $class == "s7") {
+ $agency = $td->nodeValue;
+ } else if ($class == "s6" || $class == "s9"){
+ $url = $td->nodeValue;
+ foreach($td->childNodes as $a) {
+ $href = $a->getAttribute("href");
+ if ($href != "") {
+ $url = $href;
+ }
+ }
+ }
+ }
+ if ($agency != "" && $url != "") {
+ if (!in_array(trim($agency), array_keys($nametoid))) {
+ echo trim($agency)." missing" . PHP_EOL;
+ } else {
+ // echo $agency." = ".$url.PHP_EOL;
+ $accounts[$nametoid[trim($agency)]][$accountType][] = $url;
+ }
+
+ }
+ }
+
+}
-// twitter https://docs.google.com/spreadsheet/fm?id=tsJVd9EYoAjbl014y3qMgWg.03918275400592898296.8568379511161083736&hl=en&fmcmd=5&gid=0
-// RSS https://docs.google.com/spreadsheet/fm?id=tbqjwIDHKHiVYF_glJ93GgA.03918275400592898296.8789688748524615194&authkey=CJDP-uQG&hl=en_GB&fmcmd=5&gid=0
-// facebook https://docs.google.com/spreadsheet/fm?id=tkcqoo9wrgzNWmoANuVhsBw.03918275400592898296.3040387705062056060&authkey=CKzl7r0I&hl=en_GB&fmcmd=5&gid=0
+function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) {
+ global $accounts, $nametoid;
+ $request = Requests::get($url);
+ $Data = str_getcsv($request->body, "\n"); //parse the rows
+ $headers = Array();
+ foreach ($Data as $num => $line) {
+ $Row = str_getcsv($line, ",",'"');
+ if ($num == 0) {
+
+ } else if ($num == 1) {
+ $headers = $Row;
+ //print_r($headers);
+ } else {
+ if (isset($Row[array_search($nameField, $headers)])) {
+ $agencyName = $Row[array_search($nameField, $headers)];
+ if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
+ if (!in_array(trim($agencyName), array_keys($nametoid))) {
+ echo trim($agencyName)." missing" . PHP_EOL;
+ } else {
+ // echo $Row[array_search($nameField, $headers)] . PHP_EOL;
+ $accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)];
+ }
+ }
+ } else {
+ //echo "error finding agency" . $line . PHP_EOL;
+ }
+ }
+ }
+}
-/*
- * http://australia.gov.au/news-and-media/media-release-rss-feeds
- * http://australia.gov.au/news-and-media/social-media/blogs
- * http://australia.gov.au/news-and-media/social-media/twitter
- * http://australia.gov.au/news-and-media/social-media/facebook
- * http://australia.gov.au/news-and-media/social-media/youtube
- * http://australia.gov.au/news-and-media/social-media/flickr
- * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx
- * http://australia.gov.au/news-and-media/social-media/podcasts
- */
+// http://agimo.govspace.gov.au/page/gov2register/
+// twitter
+extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Twitter", "Agency/Body/Event", "", true);
+// RSS
+extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS");
+// facebook
+extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook");
+
?>
--- a/getAgency.php
+++ b/getAgency.php
@@ -14,10 +14,10 @@
echo "<ol>";
foreach ($value as $subkey => $subvalue) {
if (isset($schemas['agency']["properties"][$key]['x-property'])) {
- echo '<li property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';
- } else {
- echo "<li>";
- }
+ echo '<li property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';
+ } else {
+ echo "<li>";
+ }
echo "$subvalue</li>";
}
echo "</ol></td></tr>";
@@ -27,11 +27,11 @@
} else {
echo "<span>";
}
- if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
- echo "<a href='$value'>view</a></span>";
- } else {
- echo "$value</span>";
- }
+ if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
+ echo "<a href='$value'>view</a></span>";
+ } else {
+ echo "$value</span>";
+ }
}
echo "</td></tr>";
}
@@ -53,12 +53,12 @@
} else if ($key == "parentOrg") {
echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>";
$rows = $db->get_view("app", "byDeptStateName")->rows;
- //print_r($rows);
+//print_r($rows);
foreach ($rows as $row) {
echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>";
}
echo" </select>";
- } else {
+ } else {
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>";
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
echo "<a href='$value'>view</a>";
@@ -69,7 +69,7 @@
}
}
}
- //
+//
}
function addDefaultFields($row) {
@@ -78,13 +78,22 @@
foreach ($defaultFields as $defaultField) {
if (!isset($row[$defaultField])) {
if ($schemas['agency']['properties'][$defaultField]['type'] == "string") {
-
- $row[$defaultField] = "";
-
+ $row[$defaultField] = "";
}
if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
-
$row[$defaultField] = Array("");
+ }
+ } else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
+ if (is_array($row[$defaultField])) {
+ $row[$defaultField][] = "";
+ $row[$defaultField][] = "";
+ $row[$defaultField][] = "";
+ } else {
+ $value = $row[$defaultField];
+ $row[$defaultField] = Array($value);
+ $row[$defaultField][] = "";
+ $row[$defaultField][] = "";
+
}
}
}
@@ -94,39 +103,49 @@
$db = $server->get_db('disclosr-agencies');
if (isset($_REQUEST['id'])) {
- //get an agency record as json/html, search by name/abn/id
+//get an agency record as json/html, search by name/abn/id
// by name = startkey="Ham"&endkey="Ham\ufff0"
// edit?
- $row = $db->get($_REQUEST['id']);
- //print_r($row);
+ $obj = $db->get($_REQUEST['id']);
+//print_r($row);
if (sizeof($_POST) > 0) {
- //print_r($_POST);
+//print_r($_POST);
foreach ($_POST as $postkey => $postvalue) {
if ($postvalue == "") {
unset($_POST[$postkey]);
}
- if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") {
- unset($_POST[$postkey]);
+ if (is_array($postvalue)) {
+ if (count($postvalue) == 1 && $postvalue[0] == "") {
+ unset($_POST[$postkey]);
+ } else {
+ foreach ($_POST[$postkey] as $key => &$value) {
+ if ($value == "") {
+ unset($_POST[$postkey][$key]);
+ }
+ }
+ }
}
}
if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) {
echo "Edited version was latest version, continue saving";
$newdoc = $_POST;
$newdoc['metadata']['lastModified'] = time();
- $row = $db->save($newdoc);
+ $obj = $db->save($newdoc);
} else {
echo "ALERT doc revised by someone else while editing. Document not saved.";
}
}
$mode = "edit";
+ $rowArray = object_to_array($obj);
+ksort($rowArray);
if ($mode == "edit") {
- $row = addDefaultFields(object_to_array($row));
+ $row = addDefaultFields($rowArray);
} else {
- $row = object_to_array($row);
- }
-
+ $row = $rowArray;
+ }
+
if ($mode == "view") {
echo '<div typeof="schema:GovernmentOrganisation" about="#' . $row['_id'] . '"><table width="100%">';
echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>";
@@ -153,44 +172,44 @@
};
</script>
<form id="editform" class="nice" method="post">
- <?php
-
- }
- foreach ($row as $key => $value) {
- echo displayValue($key, $value, $mode);
- }
- if ($mode == "view") {
- echo "</table></div>";
- }
- if ($mode == "edit") {
- echo '<input id="submitbutton" type="submit"/></form>';
- }
-} else {
-
- try {
- /* $rows = $db->get_view("app", "showNamesABNs")->rows;
- //print_r($rows);
- foreach ($rows as $row) {
- // print_r($row);
- echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
- (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
- . '</a></li>';
- } */
- $rows = $db->get_view("app", "byName")->rows;
- //print_r($rows);
-echo '<ul>';
- foreach ($rows as $row) {
- // print_r($row);
- echo '<li typeof="schema:GovernmentOrganisation foaf:Organization" about="getAgency.php?id=' . $row->value . '">
-<a href="getAgency.php?id=' . $row->value . '" rel="schema:url foaf:page" property="schema:name foaf:name">' .
- $row->key
- . '</a></li>';
- }
-echo "</ul>";
- } catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
- }
-}
-include_footer();
-?>
-
+ <?php
+
+ }
+ foreach ($row as $key => $value) {
+ echo displayValue($key, $value, $mode);
+ }
+ if ($mode == "view") {
+ echo "</table></div>";
+ }
+ if ($mode == "edit") {
+ echo '<input id="submitbutton" type="submit"/></form>';
+ }
+ } else {
+
+ try {
+ /* $rows = $db->get_view("app", "showNamesABNs")->rows;
+ //print_r($rows);
+ foreach ($rows as $row) {
+ // print_r($row);
+ echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
+ (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
+ . '</a></li>';
+ } */
+ $rows = $db->get_view("app", "byCanonicalName")->rows;
+ //print_r($rows);
+ echo '<ul>';
+ foreach ($rows as $row) {
+ // print_r($row);
+ echo '<li typeof="schema:GovernmentOrganisation foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
+<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name">' .
+ $row->value->name
+ . '</a></li>';
+ }
+ echo "</ul>";
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+ }
+ }
+ include_footer();
+ ?>
+
--- a/include/couchdb.inc.php
+++ b/include/couchdb.inc.php
@@ -92,25 +92,29 @@
}
}";
// http://stackoverflow.com/questions/646628/javascript-startswith
- $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
+$obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {
return !this.indexOf(str);
}
}
-if(!String.prototype.endsWith){
- String.prototype.endsWith = function(suffix) {
- return this.indexOf(suffix, this.length - suffix.length) !== -1;
- };
-}
+
function(doc) {
-if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
-for(var propName in doc) {
- if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
- emit(propName, 1);
- }
-}
- emit("total", 1);
- }
+ count = 0;
+ if (doc["status"] != "suspended") {
+ for(var propName in doc) {
+ if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
+ count++;
+ }
+ }
+ portfolio = doc.parentOrg;
+ if (doc.orgType == "FMA-DepartmentOfState") {
+ portfolio = doc._id;
+ }
+ if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
+ portfolio = doc.orgType;
+ }
+ emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
+ }
}';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {
--- /dev/null
+++ b/lib/phpquery
--- a/schemas/agency.json.php
+++ b/schemas/agency.json.php
@@ -17,6 +17,7 @@
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"),
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
+ "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"),
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>" ),
"budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"),
"grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded",
@@ -33,6 +34,8 @@
"appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
"advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "),
"hasRSS" => Array("type" => "array", "required" => true, "x-title" => "Has RSS", "description" => ""),
+ "hasBlog" => Array("type" => "array", "required" => true, "x-title" => "Has Blog", "description" => ""),
+ "hasMobileApp" => Array("type" => "array", "required" => true, "x-title" => "Has Mobile App", "description" => ""),
"hasMailingList" => Array("type" => "array", "required" => true, "x-title" => "Has Mailing List", "description" => "",
"items" => Array("type" => "string")),
"hasTwitter" => Array("type" => "array", "required" => true, "x-title" => "Has Twitter", "description" => "",
@@ -47,6 +50,8 @@
"items" => Array("type" => "string")),
"hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright",
"items" => Array("type" => "string")),
+ "hasPermissiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Permissive Licence", "description" => "Has any page licenced under terms more permissive than Crown Copyright but not clear CCBY",
+ "items" => Array("type" => "string")),
"hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration",
"items" => Array("type" => "string")),
),
--- a/scrape.py
+++ b/scrape.py
@@ -77,7 +77,7 @@
print "Fetching %s" % url
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
print "Not a valid HTTP url"
- return (None,None)
+ return (None,None,None)
doc = docsdb.get(hash)
if doc == None:
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
@@ -86,13 +86,14 @@
print "Uh oh, trying to scrape URL again too soon!"
last_attachment_fname = doc["_attachments"].keys()[-1]
last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
- return (doc['mime_type'],last_attachment)
+ return (doc['url'],doc['mime_type'],last_attachment)
if scrape_again == False:
print "Not scraping this URL again as requested"
- return (None,None)
+ return (None,None,None)
time.sleep(3) # wait 3 seconds to give webserver time to recover
+ req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
#if there is a previous version stored in couchdb, load caching helper tags
if doc.has_key('etag'):
req.add_header("If-None-Match", doc['etag'])
@@ -102,12 +103,14 @@
opener = urllib2.build_opener(NotModifiedHandler())
try:
url_handle = opener.open(req)
+ doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag")
doc['last_modified'] = headers.getheader("Last-Modified")
doc['date'] = headers.getheader("Date")
doc['page_scraped'] = time.time()
doc['web_server'] = headers.getheader("Server")
+ doc['via'] = headers.getheader("Via")
doc['powered_by'] = headers.getheader("X-Powered-By")
doc['file_size'] = headers.getheader("Content-Length")
content_type = headers.getheader("Content-Type")
@@ -119,13 +122,13 @@
if hasattr(url_handle, 'code'):
if url_handle.code == 304:
print "the web page has not been modified"
- return (None,None)
+ return (None,None,None)
else:
content = url_handle.read()
docsdb.save(doc)
doc = docsdb.get(hash) # need to get a _rev
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
- return (doc['mime_type'], content)
+ return (doc['url'], doc['mime_type'], content)
#store as attachment epoch-filename
except urllib2.URLError as e:
error = ""
@@ -136,21 +139,22 @@
print error
doc['error'] = error
docsdb.save(doc)
- return (None,None)
+ return (None,None,None)
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
- (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
- if content != None and depth > 0:
+ (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+ badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+ if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(content)
- navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
+ navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
for nav in navIDs:
print "Removing element", nav['id']
nav.extract()
- navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
+ navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
for nav in navClasses:
print "Removing element", nav['class']
nav.extract()
@@ -169,7 +173,10 @@
# not http
None
else:
- linkurls.add(urljoin(url,link['href'].replace(" ","%20")))
+ # remove anchors and spaces in urls
+ link['href'] = link['href'].replace(" ","%20")
+ link['href'] = re.sub('#.*$','',link['href'])
+ linkurls.add(urljoin(url,link['href']))
for linkurl in linkurls:
#print linkurl
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)