ausaid scraper
Former-commit-id: d2ca4331376808c570b588bad9a3c97df495d510
--- a/admin/ausbudget/importAusbudget.php
+++ /dev/null
@@ -1,7 +1,1 @@
-<?php
-$html = phpQuery::newDocumentHTML($request->body);
- phpQuery::selectDocument($html);
- foreach (pq('meta')->elements as $meta) {
-
-
--- /dev/null
+++ b/admin/importAusbudget.php
@@ -1,1 +1,68 @@
+<?php
+include_once("../include/common.inc.php");
+require($basePath . 'lib/phpquery/phpQuery/phpQuery.php');
+
+setlocale(LC_CTYPE, 'C');
+
+$dir = "./ausbudget/";
+$dhandle = opendir("./ausbudget/");
+$headers = Array("Table ID", "Portfolio", "Agency", "Program", "Scheme", "2011-2012", "2012-2013", "Difference", "Source");
+
+$fp = fopen('php://output', 'w');
+if ($fp) {
+ header('Content-Type: text/csv; charset=utf-8');
+ header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
+ header('Pragma: no-cache');
+ header('Expires: 0');
+ fputcsv($fp, $headers);
+ if ($dhandle) {
+ // loop through all of the files
+ while (false !== ($fname = readdir($dhandle))) {
+ if (($fname != '.') && ($fname != '..')) {
+ //echo "$fname <br>";
+ $html = phpQuery::newDocumentHTML(file_get_contents($dir . $fname));
+ phpQuery::selectDocument($html);
+ foreach (pq('table')->elements as $table) {
+ $data = Array();
+ ////echo "loltable";
+ //echo $table->ownerDocument->saveXML($table);
+ foreach (pq('tr',$table)->elements as $row) {
+ //echo "lolrow";
+ $rowText = pq($row)->text();
+ if (strpos($rowText, "Twitter") === false) {
+
+ $key = trim(pq("td:first", $row)->text());
+ //echo "<b>$key</b><br>";
+ $value = trim(str_replace(pq("td:first", $row)->text(), "", pq("td", $row)->text()));
+ if ($key == "2011-2012" || $key == "2012-2013") {
+ $eValue = explode("-", $value);
+ $value = trim(str_replace(Array("$", ",", "\n"), "", $eValue[0]));
+ }
+ //echo "$value <br>";
+ $data[$key] = trim(str_replace(Array("\n"), "", $value));
+ if ($key == "Source") {
+
+ foreach ($headers as $fieldName) {
+ if (isset($data[$fieldName])) {
+
+ $csvrow[] = $data[$fieldName];
+ } else {
+ $csvrow[] = "";
+ }
+ }
+
+ fputcsv($fp, array_values($csvrow));
+ $data = Array();
+ $csvrow = Array();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ die;
+}
+?>
+
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -1,4 +1,5 @@
<?php
+
include('template.inc.php');
include_header_documents("");
include_once('../include/common.inc.php');
@@ -14,24 +15,31 @@
if ($rows) {
foreach ($rows as $row) {
- echo "<tr><td><b>" . $row->value->name . "</b> (".$row->id.")</td>\n";
-
- echo "<td>";
- if (isset($row->value->FOIDocumentsURL)) {
- echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
- .$row->value->FOIDocumentsURL.'</a>';
- } else {
+ echo "<tr><td><b>" . $row->value->name . "</b>";
+ if ($ENV == "DEV")
+ echo "<br>(" . $row->id . ")";
+ echo "</td>\n";
+
+
+ echo "<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ echo '<a href="' . $row->value->FOIDocumentsURL . '">'
+ . $row->value->FOIDocumentsURL . '</a>';
+ if ($ENV == "DEV")
+ echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
+ . 'view local copy</a>)</small>';
+ } else {
echo "<font color='red'>✘</font>";
}
echo "</td>\n<td>";
if (isset($row->value->FOIDocumentsURL)) {
- if (file_exists("./scrapers/".$row->id.'.py')) {
- echo "<font color='green'>✔</font>";
- } else if (file_exists("./scrapers/".$row->id.'.txt')){
- echo "<font color='blue'><b>▬</b></font>";
- } else {
- echo "<font color='red'>✘</font>";
- }
+ if (file_exists("./scrapers/" . $row->id . '.py')) {
+ echo "<font color='green'>✔</font>";
+ } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
+ echo "<font color='blue'><b>▬</b></font>";
+ } else {
+ echo "<font color='red'>✘</font>";
+ }
}
echo "</td></tr>\n";
}
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -82,8 +82,21 @@
descriptiontxt = descriptiontxt + " \n" + string
doc.update({'description': descriptiontxt})
return
+ def getTitle(self, content, entry, doc):
+ doc.update({'title': content.string})
+ return
def getTable(self, soup):
return soup.table
+ def getDate(self, content, entry, doc):
+ dtresult = cal.parseDateText(content.string)
+ if len(dtresult) == 2:
+ (dtdate,dtr) = dtresult
+ edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
+ else:
+ edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
+ print edate
+ doc.update({'date': edate})
+ return
def doScrape(self):
cal = pdt.Calendar()
@@ -111,19 +124,13 @@
if doc == None:
print "saving"
- dtresult = cal.parseDateText(date.string)
- if len(dtresult) == 2:
- (dtdate,dtr) = dtresult
- print dtdate
- edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
- else:
- edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
- doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string,
- "date": edate,"title": title.string}
+ doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
if links != []:
doc.update({'links': links})
+ self.getTitle(title,row, doc)
+ self.getDate(date,row, doc)
self.getDescription(description,row, doc)
- if notes != None:
+ if notes != None:
doc.update({ 'notes': notes.string})
foidocsdb.save(doc)
else:
--- a/getAgency.php
+++ b/getAgency.php
@@ -1,31 +1,31 @@
<?php
include_once('include/common.inc.php');
-
function displayValue($key, $value, $mode) {
global $db, $schemas;
if ($mode == "view") {
-if (strpos($key, "_") === 0 || $key== "metadata") return;
+ if (strpos($key, "_") === 0 || $key == "metadata")
+ return;
echo "<tr>";
echo "<td>";
if (isset($schemas['agency']["properties"][$key])) {
- echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description']."</small>";
- }
- echo "</td><td>";
+ echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small>";
+ }
+ echo "</td><td>";
if (is_array($value)) {
echo "<ol>";
foreach ($value as $subkey => $subvalue) {
-
- echo "<li ";
- if (isset($schemas['agency']["properties"][$key]['x-property'])) {
+
+ echo "<li ";
+ if (isset($schemas['agency']["properties"][$key]['x-property'])) {
echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" ';
- } if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
+ } if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" ';
}
echo " >";
-
+
echo "$subvalue</li>";
}
echo "</ol></td></tr>";
@@ -35,11 +35,15 @@
} else {
echo "<span>";
}
+
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
- echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>$value</a>";
- } else {
- echo "$value</span>";
- }
+ echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>$value</a>";
+ } else if ($key == 'abn') {
+ echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>$value</a>";
+ } else {
+ echo "$value";
+ }
+ echo "</span>";
}
echo "</td></tr>";
}
@@ -69,8 +73,7 @@
} else {
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>";
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
- echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>view</a>";
-
+ echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>view</a>";
}
if ($key == 'abn') {
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>";
@@ -102,7 +105,6 @@
$row[$defaultField] = Array($value);
$row[$defaultField][] = "";
$row[$defaultField][] = "";
-
}
}
}
@@ -117,7 +119,7 @@
// edit?
$obj = $db->get($_REQUEST['id']);
- include_header($obj->name);
+ include_header(isset($obj->name) ? $obj->name : "");
//print_r($row);
if (sizeof($_POST) > 0) {
//print_r($_POST);
@@ -149,7 +151,7 @@
$mode = "view";
$rowArray = object_to_array($obj);
-ksort($rowArray);
+ ksort($rowArray);
if ($mode == "edit") {
$row = addDefaultFields($rowArray);
} else {
@@ -182,45 +184,37 @@
};
</script>
<form id="editform" class="nice" method="post">
- <?php
-
- }
- foreach ($row as $key => $value) {
- echo displayValue($key, $value, $mode);
- }
- if ($mode == "view") {
- echo "</table></div>";
- }
- if ($mode == "edit") {
- echo '<input id="submitbutton" type="submit"/></form>';
- }
- } else {
+ <?php
+
+ }
+ foreach ($row as $key => $value) {
+ echo displayValue($key, $value, $mode);
+ }
+ if ($mode == "view") {
+ echo "</table></div>";
+ }
+ if ($mode == "edit") {
+ echo '<input id="submitbutton" type="submit"/></form>';
+ }
+} else {
// show all list
- include_header('Agencies');
- try {
- /* $rows = $db->get_view("app", "showNamesABNs")->rows;
- //print_r($rows);
- foreach ($rows as $row) {
- // print_r($row);
- echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
- (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
- . '</a></li>';
- } */
- $rows = $db->get_view("app", "byCanonicalName")->rows;
- //print_r($rows);
- echo '<ul>';
- foreach ($rows as $row) {
- // print_r($row);
- echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
+ include_header('Agencies');
+ try {
+ $rows = $db->get_view("app", "byCanonicalName")->rows;
+ //print_r($rows);
+ echo '<ul>';
+ foreach ($rows as $row) {
+ // print_r($row);
+ echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' .
- $row->value->name
- . '</span></a></li>';
- }
- echo "</ul>";
- } catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
- }
- }
- include_footer();
- ?>
-
+ (isset($row->value->name) ? $row->value->name : "ERROR NAME MISSING")
+ . '</span></a></li>';
+ }
+ echo "</ul>";
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+ }
+}
+include_footer();
+?>
+
--- a/include/common.inc.php
+++ b/include/common.inc.php
@@ -17,7 +17,7 @@
require_once $basePath.'lib/Requests/library/Requests.php';
Requests::register_autoloader();
-
+$ENV = "DEV";
if (isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') {
require $basePath."lib/amon-php/amon.php";
@@ -25,6 +25,7 @@
'protocol' => 'http',
'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw"));
Amon::setup_exception_handler();
+ $ENV = "PROD";
}
# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735
--- a/schemas/agency.json.php
+++ b/schemas/agency.json.php
@@ -6,17 +6,23 @@
"properties" => Array(
"name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-property" => "schema:name foaf:name", "x-title" => "Name", "description" => "Name, most recent and broadest"),
"shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"),
+ "description" => Array("type" => "string", "required" => false, "x-title" => "Description", "description" => "Description of roles and responsiblities of organisation"),
"foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "x-itemprop" => "email", "description" => "FOI contact email if not foi@"),
"sameAs" => Array("type" => "array", "required" => false, "x-property"=>"owl:sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity",
"items" => Array("type" => "string")),
"otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation",
"items" => Array("type" => "string")),
+ "positions" => Array("type" => "array", "required" => true, "x-title" => "Political Positions", "description" => "Ministers and Parliamentary Secretaries",
+ "items" => Array("type" => "string")),
"foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-property"=>"schema:members foaf:knows", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous",
+ "items" => Array("type" => "string")),
+ "legislation" => Array("type" => "array", "required" => true, "x-title" => "Legislation", "description" => "Legislation administered by or created for the establishment of this organisation",
"items" => Array("type" => "string")),
"orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."),
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"),
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
+ "established" => Array("type" => "string", "required" => true, "x-title" => "Date established", "description" => "Date established"),
"employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"),
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>" ),
"budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"),
@@ -55,12 +61,7 @@
"hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration",
"items" => Array("type" => "string")),
),
- /* "org":{"type":"object",
- "properties":{
- "organizationName":{"type":"string"},
- "organizationUnit":{"type":"string"}},
- }
- } */
+
);
?>