better date parser
Former-commit-id: 64dff9bcaeb72426a713440e995584a6ea0472b9
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -1,4 +1,5 @@
<?php
+
include('template.inc.php');
include_header_documents("");
include_once('../include/common.inc.php');
@@ -14,24 +15,31 @@
if ($rows) {
foreach ($rows as $row) {
- echo "<tr><td><b>" . $row->value->name . "</b> (".$row->id.")</td>\n";
-
- echo "<td>";
- if (isset($row->value->FOIDocumentsURL)) {
- echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
- .$row->value->FOIDocumentsURL.'</a>';
- } else {
+ echo "<tr><td><b>" . $row->value->name . "</b>";
+ if ($ENV == "DEV")
+ echo "<br>(" . $row->id . ")";
+ echo "</td>\n";
+
+
+ echo "<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ echo '<a href="' . $row->value->FOIDocumentsURL . '">'
+ . $row->value->FOIDocumentsURL . '</a>';
+ if ($ENV == "DEV")
+ echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
+ . 'view local copy</a>)</small>';
+ } else {
echo "<font color='red'>✘</font>";
}
echo "</td>\n<td>";
if (isset($row->value->FOIDocumentsURL)) {
- if (file_exists("./scrapers/".$row->id.'.py')) {
- echo "<font color='green'>✔</font>";
- } else if (file_exists("./scrapers/".$row->id.'.txt')){
- echo "<font color='blue'><b>▬</b></font>";
- } else {
- echo "<font color='red'>✘</font>";
- }
+ if (file_exists("./scrapers/" . $row->id . '.py')) {
+ echo "<font color='green'>✔</font>";
+ } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
+ echo "<font color='blue'><b>▬</b></font>";
+ } else {
+ echo "<font color='red'>✘</font>";
+ }
}
echo "</td></tr>\n";
}
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -2,12 +2,13 @@
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape
from bs4 import BeautifulSoup
-import parsedatetime as pdt
from time import mktime
-from datetime import datetime
import feedparser
import abc
import unicodedata, re
+import dateutil
+from dateutil.parser import *
+from datetime import *
class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta
@@ -82,11 +83,18 @@
descriptiontxt = descriptiontxt + " \n" + string
doc.update({'description': descriptiontxt})
return
+ def getTitle(self, content, entry, doc):
+ doc.update({'title': content.string})
+ return
def getTable(self, soup):
return soup.table
+ def getDate(self, content, entry, doc):
+ edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+ print edate
+ doc.update({'date': edate})
+ return
def doScrape(self):
- cal = pdt.Calendar()
foidocsdb = scrape.couch['disclosr-foidocuments']
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
if content != None:
@@ -111,19 +119,13 @@
if doc == None:
print "saving"
- dtresult = cal.parseDateText(date.string)
- if len(dtresult) == 2:
- (dtdate,dtr) = dtresult
- print dtdate
- edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
- else:
- edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
- doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string,
- "date": edate,"title": title.string}
+ doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
if links != []:
doc.update({'links': links})
+ self.getTitle(title,row, doc)
+ self.getDate(date,row, doc)
self.getDescription(description,row, doc)
- if notes != None:
+ if notes != None:
doc.update({ 'notes': notes.string})
foidocsdb.save(doc)
else:
--- a/getAgency.php
+++ b/getAgency.php
@@ -1,31 +1,31 @@
<?php
include_once('include/common.inc.php');
-
function displayValue($key, $value, $mode) {
global $db, $schemas;
if ($mode == "view") {
-if (strpos($key, "_") === 0 || $key== "metadata") return;
+ if (strpos($key, "_") === 0 || $key == "metadata")
+ return;
echo "<tr>";
echo "<td>";
if (isset($schemas['agency']["properties"][$key])) {
- echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description']."</small>";
- }
- echo "</td><td>";
+ echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small>";
+ }
+ echo "</td><td>";
if (is_array($value)) {
echo "<ol>";
foreach ($value as $subkey => $subvalue) {
-
- echo "<li ";
- if (isset($schemas['agency']["properties"][$key]['x-property'])) {
+
+ echo "<li ";
+ if (isset($schemas['agency']["properties"][$key]['x-property'])) {
echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" ';
- } if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
+ } if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" ';
}
echo " >";
-
+
echo "$subvalue</li>";
}
echo "</ol></td></tr>";
@@ -35,11 +35,15 @@
} else {
echo "<span>";
}
+
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
- echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>$value</a>";
- } else {
- echo "$value</span>";
- }
+ echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>$value</a>";
+ } else if ($key == 'abn') {
+ echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>$value</a>";
+ } else {
+ echo "$value";
+ }
+ echo "</span>";
}
echo "</td></tr>";
}
@@ -69,8 +73,7 @@
} else {
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>";
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
- echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>view</a>";
-
+ echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>view</a>";
}
if ($key == 'abn') {
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>";
@@ -102,7 +105,6 @@
$row[$defaultField] = Array($value);
$row[$defaultField][] = "";
$row[$defaultField][] = "";
-
}
}
}
@@ -117,7 +119,7 @@
// edit?
$obj = $db->get($_REQUEST['id']);
- include_header($obj->name);
+ include_header(isset($obj->name) ? $obj->name : "");
//print_r($row);
if (sizeof($_POST) > 0) {
//print_r($_POST);
@@ -149,7 +151,7 @@
$mode = "view";
$rowArray = object_to_array($obj);
-ksort($rowArray);
+ ksort($rowArray);
if ($mode == "edit") {
$row = addDefaultFields($rowArray);
} else {
@@ -182,45 +184,37 @@
};
</script>
<form id="editform" class="nice" method="post">
- <?php
-
- }
- foreach ($row as $key => $value) {
- echo displayValue($key, $value, $mode);
- }
- if ($mode == "view") {
- echo "</table></div>";
- }
- if ($mode == "edit") {
- echo '<input id="submitbutton" type="submit"/></form>';
- }
- } else {
+ <?php
+
+ }
+ foreach ($row as $key => $value) {
+ echo displayValue($key, $value, $mode);
+ }
+ if ($mode == "view") {
+ echo "</table></div>";
+ }
+ if ($mode == "edit") {
+ echo '<input id="submitbutton" type="submit"/></form>';
+ }
+} else {
// show all list
- include_header('Agencies');
- try {
- /* $rows = $db->get_view("app", "showNamesABNs")->rows;
- //print_r($rows);
- foreach ($rows as $row) {
- // print_r($row);
- echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
- (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
- . '</a></li>';
- } */
- $rows = $db->get_view("app", "byCanonicalName")->rows;
- //print_r($rows);
- echo '<ul>';
- foreach ($rows as $row) {
- // print_r($row);
- echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
+ include_header('Agencies');
+ try {
+ $rows = $db->get_view("app", "byCanonicalName")->rows;
+ //print_r($rows);
+ echo '<ul>';
+ foreach ($rows as $row) {
+ // print_r($row);
+ echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' .
- $row->value->name
- . '</span></a></li>';
- }
- echo "</ul>";
- } catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
- }
- }
- include_footer();
- ?>
-
+ (isset($row->value->name) ? $row->value->name : "ERROR NAME MISSING")
+ . '</span></a></li>';
+ }
+ echo "</ul>";
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+ }
+}
+include_footer();
+?>
+
--- a/include/common.inc.php
+++ b/include/common.inc.php
@@ -17,7 +17,7 @@
require_once $basePath.'lib/Requests/library/Requests.php';
Requests::register_autoloader();
-
+$ENV = "DEV";
if (isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') {
require $basePath."lib/amon-php/amon.php";
@@ -25,6 +25,7 @@
'protocol' => 'http',
'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw"));
Amon::setup_exception_handler();
+ $ENV = "PROD";
}
# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735
--- a/schemas/agency.json.php
+++ b/schemas/agency.json.php
@@ -6,17 +6,23 @@
"properties" => Array(
"name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-property" => "schema:name foaf:name", "x-title" => "Name", "description" => "Name, most recent and broadest"),
"shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"),
+ "description" => Array("type" => "string", "required" => false, "x-title" => "Description", "description" => "Description of roles and responsiblities of organisation"),
"foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "x-itemprop" => "email", "description" => "FOI contact email if not foi@"),
"sameAs" => Array("type" => "array", "required" => false, "x-property"=>"owl:sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity",
"items" => Array("type" => "string")),
"otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation",
"items" => Array("type" => "string")),
+ "positions" => Array("type" => "array", "required" => true, "x-title" => "Political Positions", "description" => "Ministers and Parliamentary Secretaries",
+ "items" => Array("type" => "string")),
"foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-property"=>"schema:members foaf:knows", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous",
+ "items" => Array("type" => "string")),
+ "legislation" => Array("type" => "array", "required" => true, "x-title" => "Legislation", "description" => "Legislation administered by or created for the establishment of this organisation",
"items" => Array("type" => "string")),
"orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."),
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"),
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
+ "established" => Array("type" => "string", "required" => true, "x-title" => "Date established", "description" => "Date established"),
"employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"),
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>" ),
"budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"),
@@ -55,12 +61,7 @@
"hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration",
"items" => Array("type" => "string")),
),
- /* "org":{"type":"object",
- "properties":{
- "organizationName":{"type":"string"},
- "organizationUnit":{"type":"string"}},
- }
- } */
+
);
?>