ausaid scraper
Former-commit-id: d2ca4331376808c570b588bad9a3c97df495d510
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -1,4 +1,5 @@
<?php
+
include('template.inc.php');
include_header_documents("");
include_once('../include/common.inc.php');
@@ -14,24 +15,31 @@
if ($rows) {
foreach ($rows as $row) {
- echo "<tr><td><b>" . $row->value->name . "</b> (".$row->id.")</td>\n";
-
- echo "<td>";
- if (isset($row->value->FOIDocumentsURL)) {
- echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
- .$row->value->FOIDocumentsURL.'</a>';
- } else {
+ echo "<tr><td><b>" . $row->value->name . "</b>";
+ if ($ENV == "DEV")
+ echo "<br>(" . $row->id . ")";
+ echo "</td>\n";
+
+
+ echo "<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ echo '<a href="' . $row->value->FOIDocumentsURL . '">'
+ . $row->value->FOIDocumentsURL . '</a>';
+ if ($ENV == "DEV")
+ echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
+ . 'view local copy</a>)</small>';
+ } else {
echo "<font color='red'>✘</font>";
}
echo "</td>\n<td>";
if (isset($row->value->FOIDocumentsURL)) {
- if (file_exists("./scrapers/".$row->id.'.py')) {
- echo "<font color='green'>✔</font>";
- } else if (file_exists("./scrapers/".$row->id.'.txt')){
- echo "<font color='blue'><b>▬</b></font>";
- } else {
- echo "<font color='red'>✘</font>";
- }
+ if (file_exists("./scrapers/" . $row->id . '.py')) {
+ echo "<font color='green'>✔</font>";
+ } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
+ echo "<font color='blue'><b>▬</b></font>";
+ } else {
+ echo "<font color='red'>✘</font>";
+ }
}
echo "</td></tr>\n";
}
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -82,8 +82,21 @@
descriptiontxt = descriptiontxt + " \n" + string
doc.update({'description': descriptiontxt})
return
+ def getTitle(self, content, entry, doc):
+ doc.update({'title': content.string})
+ return
def getTable(self, soup):
return soup.table
+ def getDate(self, content, entry, doc):
+ dtresult = cal.parseDateText(content.string)
+ if len(dtresult) == 2:
+ (dtdate,dtr) = dtresult
+ edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
+ else:
+ edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
+ print edate
+ doc.update({'date': edate})
+ return
def doScrape(self):
cal = pdt.Calendar()
@@ -111,19 +124,13 @@
if doc == None:
print "saving"
- dtresult = cal.parseDateText(date.string)
- if len(dtresult) == 2:
- (dtdate,dtr) = dtresult
- print dtdate
- edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
- else:
- edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
- doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string,
- "date": edate,"title": title.string}
+ doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
if links != []:
doc.update({'links': links})
+ self.getTitle(title,row, doc)
+ self.getDate(date,row, doc)
self.getDescription(description,row, doc)
- if notes != None:
+ if notes != None:
doc.update({ 'notes': notes.string})
foidocsdb.save(doc)
else:
--- a/getAgency.php
+++ b/getAgency.php
@@ -119,7 +119,7 @@
// edit?
$obj = $db->get($_REQUEST['id']);
- include_header($obj->name);
+ include_header(isset($obj->name) ? $obj->name : "");
//print_r($row);
if (sizeof($_POST) > 0) {
//print_r($_POST);
@@ -200,14 +200,6 @@
// show all list
include_header('Agencies');
try {
- /* $rows = $db->get_view("app", "showNamesABNs")->rows;
- //print_r($rows);
- foreach ($rows as $row) {
- // print_r($row);
- echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
- (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
- . '</a></li>';
- } */
$rows = $db->get_view("app", "byCanonicalName")->rows;
//print_r($rows);
echo '<ul>';
@@ -215,7 +207,7 @@
// print_r($row);
echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' .
- $row->value->name
+ (isset($row->value->name) ? $row->value->name : "ERROR NAME MISSING")
. '</span></a></li>';
}
echo "</ul>";
--- a/include/common.inc.php
+++ b/include/common.inc.php
@@ -17,7 +17,7 @@
require_once $basePath.'lib/Requests/library/Requests.php';
Requests::register_autoloader();
-
+$ENV = "DEV";
if (isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') {
require $basePath."lib/amon-php/amon.php";
@@ -25,6 +25,7 @@
'protocol' => 'http',
'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw"));
Amon::setup_exception_handler();
+ $ENV = "PROD";
}
# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735