ausaid scraper
ausaid scraper


Former-commit-id: d2ca4331376808c570b588bad9a3c97df495d510

--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -1,4 +1,5 @@
 <?php
+
 include('template.inc.php');
 include_header_documents("");
 include_once('../include/common.inc.php');
@@ -14,24 +15,31 @@
     if ($rows) {
         foreach ($rows as $row) {
 
-            echo "<tr><td><b>" . $row->value->name . "</b> (".$row->id.")</td>\n";
-            
-             echo "<td>";
-             if (isset($row->value->FOIDocumentsURL)) {
-                 echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">'
-                     .$row->value->FOIDocumentsURL.'</a>';
-             } else {
+            echo "<tr><td><b>" . $row->value->name . "</b>";
+            if ($ENV == "DEV")
+                echo "<br>(" . $row->id . ")";
+            echo "</td>\n";
+
+
+            echo "<td>";
+            if (isset($row->value->FOIDocumentsURL)) {
+                echo '<a href="' . $row->value->FOIDocumentsURL . '">'
+                . $row->value->FOIDocumentsURL . '</a>';
+                if ($ENV == "DEV")
+                    echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
+                    . 'view local copy</a>)</small>';
+            } else {
                 echo "<font color='red'>✘</font>";
             }
             echo "</td>\n<td>";
             if (isset($row->value->FOIDocumentsURL)) {
-            if (file_exists("./scrapers/".$row->id.'.py')) {
-                echo "<font color='green'>✔</font>";
-            } else if (file_exists("./scrapers/".$row->id.'.txt')){
-                echo "<font color='blue'><b>▬</b></font>";
-            } else {
-                echo "<font color='red'>✘</font>";
-            }
+                if (file_exists("./scrapers/" . $row->id . '.py')) {
+                    echo "<font color='green'>✔</font>";
+                } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
+                    echo "<font color='blue'><b>▬</b></font>";
+                } else {
+                    echo "<font color='red'>✘</font>";
+                }
             }
             echo "</td></tr>\n";
         }

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -82,8 +82,21 @@
                 	descriptiontxt = descriptiontxt + " \n" + string
                 doc.update({'description': descriptiontxt})
 		return
+        def getTitle(self, content, entry, doc):
+                doc.update({'title': content.string})
+		return
 	def getTable(self, soup):
 		return soup.table
+	def getDate(self, content, entry, doc):
+                dtresult = cal.parseDateText(content.string)
+		if len(dtresult) == 2:
+			(dtdate,dtr) = dtresult
+                      	edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
+		else:
+			edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
+		print edate
+		doc.update({'date': edate})
+		return
 
 	def doScrape(self):
 		cal = pdt.Calendar()
@@ -111,19 +124,13 @@
 							
 						if doc == None:
 							print "saving"
-                                                        dtresult = cal.parseDateText(date.string)
-							if len(dtresult) == 2:
-								(dtdate,dtr) = dtresult
-								print dtdate
-                                                        	edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
-							else:
-								edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
-							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string,
-			 				 "date": edate,"title": title.string}
+							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
                                 			if links != []:
                                         			doc.update({'links': links})
+                                			self.getTitle(title,row, doc)
+                                			self.getDate(date,row, doc)
 							self.getDescription(description,row, doc)
-                                			if notes != None:
+							if notes != None:
                                         			doc.update({ 'notes': notes.string})
 							foidocsdb.save(doc)
 						else:

--- a/getAgency.php
+++ b/getAgency.php
@@ -1,31 +1,31 @@
 <?php
 
 include_once('include/common.inc.php');
-
 
 function displayValue($key, $value, $mode) {
     global $db, $schemas;
     if ($mode == "view") {
-if (strpos($key, "_") === 0 || $key== "metadata") return;
+        if (strpos($key, "_") === 0 || $key == "metadata")
+            return;
         echo "<tr>";
 
         echo "<td>";
         if (isset($schemas['agency']["properties"][$key])) {
-             echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description']."</small>";
-        }
-         echo                "</td><td>";
+            echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small>";
+        }
+        echo "</td><td>";
         if (is_array($value)) {
             echo "<ol>";
             foreach ($value as $subkey => $subvalue) {
-                 
-                    echo "<li ";
-                    if (isset($schemas['agency']["properties"][$key]['x-property'])) {
+
+                echo "<li ";
+                if (isset($schemas['agency']["properties"][$key]['x-property'])) {
                     echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" ';
-                }    if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
+                } if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
                     echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" ';
                 }
                 echo " >";
-                
+
                 echo "$subvalue</li>";
             }
             echo "</ol></td></tr>";
@@ -35,11 +35,15 @@
             } else {
                 echo "<span>";
             }
+
             if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
-                echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>$value</a>";
-                          } else {
-                echo "$value</span>";
-            }
+                echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>$value</a>";
+            } else if ($key == 'abn') {
+                echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>$value</a>";
+            } else {
+                echo "$value";
+            }
+            echo "</span>";
         }
         echo "</td></tr>";
     }
@@ -69,8 +73,7 @@
             } else {
                 echo "<label>$key</label><input  class='input-text' type='text' id='$key' name='$key' value='$value'/>";
                 if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
-                    echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>view</a>";
-                        
+                    echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>view</a>";
                 }
                 if ($key == 'abn') {
                     echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>";
@@ -102,7 +105,6 @@
                 $row[$defaultField] = Array($value);
                 $row[$defaultField][] = "";
                 $row[$defaultField][] = "";
-                
             }
         }
     }
@@ -117,7 +119,7 @@
 // edit?
 
     $obj = $db->get($_REQUEST['id']);
-    include_header($obj->name);
+    include_header(isset($obj->name) ? $obj->name : "");
 //print_r($row);
     if (sizeof($_POST) > 0) {
 //print_r($_POST);
@@ -149,7 +151,7 @@
 
     $mode = "view";
     $rowArray = object_to_array($obj);
-ksort($rowArray);
+    ksort($rowArray);
     if ($mode == "edit") {
         $row = addDefaultFields($rowArray);
     } else {
@@ -182,45 +184,37 @@
             };
         </script>
         <form id="editform" class="nice" method="post">
-            <?php
-
-        }
-        foreach ($row as $key => $value) {
-            echo displayValue($key, $value, $mode);
-        }
-        if ($mode == "view") {
-            echo "</table></div>";
-        }
-        if ($mode == "edit") {
-            echo '<input id="submitbutton" type="submit"/></form>';
-        }
-    } else {
+        <?php
+
+    }
+    foreach ($row as $key => $value) {
+        echo displayValue($key, $value, $mode);
+    }
+    if ($mode == "view") {
+        echo "</table></div>";
+    }
+    if ($mode == "edit") {
+        echo '<input id="submitbutton" type="submit"/></form>';
+    }
+} else {
 // show all list
-        include_header('Agencies');
-        try {
-            /* $rows = $db->get_view("app", "showNamesABNs")->rows;
-              //print_r($rows);
-              foreach ($rows as $row) {
-              //   print_r($row);
-              echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
-              (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
-              . '</a></li>';
-              } */
-            $rows = $db->get_view("app", "byCanonicalName")->rows;
-            //print_r($rows);
-            echo '<ul>';
-            foreach ($rows as $row) {
-                //   print_r($row);
-                echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
+    include_header('Agencies');
+    try {
+        $rows = $db->get_view("app", "byCanonicalName")->rows;
+        //print_r($rows);
+        echo '<ul>';
+        foreach ($rows as $row) {
+            //   print_r($row);
+            echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
 <a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' .
-                $row->value->name
-                . '</span></a></li>';
-            }
-            echo "</ul>";
-        } catch (SetteeRestClientException $e) {
-            setteErrorHandler($e);
-        }
-    }
-    include_footer();
-    ?>
-
+            (isset($row->value->name) ? $row->value->name : "ERROR NAME MISSING") 
+            . '</span></a></li>';
+        }
+        echo "</ul>";
+    } catch (SetteeRestClientException $e) {
+        setteErrorHandler($e);
+    }
+}
+include_footer();
+?>
+

--- a/include/common.inc.php
+++ b/include/common.inc.php
@@ -17,7 +17,7 @@
 require_once $basePath.'lib/Requests/library/Requests.php';
 
 Requests::register_autoloader();
-
+$ENV = "DEV";
 if (isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') {
 
     require $basePath."lib/amon-php/amon.php";
@@ -25,6 +25,7 @@
                     'protocol' => 'http', 
                     'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw"));
     Amon::setup_exception_handler();
+    $ENV = "PROD";
 }
 
 # Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735

--- a/schemas/agency.json.php
+++ b/schemas/agency.json.php
@@ -6,17 +6,23 @@
     "properties" => Array(
         "name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-property" => "schema:name foaf:name", "x-title" => "Name", "description" => "Name, most recent and broadest"),
         "shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"),
+   "description" => Array("type" => "string", "required" => false, "x-title" => "Description", "description" => "Description of roles and responsiblities of organisation"),
         "foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "x-itemprop" => "email", "description" => "FOI contact email if not foi@"),
         "sameAs" => Array("type" => "array", "required" => false, "x-property"=>"owl:sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity",
             "items" => Array("type" => "string")),
         "otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation",
             "items" => Array("type" => "string")),
+                "positions" => Array("type" => "array", "required" => true, "x-title" => "Political Positions", "description" => "Ministers and Parliamentary Secretaries",
+            "items" => Array("type" => "string")),
         "foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-property"=>"schema:members foaf:knows",  "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous",
+            "items" => Array("type" => "string")),
+        "legislation" => Array("type" => "array", "required" => true, "x-title" => "Legislation", "description" => "Legislation administered by or created for the establishment of this organisation",
             "items" => Array("type" => "string")),
         "orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."),
         "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
         "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"),
         "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
+        "established" => Array("type" => "string", "required" => true, "x-title" => "Date established", "description" => "Date established"),
         "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"),
         "contractListURL" => Array("type" => "string", "required" => true,  "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>" ),
         "budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"),
@@ -55,12 +61,7 @@
            "hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration",
             "items" => Array("type" => "string")),
     ),
-        /* "org":{"type":"object",
-          "properties":{
-          "organizationName":{"type":"string"},
-          "organizationUnit":{"type":"string"}},
-          }
-          } */
+   
 );
 ?>