From: Maxious Date: Thu, 22 Nov 2012 05:47:09 +0000 Subject: ausaid scraper X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=b33710797e56bcf3bd111f01e00a487ab199cecd --- ausaid scraper Former-commit-id: d2ca4331376808c570b588bad9a3c97df495d510 --- --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -1,4 +1,5 @@ " . $row->value->name . " (".$row->id.")\n"; - - echo ""; - if (isset($row->value->FOIDocumentsURL)) { - echo '' - .$row->value->FOIDocumentsURL.''; - } else { + echo "" . $row->value->name . ""; + if ($ENV == "DEV") + echo "
(" . $row->id . ")"; + echo "\n"; + + + echo ""; + if (isset($row->value->FOIDocumentsURL)) { + echo '' + . $row->value->FOIDocumentsURL . ''; + if ($ENV == "DEV") + echo '
(' + . 'view local copy)'; + } else { echo ""; } echo "\n"; if (isset($row->value->FOIDocumentsURL)) { - if (file_exists("./scrapers/".$row->id.'.py')) { - echo ""; - } else if (file_exists("./scrapers/".$row->id.'.txt')){ - echo ""; - } else { - echo ""; - } + if (file_exists("./scrapers/" . $row->id . '.py')) { + echo ""; + } else if (file_exists("./scrapers/" . $row->id . '.txt')) { + echo ""; + } else { + echo ""; + } } echo "\n"; } --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -82,8 +82,21 @@ descriptiontxt = descriptiontxt + " \n" + string doc.update({'description': descriptiontxt}) return + def getTitle(self, content, entry, doc): + doc.update({'title': content.string}) + return def getTable(self, soup): return soup.table + def getDate(self, content, entry, doc): + dtresult = cal.parseDateText(content.string) + if len(dtresult) == 2: + (dtdate,dtr) = dtresult + edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) + else: + edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return def doScrape(self): cal = pdt.Calendar() @@ -111,19 +124,13 @@ if doc == None: print "saving" - dtresult = cal.parseDateText(date.string) - if len(dtresult) == 2: - (dtdate,dtr) = dtresult - print dtdate - edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) - else: - edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string, - "date": edate,"title": title.string} + doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string} if links != []: doc.update({'links': links}) + self.getTitle(title,row, doc) + self.getDate(date,row, doc) self.getDescription(description,row, doc) - if notes != None: + if notes != None: doc.update({ 'notes': notes.string}) foidocsdb.save(doc) else: --- a/getAgency.php +++ b/getAgency.php @@ -1,31 +1,31 @@ "; echo ""; if (isset($schemas['agency']["properties"][$key])) { - echo $schemas['agency']["properties"][$key]['x-title'] . "
" . $schemas['agency']["properties"][$key]['description'].""; - } - echo ""; + echo $schemas['agency']["properties"][$key]['x-title'] . "
" . $schemas['agency']["properties"][$key]['description'] . ""; + } + echo ""; if (is_array($value)) { echo "
    "; foreach ($value as $subkey => $subvalue) { - - echo "
  1. "; - + echo "$subvalue
  2. "; } echo "
"; @@ -35,11 +35,15 @@ } else { echo ""; } + if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { - echo "$value"; - } else { - echo "$value"; - } + echo "$value"; + } else if ($key == 'abn') { + echo "$value"; + } else { + echo "$value"; + } + echo ""; } echo ""; } @@ -69,8 +73,7 @@ } else { echo ""; if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { - echo "view"; - + echo "view"; } if ($key == 'abn') { echo "view abn"; @@ -102,7 +105,6 @@ $row[$defaultField] = Array($value); $row[$defaultField][] = ""; $row[$defaultField][] = ""; - } } } @@ -117,7 +119,7 @@ // edit? $obj = $db->get($_REQUEST['id']); - include_header($obj->name); + include_header(isset($obj->name) ? $obj->name : ""); //print_r($row); if (sizeof($_POST) > 0) { //print_r($_POST); @@ -149,7 +151,7 @@ $mode = "view"; $rowArray = object_to_array($obj); -ksort($rowArray); + ksort($rowArray); if ($mode == "edit") { $row = addDefaultFields($rowArray); } else { @@ -182,45 +184,37 @@ };
- $value) { - echo displayValue($key, $value, $mode); - } - if ($mode == "view") { - echo ""; - } - if ($mode == "edit") { - echo '
'; - } - } else { + $value) { + echo displayValue($key, $value, $mode); + } + if ($mode == "view") { + echo ""; + } + if ($mode == "edit") { + echo ''; + } +} else { // show all list - include_header('Agencies'); - try { - /* $rows = $db->get_view("app", "showNamesABNs")->rows; - //print_r($rows); - foreach ($rows as $row) { - // print_r($row); - echo '
  • ' . - (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) - . '
  • '; - } */ - $rows = $db->get_view("app", "byCanonicalName")->rows; - //print_r($rows); - echo '"; + } catch (SetteeRestClientException $e) { + setteErrorHandler($e); + } +} +include_footer(); +?> + --- a/include/common.inc.php +++ b/include/common.inc.php @@ -17,7 +17,7 @@ require_once $basePath.'lib/Requests/library/Requests.php'; Requests::register_autoloader(); - +$ENV = "DEV"; if (isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') { require $basePath."lib/amon-php/amon.php"; @@ -25,6 +25,7 @@ 'protocol' => 'http', 'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw")); Amon::setup_exception_handler(); + $ENV = "PROD"; } # Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735 --- a/schemas/agency.json.php +++ b/schemas/agency.json.php @@ -6,17 +6,23 @@ "properties" => Array( "name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-property" => "schema:name foaf:name", "x-title" => "Name", "description" => "Name, most recent and broadest"), "shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"), + "description" => Array("type" => "string", "required" => false, "x-title" => "Description", "description" => "Description of roles and responsiblities of organisation"), "foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "x-itemprop" => "email", "description" => "FOI contact email if not foi@"), "sameAs" => Array("type" => "array", "required" => false, "x-property"=>"owl:sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity", "items" => Array("type" => "string")), "otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation", "items" => Array("type" => "string")), + "positions" => Array("type" => "array", "required" => true, "x-title" => "Political Positions", "description" => "Ministers and Parliamentary Secretaries", + "items" => Array("type" => "string")), "foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-property"=>"schema:members foaf:knows", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous", + "items" => Array("type" => "string")), + "legislation" => Array("type" => "array", "required" => true, "x-title" => "Legislation", "description" => "Legislation administered by or created for the establishment of this organisation", "items" => Array("type" => "string")), "orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."), "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"), "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), + "established" => Array("type" => "string", "required" => true, "x-title" => "Date established", "description" => "Date established"), "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"), "contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, mandated by the Senate" ), "budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"), @@ -55,12 +61,7 @@ "hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration", "items" => Array("type" => "string")), ), - /* "org":{"type":"object", - "properties":{ - "organizationName":{"type":"string"}, - "organizationUnit":{"type":"string"}}, - } - } */ + ); ?>