fix sitemap
fix sitemap


Former-commit-id: ddc69719d5b15a16ac0ec2110cd9fdd9fb44cc0e

import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getTable(self,soup): def getTable(self,soup):
return soup.find(id = "ctl00_PlaceHolderMain_PublishingPageContent__ControlWrapper_RichHtmlField").table return soup.find(id = "block-system-main").table
def getColumnCount(self): def getColumnCount(self):
return 7 return 2
def getColumns(self,columns): def getColumns(self,columns):
(id, date, title, description,link,deldate, notes) = columns (date, title) = columns
return (id, date, title, description, notes) return (date, date, title, title, None)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
<?php <?php
   
include ('../include/common.inc.php'); include ('../include/common.inc.php');
$last_updated = date('Y-m-d', @filemtime('cbrfeed.zip'));  
header("Content-Type: text/xml"); header("Content-Type: text/xml");
echo "<?xml version='1.0' encoding='UTF-8'?>"; echo "<?xml version='1.0' encoding='UTF-8'?>";
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n";
foreach (scandir("./") as $file) { foreach (scandir("./") as $file) {
if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") {
echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n"; echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n";
  }
} }
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
  $foidocsdb = $server->get_db('disclosr-foidocuments');
try { try {
$rows = $agenciesdb->get_view("app", "byCanonicalName")->rows; $rows = $agenciesdb->get_view("app", "byCanonicalName")->rows;
foreach ($rows as $row) { foreach ($rows as $row) {
echo '<url><loc>' . local_url() . 'agency.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; echo '<url><loc>' . local_url() . 'agency.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n";
} }
  unset($rows);
  $rows = null;
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments');  
  foreach (range(0, 8) as $number) {
try { try {
$rows = $foidocsdb->get_view("app", "all")->rows; $rows = $foidocsdb->get_view("app", "all", Array($number,$number+1))->rows;
foreach ($rows as $row) { foreach ($rows as $row) {
echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n";
} }
  unset($rows);
  $rows = null;
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  }
   
  try {
  $rows = $foidocsdb->get_view("app", "all", Array('9','fffffffff'))->rows;
  foreach ($rows as $row) {
  echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n";
  }
  unset($rows);
  $rows = null;
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
echo '</urlset>'; echo '</urlset>';
?> ?>
   
<?php <?php
   
date_default_timezone_set("Australia/Sydney"); date_default_timezone_set("Australia/Sydney");
   
$basePath = ""; $basePath = "";
if (strstr($_SERVER['PHP_SELF'], "alaveteli/") if (strstr($_SERVER['PHP_SELF'], "alaveteli/")
|| strstr($_SERVER['PHP_SELF'], "admin/") || strstr($_SERVER['PHP_SELF'], "admin/")
|| strstr($_SERVER['PHP_SELF'], "lib/") || strstr($_SERVER['PHP_SELF'], "lib/")
|| strstr($_SERVER['PHP_SELF'], "include/") || strstr($_SERVER['PHP_SELF'], "include/")
|| strstr($_SERVER['PHP_SELF'], "documents/") || strstr($_SERVER['PHP_SELF'], "documents/")
|| $_SERVER['SERVER_NAME'] == "disclosurelo.gs" || $_SERVER['SERVER_NAME'] == "disclosurelo.gs"
|| $_SERVER['SERVER_NAME'] == "www.disclosurelo.gs" || $_SERVER['SERVER_NAME'] == "www.disclosurelo.gs"
  || $_SERVER['SERVER_NAME'] == "direct.disclosurelo.gs"
) )
$basePath = "../"; $basePath = "../";
   
include_once ('couchdb.inc.php'); include_once ('couchdb.inc.php');
include_once ('template.inc.php'); include_once ('template.inc.php');
require_once $basePath.'lib/Requests/library/Requests.php'; require_once $basePath.'lib/Requests/library/Requests.php';
   
Requests::register_autoloader(); Requests::register_autoloader();
$ENV = "DEV"; $ENV = "DEV";
if (false && isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') { if (false && isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') {
   
require $basePath."lib/amon-php/amon.php"; require $basePath."lib/amon-php/amon.php";
Amon::config(array('address'=> 'http://127.0.0.1:2464', Amon::config(array('address'=> 'http://127.0.0.1:2464',
'protocol' => 'http', 'protocol' => 'http',
'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw")); 'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw"));
Amon::setup_exception_handler(); Amon::setup_exception_handler();
$ENV = "PROD"; $ENV = "PROD";
} }
   
# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735 # Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735
   
function object_to_array(stdClass $Class) { function object_to_array(stdClass $Class) {
# Typecast to (array) automatically converts stdClass -> array. # Typecast to (array) automatically converts stdClass -> array.
$Class = (array) $Class; $Class = (array) $Class;
   
# Iterate through the former properties looking for any stdClass properties. # Iterate through the former properties looking for any stdClass properties.
# Recursively apply (array). # Recursively apply (array).
foreach ($Class as $key => $value) { foreach ($Class as $key => $value) {
if (is_object($value) && get_class($value) === 'stdClass') { if (is_object($value) && get_class($value) === 'stdClass') {
$Class[$key] = object_to_array($value); $Class[$key] = object_to_array($value);
} }
} }
return $Class; return $Class;
} }
   
# Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735 # Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735
   
function array_to_object(array $array) { function array_to_object(array $array) {
# Iterate through our array looking for array values. # Iterate through our array looking for array values.
# If found recurvisely call itself. # If found recurvisely call itself.
foreach ($array as $key => $value) { foreach ($array as $key => $value) {
if (is_array($value)) { if (is_array($value)) {
$array[$key] = array_to_object($value); $array[$key] = array_to_object($value);
} }
} }
   
# Typecast to (object) will automatically convert array -> stdClass # Typecast to (object) will automatically convert array -> stdClass
return (object) $array; return (object) $array;
} }
   
function dept_to_portfolio($deptName) { function dept_to_portfolio($deptName) {
return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName))); return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName)));
} }
function phrase_to_tag ($phrase) { function phrase_to_tag ($phrase) {
return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase)))); return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase))));
} }
function local_url() { function local_url() {
return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/"; return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/";
} }
function GetDomain($url) function GetDomain($url)
{ {
$nowww = ereg_replace('www\.','',$url); $nowww = ereg_replace('www\.','',$url);
$domain = parse_url($nowww); $domain = parse_url($nowww);
if(!empty($domain["host"])) if(!empty($domain["host"]))
{ {
return $domain["host"]; return $domain["host"];
} else } else
{ {
return $domain["path"]; return $domain["path"];
} }
} }