# www.robotstxt.org/ | # www.robotstxt.org/ |
# http://code.google.com/web/controlcrawlindex/ | # http://code.google.com/web/controlcrawlindex/ |
User-agent: * | User-agent: * |
Disallow: /admin/ | Disallow: /admin/ |
Disallow: /viewDocument.php | |
Sitemap: http://disclosurelo.gs/sitemap.xml.php | Sitemap: http://disclosurelo.gs/sitemap.xml.php |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(id = "ctl00_PlaceHolderMain_PublishingPageContent__ControlWrapper_RichHtmlField").table | return soup.find(id = "block-system-main").table |
def getColumnCount(self): | def getColumnCount(self): |
return 7 | return 2 |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description,link,deldate, notes) = columns | (date, title) = columns |
return (id, date, title, description, notes) | return (date, date, title, title, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
<?php | <?php |
include ('../include/common.inc.php'); | include ('../include/common.inc.php'); |
$last_updated = date('Y-m-d', @filemtime('cbrfeed.zip')); | |
header("Content-Type: text/xml"); | header("Content-Type: text/xml"); |
echo "<?xml version='1.0' encoding='UTF-8'?>"; | echo "<?xml version='1.0' encoding='UTF-8'?>"; |
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; | echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; |
echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; | echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; |
foreach (scandir("./") as $file) { | foreach (scandir("./") as $file) { |
if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") | if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php"&& $file != "viewDocument.php") |
echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n"; | echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n"; |
} | |
} | } |
$agenciesdb = $server->get_db('disclosr-agencies'); | $agenciesdb = $server->get_db('disclosr-agencies'); |
$foidocsdb = $server->get_db('disclosr-foidocuments'); | |
try { | try { |
$rows = $agenciesdb->get_view("app", "byCanonicalName")->rows; | $rows = $agenciesdb->get_view("app", "byCanonicalName")->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<url><loc>' . local_url() . 'agency.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; | echo '<url><loc>' . local_url() . 'agency.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; |
} | } |
unset($rows); | |
$rows = null; | |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
$foidocsdb = $server->get_db('disclosr-foidocuments'); | |
foreach (range(0, 8) as $number) { | |
try { | try { |
$rows = $foidocsdb->get_view("app", "all")->rows; | $rows = $foidocsdb->get_view("app", "all", Array($number,$number+1))->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; | echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; |
} | } |
unset($rows); | |
$rows = null; | |
} catch (SetteeRestClientException $e) { | |
setteErrorHandler($e); | |
} | |
} | |
try { | |
$rows = $foidocsdb->get_view("app", "all", Array('9','fffffffff'))->rows; | |
foreach ($rows as $row) { | |
echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; | |
} | |
unset($rows); | |
$rows = null; | |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
echo '</urlset>'; | echo '</urlset>'; |
?> | ?> |
<?php | <?php |
date_default_timezone_set("Australia/Sydney"); | date_default_timezone_set("Australia/Sydney"); |
$basePath = ""; | $basePath = ""; |
if (strstr($_SERVER['PHP_SELF'], "alaveteli/") | if (strstr($_SERVER['PHP_SELF'], "alaveteli/") |
|| strstr($_SERVER['PHP_SELF'], "admin/") | || strstr($_SERVER['PHP_SELF'], "admin/") |
|| strstr($_SERVER['PHP_SELF'], "lib/") | || strstr($_SERVER['PHP_SELF'], "lib/") |
|| strstr($_SERVER['PHP_SELF'], "include/") | || strstr($_SERVER['PHP_SELF'], "include/") |
|| strstr($_SERVER['PHP_SELF'], "documents/") | || strstr($_SERVER['PHP_SELF'], "documents/") |
|| $_SERVER['SERVER_NAME'] == "disclosurelo.gs" | || $_SERVER['SERVER_NAME'] == "disclosurelo.gs" |
|| $_SERVER['SERVER_NAME'] == "www.disclosurelo.gs" | || $_SERVER['SERVER_NAME'] == "www.disclosurelo.gs" |
|| $_SERVER['SERVER_NAME'] == "direct.disclosurelo.gs" | |
) | ) |
$basePath = "../"; | $basePath = "../"; |
include_once ('couchdb.inc.php'); | include_once ('couchdb.inc.php'); |
include_once ('template.inc.php'); | include_once ('template.inc.php'); |
require_once $basePath.'lib/Requests/library/Requests.php'; | require_once $basePath.'lib/Requests/library/Requests.php'; |
Requests::register_autoloader(); | Requests::register_autoloader(); |
$ENV = "DEV"; | $ENV = "DEV"; |
if (false && isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') { | if (false && isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') { |
require $basePath."lib/amon-php/amon.php"; | require $basePath."lib/amon-php/amon.php"; |
Amon::config(array('address'=> 'http://127.0.0.1:2464', | Amon::config(array('address'=> 'http://127.0.0.1:2464', |
'protocol' => 'http', | 'protocol' => 'http', |
'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw")); | 'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw")); |
Amon::setup_exception_handler(); | Amon::setup_exception_handler(); |
$ENV = "PROD"; | $ENV = "PROD"; |
} | } |
# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735 | # Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735 |
function object_to_array(stdClass $Class) { | function object_to_array(stdClass $Class) { |
# Typecast to (array) automatically converts stdClass -> array. | # Typecast to (array) automatically converts stdClass -> array. |
$Class = (array) $Class; | $Class = (array) $Class; |
# Iterate through the former properties looking for any stdClass properties. | # Iterate through the former properties looking for any stdClass properties. |
# Recursively apply (array). | # Recursively apply (array). |
foreach ($Class as $key => $value) { | foreach ($Class as $key => $value) { |
if (is_object($value) && get_class($value) === 'stdClass') { | if (is_object($value) && get_class($value) === 'stdClass') { |
$Class[$key] = object_to_array($value); | $Class[$key] = object_to_array($value); |
} | } |
} | } |
return $Class; | return $Class; |
} | } |
# Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735 | # Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735 |
function array_to_object(array $array) { | function array_to_object(array $array) { |
# Iterate through our array looking for array values. | # Iterate through our array looking for array values. |
# If found recurvisely call itself. | # If found recurvisely call itself. |
foreach ($array as $key => $value) { | foreach ($array as $key => $value) { |
if (is_array($value)) { | if (is_array($value)) { |
$array[$key] = array_to_object($value); | $array[$key] = array_to_object($value); |
} | } |
} | } |
# Typecast to (object) will automatically convert array -> stdClass | # Typecast to (object) will automatically convert array -> stdClass |
return (object) $array; | return (object) $array; |
} | } |
function dept_to_portfolio($deptName) { | function dept_to_portfolio($deptName) { |
return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName))); | return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName))); |
} | } |
function phrase_to_tag ($phrase) { | function phrase_to_tag ($phrase) { |
return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase)))); | return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase)))); |
} | } |
function local_url() { | function local_url() { |
return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/"; | return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/"; |
} | } |
function GetDomain($url) | function GetDomain($url) |
{ | { |
$nowww = ereg_replace('www\.','',$url); | $nowww = ereg_replace('www\.','',$url); |
$domain = parse_url($nowww); | $domain = parse_url($nowww); |
if(!empty($domain["host"])) | if(!empty($domain["host"])) |
{ | { |
return $domain["host"]; | return $domain["host"]; |
} else | } else |
{ | { |
return $domain["path"]; | return $domain["path"]; |
} | } |
} | } |
# www.robotstxt.org/ | # www.robotstxt.org/ |
# www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449 | # www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449 |
User-agent: * | User-agent: * |
Disallow: /admin/ | Disallow: /admin/ |
Disallow: /viewDocument.php | |
Sitemap: http://orgs.disclosurelo.gs/sitemap.xml.php | Sitemap: http://orgs.disclosurelo.gs/sitemap.xml.php |
<?php | <?php |
include ('include/common.inc.php'); | include ('include/common.inc.php'); |
$last_updated = date('Y-m-d', @filemtime('cbrfeed.zip')); | $last_updated = date('Y-m-d', @filemtime('cbrfeed.zip')); |
header("Content-Type: text/xml"); | header("Content-Type: text/xml"); |
echo "<?xml version='1.0' encoding='UTF-8'?>"; | echo "<?xml version='1.0' encoding='UTF-8'?>"; |
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; | echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; |
echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; | echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; |
foreach (scandir("./") as $file) { | foreach (scandir("./") as $file) { |
if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") | if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php"&& $file != "viewDocument.php") |
echo " <url><loc>" . local_url() . "$file</loc><priority>0.3</priority></url>\n"; | echo " <url><loc>" . local_url() . "$file</loc><priority>0.3</priority></url>\n"; |
} | } |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
try { | try { |
$rows = $db->get_view("app", "byCanonicalName")->rows; | $rows = $db->get_view("app", "byCanonicalName")->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<url><loc>' . local_url() . 'getAgency.php?id=' . $row->value->_id . "</loc><priority>0.6</priority></url>\n"; | echo '<url><loc>' . local_url() . 'getAgency.php?id=' . $row->value->_id . "</loc><priority>0.6</priority></url>\n"; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
echo '</urlset>'; | echo '</urlset>'; |
?> | ?> |