<?php | |
require_once '../include/common.inc.php'; | |
$db = $server->get_db('disclosr-agencies'); | |
$rows = $db->get_view("app", "byName")->rows; | |
$nametoid = Array(); | |
$accounts = Array(); | |
foreach ($rows as $row) { | |
$nametoid[trim($row->key)] = $row->value; | |
} | |
function extractCSVAccounts($url, $nameField, $accountField, $filter) { | |
global $accounts, $nametoid; | |
$request = Requests::get($url); | |
echo $url; | |
$Data = str_getcsv($request->body, "\n"); //parse the rows | |
$headers = Array(); | |
foreach ($Data as $num => $line) { | |
$Row = str_getcsv($line, ","); | |
if ($num == 0) { | |
$headers = $Row; | |
print_r($headers); | |
} else { | |
if (isset($Row[array_search($nameField, $headers)])) { | |
$agencyName = $Row[array_search($nameField, $headers)]; | |
if (!in_array(trim($agencyName), array_keys($nametoid))) { | |
echo "$agencyName missing" . PHP_EOL; | |
} else { | |
echo $Row[array_search($nameField, $headers)] . PHP_EOL; | |
$accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)]; | |
} | |
} else { | |
echo "error finding any agency" . $line . PHP_EOL; | |
} | |
} | |
} | |
} | |
extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name"); | |
print_r($accounts); | |
/* foreach ($accounts as $id => $accountTypes) { | |
echo $id . "<br>" . PHP_EOL; | |
$doc = object_to_array($db->get($id)); | |
// print_r($doc); | |
foreach ($accountTypes as $accountType => $accounts) { | |
if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { | |
$doc["has" . $accountType] = Array(); | |
} | |
$doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); | |
} | |
$db->save($doc); | |
}*/ | |
?> | |
<?php | |
include_once('../include/common.inc.php'); | |
include_header('Webserver and Accessiblity'); | |
echo "<table> | |
<tr><th>name</th><th>webserver</th><th>accessiblity errors</th></tr>"; | |
$agenciesdb = $server->get_db('disclosr-agencies'); | |
$docsdb = $server->get_db('disclosr-documents'); | |
try { | |
$rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows; | |
if ($rows) { | |
foreach ($rows as $row) { | |
echo "<tr><td>" . $row->value->name . "</td>\n"; | |
echo "<td>"; | |
if (isset($row->value->FOIDocumentsURL)) { | |
echo '<a href="viewDocument.php?hash='.md5($row->value->FOIDocumentsURL).'">' | |
.$row->value->FOIDocumentsURL.'</a>'; | |
} | |
echo "</td>\n"; | |
echo "</tr>\n"; | |
} | |
} | |
} catch (SetteeRestClientException $e) { | |
setteErrorHandler($e); | |
} | |
include_footer(); | |
?> |
import sys,os | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | |
import scrape | |
from bs4 import BeautifulSoup | |
import abc | |
class GenericOAICDisclogScraper(object): | |
__metaclass__ = abc.ABCMeta | |
@abc.abstractmethod | |
def getAgencyID(self): | |
""" disclosr agency id """ | |
return | |
@abc.abstractmethod | |
def getURL(self): | |
""" disclog URL""" | |
return | |
@abc.abstractmethod | |
def getColumns(self,columns): | |
""" rearranges columns if required """ | |
return | |
def doScrape(self): | |
foidocsdb = scrape.couch['disclosr-foidocuments'] | |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | |
if content != None: | |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | |
soup = BeautifulSoup(content) | |
for row in soup.table.find_all('tr'): | |
columns = row.find_all('td') | |
if len(columns) == 5: | |
(id, date, description, title, notes) = self.getColumns(columns) | |
print id.string | |
hash = scrape.mkhash(url+id.string) | |
links = [] | |
for atag in row.find_all("a"): | |
if atag.has_key('href'): | |
links.append(scrape.fullurl(url,atag['href'])) | |
doc = foidocsdb.get(hash) | |
descriptiontxt = "" | |
for string in description.stripped_strings: | |
descriptiontxt = descriptiontxt + string | |
if doc == None: | |
print "saving" | |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, | |
"date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string} | |
foidocsdb.save(doc) | |
else: | |
print "already saved" | |
elif len(row.find_all('th')) == 5: | |
print "header row" | |
else: | |
print "ERROR number of columns incorrect" | |
print row | |
google-site-verification: google676a414ad086cefb.html | |
australian disclosure logs | |
are you looking for more information about: | |
contracts | |
gov orgs | |
lobbyists | |
1/1/11 title (Dept dfggdfgdf) | |
description: | |
source link: | |
documents: | |
#1 title link | |
rss feed here |
<?php | |
// Agency X updated Y, new files, diff of plain text/link text, | |
// feed for just one agency or all | |
// This is a minimum example of using the Universal Feed Generator Class | |
include("lib/FeedWriter.php"); | |
//Creating an instance of FeedWriter class. | |
$TestFeed = new FeedWriter(RSS2); | |
//Setting the channel elements | |
//Use wrapper functions for common channelelements | |
$TestFeed->setTitle('Last Modified - All'); | |
$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php'); | |
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer'); | |
//Retriving informations from database | |
$rows = $db->get_view("app", "byLastModified")->rows; | |
//print_r($rows); | |
foreach ($rows as $row) { | |
//Create an empty FeedItem | |
$newItem = $TestFeed->createNewItem(); | |
//Add elements to the feed item | |
$newItem->setTitle($row['name']); | |
$newItem->setLink($row['id']); | |
$newItem->setDate(date("c", $row['metadata']['lastModified'])); | |
$newItem->setDescription($row['name']); | |
//Now add the feed item | |
$TestFeed->addItem($newItem); | |
} | |
//OK. Everything is done. Now genarate the feed. | |
$TestFeed->genarateFeed(); | |
?> |
python scrape.py | |
pause |
#http://packages.python.org/CouchDB/client.html | |
import couchdb | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import re | |
import hashlib | |
from urlparse import urljoin | |
import time | |
import os | |
import mimetypes | |
import re | |
import urllib | |
import urlparse | |
def mkhash(input): | |
return hashlib.md5(input).hexdigest().encode("utf-8") | |
def canonurl(url): | |
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' | |
if the URL looks invalid. | |
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws | |
'http://xn--hgi.ws/' | |
""" | |
# strip spaces at the ends and ensure it's prefixed with 'scheme://' | |
url = url.strip() | |
if not url: | |
return '' | |
if not urlparse.urlsplit(url).scheme: | |
url = 'http://' + url | |
# turn it into Unicode | |
#try: | |
# url = unicode(url, 'utf-8') | |
#except UnicodeDecodeError: | |
# return '' # bad UTF-8 chars in URL | |
# parse the URL into its components | |
parsed = urlparse.urlsplit(url) | |
scheme, netloc, path, query, fragment = parsed | |
# ensure scheme is a letter followed by letters, digits, and '+-.' chars | |
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): | |
return '' | |
scheme = str(scheme) | |
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] | |
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) | |
if not match: | |
return '' | |
domain, port = match.groups() | |
netloc = domain + (port if port else '') | |
netloc = netloc.encode('idna') | |
# ensure path is valid and convert Unicode chars to %-encoded | |
if not path: | |
path = '/' # eg: 'http://google.com' -> 'http://google.com/' | |
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') | |
# ensure query is valid | |
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') | |
# ensure fragment is valid | |
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) | |
# piece it all back together, truncating it to a maximum of 4KB | |
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) | |
return url[:4096] | |
def fullurl(url,href): | |
href = href.replace(" ","%20") | |
href = re.sub('#.*$','',href) | |
return urljoin(url,href) | |
#http://diveintopython.org/http_web_services/etags.html | |
class NotModifiedHandler(urllib2.BaseHandler): | |
def http_error_304(self, req, fp, code, message, headers): | |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) | |
addinfourl.code = code | |
return addinfourl | |
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): | |
url = canonurl(url) | |
hash = mkhash(url) | |
req = urllib2.Request(url) | |
print "Fetching %s" % url | |
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": | |
print "Not a valid HTTP url" | |
return (None,None,None) | |
doc = docsdb.get(hash) | |
if doc == None: | |
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} | |
else: | |
if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000): | |
print "Uh oh, trying to scrape URL again too soon!" | |
last_attachment_fname = doc["_attachments"].keys()[-1] | |
last_attachment = docsdb.get_attachment(doc,last_attachment_fname) | |
return (doc['url'],doc['mime_type'],last_attachment.read()) | |
if scrape_again == False: | |
print "Not scraping this URL again as requested" | |
return (None,None,None) | |
time.sleep(3) # wait 3 seconds to give webserver time to recover | |
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") | |
#if there is a previous version stored in couchdb, load caching helper tags | |
if doc.has_key('etag'): | |
req.add_header("If-None-Match", doc['etag']) | |
if doc.has_key('last_modified'): | |
req.add_header("If-Modified-Since", doc['last_modified']) | |
opener = urllib2.build_opener(NotModifiedHandler()) | |
try: | |
url_handle = opener.open(req) | |
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url | |
headers = url_handle.info() # the addinfourls have the .info() too | |
doc['etag'] = headers.getheader("ETag") | |
doc['last_modified'] = headers.getheader("Last-Modified") | |
doc['date'] = headers.getheader("Date") | |
doc['page_scraped'] = time.time() | |
doc['web_server'] = headers.getheader("Server") | |
doc['via'] = headers.getheader("Via") | |
doc['powered_by'] = headers.getheader("X-Powered-By") | |
doc['file_size'] = headers.getheader("Content-Length") | |
content_type = headers.getheader("Content-Type") | |
if content_type != None: | |
doc['mime_type'] = content_type.split(";")[0] | |
else: | |
(type,encoding) = mimetypes.guess_type(url) | |
doc['mime_type'] = type | |
if hasattr(url_handle, 'code'): | |
if url_handle.code == 304: | |
print "the web page has not been modified" | |
return (None,None,None) | |
else: | |
content = url_handle.read() | |
docsdb.save(doc) | |
doc = docsdb.get(hash) # need to get a _rev | |
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) | |
return (doc['url'], doc['mime_type'], content) | |
#store as attachment epoch-filename | |
except urllib2.URLError as e: | |
error = "" | |
if hasattr(e, 'reason'): | |
error = "error %s in downloading %s" % (str(e.reason), url) | |
elif hasattr(e, 'code'): | |
error = "error %s in downloading %s" % (e.code, url) | |
print error | |
doc['error'] = error | |
docsdb.save(doc) | |
return (None,None,None) | |
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): | |
(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) | |
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] | |
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": | |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | |
soup = BeautifulSoup(content) | |
navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) | |
for nav in navIDs: | |
print "Removing ele |