<?php | |
require_once '../include/common.inc.php'; | |
$db = $server->get_db('disclosr-agencies'); | |
$rows = $db->get_view("app", "byName")->rows; | |
$nametoid = Array(); | |
$accounts = Array(); | |
foreach ($rows as $row) { | |
$nametoid[trim($row->key)] = $row->value; | |
} | |
function extractCSVAccounts($url, $nameField, $accountField, $filter) { | |
global $accounts, $nametoid; | |
$request = Requests::get($url); | |
echo $url; | |
$Data = str_getcsv($request->body, "\n"); //parse the rows | |
$headers = Array(); | |
foreach ($Data as $num => $line) { | |
$Row = str_getcsv($line, ","); | |
if ($num == 0) { | |
$headers = $Row; | |
print_r($headers); | |
} else { | |
if (isset($Row[array_search($nameField, $headers)])) { | |
$agencyName = $Row[array_search($nameField, $headers)]; | |
if (!in_array(trim($agencyName), array_keys($nametoid))) { | |
echo "$agencyName missing" . PHP_EOL; | |
} else { | |
echo $Row[array_search($nameField, $headers)] . PHP_EOL; | |
$accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)]; | |
} | |
} else { | |
echo "error finding any agency" . $line . PHP_EOL; | |
} | |
} | |
} | |
} | |
extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name"); | |
print_r($accounts); | |
/* foreach ($accounts as $id => $accountTypes) { | |
echo $id . "<br>" . PHP_EOL; | |
$doc = object_to_array($db->get($id)); | |
// print_r($doc); | |
foreach ($accountTypes as $accountType => $accounts) { | |
if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { | |
$doc["has" . $accountType] = Array(); | |
} | |
$doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); | |
} | |
$db->save($doc); | |
}*/ | |
?> | |
<?php | <?php |
include_once("../include/common.inc.php"); | include_once("../include/common.inc.php"); |
setlocale(LC_CTYPE, 'C'); | setlocale(LC_CTYPE, 'C'); |
$headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); | $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string"); |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
$tag = Array(); | $tag = Array(); |
try { | try { |
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; | $rows = $db->get_view("app", "byDeptStateName", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); | $tag[$row->id] = phrase_to_tag(dept_to_portfolio($row->key)); |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
die(); | die(); |
} | } |
$foiEmail = Array(); | $foiEmail = Array(); |
try { | try { |
$rows = $db->get_view("app", "foiEmails", null, true)->rows; | $rows = $db->get_view("app", "foiEmails", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$foiEmail[$row->key] = $row->value; | $foiEmail[$row->key] = $row->value; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
die(); | die(); |
} | } |
$fp = fopen('php://output', 'w'); | $fp = fopen('php://output', 'w'); |
if ($fp && $db) { | if ($fp && $db) { |
header('Content-Type: text/csv; charset=utf-8'); | header('Content-Type: text/csv; charset=utf-8'); |
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); | header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); |
header('Pragma: no-cache'); | header('Pragma: no-cache'); |
header('Expires: 0'); | header('Expires: 0'); |
fputcsv($fp, $headers); | fputcsv($fp, $headers); |
try { | try { |
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; | $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($agencies as $agency) { | foreach ($agencies as $agency) { |
// print_r($agency); | // print_r($agency); |
if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { | if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { |
$row = Array(); | $row = Array(); |
$row["#id"] = $agency->id; | $row["#id"] = $agency->id; |
$row["name"] = trim($agency->value->name); | $row["name"] = trim($agency->value->name); |
$row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); | $row["request_email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); |
$row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); | $row["short_name"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); |
$row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); | $row["notes"] = (isset($agency->value->description) ? $agency->value->description : ""); |
$otherBodies = Array(); | $otherBodies = Array(); |
if (isset($agency->value->foiBodies)) { | if (isset($agency->value->foiBodies)) { |
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies); | $otherBodies = array_merge($otherBodies, $agency->value->foiBodies); |
} | } |
if (isset($agency->value->positions)) { | if (isset($agency->value->positions)) { |
$otherBodies = array_merge($otherBodies, $agency->value->positions); | $positions = Array(); |
foreach ($agency->value->positions as $position) { | |
$positions[] = "Office of the ".$position; | |
} | |
$otherBodies = array_merge($otherBodies, $positions); | |
} | } |
sort($otherBodies); | |
if (count($otherBodies) > 0) { | if (count($otherBodies) > 0) { |
$row["notes"] .= "<br/> This department also responds to requests for information held by ".implode(",",$otherBodies); | $row["notes"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies); |
} | } |
$row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); | $row["publication_scheme"] = (isset($agency->value->infoPublicationSchemeURL) ? $agency->value->infoPublicationSchemeURL : ""); |
$row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); | $row["home_page"] = (isset($agency->value->website) ? $agency->value->website : ""); |
if ($agency->value->orgType == "FMA-DepartmentOfState") { | if ($agency->value->orgType == "FMA-DepartmentOfState") { |
$row["tag_string"] = $tag[$agency->value->_id]; | $row["tag_string"] = $tag[$agency->value->_id]; |
} else { | } else { |
$row["tag_string"] = $tag[$agency->value->parentOrg]; | $row["tag_string"] = $tag[$agency->value->parentOrg]; |
} | } |
$row["tag_string"] .= " " . $agency->value->orgType; | $row["tag_string"] .= " " . $agency->value->orgType; |
$row["tag_string"] .= " federal"; | $row["tag_string"] .= " federal"; |
fputcsv($fp, array_values($row)); | fputcsv($fp, array_values($row)); |
} | } |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
die; | die; |
} | } |
?> | ?> |
import sys,os | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | |
import scrape | |
from bs4 import BeautifulSoup | |
import abc | |
class GenericOAICDisclogScraper(object): | |
__metaclass__ = abc.ABCMeta | |
@abc.abstractmethod | |
def getAgencyID(self): | |
""" disclosr agency id """ | |
return | |
@abc.abstractmethod | |
def getURL(self): | |
""" disclog URL""" | |
return | |
@abc.abstractmethod | |
def getColumns(self,columns): | |
""" rearranges columns if required """ | |
return | |
def doScrape(self): | |
foidocsdb = scrape.couch['disclosr-foidocuments'] | |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | |
if content != None: | |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | |
soup = BeautifulSoup(content) | |
for row in soup.table.find_all('tr'): | |
columns = row.find_all('td') | |
if len(columns) == 5: | |
(id, date, description, title, notes) = self.getColumns(columns) | |
print id.string | |
hash = scrape.mkhash(url+id.string) | |
links = [] | |
for atag in row.find_all("a"): | |
if atag.has_key('href'): | |
links.append(scrape.fullurl(url,atag['href'])) | |
doc = foidocsdb.get(hash) | |
descriptiontxt = "" | |
for string in description.stripped_strings: | |
descriptiontxt = descriptiontxt + string | |
if doc == None: | |
print "saving" | |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, | |
"date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string} | |
foidocsdb.save(doc) | |
else: | |
print "already saved" | |
elif len(row.find_all('th')) == 5: | |
print "header row" | |
else: | |
print "ERROR number of columns incorrect" | |
print row | |
google-site-verification: google676a414ad086cefb.html | |
australian disclosure logs | |
are you looking for more information about: | |
contracts | |
gov orgs | |
lobbyists | |
1/1/11 title (Dept dfggdfgdf) | |
description: | |
source link: | |
documents: | |
#1 title link | |
rss feed here |
<?php | |
// Agency X updated Y, new files, diff of plain text/link text, | |
// feed for just one agency or all | |
// This is a minimum example of using the Universal Feed Generator Class | |
include("lib/FeedWriter.php"); | |
//Creating an instance of FeedWriter class. | |
$TestFeed = new FeedWriter(RSS2); | |
//Setting the channel elements | |
//Use wrapper functions for common channelelements | |
$TestFeed->setTitle('Last Modified - All'); | |
$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php'); | |
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer'); | |
//Retriving informations from database | |
$rows = $db->get_view("app", "byLastModified")->rows; | |
//print_r($rows); | |
foreach ($rows as $row) { | |
//Create an empty FeedItem | |
$newItem = $TestFeed->createNewItem(); | |
//Add elements to the feed item | |
$newItem->setTitle($row['name']); | |
$newItem->setLink($row['id']); | |
$newItem->setDate(date("c", $row['metadata']['lastModified'])); | |
$newItem->setDescription($row['name']); | |
//Now add the feed item | |
$TestFeed->addItem($newItem); | |
} | |
//OK. Everything is done. Now genarate the feed. | |
$TestFeed->genarateFeed(); | |
?> |
python scrape.py | |
pause |
#http://packages.python.org/CouchDB/client.html | |
import couchdb | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import re | |
import hashlib | |
from urlparse import urljoin | |
import time | |
import os | |
import mimetypes | |
import re | |
import urllib | |
import urlparse | |
def mkhash(input): | |
return hashlib.md5(input).hexdigest().encode("utf-8") | |
def canonurl(url): | |
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' | |
if the URL looks invalid. | |
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws | |
'http://xn--hgi.ws/' | |
""" | |
# strip spaces at the ends and ensure it's prefixed with 'scheme://' | |
url = url.strip() | |
if not url: | |
return '' | |
if not urlparse.urlsplit(url).scheme: | |
url = 'http://' + url | |
# turn it into Unicode | |
#try: | |
# url = unicode(url, 'utf-8') | |
#except UnicodeDecodeError: | |
# return '' # bad UTF-8 chars in URL | |
# parse the URL into its components | |
parsed = urlparse.urlsplit(url) | |
scheme, netloc, path, query, fragment = parsed | |
# ensure scheme is a letter followed by letters, digits, and '+-.' chars | |
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): | |
return '' | |
scheme = str(scheme) | |
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] | |
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) | |
if not match: | |
return '' | |
domain, port = match.groups() | |
netloc = domain + (port if port else '') | |
netloc = netloc.encode('idna') | |
# ensure path is valid and convert Unicode chars to %-encoded | |
if not path: | |
path = '/' # eg: 'http://google.com' -> 'http://google.com/' | |
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') | |
# ensure query is valid | |
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') | |
# ensure fragment is valid | |
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) | |
# piece it all back together, truncating it to a maximum of 4KB | |
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) | |
return url[:4096] | |
def fullurl(url,href): | |
href = href.replace(" ","%20") | |
href = re.sub('#.*$','',href) | |
return urljoin(url,href) | |
#http://diveintopython.org/http_web_services/etags.html | |
class NotModifiedHandler(urllib2.BaseHandler): | |
def http_error_304(self, req, fp, code, message, headers): | |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) | |
addinfourl.code = code | |
return addinfourl | |
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): | |
url = canonurl(url) | |
hash = mkhash(url) | |
req = urllib2.Request(url) | |
print "Fetching %s" % url | |
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": | |
print "Not a valid HTTP url" | |
return (None,None,None) | |
doc = docsdb.get(hash) | |
if doc == None: | |
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} | |
else: | |
if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000): | |
print "Uh oh, trying to scrape URL again too soon!" | |
last_attachment_fname = doc["_attachments"].keys()[-1] | |
last_attachment = docsdb.get_attachment(doc,last_attachment_fname) | |
return (doc['url'],doc['mime_type'],last_attachment.read()) | |
if scrape_again == False: | |
print "Not scraping this URL again as requested" | |
return (None,None,None) | |
time.sleep(3) # wait 3 seconds to give webserver time to recover | |
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") | |
#if there is a previous version stored in couchdb, load caching helper tags | |
if doc.has_key('etag'): | |
req.add_header("If-None-Match", doc['etag']) | |
if doc.has_key('last_modified'): | |
req.add_header("If-Modified-Since", doc['last_modified']) | |
opener = urllib2.build_opener(NotModifiedHandler()) | |
try: | |
url_handle = opener.open(req) | |
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url | |
headers = url_handle.info() # the addinfourls have the .info() too | |
doc['etag'] = headers.getheader("ETag") | |
doc['last_modified'] = headers.getheader("Last-Modified") | |
doc['date'] = headers.getheader("Date") | |
doc['page_scraped'] = time.time() | |
doc['web_server'] = headers.getheader("Server") | |
doc['via'] = headers.getheader("Via") | |
doc['powered_by'] = headers.getheader("X-Powered-By") | |
doc['file_size'] = headers.getheader("Content-Length") | |
content_type = headers.getheader("Content-Type") | |
if content_type != None: | |
doc['mime_type'] = content_type.split(";")[0] | |
else: | |
(type,encoding) = mimetypes.guess_type(url) | |
doc['mime_type'] = type | |
if hasattr(url_handle, 'code'): | |
if url_handle.code == 304: | |
print "the web page has not been modified" | |
return (None,None,None) | |
else: | |
content = url_handle.read() | |
docsdb.save(doc) | |
doc = docsdb.get(hash) # need to get a _rev | |
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) | |
return (doc['url'], doc['mime_type'], content) | |
#store as attachment epoch-filename | |
except urllib2.URLError as e: | |
error = "" | |
if hasattr(e, 'reason'): | |
error = "error %s in downloading %s" % (str(e.reason), url) | |
elif hasattr(e, 'code'): | |
error = "error %s in downloading %s" % (e.code, url) | |
print error | |
doc['error'] = error | |
docsdb.save(doc) | |
return (None,None,None) | |
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): | |
(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) | |
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] | |
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": | |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | |
soup = BeautifulSoup(content) | |
navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) | |
for nav in navIDs: | |
print "Removing element", nav['id'] | |
nav.extract() | |
navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) | |
for nav in navClasses: | |
print "Removing element", nav['class'] | |
nav.extract() | |
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) | |
linkurls = set([]) | |
for link in links: | |
if link.has_key("href"): | |
if link['href'].startswith("http"): | |
# lets not do external links for now | |
# linkurls.add(link['href']) | |
None | |
if link['href'].startswith("mailto"): | |
# not http | |
None | |
if link['href'].startswith("javascript"): | |
# not http | |
None | |
else: | |
# remove anchors and spaces in urls | |
linkurls.add(fullurl(url,link['href'])) | |
for linkurl in linkurls: | |
#print linkurl | |
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) | |
#couch = couchdb.Server('http://192.168.1.148:5984/') | |
couch = couchdb.Server('http://192.168.1.148:5984/') | |
# select database | |
agencydb = couch['disclosr-agencies'] | |
docsdb = couch['disclosr-documents'] | |
if __name__ == "__main__": | |
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? | |
agency = agencydb.get(row.id) | |
print agency['name'] | |
for key in agency.keys(): | |
if key == "FOIDocumentsURL" and "status" not in agency.keys: | |
scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) | |
if key == 'website' and False: | |
scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) | |
if key.endswith('URL') and False: | |
print key | |
depth = 1 | |
if 'scrapeDepth' in agency.keys(): | |
depth = agency['scrapeDepth'] | |
scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) | |
agency['metadata']['lastScraped'] = time.time() | |
agencydb.save(agency) | |
import sys,os | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | |
import genericScrapers | |
#RSS feed not detailed | |
#http://www.doughellmann.com/PyMOTW/abc/ | |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | |
def getAgencyID(self): | |
return "3cd40b1240e987cbcd3f0e67054ce259" | |
def getURL(self): | |
return "http://www.apvma.gov.au/about/foi/disclosure/index.php" | |
def getColumns(self,columns): | |
(id, date, description, title, notes) = columns | |
return (id, date, description, title, notes) | |
if __name__ == '__main__': | |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | |
ScraperImplementation().doScrape() |
http://www.ipaustralia.gov.au/about-us/freedom-of-information/foi-disclosure-log/?page=35&sessionId=3644188 |
import sys,os | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | |
import scrape | |
foidocsdb = scrape.couch['disclosr-foidocuments'] | |
#rss feed has only one entry | |
http://www.daff.gov.au/about/foi/ips/disclosure-log | |
import sys,os | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | |
import scrape | |
foidocsdb = scrape.couch['disclosr-foidocuments'] | |
import feedparser | |
feed = feedparser.parse( "http://foi.deewr.gov.au/disclosure-log/rss") | |
print feed.entries[0] | |
#foreach feed.entries | |
http://www.awm.gov.au/about/AWM_Disclosure_Log.pdf |
www.finance.gov.au/foi/disclosure-log/foi-rss.xml |
http://www.righttoknow.org.au/feed/search/%20(latest_status:successful%20OR%20latest_status:partially_successful) |
python 3cd40b1240e987cbcd3f0e67054ce259.py | |
pause |
<?php | |
include_once('include/common.inc.php'); | |
include_header('Search'); | |
?> | |
<div class="foundation-header"> | |
<h1><a href="search.php">Search</a></h1> | |
</div> | |
<form> | |
<input type="text" name="q" value="<?php if (isset($_REQUEST['q']))echo $_REQUEST['q'];?>"/> | |
<input type="submit"/> | |
</form> | |
<?php | |
if (isset($_REQUEST['q'])) { | |
$request = Requests::get($serverAddr."disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=".$_REQUEST['q']); | |
$results = json_decode($request->body); | |
$db = $server->get_db('disclosr-documents'); | |
foreach ($results->rows as $result) { | |
//print_r($result); | |
//$row = $db->get($result->id); | |
echo $result->doc->_id." ".$result->doc->url."<br>".PHP_EOL; | |
} | |
} | |
include_footer(); | |
?> |
<?php | <?php |
include_once('include/common.inc.php'); | include_once('include/common.inc.php'); |
function displayValue($key, $value, $mode) { | function displayValue($key, $value, $mode) { |
global $db, $schemas; | global $db, $schemas; |
if ($mode == "view") { | if ($mode == "view") { |
if (strpos($key, "_") === 0 || $key== "metadata") return; | if (strpos($key, "_") === 0 || $key== "metadata") return; |
echo "<tr>"; | echo "<tr>"; |
echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>"; | echo "<td>"; |
if (isset($schemas['agency']["properties"][$key])) { | |
echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description']."</small>"; | |
} | |
echo "</td><td>"; | |
if (is_array($value)) { | if (is_array($value)) { |
echo "<ol>"; | echo "<ol>"; |
foreach ($value as $subkey => $subvalue) { | foreach ($value as $subkey => $subvalue) { |
echo "<li "; | echo "<li "; |
if (isset($schemas['agency']["properties"][$key]['x-property'])) { | if (isset($schemas['agency']["properties"][$key]['x-property'])) { |
echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" '; | echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" '; |
} if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { | } if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { |
echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" '; | echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" '; |
} | } |
echo " >"; | echo " >"; |
echo "$subvalue</li>"; | echo "$subvalue</li>"; |
} | } |
echo "</ol></td></tr>"; | echo "</ol></td></tr>"; |
} else { | } else { |
if (isset($schemas['agency']["properties"][$key]['x-property'])) { | if (isset($schemas['agency']["properties"][$key]['x-property'])) { |
echo '<span property="' . $schemas['agency']["properties"][$key]['x-property'] . '">'; | echo '<span property="' . $schemas['agency']["properties"][$key]['x-property'] . '">'; |
} else { | } else { |
echo "<span>"; | echo "<span>"; |
} | } |
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { | if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { |
echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>$value</a>"; | echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>$value</a>"; |
} else { | } else { |
echo "$value</span>"; | echo "$value</span>"; |
} | } |
} | } |
echo "</td></tr>"; | echo "</td></tr>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
if (is_array($value)) { | if (is_array($value)) { |
echo '<div class="row"> | echo '<div class="row"> |
<div class="seven columns"> | <div class="seven columns"> |
<fieldset> | <fieldset> |
<h5>' . $key . '</h5>'; | <h5>' . $key . '</h5>'; |
foreach ($value as $subkey => $subvalue) { | foreach ($value as $subkey => $subvalue) { |
echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; | echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; |
} | } |
echo "</fieldset> | echo "</fieldset> |
</div> | </div> |
</div>"; | </div>"; |
} else { | } else { |
if (strpos($key, "_") === 0) { | if (strpos($key, "_") === 0) { |
echo"<input type='hidden' id='$key' name='$key' value='$value'/>"; | echo"<input type='hidden' id='$key' name='$key' value='$value'/>"; |
} else if ($key == "parentOrg") { | } else if ($key == "parentOrg") { |
echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; | echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; |
$rows = $db->get_view("app", "byDeptStateName")->rows; | $rows = $db->get_view("app", "byDeptStateName")->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; | echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; |
} | } |
echo" </select>"; | echo" </select>"; |
} else { | } else { |
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; | echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; |
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { | if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { |
echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>view</a>"; | echo "<a ".($key == 'website' ? 'itemprop="url"':'')." href='$value'>view</a>"; |
} | } |
if ($key == 'abn') { | if ($key == 'abn') { |
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; | echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; |
} | } |
} | } |
} | } |
} | } |
// | // |
} | } |
function addDefaultFields($row) { | function addDefaultFields($row) { |
global $schemas; | global $schemas; |
$defaultFields = array_keys($schemas['agency']['properties']); | $defaultFields = array_keys($schemas['agency']['properties']); |
foreach ($defaultFields as $defaultField) { | foreach ($defaultFields as $defaultField) { |
if (!isset($row[$defaultField])) { | if (!isset($row[$defaultField])) { |
if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { | if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { |
$row[$defaultField] = ""; | $row[$defaultField] = ""; |
} | } |
if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { | if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { |
$row[$defaultField] = Array(""); | $row[$defaultField] = Array(""); |
} | } |
} else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { | } else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { |
if (is_array($row[$defaultField])) { | if (is_array($row[$defaultField])) { |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
} else { | } else { |
$value = $row[$defaultField]; | $value = $row[$defaultField]; |
$row[$defaultField] = Array($value); | $row[$defaultField] = Array($value); |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
} | } |
} | } |
} | } |
return $row; | return $row; |
} | } |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
if (isset($_REQUEST['id'])) { | if (isset($_REQUEST['id'])) { |
//get an agency record as json/html, search by name/abn/id | //get an agency record as json/html, search by name/abn/id |
// by name = startkey="Ham"&endkey="Ham\ufff0" | // by name = startkey="Ham"&endkey="Ham\ufff0" |
// edit? | // edit? |
$obj = $db->get($_REQUEST['id']); | $obj = $db->get($_REQUEST['id']); |
include_header($obj->name); | include_header($obj->name); |
//print_r($row); | //print_r($row); |
if (sizeof($_POST) > 0) { | if (sizeof($_POST) > 0) { |
//print_r($_POST); | //print_r($_POST); |
foreach ($_POST as $postkey => $postvalue) { | foreach ($_POST as $postkey => $postvalue) { |
if ($postvalue == "") { | if ($postvalue == "") { |
unset($_POST[$postkey]); | unset($_POST[$postkey]); |
} | } |
if (is_array($postvalue)) { | if (is_array($postvalue)) { |
if (count($postvalue) == 1 && $postvalue[0] == "") { | if (count($postvalue) == 1 && $postvalue[0] == "") { |
unset($_POST[$postkey]); | unset($_POST[$postkey]); |
} else { | } else { |
foreach ($_POST[$postkey] as $key => &$value) { | foreach ($_POST[$postkey] as $key => &$value) { |
if ($value == "") { | if ($value == "") { |
unset($_POST[$postkey][$key]); | unset($_POST[$postkey][$key]); |
} | } |
} | } |
} | } |
} | } |
} | } |
if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { | if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { |
echo "Edited version was latest version, continue saving"; | echo "Edited version was latest version, continue saving"; |
$newdoc = $_POST; | $newdoc = $_POST; |
$newdoc['metadata']['lastModified'] = time(); | $newdoc['metadata']['lastModified'] = time(); |
$obj = $db->save($newdoc); | $obj = $db->save($newdoc); |
} else { | } else { |
echo "ALERT doc revised by someone else while editing. Document not saved."; | echo "ALERT doc revised by someone else while editing. Document not saved."; |
} | } |
} | } |
$mode = "view"; | $mode = "view"; |
$rowArray = object_to_array($obj); | $rowArray = object_to_array($obj); |
ksort($rowArray); | ksort($rowArray); |
if ($mode == "edit") { | if ($mode == "edit") { |
$row = addDefaultFields($rowArray); | $row = addDefaultFields($rowArray); |
} else { | } else { |
$row = $rowArray; | $row = $rowArray; |
} | } |
if ($mode == "view") { | if ($mode == "view") { |
echo '<div itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization" about="#' . $row['_id'] . '"><table width="100%">'; | echo '<div itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization" about="#' . $row['_id'] . '"><table width="100%">'; |
echo '<tr> <td colspan="2"><h3 itemprop="name">' . $row['name'] . "</h3></td></tr>"; | echo '<tr> <td colspan="2"><h3 itemprop="name">' . $row['name'] . "</h3></td></tr>"; |
echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; | echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
?> | ?> |
<input id="addfield" type="button" value="Add Field"/> | <input id="addfield" type="button" value="Add Field"/> |
<script> | <script> |
window.onload = function() { | window.onload = function() { |
$(document).ready(function() { | $(document).ready(function() { |
// put all your jQuery goodness in here. | // put all your jQuery goodness in here. |
// http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ | // http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ |
$('#addfield').click(function() { | $('#addfield').click(function() { |
var field_name=window.prompt("fieldname?",""); | var field_name=window.prompt("fieldname?",""); |
if (field_name !="") { | if (field_name !="") { |
$('#submitbutton').before($('<span></span>') | $('#submitbutton').before($('<span></span>') |
.append("<label>"+field_name+"</label>") | .append("<label>"+field_name+"</label>") |
.append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>") | .append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>") |
); | ); |
} | } |
}); | }); |
}); | }); |
}; | }; |
</script> | </script> |
<form id="editform" class="nice" method="post"> | <form id="editform" class="nice" method="post"> |
<?php | <?php |
} | } |
foreach ($row as $key => $value) { | foreach ($row as $key => $value) { |
echo displayValue($key, $value, $mode); | echo displayValue($key, $value, $mode); |
} | } |
if ($mode == "view") { | if ($mode == "view") { |
echo "</table></div>"; | echo "</table></div>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
echo '<input id="submitbutton" type="submit"/></form>'; | echo '<input id="submitbutton" type="submit"/></form>'; |
} | } |
} else { | } else { |
// show all list | // show all list |
include_header('Agencies'); | include_header('Agencies'); |
try { | try { |
/* $rows = $db->get_view("app", "showNamesABNs")->rows; | /* $rows = $db->get_view("app", "showNamesABNs")->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
// print_r($row); | // print_r($row); |
echo '<li><a href="getAgency.php?id=' . $row->key . '">' . | echo '<li><a href="getAgency.php?id=' . $row->key . '">' . |
(isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) | (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) |
. '</a></li>'; | . '</a></li>'; |
} */ | } */ |
$rows = $db->get_view("app", "byCanonicalName")->rows; | $rows = $db->get_view("app", "byCanonicalName")->rows; |
//print_r($rows); | //print_r($rows); |
echo '<ul>'; | echo '<ul>'; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
// print_r($row); | // print_r($row); |
echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '"> | echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '"> |
<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' . | <a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' . |
$row->value->name | $row->value->name |
. '</span></a></li>'; | . '</span></a></li>'; |
} | } |
echo "</ul>"; | echo "</ul>"; |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
} | } |
include_footer(); | include_footer(); |
?> | ?> |
<?php | <?php |
date_default_timezone_set("Australia/Sydney"); | date_default_timezone_set("Australia/Sydney"); |
$basePath = ""; | $basePath = ""; |
if (strstr($_SERVER['PHP_SELF'], "alaveteli/") | if (strstr($_SERVER['PHP_SELF'], "alaveteli/") |
|| strstr($_SERVER['PHP_SELF'], "admin/") | || strstr($_SERVER['PHP_SELF'], "admin/") |
|| strstr($_SERVER['PHP_SELF'], "lib/") | || strstr($_SERVER['PHP_SELF'], "lib/") |
|| strstr($_SERVER['PHP_SELF'], "include/")) | || strstr($_SERVER['PHP_SELF'], "include/")) |
$basePath = "../"; | $basePath = "../"; |
include_once ('couchdb.inc.php'); | include_once ('couchdb.inc.php'); |
include_once ('template.inc.php'); | include_once ('template.inc.php'); |
require_once $basePath.'lib/Requests/library/Requests.php'; | require_once $basePath.'lib/Requests/library/Requests.php'; |
Requests::register_autoloader(); | Requests::register_autoloader(); |
require $basePath."/lib/amon-php/amon.php"; | require $basePath."lib/amon-php/amon.php"; |
Amon::config(array('address'=> 'http://127.0.0.1:2465', | Amon::config(array('address'=> 'http://127.0.0.1:2464', |
'protocol' => 'http', | 'protocol' => 'http', |
'secret_key' => "g99127n3lkzigg8ob2rllth97d1pb4sj")); | 'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw")); |
Amon::setup_exception_handler(); | Amon::setup_exception_handler(); |
# Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735 | # Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735 |
function object_to_array(stdClass $Class) { | function object_to_array(stdClass $Class) { |
# Typecast to (array) automatically converts stdClass -> array. | # Typecast to (array) automatically converts stdClass -> array. |
$Class = (array) $Class; | $Class = (array) $Class; |
# Iterate through the former properties looking for any stdClass properties. | # Iterate through the former properties looking for any stdClass properties. |
# Recursively apply (array). | # Recursively apply (array). |
foreach ($Class as $key => $value) { | foreach ($Class as $key => $value) { |
if (is_object($value) && get_class($value) === 'stdClass') { | if (is_object($value) && get_class($value) === 'stdClass') { |
$Class[$key] = object_to_array($value); | $Class[$key] = object_to_array($value); |
} | } |
} | } |
return $Class; | return $Class; |
} | } |
# Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735 | # Convert an Array to stdClass. http://www.php.net/manual/en/language.types.object.php#102735 |
function array_to_object(array $array) { | function array_to_object(array $array) { |
# Iterate through our array looking for array values. | # Iterate through our array looking for array values. |
# If found recurvisely call itself. | # If found recurvisely call itself. |
foreach ($array as $key => $value) { | foreach ($array as $key => $value) { |
if (is_array($value)) { | if (is_array($value)) { |
$array[$key] = array_to_object($value); | $array[$key] = array_to_object($value); |
} | } |
} | } |
# Typecast to (object) will automatically convert array -> stdClass | # Typecast to (object) will automatically convert array -> stdClass |
return (object) $array; | return (object) $array; |
} | } |
function dept_to_portfolio($deptName) { | function dept_to_portfolio($deptName) { |
return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName))); | return trim(str_replace("Department of", "", str_replace("Department of the", "Department of", $deptName))); |
} | } |
function phrase_to_tag ($phrase) { | function phrase_to_tag ($phrase) { |
return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase)))); | return str_replace(" ","_",str_replace("'","",str_replace(",","",strtolower($phrase)))); |
} | } |
function local_url() { | function local_url() { |
return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/"; | return "http://" . $_SERVER['HTTP_HOST'] . rtrim(dirname($_SERVER['PHP_SELF']), '/\\') . "/"; |
} | } |
function GetDomain($url) | function GetDomain($url) |
{ | { |
$nowww = ereg_replace('www\.','',$url); | $nowww = ereg_replace('www\.','',$url); |
$domain = parse_url($nowww); | $domain = parse_url($nowww); |
if(!empty($domain["host"])) | if(!empty($domain["host"])) |
{ | { |
return $domain["host"]; | return $domain["host"]; |
} else | } else |
{ | { |
return $domain["path"]; | return $domain["path"]; |
} | } |
} | } |
<?php | |
// Agency X updated Y, new files, diff of plain text/link text, | |
// feed for just one agency or all | |
// This is a minimum example of using the Universal Feed Generator Class | |
include("lib/FeedWriter.php"); | |
//Creating an instance of FeedWriter class. | |
$TestFeed = new FeedWriter(RSS2); | |
//Setting the channel elements | |
//Use wrapper functions for common channelelements | |
$TestFeed->setTitle('Last Modified - All'); | |
$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php'); | |
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer'); | |
//Retriving informations from database | |
$rows = $db->get_view("app", "byLastModified")->rows; | |
//print_r($rows); | |
foreach ($rows as $row) { | |
//Create an empty FeedItem | |
$newItem = $TestFeed->createNewItem(); | |
//Add elements to the feed item | |
$newItem->setTitle($row['name']); | |
$newItem->setLink($row['id']); | |
$newItem->setDate(date("c", $row['metadata']['lastModified'])); | |
$newItem->setDescription($row['name']); | |
//Now add the feed item | |
$TestFeed->addItem($newItem); | |
} | |
//OK. Everything is done. Now genarate the feed. | |
$TestFeed->genarateFeed(); | |
?> |
#http://packages.python.org/CouchDB/client.html | |
import couchdb | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import re | |
import hashlib | |
from urlparse import urljoin | |
import time | |
import os | |
import mimetypes | |
import re | |
import urllib | |
import urlparse | |
def canonurl(url): | |
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' | |
if the URL looks invalid. | |
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws | |
'http://xn--hgi.ws/' | |
""" | |
# strip spaces at the ends and ensure it's prefixed with 'scheme://' | |
url = url.strip() | |
if not url: | |
return '' | |
if not urlparse.urlsplit(url).scheme: | |
url = 'http://' + url | |
# turn it into Unicode | |
#try: | |
# url = unicode(url, 'utf-8') | |
#except UnicodeDecodeError: | |
# return '' # bad UTF-8 chars in URL | |
# parse the URL into its components | |
parsed = urlparse.urlsplit(url) | |
scheme, netloc, path, query, fragment = parsed | |
# ensure scheme is a letter followed by letters, digits, and '+-.' chars | |
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): | |
return '' | |
scheme = str(scheme) | |
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] | |
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) | |
if not match: | |
return '' | |
domain, port = match.groups() | |
netloc = domain + (port if port else '') | |
netloc = netloc.encode('idna') | |
# ensure path is valid and convert Unicode chars to %-encoded | |
if not path: | |
path = '/' # eg: 'http://google.com' -> 'http://google.com/' | |
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') | |
# ensure query is valid | |
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') | |
# ensure fragment is valid | |
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) | |
# piece it all back together, truncating it to a maximum of 4KB | |
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) | |
return url[:4096] | |
#http://diveintopython.org/http_web_services/etags.html | |
class NotModifiedHandler(urllib2.BaseHandler): | |
def http_error_304(self, req, fp, code, message, headers): | |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) | |
addinfourl.code = code | |
return addinfourl | |
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): | |
url = canonurl(url) | |
hash = hashlib.md5(url).hexdigest().encode("utf-8") | |
req = urllib2.Request(url) | |
print "Fetching %s" % url | |
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": | |
print "Not a valid HTTP url" | |
return (None,None,None) | |
doc = docsdb.get(hash) | |
if doc == None: | |
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} | |
else: | |
if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 999999): | |
print "Uh oh, trying to scrape URL again too soon!" | |
last_attachment_fname = doc["_attachments"].keys()[-1] | |
last_attachment = docsdb.get_attachment(doc,last_attachment_fname) | |
return (doc['url'],doc['mime_type'],last_attachment) | |
if scrape_again == False: | |
print "Not scraping this URL again as requested" | |
return (None,None,None) | |
time.sleep(3) # wait 3 seconds to give webserver time to recover | |
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") | |
#if there is a previous version stored in couchdb, load caching helper tags | |
if doc.has_key('etag'): | |
req.add_header("If-None-Match", doc['etag']) | |
if doc.has_key('last_modified'): | |
req.add_header("If-Modified-Since", doc['last_modified']) | |
opener = urllib2.build_opener(NotModifiedHandler()) | |
try: | |
url_handle = opener.open(req) | |
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url | |
headers = url_handle.info() # the addinfourls have the .info() too | |
doc['etag'] = headers.getheader("ETag") | |
doc['last_modified'] = headers.getheader("Last-Modified") | |
doc['date'] = headers.getheader("Date") | |
doc['page_scraped'] = time.time() | |
doc['web_server'] = headers.getheader("Server") | |
doc['via'] = headers.getheader("Via") | |
doc['powered_by'] = headers.getheader("X-Powered-By") | |
doc['file_size'] = headers.getheader("Content-Length") | |
content_type = headers.getheader("Content-Type") | |
if content_type != None: | |
doc['mime_type'] = content_type.split(";")[0] | |
else: | |
(type,encoding) = mimetypes.guess_type(url) | |
doc['mime_type'] = type | |
if hasattr(url_handle, 'code'): | |
if url_handle.code == 304: | |
print "the web page has not been modified" | |
return (None,None,None) | |
else: | |
content = url_handle.read() | |
docsdb.save(doc) | |
doc = docsdb.get(hash) # need to get a _rev | |
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) | |
return (doc['url'], doc['mime_type'], content) | |
#store as attachment epoch-filename | |
except urllib2.URLError as e: | |
error = "" | |
if hasattr(e, 'reason'): | |
error = "error %s in downloading %s" % (str(e.reason), url) | |
elif hasattr(e, 'code'): | |
error = "error %s in downloading %s" % (e.code, url) | |
print error | |
doc['error'] = error | |
docsdb.save(doc) | |
return (None,None,None) | |
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): | |
(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) | |
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] | |
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": | |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | |
soup = BeautifulSoup(content) | |
navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) | |
for nav in navIDs: | |
print "Removing element", nav['id'] | |
nav.extract() | |
navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) | |
for nav in navClasses: | |
print "Removing element", nav['class'] | |
nav.extract() | |
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) | |
linkurls = set([]) | |
for link in links: | |
if link.has_key("href"): | |
if link['href'].startswith("http"): | |
# lets not do external links for now | |
# linkurls.add(link['href']) | |
None | |
if link['href'].startswith("mailto"): | |
# not http | |
None | |
if link['href'].startswith("javascript"): | |
# not http | |
None | |
else: | |
# remove anchors and spaces in urls | |
link['href'] = link['href'].replace(" ","%20") | |
link['href'] = re.sub('#.*$','',link['href']) | |
linkurls.add(urljoin(url,link['href'])) | |
for linkurl in linkurls: | |
#print linkurl | |
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) | |
couch = couchdb.Server('http://127.0.0.1:5984/') | |
# select database | |
agencydb = couch['disclosr-agencies'] | |
docsdb = couch['disclosr-documents'] | |
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? | |
agency = agencydb.get(row.id) | |
print agency['name'] | |
for key in agency.keys(): | |
if key == 'website': | |
scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) | |
if key.endswith('URL'): | |
print key | |
depth = 1 | |
if 'scrapeDepth' in agency.keys(): | |
depth = agency['scrapeDepth'] | |
scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) | |
agency['metadata']['lastScraped'] = time.time() | |
agencydb.save(agency) | |
<?php | |
include_once('include/common.inc.php'); | |
include_header('Search'); | |
?> | |
<div class="foundation-header"> | |
<h1><a href="search.php">Search</a></h1> | |
</div> | |
<form> | |
<input type="text" name="q" value="<?php if (isset($_REQUEST['q']))echo $_REQUEST['q'];?>"/> | |
<input type="submit"/> | |
</form> | |
<?php | |
if (isset($_REQUEST['q'])) { | |
$request = Requests::get($serverAddr."disclosr-documents/_fti/_design/lucene/by_all?include_docs=true&q=".$_REQUEST['q']); | |
$results = json_decode($request->body); | |
$db = $server->get_db('disclosr-documents'); | |
foreach ($results->rows as $result) { | |
//print_r($result); | |
//$row = $db->get($result->id); | |
echo $result->doc->_id." ".$result->doc->url."<br>".PHP_EOL; | |
} | |
} | |
include_footer(); | |
?> |