";d.p.dom.innerHTML=
+a;return d}sigma.classes.Cascade.call(this);var d=this;this.instance=b;this.monitoring=!1;this.p={fps:40,dom:h,globalProbes:{"Time (ms)":sigma.chronos.getExecutionTime,Queue:sigma.chronos.getQueuedTasksCount,Tasks:sigma.chronos.getTasksCount,FPS:sigma.chronos.getFPS},localProbes:{"Nodes count":function(){return d.instance.graph.nodes.length},"Edges count":function(){return d.instance.graph.edges.length}}};this.activate=function(){if(!d.monitoring)d.monitoring=window.setInterval(e,1E3/d.p.fps);return d};
+this.desactivate=function(){if(d.monitoring)window.clearInterval(d.monitoring),d.monitoring=null,d.p.dom.innerHTML="";return d}}function j(b){function h(b){if(a.p.mouseEnabled&&(e(a.mouseX,a.mouseY,a.ratio*(0<(void 0!=b.wheelDelta&&b.wheelDelta||void 0!=b.detail&&-b.detail)?a.p.zoomMultiply:1/a.p.zoomMultiply)),a.p.blockScroll))b.preventDefault?b.preventDefault():b.returnValue=!1}function e(b,c,e){if(!a.isMouseDown&&(window.clearInterval(a.interpolationID),n=void 0!=e,i=a.stageX,j=b,k=a.stageY,l=
+c,f=e||a.ratio,f=Math.min(Math.max(f,a.p.minRatio),a.p.maxRatio),u=a.p.directZooming?1-(n?a.p.zoomDelta:a.p.dragDelta):0,a.ratio!=f||a.stageX!=j||a.stageY!=l))d(),a.interpolationID=window.setInterval(d,50),a.dispatch("startinterpolate")}function d(){u+=n?a.p.zoomDelta:a.p.dragDelta;u=Math.min(u,1);var b=sigma.easing.quadratic.easeout(u),c=a.ratio;a.ratio=c*(1-b)+f*b;n?(a.stageX=j+(a.stageX-j)*a.ratio/c,a.stageY=l+(a.stageY-l)*a.ratio/c):(a.stageX=i*(1-b)+j*b,a.stageY=k*(1-b)+l*b);a.dispatch("interpolate");
+if(1<=u)window.clearInterval(a.interpolationID),b=a.ratio,n?(a.ratio=f,a.stageX=j+(a.stageX-j)*a.ratio/b,a.stageY=l+(a.stageY-l)*a.ratio/b):(a.stageX=j,a.stageY=l),a.dispatch("stopinterpolate")}sigma.classes.Cascade.call(this);sigma.classes.EventDispatcher.call(this);var a=this;this.p={minRatio:1,maxRatio:32,marginRatio:1,zoomDelta:0.1,dragDelta:0.3,zoomMultiply:2,directZooming:!1,blockScroll:!0,inertia:1.1,mouseEnabled:!0};var g=0,c=0,i=0,k=0,f=1,j=0,l=0,s=0,q=0,z=0,m=0,u=0,n=!1;this.stageY=this.stageX=
+0;this.ratio=1;this.mouseY=this.mouseX=0;this.isMouseDown=!1;b.addEventListener("DOMMouseScroll",h,!0);b.addEventListener("mousewheel",h,!0);b.addEventListener("mousemove",function(b){a.mouseX=void 0!=b.offsetX&&b.offsetX||void 0!=b.layerX&&b.layerX||void 0!=b.clientX&&b.clientX;a.mouseY=void 0!=b.offsetY&&b.offsetY||void 0!=b.layerY&&b.layerY||void 0!=b.clientY&&b.clientY;if(a.isMouseDown){var d=a.mouseX-g+i,f=a.mouseY-c+k;if(d!=a.stageX||f!=a.stageY)q=s,m=z,s=d,z=f,a.stageX=d,a.stageY=f,a.dispatch("drag")}a.dispatch("move");
+b.preventDefault?b.preventDefault():b.returnValue=!1},!0);b.addEventListener("mousedown",function(b){if(a.p.mouseEnabled)a.isMouseDown=!0,a.dispatch("mousedown"),i=a.stageX,k=a.stageY,g=a.mouseX,c=a.mouseY,q=s=a.stageX,m=z=a.stageY,a.dispatch("startdrag"),b.preventDefault?b.preventDefault():b.returnValue=!1},!0);document.addEventListener("mouseup",function(b){if(a.p.mouseEnabled&&a.isMouseDown)a.isMouseDown=!1,a.dispatch("mouseup"),(i!=a.stageX||k!=a.stageY)&&e(a.stageX+a.p.inertia*(a.stageX-q),a.stageY+
+a.p.inertia*(a.stageY-m)),b.preventDefault?b.preventDefault():b.returnValue=!1},!0);this.checkBorders=function(){return a};this.interpolate=e}function m(b,h,e,d,a,g,c){function i(a){var b=d,c="fixed"==f.p.labelSize?f.p.defaultLabelSize:f.p.labelSizeRatio*a.displaySize;b.font=(f.p.hoverFontStyle||f.p.fontStyle||"")+" "+c+"px "+(f.p.hoverFont||f.p.font||"");b.fillStyle="node"==f.p.labelHoverBGColor?a.color||f.p.defaultNodeColor:f.p.defaultHoverLabelBGColor;b.beginPath();if(f.p.labelHoverShadow)b.shadowOffsetX=
+0,b.shadowOffsetY=0,b.shadowBlur=4,b.shadowColor=f.p.labelHoverShadowColor;sigma.tools.drawRoundRect(b,Math.round(a.displayX-c/2-2),Math.round(a.displayY-c/2-2),Math.round(b.measureText(a.label).width+1.5*a.displaySize+c/2+4),Math.round(c+4),Math.round(c/2+2),"left");b.closePath();b.fill();b.shadowOffsetX=0;b.shadowOffsetY=0;b.shadowBlur=0;b.beginPath();b.fillStyle="node"==f.p.nodeBorderColor?a.color||f.p.defaultNodeColor:f.p.defaultNodeBorderColor;b.arc(Math.round(a.displayX),Math.round(a.displayY),
+a.displaySize+f.p.borderSize,0,2*Math.PI,!0);b.closePath();b.fill();b.beginPath();b.fillStyle="node"==f.p.nodeHoverColor?a.color||f.p.defaultNodeColor:f.p.defaultNodeHoverColor;b.arc(Math.round(a.displayX),Math.round(a.displayY),a.displaySize,0,2*Math.PI,!0);b.closePath();b.fill();b.fillStyle="node"==f.p.labelHoverColor?a.color||f.p.defaultNodeColor:f.p.defaultLabelHoverColor;b.fillText(a.label,Math.round(a.displayX+1.5*a.displaySize),Math.round(a.displayY+c/2-3));return f}function k(a){if(isNaN(a.x)||
+isNaN(a.y))throw Error("A node's coordinate is not a number (id: "+a.id+")");return!a.hidden&&a.displayX+a.displaySize>-j/3&&a.displayX-a.displaySize<4*j/3&&a.displayY+a.displaySize>-l/3&&a.displayY-a.displaySize<4*l/3}sigma.classes.Cascade.call(this);var f=this;this.p={labelColor:"default",defaultLabelColor:"#000",labelHoverBGColor:"default",defaultHoverLabelBGColor:"#fff",labelHoverShadow:!0,labelHoverShadowColor:"#000",labelHoverColor:"default",defaultLabelHoverColor:"#000",labelActiveBGColor:"default",
+defaultActiveLabelBGColor:"#fff",labelActiveShadow:!0,labelActiveShadowColor:"#000",labelActiveColor:"default",defaultLabelActiveColor:"#000",labelSize:"fixed",defaultLabelSize:12,labelSizeRatio:2,labelThreshold:6,font:"Arial",hoverFont:"",activeFont:"",fontStyle:"",hoverFontStyle:"",activeFontStyle:"",edgeColor:"source",defaultEdgeColor:"#aaa",defaultEdgeType:"line",defaultNodeColor:"#aaa",nodeHoverColor:"node",defaultNodeHoverColor:"#fff",nodeActiveColor:"node",defaultNodeActiveColor:"#fff",borderSize:0,
+nodeBorderColor:"node",defaultNodeBorderColor:"#fff",edgesSpeed:200,nodesSpeed:200,labelsSpeed:200};var j=g,l=c;this.currentLabelIndex=this.currentNodeIndex=this.currentEdgeIndex=0;this.task_drawLabel=function(){for(var b=a.nodes.length,c=0;c++=f.p.labelThreshold){var k="fixed"==f.p.labelSize?f.p.defaultLabelSize:f.p.labelSizeRatio*d.displaySize;h.font=
+f.p.fontStyle+k+"px "+f.p.font;h.fillStyle="node"==f.p.labelColor?d.color||f.p.defaultNodeColor:f.p.defaultLabelColor;h.fillText(d.label,Math.round(d.displayX+1.5*d.displaySize),Math.round(d.displayY+k/2-3))}}else f.currentLabelIndex++;return f.currentLabelIndex(b*=2)?0.5*b*b:-0.5*(--b*(b-2)-1)};sigma.tools.drawRoundRect=function(b,h,e,d,a,g,c){var g=g?g:0,i=c?c:[],i="string"==typeof i?i.split(" "):i,c=g&&(0<=i.indexOf("topleft")||0<=i.indexOf("top")||0<=i.indexOf("left")),j=g&&(0<=i.indexOf("topright")||0<=i.indexOf("top")||0<=i.indexOf("right")),f=g&&(0<=i.indexOf("bottomleft")||0<=i.indexOf("bottom")||0<=i.indexOf("left")),i=g&&(0<=i.indexOf("bottomright")||
+0<=i.indexOf("bottom")||0<=i.indexOf("right"));b.moveTo(h,e+g);c?b.arcTo(h,e,h+g,e,g):b.lineTo(h,e);j?(b.lineTo(h+d-g,e),b.arcTo(h+d,e,h+d,e+g,g)):b.lineTo(h+d,e);i?(b.lineTo(h+d,e+a-g),b.arcTo(h+d,e+a,h+d-g,e+a,g)):b.lineTo(h+d,e+a);f?(b.lineTo(h+g,e+a),b.arcTo(h,e+a,h,e+a-g,g)):b.lineTo(h,e+a);b.lineTo(h,e+g)};sigma.tools.getRGB=function(b,g){var b=b.toString(),e={r:0,g:0,b:0};if(3<=b.length&&"#"==b.charAt(0)){var d=b.length-1;6==d?e={r:parseInt(b.charAt(1)+b.charAt(2),16),g:parseInt(b.charAt(3)+
+b.charAt(4),16),b:parseInt(b.charAt(5)+b.charAt(5),16)}:3==d&&(e={r:parseInt(b.charAt(1)+b.charAt(1),16),g:parseInt(b.charAt(2)+b.charAt(2),16),b:parseInt(b.charAt(3)+b.charAt(3),16)})}g&&(e=[e.r,e.g,e.b]);return e};sigma.tools.rgbToHex=function(b,g,e){return sigma.tools.toHex(b)+sigma.tools.toHex(g)+sigma.tools.toHex(e)};sigma.tools.toHex=function(b){b=parseInt(b,10);if(isNaN(b))return"00";b=Math.max(0,Math.min(b,255));return"0123456789ABCDEF".charAt((b-b%16)/16)+"0123456789ABCDEF".charAt(b%16)};
+sigma.publicPrototype=p.prototype})();
--- /dev/null
+++ b/lib/phpquery
--- a/lib/springy
+++ /dev/null
--- a/schemas/agency.json.php
+++ b/schemas/agency.json.php
@@ -17,6 +17,7 @@
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"),
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
+ "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"),
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, mandated by the Senate" ),
"budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"),
"grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded",
@@ -49,6 +50,8 @@
"items" => Array("type" => "string")),
"hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright",
"items" => Array("type" => "string")),
+ "hasPermissiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Permissive Licence", "description" => "Has any page licenced under terms more permissive than Crown Copyright but not clear CCBY",
+ "items" => Array("type" => "string")),
"hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration",
"items" => Array("type" => "string")),
),
--- a/scrape.py
+++ b/scrape.py
@@ -77,7 +77,7 @@
print "Fetching %s" % url
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
print "Not a valid HTTP url"
- return (None,None)
+ return (None,None,None)
doc = docsdb.get(hash)
if doc == None:
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
@@ -86,13 +86,14 @@
print "Uh oh, trying to scrape URL again too soon!"
last_attachment_fname = doc["_attachments"].keys()[-1]
last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
- return (doc['mime_type'],last_attachment)
+ return (doc['url'],doc['mime_type'],last_attachment)
if scrape_again == False:
print "Not scraping this URL again as requested"
- return (None,None)
+ return (None,None,None)
time.sleep(3) # wait 3 seconds to give webserver time to recover
+ req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
#if there is a previous version stored in couchdb, load caching helper tags
if doc.has_key('etag'):
req.add_header("If-None-Match", doc['etag'])
@@ -102,12 +103,14 @@
opener = urllib2.build_opener(NotModifiedHandler())
try:
url_handle = opener.open(req)
+ doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag")
doc['last_modified'] = headers.getheader("Last-Modified")
doc['date'] = headers.getheader("Date")
doc['page_scraped'] = time.time()
doc['web_server'] = headers.getheader("Server")
+ doc['via'] = headers.getheader("Via")
doc['powered_by'] = headers.getheader("X-Powered-By")
doc['file_size'] = headers.getheader("Content-Length")
content_type = headers.getheader("Content-Type")
@@ -119,13 +122,13 @@
if hasattr(url_handle, 'code'):
if url_handle.code == 304:
print "the web page has not been modified"
- return (None,None)
+ return (None,None,None)
else:
content = url_handle.read()
docsdb.save(doc)
doc = docsdb.get(hash) # need to get a _rev
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
- return (doc['mime_type'], content)
+ return (doc['url'], doc['mime_type'], content)
#store as attachment epoch-filename
except urllib2.URLError as e:
error = ""
@@ -136,21 +139,22 @@
print error
doc['error'] = error
docsdb.save(doc)
- return (None,None)
+ return (None,None,None)
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
- (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
- if content != None and depth > 0:
+ (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+ badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+ if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(content)
- navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
+ navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
for nav in navIDs:
print "Removing element", nav['id']
nav.extract()
- navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
+ navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
for nav in navClasses:
print "Removing element", nav['class']
nav.extract()
@@ -169,7 +173,10 @@
# not http
None
else:
- linkurls.add(urljoin(url,link['href'].replace(" ","%20")))
+ # remove anchors and spaces in urls
+ link['href'] = link['href'].replace(" ","%20")
+ link['href'] = re.sub('#.*$','',link['href'])
+ linkurls.add(urljoin(url,link['href']))
for linkurl in linkurls:
#print linkurl
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
--- a/unimplemented/foundation.html
+++ /dev/null
@@ -1,137 +1,1 @@
-
-
-
-
-
-
-
-
-
-
-
-
- Welcome to Foundation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Welcome to Foundation
-
This is version 2.1.4 released on December 19, 2011
-
-
-
-
-
-
-
The Grid
-
-
-
-
-
-
This is a twelve column section in a row. Each of these includes a div.panel element so you can see where the columns are - it's not required at all for the grid.
We're stoked you want to try Foundation! To get going, this file (index.html) includes some basic styles you can modify, play around with, or totally destroy to get going.
-
-
Other Resources
-
Once you've exhausted the fun in this document, you should check out:
@foundationzurb Ping us on Twitter if you have questions. If you build something with this we'd love to see it (and send you a totally boss sticker).
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/unimplemented/humans.txt
+++ /dev/null
@@ -1,8 +1,1 @@
-/* Foundation was made by ZURB, an interaction design and design strategy firm in Campbell, CA */
-/* zurb.com */
-/* humanstxt.org */
-/* SITE */
- Standards: HTML5, CSS3
- Components: jQuery, Orbit, Reveal
- Software: Coda, Textmate, Git
--- a/unimplemented/lastUpdated.php
+++ /dev/null
@@ -1,2 +1,1 @@
-for each agency, record when last changed (number of days too) and show a couple of URLs that were in that change
--- a/unimplemented/validation.php
+++ /dev/null
--- a/unimplemented/webservers.php
+++ /dev/null
@@ -1,1 +1,1 @@
-for each agency, find a scrapped document and read the webserver off it
+