varnish config dec 2013
varnish config dec 2013

file:a/README.rst -> file:b/README.rst
This CKAN Extension customises a CKAN instance for the hosting of data.gov.au. This CKAN Extension customises a CKAN instance for the hosting of data.gov.au.
   
It comprises: It comprises:
   
* A CKAN Extension "plugin" at ``ckanext/datagovau/plugin.py`` which, when * A CKAN Extension "plugin" at ``ckanext/datagovau/plugin.py`` which, when
loaded, overrides various settings in the core ``ini``-file to provide: loaded, overrides various settings in the core ``ini``-file to provide:
* A path to local customisations of the core templates * A path to local customisations of the core templates to include AGLS/Dublin Core minimum metadata
* A custom Package edit form that defaults to cc-by licence * A custom Package edit form that defaults to cc-by licence
* A custom n3/rdf output format * A custom n3/rdf output format
  * Replaces links with http/https protocol independent versions
  * Provides HTML to users to embed data previews on their own website
   
* A cut down licenses.json file * A cut down licenses.json file
   
Installation Installation
============ ============
   
To install this package, from your CKAN virtualenv, run the following from your CKAN base folder (e.g. ``pyenv/``):: To install this package, from your CKAN virtualenv, run the following from your CKAN base folder (e.g. ``pyenv/``)::
   
pip install -e git+https://github.com/okfn/ckanext-datagovau#egg=ckanext-datagovau pip install -e git+https://github.com/okfn/ckanext-datagovau#egg=ckanext-datagovau
   
Then activate it by setting ``ckan.plugins = datagovau`` in your main ``ini``-file. Then activate it by setting ``ckan.plugins = datagovau`` in your main ``ini``-file.
   
To add the cut down licenses.json set ``licenses_group_url = http://%(ckan.site_url)/licenses.json`` To add the cut down licenses.json set ``licenses_group_url = http://%(ckan.site_url)/licenses.json``
or copy ``ckanext/datagovau/theme/public/licenses.json`` to the same folder as your CKAN config ini file or copy ``ckanext/datagovau/theme/public/licenses.json`` to the same folder as your CKAN config ini file
and set ``licenses_group_url = file://%(here)s/licenses.json`` and set ``licenses_group_url = file://%(here)s/licenses.json``
   
   
   
# This is a basic VCL configuration file for varnish. See the vcl(7) # This is a basic VCL configuration file for varnish. See the vcl(7)
# man page for details on VCL syntax and semantics. # man page for details on VCL syntax and semantics.
# #
# Default backend definition. Set this to point to your content # Default backend definition. Set this to point to your content
# server. # server.
# #
backend default { backend default {
.host = "127.0.0.1"; .host = "127.0.0.1";
.port = "8080"; .port = "8080";
} }
  backend geoserver {
  .host = "172.31.18.207";
  .port = "8983";
  }
   
sub vcl_fetch { sub vcl_fetch {
set beresp.grace = 1h; set beresp.grace = 1h;
  unset beresp.http.Server;
if (beresp.http.content-type ~ "(text|application)") { # These status codes should always pass through and never cache.
  if ( beresp.status >= 500 ) {
  set beresp.ttl = 0s;
  }
  if (beresp.http.content-type ~ "(text|javascript|json|xml|html)") {
set beresp.do_gzip = true; set beresp.do_gzip = true;
} }
  # CKAN cache headers are used by Varnish cache, but should not be propagated to
  # the Internet. Tell browsers and proxies not to cache. This means Varnish always
  # gets the responsibility to server the right content at all times.
  if (beresp.http.Cache-Control ~ "max-age") {
  unset beresp.http.set-cookie;
  set beresp.http.Cache-Control = "no-cache";
  }
   
  # Encourage assets to be cached by proxies and browsers
  # JS and CSS may be gzipped depending on headers
  # see https://developers.google.com/speed/docs/best-practices/caching
  if (req.url ~ "\.(css|js)") {
  set beresp.http.Vary = "Accept-Encoding";
  }
   
  # Encourage assets to be cached by proxies and browsers for 1 day
  if (req.url ~ "\.(png|gif|jpg|swf|css|js)") {
  unset beresp.http.set-cookie;
  set beresp.http.Cache-Control = "public, max-age=86400";
  set beresp.ttl = 1d;
  }
   
  # Encourage CKAN vendor assets (which are versioned) to be cached by
  # by proxies and browsers for 1 year
  if (req.url ~ "^/scripts/vendor/") {
  unset beresp.http.set-cookie;
  set beresp.http.Cache-Control = "public, max-age=31536000";
  set beresp.ttl = 12m;
  }
  }
  sub vcl_recv {
  if (req.http.user-agent ~ "Ezooms" || req.http.user-agent ~ "Ahrefs") {
  error 403;
  }
  if (req.url ~ "^/geoserver/") {
  set req.backend = geoserver;
  } else {
  set req.backend = default;
  #redirect secure traffic to https
  if ( (req.http.Cookie ~ "auth_tkt" || req.http.Cookie ~ "ckan" || req.url ~ "user/(reset|login)") && req.http.X-Forwarded-Proto !~ "(?i)https") {
  set req.http.x-Redir-Url = "https://data.gov.au" + req.url;
  error 753 req.http.x-Redir-Url;
  }
  # remove locale links
  if (req.url ~ "/((?!js)..|.._..|sr_Latn)/") {
  set req.http.x-Redir-Url = regsub(req.url, "/((?!js)..|.._..|sr_Latn)/", "/");
  error 751 req.http.x-Redir-Url;
  }
  # rewrite broken resources
  if (req.url ~ "leaflet") {
  set req.url = regsub(req.url, "fanstatic/ckanext-spatial/:version:2013-09-13T02:32:17.87/:bundle:js/vendor/leaflet/images", "js/vendor/leaflet/images");
  }
  # remove old hostnames
  if (req.http.host ~ "data.australia.gov.au") {
  set req.http.x-Redir-Url = "http://data.gov.au" + req.url;
  error 751 req.http.x-Redir-Url;
  }
   
  if (req.url ~ "^/_tracking") {
  // exclude web spiders from statistics
  if (req.http.user-agent ~ "Googlebot" || req.http.user-agent ~ "baidu" || req.http.user-agent ~ "bing") {
  error 200;
  } else {
  return (pass);
  }
  }
if (req.url ~ "\.(png|gif|jpg|jpeg|swf|css|js|woff|eot)$") { if (req.url ~ "\.(png|gif|jpg|jpeg|swf|css|js|woff|eot)$") {
unset beresp.http.set-cookie; //Varnish to deliver content from cache even if the request othervise indicates that the request should be passed
  return(lookup);
} }
} }
sub vcl_recv { // Remove has_js and Google Analytics cookies. Evan added sharethis cookies
if (req.url ~ "^/_tracking") { set req.http.Cookie = regsuball(req.http.Cookie, "(^|;\s*)(__[a-z]+|has_js|cookie-agreed-en|_csoot|_csuid|_chartbeat2)=[^;]*", "");
return (pass);  
} // Remove a ";" prefix, if present.
if (req.url ~ "\.(png|gif|jpg|jpeg|swf|css|js|woff|eot)$") { set req.http.Cookie = regsub(req.http.Cookie, "^;\s*", "");
return(lookup); // Remove empty cookies.
} if (req.http.Cookie ~ "^\s*$") {
if (req.url ~ "/(..|.._..)/") { unset req.http.Cookie;
set req.url = regsub(req.url, "/((?!js)..|.._..)/", "/"); }
}  
if (req.http.Cookie) { remove req.http.X-Forwarded-For;
set req.http.Cookie = regsuball(req.http.Cookie, "(^|; ) *__utm.=[^;]+;? *", "\1"); # removes all cookies named __utm? (utma, utmb...) - tracking thing set req.http.X-Forwarded-For = req.http.X-Real-IP;
   
if (req.http.Cookie == "") {  
remove req.http.Cookie;  
}  
}  
} }
sub vcl_hash { sub vcl_hash {
# http://serverfault.com/questions/112531/ignoring-get-parameters-in-varnish-vcl # http://serverfault.com/questions/112531/ignoring-get-parameters-in-varnish-vcl
set req.url = regsub(req.url, "(?:(.com|.au))/((?!js)..|.._..)/", "/"); set req.url = regsub(req.url, "(?:(.com|.au))/((?!js)..|.._..|sr_Latn)/", "/");
hash_data(req.url); hash_data(req.url);
if (req.http.host) { if (req.http.host) {
hash_data(req.http.host); hash_data(req.http.host);
} else { } else {
hash_data(server.ip); hash_data(server.ip);
} }
return (hash); if (req.http.Cookie) {
  hash_data(req.http.Cookie);
  }
} }
sub vcl_deliver { sub vcl_deliver {
if (!resp.http.Vary) { if (!resp.http.Vary) {
set resp.http.Vary = "Accept-Encoding"; set resp.http.Vary = "Accept-Encoding";
} else if (resp.http.Vary !~ "(?i)Accept-Encoding") { } else if (resp.http.Vary !~ "(?i)Accept-Encoding") {
set resp.http.Vary = resp.http.Vary + ",Accept-Encoding"; set resp.http.Vary = resp.http.Vary + ",Accept-Encoding";
} }
if (obj.hits > 0) { remove resp.http.X-Varnish;
set resp.http.X-Cache = "HIT"; remove resp.http.Via;
} else { remove resp.http.Age;
set resp.http.X-Cache = "MISS"; remove resp.http.X-Powered-By;
} if (req.url ~ "^/geoserver/") {
  set resp.http.Access-Control-Allow-Origin = "*";
  set resp.http.Access-Control-Allow-Methods = "GET, POST, PUT, DELETE";
  set resp.http.Access-Control-Allow-Headers = "Origin, X-Requested-With, Content-Type, Accept";
  }
} }
  sub vcl_error {
  remove obj.http.Server;
  if (obj.status == 751) {
  set obj.http.Location = obj.response;
  set obj.status = 301;
  return (deliver);
  }
  if (obj.status == 753) {
  set obj.http.Location = obj.response;
  set obj.status = 301;
  return (deliver);
  }
  }
# #
# Below is a commented-out copy of the default VCL logic. If you # Below is a commented-out copy of the default VCL logic. If you
# redefine any of these subroutines, the built-in logic will be # redefine any of these subroutines, the built-in logic will be
# appended to your code. # appended to your code.
# sub vcl_recv { # sub vcl_recv {
# if (req.restarts == 0) { # if (req.restarts == 0) {
# if (req.http.x-forwarded-for) { # if (req.http.x-forwarded-for) {
# set req.http.X-Forwarded-For = # set req.http.X-Forwarded-For =
# req.http.X-Forwarded-For + ", " + client.ip; # req.http.X-Forwarded-For + ", " + client.ip;
# } else { # } else {
# set req.http.X-Forwarded-For = client.ip; # set req.http.X-Forwarded-For = client.ip;
# } # }
# } # }
# if (req.request != "GET" && # if (req.request != "GET" &&
# req.request != "HEAD" && # req.request != "HEAD" &&
# req.request != "PUT" && # req.request != "PUT" &&
# req.request != "POST" && # req.request != "POST" &&
# req.request != "TRACE" && # req.request != "TRACE" &&
# req.request != "OPTIONS" && # req.request != "OPTIONS" &&
# req.request != "DELETE") { # req.request != "DELETE") {
# /* Non-RFC2616 or CONNECT which is weird. */ # /* Non-RFC2616 or CONNECT which is weird. */
# return (pipe); # return (pipe);
# } # }
# if (req.request != "GET" && req.request != "HEAD") { # if (req.request != "GET" && req.request != "HEAD") {
# /* We only deal with GET and HEAD by default */ # /* We only deal with GET and HEAD by default */
# return (pass); # return (pass);
# } # }
# if (req.http.Authorization || req.http.Cookie) { # if (req.http.Authorization || req.http.Cookie) {
# /* Not cacheable by default */ # /* Not cacheable by default */
# return (pass); # return (pass);
# } # }
# return (lookup); # return (lookup);
# } # }
# #
# sub vcl_pipe { # sub vcl_pipe {
# # Note that only the first request to the backend will have # # Note that only the first request to the backend will have
# # X-Forwarded-For set. If you use X-Forwarded-For and want to # # X-Forwarded-For set. If you use X-Forwarded-For and want to
# # have it set for all requests, make sure to have: # # have it set for all requests, make sure to have:
# # set bereq.http.connection = "close"; # # set bereq.http.connection = "close";
# # here. It is not set by default as it might break some broken web # # here. It is not set by default as it might break some broken web
# # applications, like IIS with NTLM authentication. # # applications, like IIS with NTLM authentication.
# return (pipe); # return (pipe);
# } # }
# #
# sub vcl_pass { # sub vcl_pass {
# return (pass); # return (pass);
# } # }
# #
# sub vcl_hash { # sub vcl_hash {
# hash_data(req.url); # hash_data(req.url);
# if (req.http.host) { # if (req.http.host) {
# hash_data(req.http.host); # hash_data(req.http.host);
# } else { # } else {
# hash_data(server.ip); # hash_data(server.ip);
# } # }
# return (hash); # return (hash);
# } # }
# #
# sub vcl_hit { # sub vcl_hit {
# return (deliver); # return (deliver);
# } # }
# #
# sub vcl_miss { # sub vcl_miss {
# return (fetch); # return (fetch);
# } # }
# #
# sub vcl_fetch { # sub vcl_fetch {
# if (beresp.ttl <= 0s || # if (beresp.ttl <= 0s ||
# beresp.http.Set-Cookie || # beresp.http.Set-Cookie ||
# beresp.http.Vary == "*") { # beresp.http.Vary == "*") {
# /* # /*
# * Mark as "Hit-For-Pass" for the next 2 minutes # * Mark as "Hit-For-Pass" for the next 2 minutes
# */ # */
# set beresp.ttl = 120 s; # set beresp.ttl = 120 s;
# return (hit_for_pass); # return (hit_for_pass);
# } # }
# return (deliver); # return (deliver);
# } # }
# #
# sub vcl_deliver { # sub vcl_deliver {
# return (deliver); # return (deliver);
# } # }
# #
# sub vcl_error { # sub vcl_error {
# set obj.http.Content-Type = "text/html; charset=utf-8"; # set obj.http.Content-Type = "text/html; charset=utf-8";
# set obj.http.Retry-After = "5"; # set obj.http.Retry-After = "5";
# synthetic {" # synthetic {"
# <?xml version="1.0" encoding="utf-8"?> # <?xml version="1.0" encoding="utf-8"?>
# <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" # <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
# "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> # "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
# <html> # <html>
# <head> # <head>
# <title>"} + obj.status + " " + obj.response + {"</title> # <title>"} + obj.status + " " + obj.response + {"</title>
# </head> # </head>
# <body> # <body>
# <h1>Error "} + obj.status + " " + obj.response + {"</h1> # <h1>Error "} + obj.status + " " + obj.response + {"</h1>
# <p>"} + obj.response + {"</p> # <p>"} + obj.response + {"</p>
# <h3>Guru Meditation:</h3> # <h3>Guru Meditation:</h3>
# <p>XID: "} + req.xid + {"</p> # <p>XID: "} + req.xid + {"</p>
# <hr> # <hr>
# <p>Varnish cache server</p> # <p>Varnish cache server</p>
# </body> # </body>
# </html> # </html>
# "}; # "};
# return (deliver); # return (deliver);
# } # }
# #
# sub vcl_init { # sub vcl_init {
# return (ok); # return (ok);
# } # }
# #
# sub vcl_fini { # sub vcl_fini {
# return (ok); # return (ok);
# } # }
   
import urllib import urllib
import json import json
from pprint import pprint from pprint import pprint
import logging import logging
import ckan.logic as logic import ckan.logic as logic
import hashlib import hashlib
import threading import threading
from ckan.common import _, c, request, response from ckan.common import _, c, request, response
from pylons import config from pylons import config
  from webob.multidict import UnicodeMultiDict
  from paste.util.multidict import MultiDict
   
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
   
from ckan.controllers.api import ApiController from ckan.controllers.api import ApiController
   
class DGAApiController(ApiController): class DGAApiController(ApiController):
   
def _post_analytics(self,user,request_obj_type,request_function,request_id): def _post_analytics(self,user,request_obj_type,request_function,request_id):
if (config.get('googleanalytics.id') != None): if (config.get('googleanalytics.id') != None):
data = urllib.urlencode({ data = urllib.urlencode({
"v":1, "v":1,
"tid":config.get('googleanalytics.id'), "tid":config.get('googleanalytics.id'),
"cid":hashlib.md5(user).hexdigest(), "cid":hashlib.md5(user).hexdigest(),
"t":"event", "t":"event",
"dh":c.environ['HTTP_HOST'], "dh":c.environ['HTTP_HOST'],
"dp":c.environ['PATH_INFO'], "dp":c.environ['PATH_INFO'],
"dr":c.environ.get('HTTP_REFERER',''), "dr":c.environ.get('HTTP_REFERER',''),
"ec":"CKAN API Request", "ec":"CKAN API Request",
"ea":request_obj_type+request_function, "ea":request_obj_type+request_function,
"el":request_id, "el":request_id,
}) })
log.debug("Sending API Analytics Data: "+data) log.debug("Sending API Analytics Data: "+data)
# send analytics asynchronously # send analytics asynchronously
threading.Thread(target=urllib.urlopen,args=("http://www.google-analytics.com/collect", data)).start() threading.Thread(target=urllib.urlopen,args=("http://www.google-analytics.com/collect", data)).start()
   
def action(self, logic_function, ver=None): def action(self, logic_function, ver=None):
try: try:
function = logic.get_action(logic_function) function = logic.get_action(logic_function)
except Exception,e: except Exception,e:
log.debug(e) log.debug(e)
pass pass
try: try:
side_effect_free = getattr(function, 'side_effect_free', False) side_effect_free = getattr(function, 'side_effect_free', False)
request_data = self._get_request_data(try_url_params=side_effect_free) request_data = self._get_request_data(try_url_params=side_effect_free)
if isinstance(request_data, dict): if isinstance(request_data, dict):
id = request_data.get('id','') id = request_data.get('id','')
if 'q' in request_data.keys(): if 'q' in request_data.keys():
id = request_data['q'] id = request_data['q']