tool to check datagov resources
[disclosr.git] / documents /
blob:a/documents/ -> blob:b/documents/
--- a/documents/
+++ b/documents/
@@ -1,1 +1,79 @@
+import couchdb
+couch = couchdb.Server('')
+#couch = couchdb.Server('')
+import urllib
+import urlparse
+import httplib2
+import csv
+import ssl
+context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+context.verify_mode = ssl.CERT_NONE
+def url_fix(s, charset='utf-8'):
+    """Sometimes you get an URL by a user that just isn't a real
+    URL because it contains unsafe characters like ' ' and so on.  This
+    function can fix some of the problems in a similar way browsers
+    handle data entered by the user:
+    :param charset: The target charset for the URL if the url was
+                    given as unicode string.
+    """
+    if isinstance(s, unicode):
+        s = s.encode(charset, 'ignore')
+    if not urlparse.urlparse(s).scheme:
+   	s = "http://"+s
+    scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
+    path = urllib.quote(path, '/%')
+    qs = urllib.quote_plus(qs, ':&=')
+    return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
+    'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
+    'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
+                      'zetta', 'iotta'),
+    'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
+    'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
+                'zebi', 'yobi'),
+docsdb = couch['disclosr-documents']
+out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL)
+if __name__ == "__main__":
+    for doc in docsdb.view('app/datasets'):
+        if doc.value['url'] != "" and doc.value['agencyID'] != "qld":
+            # Collect the package metadata.
+            pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',
+                              doc.value['url'].replace("", '').replace('/', '')[:100]);
+            if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0:
+                        for resource in doc.value['metadata']['Download']:
+                            #
+                            # (KML/KMZ) / (Shapefile) /(Other)
+                            format = "plain"
+                            if resource['format'] == '(XML)':
+                                format = 'xml'
+                            if resource['format'] == '(CSV/XLS)':
+                                format = 'csv'
+                            if resource['format'] == '(Shapefile)':
+                                format = 'shp'
+                            if resource['format'] == '(KML/KMZ)':
+                                format = 'kml'
+                            name = resource['href']
+                            if 'name' in resource.keys():
+                                name = resource['name']
+			    if resource['href'].startswith("ftp"):
+				    out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""])
+			    else:
+				    try:
+					h = httplib2.Http(disable_ssl_certificate_validation=True)
+  				        resp = h.request(url_fix(resource['href']), 'HEAD')
+					content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else ""
+					out.writerow([pkg_name, url_fix(resource['href']), name,format, resp[0]['status'], content_type])
+				    except httplib2.ServerNotFoundError:
+					out.writerow([pkg_name, url_fix(resource['href']), name,format, "500","badurl"])
+	    else:
+		out.writerow([pkg_name])