gitphp 0.2.9.1 :: disclosr.git/blob

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81	import couchdb couch = couchdb.Server('http://127.0.0.1:5984/') #couch = couchdb.Server('http://192.168.1.113:5984/') import urllib import urlparse import httplib2 import httplib import csv def url_fix(s, charset='utf-8'): """Sometimes you get an URL by a user that just isn't a real URL because it contains unsafe characters like ' ' and so on. This function can fix some of the problems in a similar way browsers handle data entered by the user: :param charset: The target charset for the URL if the url was given as unicode string. """ if isinstance(s, unicode): s = s.encode(charset, 'ignore') if not urlparse.urlparse(s).scheme: s = "http://"+s scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) path = urllib.quote(path, '/%') qs = urllib.quote_plus(qs, ':&=') return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ SYMBOLS = { 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), 'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'), 'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'), } docsdb = couch['disclosr-documents'] out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL) if __name__ == "__main__": for doc in docsdb.view('app/datasets'): if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": # Collect the package metadata. pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0: for resource in doc.value['metadata']['Download']: # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html # (KML/KMZ) / (Shapefile) /(Other) format = "plain" if resource['format'] == '(XML)': format = 'xml' if resource['format'] == '(CSV/XLS)': format = 'csv' if resource['format'] == '(Shapefile)': format = 'shp' if resource['format'] == '(KML/KMZ)': format = 'kml' name = resource['href'] if 'name' in resource.keys(): name = resource['name'] if resource['href'].startswith("ftp"): out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""]) else: try: h = httplib2.Http(disable_ssl_certificate_validation=True) resp = h.request(url_fix(resource['href']), 'HEAD') content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else "" out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, resp[0]['status'], content_type]) except httplib2.ServerNotFoundError: out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) except httplib.InvalidURL: out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) except httplib2.RelativeURIError: out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) else: out.writerow([pkg_name.encode('ascii', 'ignore')])

import couchdb
couch = couchdb.Server('http://127.0.0.1:5984/')
#couch = couchdb.Server('http://192.168.1.113:5984/')
 
import urllib
import urlparse
import httplib2
import httplib
import csv
 
 
def url_fix(s, charset='utf-8'):
    """Sometimes you get an URL by a user that just isn't a real
    URL because it contains unsafe characters like ' ' and so on.  This
    function can fix some of the problems in a similar way browsers
    handle data entered by the user:
 
    :param charset: The target charset for the URL if the url was
                    given as unicode string.
    """
    if isinstance(s, unicode):
        s = s.encode(charset, 'ignore')
    if not urlparse.urlparse(s).scheme:
        s = "http://"+s
    scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
    path = urllib.quote(path, '/%')
    qs = urllib.quote_plus(qs, ':&=')
    return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
 
# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
SYMBOLS = {
    'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
    'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
                      'zetta', 'iotta'),
    'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
    'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
                'zebi', 'yobi'),
}
 
 
docsdb = couch['disclosr-documents']
out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL)
if __name__ == "__main__":
    for doc in docsdb.view('app/datasets'):
        if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld":
            # Collect the package metadata.
            pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',
                              doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]);
            if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0:
                        for resource in doc.value['metadata']['Download']:
                            # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html
                            # (KML/KMZ) / (Shapefile) /(Other)
                            format = "plain"
                            if resource['format'] == '(XML)':
                                format = 'xml'
                            if resource['format'] == '(CSV/XLS)':
                                format = 'csv'
                            if resource['format'] == '(Shapefile)':
                                format = 'shp'
                            if resource['format'] == '(KML/KMZ)':
                                format = 'kml'
                            name = resource['href']
                            if 'name' in resource.keys():
                                name = resource['name']
                            if resource['href'].startswith("ftp"):
                                    out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""])
                            else:
                                    try:
                                        h = httplib2.Http(disable_ssl_certificate_validation=True)
                                        resp = h.request(url_fix(resource['href']), 'HEAD')
                                        content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else ""
                                        out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, resp[0]['status'], content_type])
                                    except httplib2.ServerNotFoundError:
                                        out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"])
                                    except httplib.InvalidURL:
                                        out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"])
                                    except httplib2.RelativeURIError:
                                        out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"])
            else:
                out.writerow([pkg_name.encode('ascii', 'ignore')])