From: Alex Sadleir Date: Sat, 26 Jan 2013 05:55:32 +0000 Subject: fix wcag validation tester X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=68ede6a1b270221488b738fcd15b393eac40a3d7 --- fix wcag validation tester Former-commit-id: 7194cc6a6482caa4c20ac597289ff96b748223c3 --- --- a/admin/refreshDesignDoc.php +++ b/admin/refreshDesignDoc.php @@ -40,7 +40,8 @@ $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; -$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; +$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; +$docdb->save($obj, true); --- a/admin/validation.py +++ b/admin/validation.py @@ -5,13 +5,13 @@ import re from tidylib import tidy_document -couch = couchdb.Server('http://127.0.0.1:5984/') +couch = couchdb.Server('http://192.168.1.113:5984/') # select database docsdb = couch['disclosr-documents'] def f(x): - invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized") + invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized|noscript|audio") valid = re.compile(r"line") return (not invalid.search(x)) and valid.search(x) and x != '' --- a/documents/scrape.py +++ b/documents/scrape.py @@ -10,6 +10,8 @@ import mimetypes import urllib import urlparse +import socket + def mkhash(input): return hashlib.md5(input).hexdigest().encode("utf-8") @@ -97,7 +99,7 @@ return (None,None,None) doc = docsdb.get(hash) if doc == None: - doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} + doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName, 'type': 'website'} else: if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000): print "Uh oh, trying to scrape URL again too soon!"+hash @@ -118,7 +120,9 @@ opener = urllib2.build_opener(NotModifiedHandler()) try: - url_handle = opener.open(req) + #default_timeout = 12 + #socket.setdefaulttimeout(default_timeout) + url_handle = opener.open(req,None,3) doc['url'] = url_handle.geturl() # may have followed a redirect to a new url headers = url_handle.info() # the addinfourls have the .info() too doc['etag'] = headers.getheader("ETag") @@ -151,7 +155,7 @@ return (doc['url'], doc['mime_type'], content) #store as attachment epoch-filename - except urllib2.URLError as e: + except (urllib2.URLError, socket.timeout) as e: print "error!" error = "" if hasattr(e, 'reason'): @@ -202,20 +206,23 @@ scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://127.0.0.1:5984/') +couch = couchdb.Server('http://192.168.1.113:5984/') +#couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents'] if __name__ == "__main__": - for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? + for row in agencydb.view('app/all'): #not recently scraped agencies view? agency = agencydb.get(row.id) print agency['name'] for key in agency.keys(): - if key == "FOIDocumentsURL" and "status" not in agency.keys: + if key == "FOIDocumentsURL" and "status" not in agency.keys() and False: scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) - if key == 'website' and False: + if key == 'website' and True: scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) + if "metadata" not in agency.keys(): + agency['metadata'] = {} agency['metadata']['lastScraped'] = time.time() if key.endswith('URL') and False: print key