fix wcag validation tester
Former-commit-id: 7194cc6a6482caa4c20ac597289ff96b748223c3
--- a/admin/refreshDesignDoc.php
+++ b/admin/refreshDesignDoc.php
@@ -40,7 +40,8 @@
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
-$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
+$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
+$docdb->save($obj, true);
--- a/admin/validation.py
+++ b/admin/validation.py
@@ -5,13 +5,13 @@
import re
from tidylib import tidy_document
-couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
# select database
docsdb = couch['disclosr-documents']
def f(x):
- invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized")
+ invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized|noscript|audio")
valid = re.compile(r"line")
return (not invalid.search(x)) and valid.search(x) and x != ''
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -10,6 +10,8 @@
import mimetypes
import urllib
import urlparse
+import socket
+
def mkhash(input):
return hashlib.md5(input).hexdigest().encode("utf-8")
@@ -97,7 +99,7 @@
return (None,None,None)
doc = docsdb.get(hash)
if doc == None:
- doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
+ doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName, 'type': 'website'}
else:
if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
print "Uh oh, trying to scrape URL again too soon!"+hash
@@ -118,7 +120,9 @@
opener = urllib2.build_opener(NotModifiedHandler())
try:
- url_handle = opener.open(req)
+ #default_timeout = 12
+ #socket.setdefaulttimeout(default_timeout)
+ url_handle = opener.open(req,None,3)
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag")
@@ -151,7 +155,7 @@
return (doc['url'], doc['mime_type'], content)
#store as attachment epoch-filename
- except urllib2.URLError as e:
+ except (urllib2.URLError, socket.timeout) as e:
print "error!"
error = ""
if hasattr(e, 'reason'):
@@ -202,20 +206,23 @@
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
#couch = couchdb.Server('http://192.168.1.148:5984/')
-couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
+#couch = couchdb.Server('http://127.0.0.1:5984/')
# select database
agencydb = couch['disclosr-agencies']
docsdb = couch['disclosr-documents']
if __name__ == "__main__":
- for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+ for row in agencydb.view('app/all'): #not recently scraped agencies view?
agency = agencydb.get(row.id)
print agency['name']
for key in agency.keys():
- if key == "FOIDocumentsURL" and "status" not in agency.keys:
+ if key == "FOIDocumentsURL" and "status" not in agency.keys() and False:
scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
- if key == 'website' and False:
+ if key == 'website' and True:
scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+ if "metadata" not in agency.keys():
+ agency['metadata'] = {}
agency['metadata']['lastScraped'] = time.time()
if key.endswith('URL') and False:
print key