--- a/documents/scrape.py +++ b/documents/scrape.py @@ -12,7 +12,6 @@ import urlparse import socket - def mkhash(input): return hashlib.md5(input).hexdigest().encode("utf-8") @@ -105,7 +104,7 @@ if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} else: - if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000): + if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14): print "Uh oh, trying to scrape URL again too soon!" + hash last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) @@ -209,8 +208,8 @@ scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') -#couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents']