--- a/documents/scrape.py +++ b/documents/scrape.py @@ -12,6 +12,11 @@ import urlparse import socket +#couch = couchdb.Server('http://192.168.1.148:5984/') +couch = couchdb.Server('http://192.168.1.113:5984/') +#couch = couchdb.Server('http://127.0.0.1:5984/') + + def mkhash(input): return hashlib.md5(input).hexdigest().encode("utf-8") @@ -104,14 +109,11 @@ if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} else: - if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14): + if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) content = last_attachment - return (doc['url'], doc['mime_type'], content.read()) - if scrape_again == False: - print "Not scraping this URL again as requested" return (doc['url'], doc['mime_type'], content.read()) req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") @@ -207,9 +209,6 @@ #print linkurl scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) -#couch = couchdb.Server('http://192.168.1.148:5984/') -#couch = couchdb.Server('http://192.168.1.113:5984/') -couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents']