From: Maxious Date: Mon, 28 Jan 2013 05:55:22 +0000 Subject: fix scraper interval X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=d2926ead06866b7facc4342e9553dad93ab8e52c --- fix scraper interval Former-commit-id: 3028cb06efbf3c9d8a9f903943fab75d89156535 --- --- a/documents/scrape.py +++ b/documents/scrape.py @@ -105,7 +105,7 @@ if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} else: - if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000): + if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14): print "Uh oh, trying to scrape URL again too soon!" + hash last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) @@ -209,8 +209,8 @@ scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') -#couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents'] --- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py @@ -26,8 +26,8 @@ ScraperImplementation().doScrape() except Exception, err: sys.stderr.write('ERROR: %s\n' % str(err)) - print ‘Error Reason: ‘, err.__doc__ - print ‘Exception: ‘, err.__class__ + print "Error Reason: ", err.__doc__ + print "Exception: ", err.__class__ print traceback.format_exc() if amon_available: data = { --- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py +++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py @@ -14,5 +14,3 @@ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) ScraperImplementation().doScrape() -www.finance.gov.au/foi/disclosure-log/foi-rss.xml -