fix scraper interval
fix scraper interval


Former-commit-id: 3028cb06efbf3c9d8a9f903943fab75d89156535

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -71,8 +71,8 @@
                     content.split('\n'))
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
-                , 'url': self.getURL(), 'docID': dochash,
-                   "date": edate, "title": "Disclosure Log Updated", "description": description, "diff": diff}
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description), "diff": diff}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -103,8 +103,8 @@
             print "saving " + dochash
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
-                , 'url': self.getURL(), 'docID': dochash,
-                   "date": edate, "title": "Disclosure Log Updated", "description": description}
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -201,7 +201,7 @@
     def getDate(self, content, entry, doc):
         date = ''.join(content.stripped_strings).strip()
         (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October"))
+        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
         print date
         edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
         print edate

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -105,7 +105,7 @@
     if doc == None:
         doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'}
     else:
-        if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000):
+        if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14):
             print "Uh oh, trying to scrape URL again too soon!" + hash
             last_attachment_fname = doc["_attachments"].keys()[-1]
             last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
@@ -209,8 +209,8 @@
                     scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID)
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
-couch = couchdb.Server('http://192.168.1.113:5984/')
-#couch = couchdb.Server('http://127.0.0.1:5984/')
+#couch = couchdb.Server('http://192.168.1.113:5984/')
+couch = couchdb.Server('http://127.0.0.1:5984/')
 # select database
 agencydb = couch['disclosr-agencies']
 docsdb = couch['disclosr-documents']

--- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py
+++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py
@@ -26,8 +26,8 @@
 	ScraperImplementation().doScrape()
     except Exception, err:
         sys.stderr.write('ERROR: %s\n' % str(err))
-	print ‘Error Reason: ‘, err.__doc__
-	print ‘Exception: ‘, err.__class__
+	print "Error Reason: ", err.__doc__
+	print "Exception: ", err.__class__
 	print traceback.format_exc()
 	if amon_available:
                data = {

--- a/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
+++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
@@ -6,8 +6,6 @@
 
 #http://www.doughellmann.com/PyMOTW/abc/
 class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
-        def getTable(self,soup):
-                return soup.find(_class = "article-content").table
         def getColumnCount(self):
                 return 5
         def getColumns(self,columns):

--- a/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
+++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
@@ -21,6 +21,7 @@
     			if i < 2:
 				title = title + string
 			i = i+1
+		title = self.remove_control_chars(title)
                 doc.update({'title': title})
 		print title
                 return

--- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py
+++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py
@@ -14,5 +14,3 @@
     print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper)
     ScraperImplementation().doScrape()
 
-www.finance.gov.au/foi/disclosure-log/foi-rss.xml
-