scraper fixes
Former-commit-id: 7c09d22a257167842febb35ef0a1605548e871c2
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -72,7 +72,7 @@
edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
foidocsdb.save(doc)
else:
print "already saved"
@@ -104,7 +104,7 @@
edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
foidocsdb.save(doc)
else:
print "already saved"
@@ -202,7 +202,7 @@
def getDate(self, content, entry, doc):
date = ''.join(content.stripped_strings).strip()
(a, b, c) = date.partition("(")
- date = self.remove_control_chars(a.replace("Octber", "October"))
+ date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
print date
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
--- a/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
+++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
@@ -6,8 +6,6 @@
#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getTable(self,soup):
- return soup.find(_class = "article-content").table
def getColumnCount(self):
return 5
def getColumns(self,columns):
--- a/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
+++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
@@ -21,6 +21,7 @@
if i < 2:
title = title + string
i = i+1
+ title = self.remove_control_chars(title)
doc.update({'title': title})
print title
return