From: Alex Sadleir Date: Mon, 28 Jan 2013 03:42:46 +0000 Subject: Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=55128c50497d43a7e333b17bf392e6c9d5b465b3 --- Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr Former-commit-id: 088e2ae92a5b5e95f28dcf62ec219255cd70e1b1 --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -71,8 +71,8 @@ content.split('\n')) edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description, "diff": diff} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description), "diff": diff} foidocsdb.save(doc) else: print "already saved" @@ -103,8 +103,8 @@ print "saving " + dochash edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} foidocsdb.save(doc) else: print "already saved" @@ -201,7 +201,7 @@ def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() (a, b, c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber", "October")) + date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")) print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate --- a/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py +++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py @@ -6,8 +6,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(_class = "article-content").table def getColumnCount(self): return 5 def getColumns(self,columns): --- a/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py +++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py @@ -21,6 +21,7 @@ if i < 2: title = title + string i = i+1 + title = self.remove_control_chars(title) doc.update({'title': title}) print title return