--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -9,6 +9,7 @@ import dateutil from dateutil.parser import * from datetime import * +import codecs class GenericDisclogScraper(object): __metaclass__ = abc.ABCMeta @@ -92,7 +93,8 @@ return table.find_all('tr') def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() - date = str.replace("Octber","October",date) + (a,b,c) = date.partition("(") + date = self.remove_control_chars(a.replace("Octber","October")) print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate @@ -119,7 +121,7 @@ columns = row.find_all('td') if len(columns) == self.getColumnCount(): (id, date, title, description, notes) = self.getColumns(columns) - print ''.join(id.stripped_strings) + print self.remove_control_chars(''.join(id.stripped_strings)) if id.string == None: hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings)))) else: