--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -199,12 +199,28 @@ def getRows(self, table): return table.find_all('tr') + def getDocHash(self, id,date, url): + if id.string is None: + print "no id, using date as hash" + return scrape.mkhash( + self.remove_control_chars( + url + (''.join(date.stripped_strings)))) + else: + return scrape.mkhash( + self.remove_control_chars( + url + (''.join(id.stripped_strings)))) + def getDate(self, content, entry, doc): - date = ''.join(content.stripped_strings).strip() - (a, b, c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")) - print date - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + strdate = ''.join(content.stripped_strings).strip() + (a, b, c) = strdate.partition("(") + strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) + print strdate + try: + edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + except ValueError: + print >> sys.stderr, "ERROR date invalid %s " % strdate + print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() + edate = date.today().strftime("%Y-%m-%d") print edate doc.update({'date': edate}) return @@ -235,14 +251,7 @@ description, notes) = self.getColumns(columns) print self.remove_control_chars( ''.join(id.stripped_strings)) - if id.string is None: - dochash = scrape.mkhash( - self.remove_control_chars( - url + (''.join(date.stripped_strings)))) - else: - dochash = scrape.mkhash( - self.remove_control_chars( - url + (''.join(id.stripped_strings)))) + dochash = self.getDocHash(id,date,url) doc = foidocsdb.get(dochash) if doc is None: @@ -267,8 +276,7 @@ 'Summary of FOIrequest received by agency/minister', 'Summary of FOI request received', 'Description of FOI Request', "FOI request", 'Results 1 to 67 of 67'] - if doc['title'] not in badtitles\ - and doc['description'] != '': + if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': print "saving" foidocsdb.save(doc) else: