--- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py @@ -16,16 +16,20 @@ foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - - d = pq(content.read()) + + d = pq(content) d.make_links_absolute(base_url = self.getURL()) for table in d('table').items(): title= table('thead').text() - print title + print self.remove_control_chars(title) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) links = table('a').map(lambda i, e: pq(e).attr('href')) description = descA+" "+descB - edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + try: + edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + except ValueError: + edate = date.today().strftime("%Y-%m-%d") + pass print edate dochash = scrape.mkhash(self.remove_control_chars(title)) doc = foidocsdb.get(dochash) @@ -35,7 +39,7 @@ doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, "links": links, - "date": edate, "notes": notes, "title": "Disclosure Log Updated", "description": description} + "date": edate, "notes": notes, "title": title, "description": description} #print doc foidocsdb.save(doc) else: