--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -13,6 +13,8 @@ from datetime import * import codecs +import difflib + from StringIO import StringIO from pdfminer.pdfparser import PDFDocument, PDFParser @@ -49,6 +51,31 @@ """ do the scraping """ return +class GenericHTMLDisclogScraper(GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + content = rcontent + dochash = scrape.mkhash(content) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" + last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) + if last_attach != None: + html_diff = difflib.HtmlDiff() + description = description + "\nChanges: " + description = description + html_diff.make_table(last_attach.read().split('\n'), + content.split('\n')) + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} + foidocsdb.save(doc) + else: + print "already saved" class GenericPDFDisclogScraper(GenericDisclogScraper): @@ -62,7 +89,7 @@ device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) fp = StringIO() - fp.write(content.read()) + fp.write(content) process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True) @@ -77,7 +104,7 @@ edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} foidocsdb.save(doc) else: print "already saved" @@ -175,7 +202,7 @@ def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() (a, b, c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber", "October")) + date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")) print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate @@ -196,10 +223,9 @@ (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content is not None: - if mime_type is "text/html"\ - or mime_type is "application/xhtml+xml"\ - or mime_type is"application/xml": + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html + print "parsing" soup = BeautifulSoup(content) table = self.getTable(soup) for row in self.getRows(table): @@ -217,11 +243,11 @@ dochash = scrape.mkhash( self.remove_control_chars( url + (''.join(id.stripped_strings)))) - doc = foidocsdb.get(hash) + doc = foidocsdb.get(dochash) if doc is None: - print "saving " + hash - doc = {'_id': hash, + print "saving " + dochash + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}