--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -13,11 +13,9 @@ from datetime import * import codecs +import difflib + from StringIO import StringIO - -from docx import * -from lxml import etree -import zipfile from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf @@ -39,20 +37,45 @@ """ disclosr agency id """ if self.agencyID is None: self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") - return self.agencyID + return self.agencyID def getURL(self): """ disclog URL""" if self.disclogURL is None: agency = scrape.agencydb.get(self.getAgencyID()) self.disclogURL = agency['FOIDocumentsURL'] - return self.disclogURL + return self.disclogURL @abc.abstractmethod def doScrape(self): """ do the scraping """ return +class GenericHTMLDisclogScraper(GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + content = rcontent + dochash = scrape.mkhash(content) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" + last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) + if last_attach != None: + html_diff = difflib.HtmlDiff() + description = description + "\nChanges: " + description = description + html_diff.make_table(last_attach.read().split('\n'), + content.split('\n')) + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} + foidocsdb.save(doc) + else: + print "already saved" class GenericPDFDisclogScraper(GenericDisclogScraper): @@ -62,14 +85,15 @@ self.getURL(), "foidocuments", self.getAgencyID()) laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) - outfp = StringIO.StringIO() + outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) - fp = StringIO.StringIO() + fp = StringIO() fp.write(content) - description = output.getvalue() + process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True) + description = outfp.getvalue() fp.close() device.close() outfp.close() @@ -77,11 +101,10 @@ doc = foidocsdb.get(dochash) if doc is None: print "saving " + dochash - edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") + edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated"} - self.getDescription(entry, entry, doc) + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} foidocsdb.save(doc) else: print "already saved" @@ -103,17 +126,16 @@ for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) ## Print our documnts test with two newlines under each paragraph - description = '\n\n'.join(newparatextlist) + description = '\n\n'.join(newparatextlist).strip(' \t\n\r') dochash = scrape.mkhash(description) doc = foidocsdb.get(dochash) if doc is None: print "saving " + dochash - edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") + edate = time().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated"} - self.getDescription(entry, entry, doc) + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" @@ -180,7 +202,7 @@ def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() (a, b, c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber", "October")) + date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")) print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate @@ -201,10 +223,9 @@ (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content is not None: - if mime_type is "text/html"\ - or mime_type is "application/xhtml+xml"\ - or mime_type is"application/xml": + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html + print "parsing" soup = BeautifulSoup(content) table = self.getTable(soup) for row in self.getRows(table): @@ -222,11 +243,11 @@ dochash = scrape.mkhash( self.remove_control_chars( url + (''.join(id.stripped_strings)))) - doc = foidocsdb.get(hash) + doc = foidocsdb.get(dochash) if doc is None: - print "saving " + hash - doc = {'_id': hash, + print "saving " + dochash + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}