--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -15,10 +15,6 @@ from StringIO import StringIO -from docx import * -from lxml import etree -import zipfile - from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice, TagExtractor @@ -39,14 +35,14 @@ """ disclosr agency id """ if self.agencyID is None: self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") - return self.agencyID + return self.agencyID def getURL(self): """ disclog URL""" if self.disclogURL is None: agency = scrape.agencydb.get(self.getAgencyID()) self.disclogURL = agency['FOIDocumentsURL'] - return self.disclogURL + return self.disclogURL @abc.abstractmethod def doScrape(self): @@ -62,14 +58,15 @@ self.getURL(), "foidocuments", self.getAgencyID()) laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) - outfp = StringIO.StringIO() + outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) - fp = StringIO.StringIO() - fp.write(content) - description = output.getvalue() + fp = StringIO() + fp.write(content.read()) + process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True) + description = outfp.getvalue() fp.close() device.close() outfp.close() @@ -77,11 +74,10 @@ doc = foidocsdb.get(dochash) if doc is None: print "saving " + dochash - edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") + edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated"} - self.getDescription(entry, entry, doc) + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" @@ -103,17 +99,16 @@ for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) ## Print our documnts test with two newlines under each paragraph - description = '\n\n'.join(newparatextlist) + description = '\n\n'.join(newparatextlist).strip(' \t\n\r') dochash = scrape.mkhash(description) doc = foidocsdb.get(dochash) if doc is None: print "saving " + dochash - edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") + edate = time().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated"} - self.getDescription(entry, entry, doc) + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" @@ -201,10 +196,9 @@ (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content is not None: - if mime_type is "text/html"\ - or mime_type is "application/xhtml+xml"\ - or mime_type is"application/xml": + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html + print "parsing" soup = BeautifulSoup(content) table = self.getTable(soup) for row in self.getRows(table): @@ -222,11 +216,11 @@ dochash = scrape.mkhash( self.remove_control_chars( url + (''.join(id.stripped_strings)))) - doc = foidocsdb.get(hash) + doc = foidocsdb.get(dochash) if doc is None: - print "saving " + hash - doc = {'_id': hash, + print "saving " + dochash + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}