beginning of docx/pdf scrapers
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -11,6 +11,19 @@
 from datetime import *
 import codecs
 
+from StringIO import StringIO
+
+from docx import *
+from lxml import etree
+import zipfile
+
+from pdfminer.pdfparser import PDFDocument, PDFParser
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfdevice import PDFDevice, TagExtractor
+from pdfminer.converter import TextConverter
+from pdfminer.cmapdb import CMapDB
+from pdfminer.layout import LAParams
+
 class GenericDisclogScraper(object):
         __metaclass__ = abc.ABCMeta
 	agencyID = None
@@ -35,11 +48,78 @@
 		""" do the scraping """
 		return
 
-	@abc.abstractmethod
-        def getDescription(self, content, entry, doc):
-                """ get description"""
-		return
-
+class GenericPDFDisclogScraper(GenericDisclogScraper):
+
+       	def doScrape(self):
+               	foidocsdb = scrape.couch['disclosr-foidocuments']
+                (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+				
+				    laparams = LAParams()
+    
+    rsrcmgr = PDFResourceManager(caching=True)
+
+        outfp = StringIO.StringIO()
+    
+        device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams)
+     
+    
+        fp = StringIO.StringIO()
+        fp.write(content)
+        description = output.getvalue();
+        process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True)
+        fp.close()
+    device.close()
+    outfp.close()
+
+				hash = scrape.mkhash(description)
+			#print hash
+		  	doc = foidocsdb.get(hash)
+			#print doc
+			if doc == None:
+                        	print "saving "+ hash
+				edate = datetime.fromtimestamp(mktime( )).strftime("%Y-%m-%d")
+                                doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash,
+                                "date": edate,"title": "Disclosure Log Updated"}
+				self.getDescription(entry,entry, doc)
+                                foidocsdb.save(doc)
+                        else:
+                        	print "already saved"			
+
+
+class GenericDOCXDisclogScraper(GenericDisclogScraper):
+
+       	def doScrape(self):
+               	foidocsdb = scrape.couch['disclosr-foidocuments']
+                (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+				
+				   mydoc = zipfile.ZipFile(file)
+    xmlcontent = mydoc.read('word/document.xml')
+    document = etree.fromstring(xmlcontent)
+        
+    ## Fetch all the text out of the document we just created        
+    paratextlist = getdocumenttext(document)    
+
+    # Make explicit unicode version    
+    newparatextlist = []
+    for paratext in paratextlist:
+        newparatextlist.append(paratext.encode("utf-8"))                  
+    
+    ## Print our documnts test with two newlines under each paragraph
+    description = '\n\n'.join(newparatextlist)
+
+				hash = scrape.mkhash(description)
+			#print hash
+		  	doc = foidocsdb.get(hash)
+			#print doc
+			if doc == None:
+                        	print "saving "+ hash
+				edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d")
+                                doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash,
+                                "date": edate,"title": "Disclosure Log Updated"}
+				self.getDescription(entry,entry, doc)
+                                foidocsdb.save(doc)
+                        else:
+                        	print "already saved"			
 
 
 class GenericRSSDisclogScraper(GenericDisclogScraper):