better date parser
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -9,6 +9,20 @@
 import dateutil
 from dateutil.parser import *
 from datetime import *
+import codecs
+
+from StringIO import StringIO
+
+from docx import *
+from lxml import etree
+import zipfile
+
+from pdfminer.pdfparser import PDFDocument, PDFParser
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfdevice import PDFDevice, TagExtractor
+from pdfminer.converter import TextConverter
+from pdfminer.cmapdb import CMapDB
+from pdfminer.layout import LAParams
 
 class GenericDisclogScraper(object):
         __metaclass__ = abc.ABCMeta
@@ -34,11 +48,78 @@
 		""" do the scraping """
 		return
 
-	@abc.abstractmethod
-        def getDescription(self, content, entry, doc):
-                """ get description"""
-		return
-
+class GenericPDFDisclogScraper(GenericDisclogScraper):
+
+       	def doScrape(self):
+               	foidocsdb = scrape.couch['disclosr-foidocuments']
+                (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+				
+				    laparams = LAParams()
+    
+    rsrcmgr = PDFResourceManager(caching=True)
+
+        outfp = StringIO.StringIO()
+    
+        device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams)
+     
+    
+        fp = StringIO.StringIO()
+        fp.write(content)
+        description = output.getvalue();
+        process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True)
+        fp.close()
+    device.close()
+    outfp.close()
+
+				hash = scrape.mkhash(description)
+			#print hash
+		  	doc = foidocsdb.get(hash)
+			#print doc
+			if doc == None:
+                        	print "saving "+ hash
+				edate = datetime.fromtimestamp(mktime( )).strftime("%Y-%m-%d")
+                                doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash,
+                                "date": edate,"title": "Disclosure Log Updated"}
+				self.getDescription(entry,entry, doc)
+                                foidocsdb.save(doc)
+                        else:
+                        	print "already saved"			
+
+
+class GenericDOCXDisclogScraper(GenericDisclogScraper):
+
+       	def doScrape(self):
+               	foidocsdb = scrape.couch['disclosr-foidocuments']
+                (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+				
+				   mydoc = zipfile.ZipFile(file)
+    xmlcontent = mydoc.read('word/document.xml')
+    document = etree.fromstring(xmlcontent)
+        
+    ## Fetch all the text out of the document we just created        
+    paratextlist = getdocumenttext(document)    
+
+    # Make explicit unicode version    
+    newparatextlist = []
+    for paratext in paratextlist:
+        newparatextlist.append(paratext.encode("utf-8"))                  
+    
+    ## Print our documnts test with two newlines under each paragraph
+    description = '\n\n'.join(newparatextlist)
+
+				hash = scrape.mkhash(description)
+			#print hash
+		  	doc = foidocsdb.get(hash)
+			#print doc
+			if doc == None:
+                        	print "saving "+ hash
+				edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d")
+                                doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash,
+                                "date": edate,"title": "Disclosure Log Updated"}
+				self.getDescription(entry,entry, doc)
+                                foidocsdb.save(doc)
+                        else:
+                        	print "already saved"			
 
 
 class GenericRSSDisclogScraper(GenericDisclogScraper):
@@ -55,7 +136,7 @@
 		  	doc = foidocsdb.get(hash)
 			#print doc
 			if doc == None:
-                        	print "saving"
+                        	print "saving "+ hash
 				edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
                                 doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
                                 "date": edate,"title": entry.title}
@@ -84,14 +165,28 @@
                 doc.update({'description': descriptiontxt})
 		return
         def getTitle(self, content, entry, doc):
-                doc.update({'title': content.string})
+                doc.update({'title': (''.join(content.stripped_strings))})
 		return
 	def getTable(self, soup):
 		return soup.table
+	def getRows(self, table):
+		return table.find_all('tr')
 	def getDate(self, content, entry, doc):
-		edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+		date = ''.join(content.stripped_strings).strip()
+		(a,b,c) = date.partition("(")
+		date = self.remove_control_chars(a.replace("Octber","October"))
+		print date
+		edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
 		print edate
 		doc.update({'date': edate})
+		return
+	def getLinks(self, content, entry, doc):
+                links = []
+                for atag in entry.find_all("a"):
+                       	if atag.has_key('href'):
+                               	links.append(scrape.fullurl(content,atag['href']))
+                if links != []:
+	                doc.update({'links': links})
 		return
 
 	def doScrape(self):
@@ -102,31 +197,26 @@
 			# http://www.crummy.com/software/BeautifulSoup/documentation.html
 				soup = BeautifulSoup(content)
 				table = self.getTable(soup)
-				for row in table.find_all('tr'):
+				for row in self.getRows(table):
 					columns = row.find_all('td')
 					if len(columns) == self.getColumnCount():
-						(id, date, description, title, notes) = self.getColumns(columns)
-						print id.string
+						(id, date, title, description, notes) = self.getColumns(columns)
+						print self.remove_control_chars(''.join(id.stripped_strings))
 						if id.string == None:
-							hash = scrape.mkhash(self.remove_control_chars(url+date.string))
+							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
 						else:
-							hash = scrape.mkhash(self.remove_control_chars(url+id.string))
-						links = []
-						for atag in row.find_all("a"):
-							if atag.has_key('href'):
-								links.append(scrape.fullurl(url,atag['href']))
+							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
 						doc = foidocsdb.get(hash)
 							
 						if doc == None:
-							print "saving"
-							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
-                                			if links != []:
-                                        			doc.update({'links': links})
+							print "saving " +hash
+							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}
+							self.getLinks(self.getURL(),row,doc)
                                 			self.getTitle(title,row, doc)
                                 			self.getDate(date,row, doc)
 							self.getDescription(description,row, doc)
 							if notes != None:
-                                        			doc.update({ 'notes': notes.string})
+                                        			doc.update({ 'notes': (''.join(notes.stripped_strings))})
 							foidocsdb.save(doc)
 						else:
 							print "already saved "+hash