better date parser
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -9,6 +9,7 @@
 import dateutil
 from dateutil.parser import *
 from datetime import *
+import codecs
 
 class GenericDisclogScraper(object):
         __metaclass__ = abc.ABCMeta
@@ -55,7 +56,7 @@
 		  	doc = foidocsdb.get(hash)
 			#print doc
 			if doc == None:
-                        	print "saving"
+                        	print "saving "+ hash
 				edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
                                 doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
                                 "date": edate,"title": entry.title}
@@ -84,14 +85,28 @@
                 doc.update({'description': descriptiontxt})
 		return
         def getTitle(self, content, entry, doc):
-                doc.update({'title': content.string})
+                doc.update({'title': (''.join(content.stripped_strings))})
 		return
 	def getTable(self, soup):
 		return soup.table
+	def getRows(self, table):
+		return table.find_all('tr')
 	def getDate(self, content, entry, doc):
-		edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+		date = ''.join(content.stripped_strings).strip()
+		(a,b,c) = date.partition("(")
+		date = self.remove_control_chars(a.replace("Octber","October"))
+		print date
+		edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
 		print edate
 		doc.update({'date': edate})
+		return
+	def getLinks(self, content, entry, doc):
+                links = []
+                for atag in entry.find_all("a"):
+                       	if atag.has_key('href'):
+                               	links.append(scrape.fullurl(content,atag['href']))
+                if links != []:
+	                doc.update({'links': links})
 		return
 
 	def doScrape(self):
@@ -102,31 +117,26 @@
 			# http://www.crummy.com/software/BeautifulSoup/documentation.html
 				soup = BeautifulSoup(content)
 				table = self.getTable(soup)
-				for row in table.find_all('tr'):
+				for row in self.getRows(table):
 					columns = row.find_all('td')
 					if len(columns) == self.getColumnCount():
-						(id, date, description, title, notes) = self.getColumns(columns)
-						print id.string
+						(id, date, title, description, notes) = self.getColumns(columns)
+						print self.remove_control_chars(''.join(id.stripped_strings))
 						if id.string == None:
-							hash = scrape.mkhash(self.remove_control_chars(url+date.string))
+							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
 						else:
-							hash = scrape.mkhash(self.remove_control_chars(url+id.string))
-						links = []
-						for atag in row.find_all("a"):
-							if atag.has_key('href'):
-								links.append(scrape.fullurl(url,atag['href']))
+							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
 						doc = foidocsdb.get(hash)
 							
 						if doc == None:
-							print "saving"
-							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
-                                			if links != []:
-                                        			doc.update({'links': links})
+							print "saving " +hash
+							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}
+							self.getLinks(self.getURL(),row,doc)
                                 			self.getTitle(title,row, doc)
                                 			self.getDate(date,row, doc)
 							self.getDescription(description,row, doc)
 							if notes != None:
-                                        			doc.update({ 'notes': notes.string})
+                                        			doc.update({ 'notes': (''.join(notes.stripped_strings))})
 							foidocsdb.save(doc)
 						else:
 							print "already saved "+hash