Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -72,7 +72,8 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description), "diff": diff}
+            "date": edate, "title": "Disclosure Log Updated", 
+	    "description":  self.remove_control_chars(description), "diff": self.remove_control_chars(diff)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -199,11 +200,16 @@
         return table.find_all('tr')
 
     def getDate(self, content, entry, doc):
-        date = ''.join(content.stripped_strings).strip()
-        (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janurary","January"))
-        print date
-        edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+        strdate = ''.join(content.stripped_strings).strip()
+        (a, b, c) = strdate.partition("(")
+        strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")replace("Janrurary", "January").replace("1012","2012"))
+        print strdate
+        try:
+		edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+	except ValueError:
+		print >> sys.stderr, "ERROR date invalid %s " % strdate
+		print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip()
+		edate = date.today().strftime("%Y-%m-%d")  
         print edate
         doc.update({'date': edate})
         return
@@ -266,8 +272,7 @@
                                          'Summary of FOIrequest received by agency/minister',
                                          'Summary of FOI request received', 'Description of    FOI Request',
                                          "FOI request", 'Results 1 to 67 of 67']
-                            if doc['title'] not in badtitles\
-                            and doc['description'] != '':
+                            if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '':
                                 print "saving"
                                 foidocsdb.save(doc)
                         else:
@@ -277,6 +282,6 @@
                         print "header row"
 
                     else:
-                        print "ERROR number of columns incorrect"
+                        print >> sys.stderr, "ERROR number of columns incorrect"
                         print row