Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -198,11 +198,24 @@
 
     def getRows(self, table):
         return table.find_all('tr')
+    def findColumns(self, row):
+        return row.find_all('td')
+
+    def getDocHash(self, id,date, url):
+                        if id.string is None:
+			    print "no id, using date as hash"
+                            return scrape.mkhash(
+                                self.remove_control_chars(
+                                    url + (''.join(date.stripped_strings))))
+                        else:
+                            return scrape.mkhash(
+                                self.remove_control_chars(
+                                    url + (''.join(id.stripped_strings))))
 
     def getDate(self, content, entry, doc):
         strdate = ''.join(content.stripped_strings).strip()
         (a, b, c) = strdate.partition("(")
-        strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")replace("Janrurary", "January").replace("1012","2012"))
+        strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012"))
         print strdate
         try:
 		edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
@@ -234,20 +247,13 @@
                 soup = BeautifulSoup(content)
                 table = self.getTable(soup)
                 for row in self.getRows(table):
-                    columns = row.find_all('td')
+                    columns = self.findColumns(row)
                     if len(columns) is self.getColumnCount():
                         (id, date, title,
                          description, notes) = self.getColumns(columns)
                         print self.remove_control_chars(
                             ''.join(id.stripped_strings))
-                        if id.string is None:
-                            dochash = scrape.mkhash(
-                                self.remove_control_chars(
-                                    url + (''.join(date.stripped_strings))))
-                        else:
-                            dochash = scrape.mkhash(
-                                self.remove_control_chars(
-                                    url + (''.join(id.stripped_strings))))
+                        dochash = self.getDocHash(id,date,url)
                         doc = foidocsdb.get(dochash)
 
                         if doc is None: