scraper fixes
scraper fixes


Former-commit-id: 7c09d22a257167842febb35ef0a1605548e871c2

--- a/admin/refreshDesignDoc.php
+++ b/admin/refreshDesignDoc.php
@@ -25,9 +25,9 @@
 $obj->_id = "_design/" . urlencode("app");
 $obj->language = "javascript";
 $obj->views->web_server->map = "function(doc) {\n  emit(doc.web_server, 1);\n}";
-$obj->views->web_server->reduce = "_sum";
+$obj->views->web_server->reduce = "function (key, values, rereduce) {\n    return sum(values);\n}";
 $obj->views->byAgency->map = "function(doc) {\n  emit(doc.agencyID, 1);\n}";
-$obj->views->byAgency->reduce = "_sum";
+$obj->views->byAgency->reduce = "function (key, values, rereduce) {\n    return sum(values);\n}";
 $obj->views->byURL->map = "function(doc) {\n  emit(doc.url, doc);\n}";
 $obj->views->agency->map = "function(doc) {\n  emit(doc.agencyID, doc);\n}";
 $obj->views->byWebServer->map = "function(doc) {\n  emit(doc.web_server, doc);\n}";
@@ -106,17 +106,6 @@
     emit(null, [doc._rev].concat(doc._conflicts));
   }
 }";
-$obj->views->getStatistics->map = 
-"function(doc) {
-  if (doc.statistics) {
-	for (var statisticSet in doc.statistics)  {
-for (var statisticPeriod in doc.statistics[statisticSet])  {
-    emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);
-}
-}
-  }
-}";
-$obj->views->getStatistics->reduce = '_sum';
 // http://stackoverflow.com/questions/646628/javascript-startswith
 $obj->views->score->map = 'if(!String.prototype.startsWith){
     String.prototype.startsWith = function (str) {
@@ -162,7 +151,9 @@
   emit("total", 1);
   }
 }';
-$obj->views->scoreHas->reduce = '_sum';
+$obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
+    return sum(values);
+}';
 $obj->views->fieldNames->map = '
 function(doc) {
 for(var propName in doc) {
@@ -170,7 +161,9 @@
 	}
   
 }';
-$obj->views->fieldNames->reduce = '_count';
+$obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
+    return values.length;
+}';
 // allow safe updates (even if slightly slower due to extra: rev-detection check).
 $db->save($obj, true);
 ?>

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -72,7 +72,7 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -104,7 +104,7 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -202,7 +202,7 @@
     def getDate(self, content, entry, doc):
         date = ''.join(content.stripped_strings).strip()
         (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October"))
+        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
         print date
         edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
         print edate

--- a/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
+++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
@@ -6,8 +6,6 @@
 
 #http://www.doughellmann.com/PyMOTW/abc/
 class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
-        def getTable(self,soup):
-                return soup.find(_class = "article-content").table
         def getColumnCount(self):
                 return 5
         def getColumns(self,columns):

--- a/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
+++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
@@ -21,6 +21,7 @@
     			if i < 2:
 				title = title + string
 			i = i+1
+		title = self.remove_control_chars(title)
                 doc.update({'title': title})
 		print title
                 return