From: maxious Date: Sun, 27 Jan 2013 05:58:09 +0000 Subject: scraper fixes X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=ffec37058d392034b3ecacdb55c1ab4b0a2e58f6 --- scraper fixes Former-commit-id: 7c09d22a257167842febb35ef0a1605548e871c2 --- --- a/admin/refreshDesignDoc.php +++ b/admin/refreshDesignDoc.php @@ -25,9 +25,9 @@ $obj->_id = "_design/" . urlencode("app"); $obj->language = "javascript"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; -$obj->views->web_server->reduce = "_sum"; +$obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; -$obj->views->byAgency->reduce = "_sum"; +$obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; @@ -106,17 +106,6 @@ emit(null, [doc._rev].concat(doc._conflicts)); } }"; -$obj->views->getStatistics->map = -"function(doc) { - if (doc.statistics) { - for (var statisticSet in doc.statistics) { -for (var statisticPeriod in doc.statistics[statisticSet]) { - emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); -} -} - } -}"; -$obj->views->getStatistics->reduce = '_sum'; // http://stackoverflow.com/questions/646628/javascript-startswith $obj->views->score->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { @@ -162,7 +151,9 @@ emit("total", 1); } }'; -$obj->views->scoreHas->reduce = '_sum'; +$obj->views->scoreHas->reduce = 'function (key, values, rereduce) { + return sum(values); +}'; $obj->views->fieldNames->map = ' function(doc) { for(var propName in doc) { @@ -170,7 +161,9 @@ } }'; -$obj->views->fieldNames->reduce = '_count'; +$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { + return values.length; +}'; // allow safe updates (even if slightly slower due to extra: rev-detection check). $db->save($obj, true); ?> --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -72,7 +72,7 @@ edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} foidocsdb.save(doc) else: print "already saved" @@ -104,7 +104,7 @@ edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} foidocsdb.save(doc) else: print "already saved" @@ -202,7 +202,7 @@ def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() (a, b, c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber", "October")) + date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")) print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate --- a/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py +++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py @@ -6,8 +6,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(_class = "article-content").table def getColumnCount(self): return 5 def getColumns(self,columns): --- a/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py +++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py @@ -21,6 +21,7 @@ if i < 2: title = title + string i = i+1 + title = self.remove_control_chars(title) doc.update({'title': title}) print title return