From: Maxious Date: Sat, 09 Mar 2013 07:32:06 +0000 Subject: error reporting features X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=bc1dab010b441f449bbf69713a38b04281caf83b --- error reporting features Former-commit-id: 4458096bdd46a0e420126ab910fbf68cbdd986f0 --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -72,7 +72,8 @@ edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description), "diff": diff} + "date": edate, "title": "Disclosure Log Updated", + "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} foidocsdb.save(doc) else: print "already saved" @@ -277,6 +278,6 @@ print "header row" else: - print "ERROR number of columns incorrect" + print >> sys.stderr, "ERROR number of columns incorrect" print row --- a/documents/runScrapers.sh +++ b/documents/runScrapers.sh @@ -1,10 +1,20 @@ -for f in scrapers/*.py; - do echo "Processing $f file.."; - python $f; +echo "" > /tmp/disclosr-error +for f in scrapers/*.py; do + echo "Processing $f file.."; + md5=`md5sum /tmp/disclosr-error` + python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; + md52=`md5sum /tmp/disclosr-error` + if [ "$md5" != "$md52" ]; then + echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; + fi if [ "$?" -ne "0" ]; then echo "error"; - sleep 2; + sleep 1; fi done +if [ -s /tmp/disclosr-error ] ; then + echo "emailling logs.."; + mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; +fi --- a/documents/scrape.py +++ b/documents/scrape.py @@ -7,14 +7,15 @@ from urlparse import urljoin import time import os +import sys import mimetypes import urllib import urlparse import socket #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') -#couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') def mkhash(input): @@ -103,7 +104,7 @@ req = urllib2.Request(url) print "Fetching %s (%s)" % (url, hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": - print "Not a valid HTTP url" + print >> sys.stderr, "Not a valid HTTP url" return (None, None, None) doc = docsdb.get(hash) if doc == None: @@ -159,13 +160,13 @@ #store as attachment epoch-filename except (urllib2.URLError, socket.timeout) as e: - print "error!" + print >> sys.stderr,"error!" error = "" if hasattr(e, 'reason'): error = "error %s in downloading %s" % (str(e.reason), url) elif hasattr(e, 'code'): error = "error %s in downloading %s" % (e.code, url) - print error + print >> sys.stderr, error doc['error'] = error docsdb.save(doc) return (None, None, None) --- a/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py +++ b/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py @@ -18,13 +18,13 @@ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(htcontent) - for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): + rowtitle = soup.find(class_ = "wc-title").find("h1").string + if rowtitle != None: + description = rowtitle + ": " + for row in soup.find(class_ ="wc-content").find_all('td'): if row != None: - rowtitle = row.find('th').string - if rowtitle != None: - description = description + "\n" + rowtitle + ": " - for text in row.find('td').stripped_strings: - description = description + text + for text in row.stripped_strings: + description = description + text + "\n" for atag in row.find_all("a"): if atag.has_key('href'): links.append(scrape.fullurl(link,atag['href'])) @@ -37,7 +37,7 @@ def getColumnCount(self): return 2 def getTable(self,soup): - return soup.find(class_ = "ms-rteTable-GreyAlternating") + return soup.find(class_ = "ms-rteTable-default") def getColumns(self,columns): (date, title) = columns return (title, date, title, title, None) --- a/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py +++ b/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py @@ -10,7 +10,7 @@ (id, date, title, description, notes) = columns return (id, date, title, description, notes) def getTable(self,soup): - return soup.find(id = "content").table + return soup.find("table") if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)