From: Alex Sadleir Date: Thu, 11 Apr 2013 12:24:20 +0000 Subject: Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=5948d2a9216855b7a214dd4f5fcb82ed6af548d3 --- Merge branch 'master' of ssh://apples.lambdacomplex.org/git/disclosr Conflicts: documents/genericScrapers.py documents/runScrapers.sh Former-commit-id: a6f8697ed080934b51ab7b63a3d4428ff5ccdb2b --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -72,7 +72,8 @@ edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description), "diff": diff} + "date": edate, "title": "Disclosure Log Updated", + "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} foidocsdb.save(doc) else: print "already saved" @@ -199,11 +200,16 @@ return table.find_all('tr') def getDate(self, content, entry, doc): - date = ''.join(content.stripped_strings).strip() - (a, b, c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janurary","January")) - print date - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + strdate = ''.join(content.stripped_strings).strip() + (a, b, c) = strdate.partition("(") + strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")replace("Janrurary", "January").replace("1012","2012")) + print strdate + try: + edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + except ValueError: + print >> sys.stderr, "ERROR date invalid %s " % strdate + print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() + edate = date.today().strftime("%Y-%m-%d") print edate doc.update({'date': edate}) return @@ -266,8 +272,7 @@ 'Summary of FOIrequest received by agency/minister', 'Summary of FOI request received', 'Description of FOI Request', "FOI request", 'Results 1 to 67 of 67'] - if doc['title'] not in badtitles\ - and doc['description'] != '': + if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': print "saving" foidocsdb.save(doc) else: @@ -277,6 +282,6 @@ print "header row" else: - print "ERROR number of columns incorrect" + print >> sys.stderr, "ERROR number of columns incorrect" print row --- a/documents/runScrapers.sh +++ b/documents/runScrapers.sh @@ -1,13 +1,21 @@ -rm /tmp/disclosr-error +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR +echo "" > /tmp/disclosr-error for f in scrapers/*.py; do echo "Processing $f file.."; - python $f 2>/tmp/disclosr-error; + md5=`md5sum /tmp/disclosr-error` + python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; + md52=`md5sum /tmp/disclosr-error` + if [ "$md5" != "$md52" ]; then + echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; + fi if [ "$?" -ne "0" ]; then echo "error"; - sleep 2; + sleep 1; fi done if [ -s /tmp/disclosr-error ] ; then + echo "emailling logs.."; mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; fi --- a/documents/scrape.py +++ b/documents/scrape.py @@ -7,6 +7,7 @@ from urlparse import urljoin import time import os +import sys import mimetypes import urllib import urlparse @@ -89,7 +90,7 @@ def getLastAttachment(docsdb, url): hash = mkhash(url) doc = docsdb.get(hash) - if doc != None: + if doc != None and "_attachments" in doc.keys(): last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) return last_attachment @@ -103,7 +104,7 @@ req = urllib2.Request(url) print "Fetching %s (%s)" % (url, hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": - print "Not a valid HTTP url" + print >> sys.stderr, "Not a valid HTTP url" return (None, None, None) doc = docsdb.get(hash) if doc == None: @@ -111,10 +112,15 @@ else: if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash - last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc, last_attachment_fname) - content = last_attachment - return (doc['url'], doc['mime_type'], content.read()) + if "_attachments" in doc.keys(): + last_attachment_fname = doc["_attachments"].keys()[-1] + last_attachment = docsdb.get_attachment(doc, last_attachment_fname) + content = last_attachment.read() + mime_type = doc['mime_type'] + else: + content = None + mime_type = None + return (doc['url'], mime_type, content) req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags @@ -159,13 +165,13 @@ #store as attachment epoch-filename except (urllib2.URLError, socket.timeout) as e: - print "error!" + print >> sys.stderr,"error!" error = "" if hasattr(e, 'reason'): error = "error %s in downloading %s" % (str(e.reason), url) elif hasattr(e, 'code'): error = "error %s in downloading %s" % (e.code, url) - print error + print >> sys.stderr, error doc['error'] = error docsdb.save(doc) return (None, None, None) --- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py @@ -42,7 +42,6 @@ 'data': {'request': '', 'session': '', 'more': ''} } - - amonpy.exception(data) + #amonpy.exception(data) pass --- a/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py +++ b/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py @@ -18,13 +18,13 @@ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(htcontent) - for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): + rowtitle = soup.find(class_ = "wc-title").find("h1").string + if rowtitle != None: + description = rowtitle + ": " + for row in soup.find(class_ ="wc-content").find_all('td'): if row != None: - rowtitle = row.find('th').string - if rowtitle != None: - description = description + "\n" + rowtitle + ": " - for text in row.find('td').stripped_strings: - description = description + text + for text in row.stripped_strings: + description = description + text + "\n" for atag in row.find_all("a"): if atag.has_key('href'): links.append(scrape.fullurl(link,atag['href'])) @@ -37,7 +37,7 @@ def getColumnCount(self): return 2 def getTable(self,soup): - return soup.find(class_ = "ms-rteTable-GreyAlternating") + return soup.find(class_ = "ms-rteTable-default") def getColumns(self,columns): (date, title) = columns return (title, date, title, title, None) --- a/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py +++ b/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py @@ -7,7 +7,7 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getTable(self,soup): - return soup.find(id = "inner_content") + return soup.find(class_="tborder") def getColumnCount(self): return 2 def getColumns(self,columns): --- a/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py +++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py @@ -8,40 +8,14 @@ from datetime import * #http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(class_ = "inner-column").table - def getRows(self,table): - return table.tbody.find_all('tr',recursive=False) +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): def getColumnCount(self): - return 3 - def getColumns(self,columns): - (date, title, description) = columns - return (date, date, title, description, None) - def getDate(self, content, entry, doc): - i = 0 - date = "" - for string in content.stripped_strings: - if i ==1: - date = string - i = i+1 - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - print edate - doc.update({'date': edate}) - return - def getTitle(self, content, entry, doc): - i = 0 - title = "" - for string in content.stripped_strings: - if i < 2: - title = title + string - i = i+1 - doc.update({'title': title}) - #print title - return + return 0 if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) +#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx +#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericHTMLDisclogScraper) ScraperImplementation().doScrape() --- a/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py +++ b/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py @@ -6,8 +6,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(id = "content_div_50269").table def getColumns(self,columns): (id, date, title, description, notes) = columns return (id, date, title, description, notes) --- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py @@ -21,11 +21,15 @@ d.make_links_absolute(base_url = self.getURL()) for table in d('table').items(): title= table('thead').text() - print title + print self.remove_control_chars(title) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) links = table('a').map(lambda i, e: pq(e).attr('href')) description = descA+" "+descB - edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + try: + edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + except ValueError: + edate = date.today().strftime("%Y-%m-%d") + pass print edate dochash = scrape.mkhash(self.remove_control_chars(title)) doc = foidocsdb.get(dochash) --- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py @@ -18,10 +18,10 @@ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(htcontent) - for text in soup.find(id="divFullWidthColumn").stripped_strings: + for text in soup.find(class_ = "mainContent").stripped_strings: description = description + text.encode('ascii', 'ignore') - for atag in soup.find(id="divFullWidthColumn").find_all("a"): + for atag in soup.find(id="SortingTable").find_all("a"): if atag.has_key('href'): links.append(scrape.fullurl(link,atag['href'])) --- a/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py +++ b/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py @@ -7,11 +7,11 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getTable(self,soup): - return soup.find(id="ctl00_ContentPlaceHolderMainNoAjax_EdtrTD1494_2").table + return soup.find(id="int-content").table def getColumnCount(self): - return 4 + return 3 def getColumns(self,columns): - (blank,id, title,date) = columns + (id, title,date) = columns return (id, date, title, title, None) if __name__ == '__main__': --- /dev/null +++ b/documents/scrapers/b0ca7fddcd1c965787daea47f2d32e0a.py @@ -1,1 +1,17 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py +++ b/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py @@ -10,7 +10,7 @@ (id, date, title, description, notes) = columns return (id, date, title, description, notes) def getTable(self,soup): - return soup.find(class_ = "content") + return soup.find(class_ = "simpletable") if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- a/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py +++ b/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py @@ -10,7 +10,7 @@ (id, date, title, description, notes) = columns return (id, date, title, description, notes) def getTable(self,soup): - return soup.find(id = "content").table + return soup.find("table") if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)