From: maxious Date: Sat, 17 Dec 2011 09:11:38 +0000 Subject: remove sag copy X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=600e43dee78240df60eb2d35a203501564215331 --- remove sag copy --- --- a/sag +++ /dev/null --- a/scrape.py +++ b/scrape.py @@ -1,6 +1,8 @@ #http://packages.python.org/CouchDB/client.html import couchdb import urllib2 +from BeautifulSoup import BeautifulSoup +import re couch = couchdb.Server() # Assuming localhost:5984 # If your CouchDB server is running elsewhere, set it up like this: @@ -43,9 +45,15 @@ print "the web page has not been modified" else: print "error %s in downloading %s", url_handle.code, URL - #record/alert error + #record/alert error to error database #do scraping + html = ? + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(html) +links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) +for link in links: + print link['href'] #for each unique link #if html mimetype # go down X levels,