--- a/scrape.py +++ b/scrape.py @@ -1,6 +1,8 @@ #http://packages.python.org/CouchDB/client.html import couchdb import urllib2 +from BeautifulSoup import BeautifulSoup +import re couch = couchdb.Server() # Assuming localhost:5984 # If your CouchDB server is running elsewhere, set it up like this: @@ -43,9 +45,15 @@ print "the web page has not been modified" else: print "error %s in downloading %s", url_handle.code, URL - #record/alert error + #record/alert error to error database #do scraping + html = ? + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(html) +links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) +for link in links: + print link['href'] #for each unique link #if html mimetype # go down X levels,