remove sag copy
directory:
a/sag
(deleted)
--- a/sag
+++ /dev/null
--- a/scrape.py
+++ b/scrape.py
@@ -1,6 +1,8 @@
#http://packages.python.org/CouchDB/client.html
import couchdb
import urllib2
+from BeautifulSoup import BeautifulSoup
+import re
couch = couchdb.Server() # Assuming localhost:5984
# If your CouchDB server is running elsewhere, set it up like this:
@@ -43,9 +45,15 @@
print "the web page has not been modified"
else:
print "error %s in downloading %s", url_handle.code, URL
- #record/alert error
+ #record/alert error to error database
#do scraping
+ html = ?
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(html)
+links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
+for link in links:
+ print link['href']
#for each unique link
#if html mimetype
# go down X levels,