Add sag as a git submodule
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
[submodule "sag"]
path = sag
- url = git://github.com/sbisbee/sag.git
+ url = https://github.com/sbisbee/sag.git
--- a/scrape.py
+++ b/scrape.py
@@ -1,6 +1,8 @@
#http://packages.python.org/CouchDB/client.html
import couchdb
import urllib2
+from BeautifulSoup import BeautifulSoup
+import re
couch = couchdb.Server() # Assuming localhost:5984
# If your CouchDB server is running elsewhere, set it up like this:
@@ -43,9 +45,15 @@
print "the web page has not been modified"
else:
print "error %s in downloading %s", url_handle.code, URL
- #record/alert error
+ #record/alert error to error database
#do scraping
+ html = ?
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(html)
+links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
+for link in links:
+ print link['href']
#for each unique link
#if html mimetype
# go down X levels,