--- a/documents/scrape.py +++ b/documents/scrape.py @@ -12,6 +12,9 @@ import urllib import urlparse +def mkhash(input): + return hashlib.md5(input).hexdigest().encode("utf-8") + def canonurl(url): r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' if the URL looks invalid. @@ -63,6 +66,11 @@ url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) return url[:4096] +def fullurl(url,href): + href = href.replace(" ","%20") + href = re.sub('#.*$','',href) + return urljoin(url,href) + #http://diveintopython.org/http_web_services/etags.html class NotModifiedHandler(urllib2.BaseHandler): def http_error_304(self, req, fp, code, message, headers): @@ -72,7 +80,7 @@ def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): url = canonurl(url) - hash = hashlib.md5(url).hexdigest().encode("utf-8") + hash = mkhash(url) req = urllib2.Request(url) print "Fetching %s" % url if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": @@ -174,9 +182,7 @@ None else: # remove anchors and spaces in urls - link['href'] = link['href'].replace(" ","%20") - link['href'] = re.sub('#.*$','',link['href']) - linkurls.add(urljoin(url,link['href'])) + linkurls.add(fullurl(url,link['href'])) for linkurl in linkurls: #print linkurl scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)