|
"""Beautiful Soup |
|
Elixir and Tonic |
|
"The Screen-Scraper's Friend" |
|
http://www.crummy.com/software/BeautifulSoup/ |
|
|
|
Beautiful Soup parses a (possibly invalid) XML or HTML document into a |
|
tree representation. It provides methods and Pythonic idioms that make |
|
it easy to navigate, search, and modify the tree. |
|
|
|
A well-formed XML/HTML document yields a well-formed data |
|
structure. An ill-formed XML/HTML document yields a correspondingly |
|
ill-formed data structure. If your document is only locally |
|
well-formed, you can use this library to find and process the |
|
well-formed part of it. The BeautifulSoup class |
|
|
|
Beautiful Soup works with Python 2.2 and up. It has no external |
|
dependencies, but you'll have more success at converting data to UTF-8 |
|
if you also install these three packages: |
|
|
|
* chardet, for auto-detecting character encodings |
|
http://chardet.feedparser.org/ |
|
* cjkcodecs and iconv_codec, which add more encodings to the ones supported |
|
by stock Python. |
|
http://cjkpython.i18n.org/ |
|
|
|
Beautiful Soup defines classes for two main parsing strategies: |
|
|
|
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific |
|
language that kind of looks like XML. |
|
|
|
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid |
|
or invalid. This class has web browser-like heuristics for |
|
obtaining a sensible parse tree in the face of common HTML errors. |
|
|
|
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting |
|
the encoding of an HTML or XML document, and converting it to |
|
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. |
|
|
|
For more than you ever wanted to know about Beautiful Soup, see the |
|
documentation: |
|
http://www.crummy.com/software/BeautifulSoup/documentation.html |
|
|
|
""" |
|
from __future__ import generators |
|
|
|
__author__ = "Leonard Richardson (leonardr@segfault.org)" |
|
__version__ = "3.0.4" |
|
__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" |
|
__license__ = "PSF" |
|
|
|
from sgmllib import SGMLParser, SGMLParseError |
|
import codecs |
|
import types |
|
import re |
|
import sgmllib |
|
try: |
|
from htmlentitydefs import name2codepoint |
|
except ImportError: |
|
name2codepoint = {} |
|
|
|
#This hack makes Beautiful Soup able to parse XML with namespaces |
|
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|
|
|
DEFAULT_OUTPUT_ENCODING = "utf-8" |
|
|
|
# First, the classes that represent markup elements. |
|
|
|
class PageElement: |
|
"""Contains the navigational information for some part of the page |
|
(either a tag or a piece of text)""" |
|
|
|
def setup(self, parent=None, previous=None): |
|
"""Sets up the initial relations between this element and |
|
other elements.""" |
|
self.parent = parent |
|
self.previous = previous |
|
self.next = None |
|
self.previousSibling = None |
|
self.nextSibling = None |
|
if self.parent and self.parent.contents: |
|
self.previousSibling = self.parent.contents[-1] |
|
self.previousSibling.nextSibling = self |
|
|
|
def replaceWith(self, replaceWith): |
|
oldParent = self.parent |
|
myIndex = self.parent.contents.index(self) |
|
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: |
|
# We're replacing this element with one of its siblings. |
|
index = self.parent.contents.index(replaceWith) |
|
if index and index < myIndex: |
|
# Furthermore, it comes before this element. That |
|
# means that when we extract it, the index of this |
|
# element will change. |
|
myIndex = myIndex - 1 |
|
self.extract() |
|
oldParent.insert(myIndex, replaceWith) |
|
|
|
def extract(self): |
|
"""Destructively rips this element out of the tree.""" |
|
if self.parent: |
|
try: |
|
self.parent.contents.remove(self) |
|
except ValueError: |
|
pass |
|
|
|
#Find the two elements that would be next to each other if |
|
#this element (and any children) hadn't been parsed. Connect |
|
#the two. |
|
lastChild = self._lastRecursiveChild() |
|
nextElement = lastChild.next |
|
|
|
if self.previous: |
|
self.previous.next = nextElement |
|
if nextElement: |
|
nextElement.previous = self.previous |
|
self.previous = None |
|
lastChild.next = None |
|
|
|
self.parent = None |
|
if self.previousSibling: |
|
self.previousSibling.nextSibling = self.nextSibling |
|
if self.nextSibling: |
|
self.nextSibling.previousSibling = self.previousSibling |
|
self.previousSibling = self.nextSibling = None |
|
|
|
def _lastRecursiveChild(self): |
|
"Finds the last element beneath this object to be parsed." |
|
lastChild = self |
|
while hasattr(lastChild, 'contents') and lastChild.contents: |
|
lastChild = lastChild.contents[-1] |
|
return lastChild |
|
|
|
def insert(self, position, newChild): |
|
if (isinstance(newChild, basestring) |
|
or isinstance(newChild, unicode)) \ |
|
and not isinstance(newChild, NavigableString): |
|
newChild = NavigableString(newChild) |
|
|
|
position = min(position, len(self.contents)) |
|
if hasattr(newChild, 'parent') and newChild.parent != None: |
|
# We're 'inserting' an element that's already one |
|
# of this object's children. |
|
if newChild.parent == self: |
|
index = self.find(newChild) |
|
if index and index < position: |
|
# Furthermore we're moving it further down the |
|
# list of this object's children. That means that |
|
# when we extract this element, our target index |
|
# will jump down one. |
|
position = position - 1 |
|
newChild.extract() |
|
|
|
newChild.parent = self |
|
previousChild = None |
|
if position == 0: |
|
newChild.previousSibling = None |
|
newChild.previous = self |
|
else: |
|
previousChild = self.contents[position-1] |
|
newChild.previousSibling = previousChild |
|
newChild.previousSibling.nextSibling = newChild |
|
newChild.previous = previousChild._lastRecursiveChild() |
|
if newChild.previous: |
|
newChild.previous.next = newChild |
|
|
|
newChildsLastElement = newChild._lastRecursiveChild() |
|
|
|
if position >= len(self.contents): |
|
newChild.nextSibling = None |
|
|
|
parent = self |
|
parentsNextSibling = None |
|
while not parentsNextSibling: |
|
parentsNextSibling = parent.nextSibling |
|
parent = parent.parent |
|
if not parent: # This is the last element in the document. |
|
break |
|
if parentsNextSibling: |
|
newChildsLastElement.next = parentsNextSibling |
|
else: |
|
newChildsLastElement.next = None |
|
else: |
|
nextChild = self.contents[position] |
|
newChild.nextSibling = nextChild |
|
if newChild.nextSibling: |
|
newChild.nextSibling.previousSibling = newChild |
|
newChildsLastElement.next = nextChild |
|
|
|
if newChildsLastElement.next: |
|
newChildsLastElement.next.previous = newChildsLastElement |
|
self.contents.insert(position, newChild) |
|
|
|
def findNext(self, name=None, attrs={}, text=None, **kwargs): |
|
"""Returns the first item that matches the given criteria and |
|
appears after this Tag in the document.""" |
|
return self._findOne(self.findAllNext, name, attrs, text, **kwargs) |
|
|
|
def findAllNext(self, name=None, attrs={}, text=None, limit=None, |
|
**kwargs): |
|
"""Returns all items that match the given criteria and appear |
|
before after Tag in the document.""" |
|
return self._findAll(name, attrs, text, limit, self.nextGenerator) |
|
|
|
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): |
|
"""Returns the closest sibling to this Tag that matches the |
|
given criteria and appears after this Tag in the document.""" |
|
return self._findOne(self.findNextSiblings, name, attrs, text, |
|
**kwargs) |
|
|
|
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, |
|
**kwargs): |
|
"""Returns the siblings of this Tag that match the given |
|
criteria and appear after this Tag in the document.""" |
|
return self._findAll(name, attrs, text, limit, |
|
self.nextSiblingGenerator, **kwargs) |
|
fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x |
|
|
|
def findPrevious(self, name=None, attrs={}, text=None, **kwargs): |
|
"""Returns the first item that matches the given criteria and |
|
appears before this Tag in the document.""" |
|
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) |
|
|
|
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, |
|
**kwargs): |
|
"""Returns all items that match the given criteria and appear |
|
before this Tag in the document.""" |
|
return self._findAll(name, attrs, text, limit, self.previousGenerator, |
|
**kwargs) |
|
fetchPrevious = findAllPrevious # Compatibility with pre-3.x |
|
|
|
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): |
|
"""Returns the closest sibling to this Tag that matches the |
|
given criteria and appears before this Tag in the document.""" |
|
return self._findOne(self.findPreviousSiblings, name, attrs, text, |
|
**kwargs) |
|
|
|
def findPreviousSiblings(self, name=None, attrs={}, text=None, |
|
limit=None, **kwargs): |
|
"""Returns the siblings of this Tag that match the given |
|
criteria and appear before this Tag in the document.""" |
|
return self._findAll(name, attrs, text, limit, |
|
self.previousSiblingGenerator, **kwargs) |
|
fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x |
|
|
|
def findParent(self, name=None, attrs={}, **kwargs): |
|
"""Returns the closest parent of this Tag that matches the given |
|
criteria.""" |
|
# NOTE: We can't use _findOne because findParents takes a different |
|
# set of arguments. |
|
r = None |
|
l = self.findParents(name, attrs, 1) |
|
if l: |
|
r = l[0] |
|
return r |
|
|
|
def findParents(self, name=None, attrs={}, limit=None, **kwargs): |
|
"""Returns the parents of this Tag that match the given |
|
criteria.""" |
|
|
|
return self._findAll(name, attrs, None, limit, self.parentGenerator, |
|
**kwargs) |
|
fetchParents = findParents # Compatibility with pre-3.x |
|
|
|
#These methods do the real heavy lifting. |
|
|
|
def _findOne(self, method, name, attrs, text, **kwargs): |
|
r = None |
|
l = method(name, attrs, text, 1, **kwargs) |
|
if l: |
|
r = l[0] |
|
return r |
|
|
|
def _findAll(self, name, attrs, text, limit, generator, **kwargs): |
|
"Iterates over a generator looking for things that match." |
|
|
|
if isinstance(name, SoupStrainer): |
|
strainer = name |
|
else: |
|
# Build a SoupStrainer |
|
strainer = SoupStrainer(name, attrs, text, **kwargs) |
|
results = ResultSet(strainer) |
|
g = generator() |
|
while True: |
|
try: |
|
i = g.next() |
|
except StopIteration: |
|
break |
|
if i: |
|
found = strainer.search(i) |
|
if found: |
|
results.append(found) |
|
if limit and len(results) >= limit: |
|
break |
|
return results |
|
|
|
#These Generators can be used to navigate starting from both |
|
#NavigableStrings and Tags. |
|
def nextGenerator(self): |
|
i = self |
|
while i: |
|
i = i.next |
|
yield i |
|
|
|
def nextSiblingGenerator(self): |
|
i = self |
|
while i: |
|
i = i.nextSibling |
|
yield i |
|
|
|
def previousGenerator(self): |
|
i = self |
|
while i: |
|
i = i.previous |
|
yield i |
|
|
|
def previousSiblingGenerator(self): |
|
i = self |
|
while i: |
|
i = i.previousSibling |
|
yield i |
|
|
|
def parentGenerator(self): |
|
i = self |
|
while i: |
|
i = i.parent |
|
yield i |
|
|
|
# Utility methods |
|
def substituteEncoding(self, str, encoding=None): |
|
encoding = encoding or "utf-8" |
|
return str.replace("%SOUP-ENCODING%", encoding) |
|
|
|
def toEncoding(self, s, encoding=None): |
|
"""Encodes an object to a string in some encoding, or to Unicode. |
|
.""" |
|
if isinstance(s, unicode): |
|
if encoding: |
|
s = s.encode(encoding) |
|
elif isinstance(s, str): |
|
if encoding: |
|
s = s.encode(encoding) |
|
else: |
|
s = unicode(s) |
|
else: |
|
if encoding: |
|
s = self.toEncoding(str(s), encoding) |
|
else: |
|
s = unicode(s) |
|
return s |
|
|
|
class NavigableString(unicode, PageElement): |
|
|
|
def __getattr__(self, attr): |
|
"""text.string gives you text. This is for backwards |
|
compatibility for Navigable*String, but for CData* it lets you |
|
get the string without the CData wrapper.""" |
|
if attr == 'string': |
|
return self |
|
else: |
|
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) |
|
|
|
def __unicode__(self): |
|
return self.__str__(None) |
|
|
|