More trip planner testing with colors
[busui.git] / labs / openlayers / tools / BeautifulSoup.py
blob:a/labs/openlayers/tools/BeautifulSoup.py -> blob:b/labs/openlayers/tools/BeautifulSoup.py
  """Beautiful Soup
  Elixir and Tonic
  "The Screen-Scraper's Friend"
  http://www.crummy.com/software/BeautifulSoup/
   
  Beautiful Soup parses a (possibly invalid) XML or HTML document into a
  tree representation. It provides methods and Pythonic idioms that make
  it easy to navigate, search, and modify the tree.
   
  A well-formed XML/HTML document yields a well-formed data
  structure. An ill-formed XML/HTML document yields a correspondingly
  ill-formed data structure. If your document is only locally
  well-formed, you can use this library to find and process the
  well-formed part of it. The BeautifulSoup class
   
  Beautiful Soup works with Python 2.2 and up. It has no external
  dependencies, but you'll have more success at converting data to UTF-8
  if you also install these three packages:
   
  * chardet, for auto-detecting character encodings
  http://chardet.feedparser.org/
  * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  by stock Python.
  http://cjkpython.i18n.org/
   
  Beautiful Soup defines classes for two main parsing strategies:
   
  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  language that kind of looks like XML.
   
  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  or invalid. This class has web browser-like heuristics for
  obtaining a sensible parse tree in the face of common HTML errors.
   
  Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  the encoding of an HTML or XML document, and converting it to
  Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
   
  For more than you ever wanted to know about Beautiful Soup, see the
  documentation:
  http://www.crummy.com/software/BeautifulSoup/documentation.html
   
  """
  from __future__ import generators
   
  __author__ = "Leonard Richardson (leonardr@segfault.org)"
  __version__ = "3.0.4"
  __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
  __license__ = "PSF"
   
  from sgmllib import SGMLParser, SGMLParseError
  import codecs
  import types
  import re
  import sgmllib
  try:
  from htmlentitydefs import name2codepoint
  except ImportError:
  name2codepoint = {}
   
  #This hack makes Beautiful Soup able to parse XML with namespaces
  sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
   
  DEFAULT_OUTPUT_ENCODING = "utf-8"
   
  # First, the classes that represent markup elements.
   
  class PageElement:
  """Contains the navigational information for some part of the page
  (either a tag or a piece of text)"""
   
  def setup(self, parent=None, previous=None):
  """Sets up the initial relations between this element and
  other elements."""
  self.parent = parent
  self.previous = previous
  self.next = None
  self.previousSibling = None
  self.nextSibling = None
  if self.parent and self.parent.contents:
  self.previousSibling = self.parent.contents[-1]
  self.previousSibling.nextSibling = self
   
  def replaceWith(self, replaceWith):
  oldParent = self.parent
  myIndex = self.parent.contents.index(self)
  if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
  # We're replacing this element with one of its siblings.
  index = self.parent.contents.index(replaceWith)
  if index and index < myIndex:
  # Furthermore, it comes before this element. That
  # means that when we extract it, the index of this
  # element will change.
  myIndex = myIndex - 1
  self.extract()
  oldParent.insert(myIndex, replaceWith)
   
  def extract(self):
  """Destructively rips this element out of the tree."""
  if self.parent:
  try:
  self.parent.contents.remove(self)
  except ValueError:
  pass
   
  #Find the two elements that would be next to each other if
  #this element (and any children) hadn't been parsed. Connect
  #the two.
  lastChild = self._lastRecursiveChild()
  nextElement = lastChild.next
   
  if self.previous:
  self.previous.next = nextElement
  if nextElement:
  nextElement.previous = self.previous
  self.previous = None
  lastChild.next = None
   
  self.parent = None
  if self.previousSibling:
  self.previousSibling.nextSibling = self.nextSibling
  if self.nextSibling:
  self.nextSibling.previousSibling = self.previousSibling
  self.previousSibling = self.nextSibling = None
   
  def _lastRecursiveChild(self):
  "Finds the last element beneath this object to be parsed."
  lastChild = self
  while hasattr(lastChild, 'contents') and lastChild.contents:
  lastChild = lastChild.contents[-1]
  return lastChild
   
  def insert(self, position, newChild):
  if (isinstance(newChild, basestring)
  or isinstance(newChild, unicode)) \
  and not isinstance(newChild, NavigableString):
  newChild = NavigableString(newChild)
   
  position = min(position, len(self.contents))
  if hasattr(newChild, 'parent') and newChild.parent != None:
  # We're 'inserting' an element that's already one
  # of this object's children.
  if newChild.parent == self:
  index = self.find(newChild)
  if index and index < position:
  # Furthermore we're moving it further down the
  # list of this object's children. That means that
  # when we extract this element, our target index
  # will jump down one.
  position = position - 1
  newChild.extract()
   
  newChild.parent = self
  previousChild = None
  if position == 0:
  newChild.previousSibling = None
  newChild.previous = self
  else:
  previousChild = self.contents[position-1]
  newChild.previousSibling = previousChild
  newChild.previousSibling.nextSibling = newChild
  newChild.previous = previousChild._lastRecursiveChild()
  if newChild.previous:
  newChild.previous.next = newChild
   
  newChildsLastElement = newChild._lastRecursiveChild()
   
  if position >= len(self.contents):
  newChild.nextSibling = None
   
  parent = self
  parentsNextSibling = None
  while not parentsNextSibling:
  parentsNextSibling = parent.nextSibling
  parent = parent.parent
  if not parent: # This is the last element in the document.
  break
  if parentsNextSibling:
  newChildsLastElement.next = parentsNextSibling
  else:
  newChildsLastElement.next = None
  else:
  nextChild = self.contents[position]
  newChild.nextSibling = nextChild
  if newChild.nextSibling:
  newChild.nextSibling.previousSibling = newChild
  newChildsLastElement.next = nextChild
   
  if newChildsLastElement.next:
  newChildsLastElement.next.previous = newChildsLastElement
  self.contents.insert(position, newChild)
   
  def findNext(self, name=None, attrs={}, text=None, **kwargs):
  """Returns the first item that matches the given criteria and
  appears after this Tag in the document."""
  return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
   
  def findAllNext(self, name=None, attrs={}, text=None, limit=None,
  **kwargs):
  """Returns all items that match the given criteria and appear
  before after Tag in the document."""
  return self._findAll(name, attrs, text, limit, self.nextGenerator)
   
  def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
  """Returns the closest sibling to this Tag that matches the
  given criteria and appears after this Tag in the document."""
  return self._findOne(self.findNextSiblings, name, attrs, text,
  **kwargs)
   
  def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
  **kwargs):
  """Returns the siblings of this Tag that match the given
  criteria and appear after this Tag in the document."""
  return self._findAll(name, attrs, text, limit,
  self.nextSiblingGenerator, **kwargs)
  fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
   
  def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
  """Returns the first item that matches the given criteria and
  appears before this Tag in the document."""
  return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
   
  def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
  **kwargs):
  """Returns all items that match the given criteria and appear
  before this Tag in the document."""
  return self._findAll(name, attrs, text, limit, self.previousGenerator,
  **kwargs)
  fetchPrevious = findAllPrevious # Compatibility with pre-3.x
   
  def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
  """Returns the closest sibling to this Tag that matches the
  given criteria and appears before this Tag in the document."""
  return self._findOne(self.findPreviousSiblings, name, attrs, text,
  **kwargs)
   
  def findPreviousSiblings(self, name=None, attrs={}, text=None,
  limit=None, **kwargs):
  """Returns the siblings of this Tag that match the given
  criteria and appear before this Tag in the document."""
  return self._findAll(name, attrs, text, limit,
  self.previousSiblingGenerator, **kwargs)
  fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
   
  def findParent(self, name=None, attrs={}, **kwargs):
  """Returns the closest parent of this Tag that matches the given
  criteria."""
  # NOTE: We can't use _findOne because findParents takes a different
  # set of arguments.
  r = None
  l = self.findParents(name, attrs, 1)
  if l:
  r = l[0]
  return r
   
  def findParents(self, name=None, attrs={}, limit=None, **kwargs):
  """Returns the parents of this Tag that match the given
  criteria."""
   
  return self._findAll(name, attrs, None, limit, self.parentGenerator,
  **kwargs)
  fetchParents = findParents # Compatibility with pre-3.x
   
  #These methods do the real heavy lifting.
   
  def _findOne(self, method, name, attrs, text, **kwargs):
  r = None
  l = method(name, attrs, text, 1, **kwargs)
  if l:
  r = l[0]
  return r
   
  def _findAll(self, name, attrs, text, limit, generator, **kwargs):
  "Iterates over a generator looking for things that match."
   
  if isinstance(name, SoupStrainer):
  strainer = name
  else:
  # Build a SoupStrainer
  strainer = SoupStrainer(name, attrs, text, **kwargs)
  results = ResultSet(strainer)
  g = generator()
  while True:
  try:
  i = g.next()
  except StopIteration:
  break
  if i:
  found = strainer.search(i)
  if found:
  results.append(found)
  if limit and len(results) >= limit:
  break
  return results
   
  #These Generators can be used to navigate starting from both
  #NavigableStrings and Tags.
  def nextGenerator(self):
  i = self
  while i:
  i = i.next
  yield i
   
  def nextSiblingGenerator(self):
  i = self
  while i:
  i = i.nextSibling
  yield i
   
  def previousGenerator(self):
  i = self
  while i:
  i = i.previous
  yield i
   
  def previousSiblingGenerator(self):
  i = self
  while i:
  i = i.previousSibling
  yield i
   
  def parentGenerator(self):
  i = self
  while i:
  i = i.parent
  yield i
   
  # Utility methods
  def substituteEncoding(self, str, encoding=None):
  encoding = encoding or "utf-8"
  return str.replace("%SOUP-ENCODING%", encoding)
   
  def toEncoding(self, s, encoding=None):
  """Encodes an object to a string in some encoding, or to Unicode.
  ."""
  if isinstance(s, unicode):
  if encoding:
  s = s.encode(encoding)
  elif isinstance(s, str):
  if encoding:
  s = s.encode(encoding)
  else:
  s = unicode(s)
  else:
  if encoding:
  s = self.toEncoding(str(s), encoding)
  else:
  s = unicode(s)
  return s
   
  class NavigableString(unicode, PageElement):
   
  def __getattr__(self, attr):
  """text.string gives you text. This is for backwards
  compatibility for Navigable*String, but for CData* it lets you
  get the string without the CData wrapper."""
  if attr == 'string':
  return self
  else:
  raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
   
  def __unicode__(self):
  return self.__str__(None)