gitphp 0.2.9.1 :: busui.git/blobdiff

"""Beautiful Soup

Elixir and Tonic

"The Screen-Scraper's Friend"

http://www.crummy.com/software/BeautifulSoup/

Beautiful Soup parses a (possibly invalid) XML or HTML document into a

tree representation. It provides methods and Pythonic idioms that make

it easy to navigate, search, and modify the tree.

A well-formed XML/HTML document yields a well-formed data

structure. An ill-formed XML/HTML document yields a correspondingly

ill-formed data structure. If your document is only locally

well-formed, you can use this library to find and process the

well-formed part of it. The BeautifulSoup class

Beautiful Soup works with Python 2.2 and up. It has no external

dependencies, but you'll have more success at converting data to UTF-8

if you also install these three packages:

* chardet, for auto-detecting character encodings

http://chardet.feedparser.org/

* cjkcodecs and iconv_codec, which add more encodings to the ones supported

by stock Python.

http://cjkpython.i18n.org/

Beautiful Soup defines classes for two main parsing strategies:

* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific

language that kind of looks like XML.

* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid

or invalid. This class has web browser-like heuristics for

obtaining a sensible parse tree in the face of common HTML errors.

Beautiful Soup also defines a class (UnicodeDammit) for autodetecting

the encoding of an HTML or XML document, and converting it to

Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.

For more than you ever wanted to know about Beautiful Soup, see the

documentation:

http://www.crummy.com/software/BeautifulSoup/documentation.html

"""

from __future__ import generators

__author__ = "Leonard Richardson (leonardr@segfault.org)"

__version__ = "3.0.4"

__license__ = "PSF"

from sgmllib import SGMLParser, SGMLParseError

import codecs

import types

import re

import sgmllib

try:

from htmlentitydefs import name2codepoint

except ImportError:

name2codepoint = {}

#This hack makes Beautiful Soup able to parse XML with namespaces

sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')

DEFAULT_OUTPUT_ENCODING = "utf-8"

# First, the classes that represent markup elements.

class PageElement:

"""Contains the navigational information for some part of the page

(either a tag or a piece of text)"""

def setup(self, parent=None, previous=None):

"""Sets up the initial relations between this element and

other elements."""

self.parent = parent

self.previous = previous

self.next = None

self.previousSibling = None

self.nextSibling = None

if self.parent and self.parent.contents:

self.previousSibling = self.parent.contents[-1]

self.previousSibling.nextSibling = self

def replaceWith(self, replaceWith):

oldParent = self.parent

myIndex = self.parent.contents.index(self)

if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:

# We're replacing this element with one of its siblings.

index = self.parent.contents.index(replaceWith)

if index and index < myIndex:

# Furthermore, it comes before this element. That

# means that when we extract it, the index of this

# element will change.

myIndex = myIndex - 1

self.extract()

oldParent.insert(myIndex, replaceWith)

def extract(self):

"""Destructively rips this element out of the tree."""

if self.parent:

try:

self.parent.contents.remove(self)

except ValueError:

pass

#Find the two elements that would be next to each other if

#this element (and any children) hadn't been parsed. Connect

#the two.

lastChild = self._lastRecursiveChild()

nextElement = lastChild.next

if self.previous:

self.previous.next = nextElement

if nextElement:

nextElement.previous = self.previous

self.previous = None

lastChild.next = None

self.parent = None

if self.previousSibling:

self.previousSibling.nextSibling = self.nextSibling

if self.nextSibling:

self.nextSibling.previousSibling = self.previousSibling

self.previousSibling = self.nextSibling = None

def _lastRecursiveChild(self):

"Finds the last element beneath this object to be parsed."

lastChild = self

while hasattr(lastChild, 'contents') and lastChild.contents:

lastChild = lastChild.contents[-1]

return lastChild

def insert(self, position, newChild):

if (isinstance(newChild, basestring)

or isinstance(newChild, unicode)) \

and not isinstance(newChild, NavigableString):

newChild = NavigableString(newChild)

position = min(position, len(self.contents))

if hasattr(newChild, 'parent') and newChild.parent != None:

# We're 'inserting' an element that's already one

# of this object's children.

if newChild.parent == self:

index = self.find(newChild)

if index and index < position:

# Furthermore we're moving it further down the

# list of this object's children. That means that

# when we extract this element, our target index

# will jump down one.

position = position - 1

newChild.extract()

newChild.parent = self

previousChild = None

if position == 0:

newChild.previousSibling = None

newChild.previous = self

else:

previousChild = self.contents[position-1]

newChild.previousSibling = previousChild

newChild.previousSibling.nextSibling = newChild

newChild.previous = previousChild._lastRecursiveChild()

if newChild.previous:

newChild.previous.next = newChild

newChildsLastElement = newChild._lastRecursiveChild()

if position >= len(self.contents):

newChild.nextSibling = None

parent = self

parentsNextSibling = None

while not parentsNextSibling:

parentsNextSibling = parent.nextSibling

parent = parent.parent

if not parent: # This is the last element in the document.

break

if parentsNextSibling:

newChildsLastElement.next = parentsNextSibling

else:

newChildsLastElement.next = None

else:

nextChild = self.contents[position]

newChild.nextSibling = nextChild

if newChild.nextSibling:

newChild.nextSibling.previousSibling = newChild

newChildsLastElement.next = nextChild

if newChildsLastElement.next:

newChildsLastElement.next.previous = newChildsLastElement

self.contents.insert(position, newChild)

def findNext(self, name=None, attrs={}, text=None, **kwargs):

"""Returns the first item that matches the given criteria and

appears after this Tag in the document."""

return self._findOne(self.findAllNext, name, attrs, text, **kwargs)

def findAllNext(self, name=None, attrs={}, text=None, limit=None,

**kwargs):

"""Returns all items that match the given criteria and appear

before after Tag in the document."""

return self._findAll(name, attrs, text, limit, self.nextGenerator)

def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):

"""Returns the closest sibling to this Tag that matches the

given criteria and appears after this Tag in the document."""

return self._findOne(self.findNextSiblings, name, attrs, text,

**kwargs)

def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,

**kwargs):

"""Returns the siblings of this Tag that match the given

criteria and appear after this Tag in the document."""

return self._findAll(name, attrs, text, limit,

self.nextSiblingGenerator, **kwargs)

fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x

def findPrevious(self, name=None, attrs={}, text=None, **kwargs):

"""Returns the first item that matches the given criteria and

appears before this Tag in the document."""

return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)

def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,

**kwargs):

"""Returns all items that match the given criteria and appear

before this Tag in the document."""

return self._findAll(name, attrs, text, limit, self.previousGenerator,

**kwargs)

fetchPrevious = findAllPrevious # Compatibility with pre-3.x

def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):

"""Returns the closest sibling to this Tag that matches the

given criteria and appears before this Tag in the document."""

return self._findOne(self.findPreviousSiblings, name, attrs, text,

**kwargs)

def findPreviousSiblings(self, name=None, attrs={}, text=None,

limit=None, **kwargs):

"""Returns the siblings of this Tag that match the given

criteria and appear before this Tag in the document."""

return self._findAll(name, attrs, text, limit,

self.previousSiblingGenerator, **kwargs)

fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x

def findParent(self, name=None, attrs={}, **kwargs):

"""Returns the closest parent of this Tag that matches the given

criteria."""

# NOTE: We can't use _findOne because findParents takes a different

# set of arguments.

r = None

l = self.findParents(name, attrs, 1)

if l:

r = l[0]

return r

def findParents(self, name=None, attrs={}, limit=None, **kwargs):

"""Returns the parents of this Tag that match the given

criteria."""

return self._findAll(name, attrs, None, limit, self.parentGenerator,

**kwargs)

fetchParents = findParents # Compatibility with pre-3.x

#These methods do the real heavy lifting.

def _findOne(self, method, name, attrs, text, **kwargs):

r = None

l = method(name, attrs, text, 1, **kwargs)

if l:

r = l[0]

return r

def _findAll(self, name, attrs, text, limit, generator, **kwargs):

"Iterates over a generator looking for things that match."

if isinstance(name, SoupStrainer):

strainer = name

else:

# Build a SoupStrainer

strainer = SoupStrainer(name, attrs, text, **kwargs)

results = ResultSet(strainer)

g = generator()

while True:

try:

i = g.next()

except StopIteration:

break

if i:

found = strainer.search(i)

if found:

results.append(found)

if limit and len(results) >= limit:

break

return results

#These Generators can be used to navigate starting from both

#NavigableStrings and Tags.

def nextGenerator(self):

i = self

while i:

i = i.next

yield i

def nextSiblingGenerator(self):

i = self

while i:

i = i.nextSibling

yield i

def previousGenerator(self):

i = self

while i:

i = i.previous

yield i

def previousSiblingGenerator(self):

i = self

while i:

i = i.previousSibling

yield i

def parentGenerator(self):

i = self

while i:

i = i.parent

yield i

# Utility methods

def substituteEncoding(self, str, encoding=None):

encoding = encoding or "utf-8"

return str.replace("%SOUP-ENCODING%", encoding)

def toEncoding(self, s, encoding=None):

"""Encodes an object to a string in some encoding, or to Unicode.

."""

if isinstance(s, unicode):

if encoding:

s = s.encode(encoding)

elif isinstance(s, str):

if encoding:

s = s.encode(encoding)

else:

s = unicode(s)

else:

if encoding:

s = self.toEncoding(str(s), encoding)

else:

s = unicode(s)

return s

class NavigableString(unicode, PageElement):

def __getattr__(self, attr):

"""text.string gives you text. This is for backwards

compatibility for Navigable*String, but for CData* it lets you

get the string without the CData wrapper."""

if attr == 'string':

return self

else:

raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)

def __unicode__(self):

return self.__str__(None)