"""
Manipulate HTML or XHTML documents.
Version 1.1.1. This source code has been placed in the
public domain by Connelly Barnes.
Features:
- Translate HTML back and forth to data structures.
This allows you to read and write HTML documents
programmably, with much flexibility.
- Extract and modify URLs in an HTML document.
- Compatible with Python 2.0 - 2.5.
See the L{examples} for a quick start.
"""
__version__ = '1.1.1'
__all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract',
'urljoin', 'URLMatch']
# Define True and False for Python < 2.2.
import sys
if sys.version_info[:3] < (2, 2, 0):
exec "True = 1; False = 0"
# -------------------------------------------------------------------
# Globals
# -------------------------------------------------------------------
import re
import shlex
import string
import urllib
import urlparse
import types
# Translate text between these strings as plain text (not HTML).
_IGNORE_TAGS = [('script', '/script'),
('style', '/style')]
# Special tags where we have to look for _END_X as part of the
# HTML/XHTML parsing rules.
_BEGIN_COMMENT = ''
_BEGIN_CDATA = ''
# Mime types that can be parsed as HTML or HTML-like.
_HTML_MIMETYPES = ['text/html', 'application/xhtml',
'application/xhtml+xml', 'text/xml',
'application/xml']
# Mime types that can be parsed as CSS.
_CSS_MIMETYPES = ['text/css']
# -------------------------------------------------------------------
# HTML <-> Data structure
# -------------------------------------------------------------------
def tagextract(doc):
"""
Convert HTML to data structure.
Returns a list. HTML tags become C{(name, keyword_dict)} tuples
within the list, while plain text becomes strings within the
list. All tag names are lowercased and stripped of whitespace.
Tags which end with forward slashes have a single forward slash
placed at the end of their name, to indicate that they are XML
unclosed tags.
Example:
>>> tagextract('
foo