#! /usr/bin/env python """ mw2html - Mediawiki to static HTML I use this to create a personal website from a local mediawiki installation. No search functionality. Hacks the Monobook skin and the produced HTML. Connelly Barnes 2005. Public domain. Reworked by Andre Pinto 2009. Improved performance. Improved filtering. Improved usability. Customized for Audacity's manual wiki. Minor tweaks (for Audacity) By James Crook, Nov 2009. ... """ __version__ = '0.1.0.0' import re import sys import getopt import random import urllib import textwrap import urlparse import os, os.path import errno import hashlib import httplib #import pdb from time import strftime try: set except: from sets import Set as set try: import htmldata except: print 'Requires Python htmldata module:' print ' http://www.connellybarnes.com/code/htmldata/' sys.exit() config = None MOVE_HREF = 'movehref' MADE_BY_COMMENT = '' INDEX_HTML = 'index.html' QHELP_HTML = 'quick_help.html' url_filename_cache = {} redir_cache = {} wrote_file_set = set() sidebar_html = '' footer_text = '' counter = 0 errors = 0 conn = None headers = {"User-Agent": "mw2html.py/Audacity"} domain = '' MONOBOOK_SKIN = 'monobook' # Constant identifier for Monobook. class Config: """ Instances contain all options passed at the command line. """ def __init__(self, rooturl, outdir, flatten=True, index=None, clean=True, sidebar=None, hack_skin=True, made_by=True, overwrite=False, footer=None, skin=MONOBOOK_SKIN, move_href=True, remove_png=True, remove_history=True, limit_parent=False, special_mode=False, debug=False, no_images=False): self.rooturl = rooturl self.outdir = os.path.abspath(outdir) self.flatten = flatten self.index = index self.clean = clean self.sidebar = sidebar self.hack_skin = hack_skin self.made_by = made_by self.overwrite = overwrite self.footer = footer self.skin = skin self.move_href = move_href if self.sidebar is not None: self.sidebar = os.path.abspath(self.sidebar) if self.footer is not None: self.footer = os.path.abspath(self.footer) self.remove_png = remove_png self.remove_history = remove_history self.limit_parent = limit_parent self.special_mode = special_mode self.debug = debug self.no_images = no_images def get_domain(u): """ Get domain of URL. """ url = normalize_url(u) #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') L = list(urlparse.urlparse(url)) return L[1] def normalize_url(url, lower=True): # url normalization - only for local comparison operations, use original url for online requests url = split_section(url)[0] if lower: url = url.lower() if url.startswith('http://'): url = url[len('http://'):] if url.startswith('www.'): url = url[len('www.'):] url = url.strip('/') url = 'http://' + url urlparse.urljoin(config.rooturl, url) return url def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point=0): # find tag limits - start_string must be an unique identifier within doc i1 = doc.find(filter_string, start_point) if i1 == -1: return (-1, -1) aux = doc.rfind(start_tag, start_point, i1 + len(filter_string)) # we've found the filter_string but it has not the start_tag, so we return a different value # telling the script to keep searching starting on the end of the filter_string found if aux == -1: return (-2, i1 + len(filter_string)) i1 = aux sdiv = i1 ediv = i1 + len(start_tag) while(sdiv < ediv and sdiv != -1): sdiv = doc.find(start_tag, sdiv + len(start_tag)) ediv = doc.find(end_tag , ediv + len(end_tag)) return (i1, ediv) def clean_tag(doc, filter_string, end_tag, start_tag): #clean tagged text function start_point = 0 while True: (start1, start2) = find_tag_limits(doc, filter_string, end_tag, start_tag, start_point) if start1 == -1 or start2 == -1: return doc if start1 == -2: start_point = start2 continue end1 = doc.find('>', start1) + 1; end2 = start2 + len(end_tag); doc = doc[:start1] + doc[end1:start2] + doc[end2:] def remove_tag(doc, start_string, end_tag, start_tag): #remove tagged text function while True: (i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag) if i1 == -1 or i2 == -1: return doc doc = doc[:i1] + doc[i2 + len(end_tag):] def monobook_fix_html(doc, page_url): """ Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output. """ global config if config.made_by: doc = doc.replace('', '', '