diff --git a/scripts/mw2html_audacity/mw2html.py b/scripts/mw2html_audacity/mw2html.py index f97c0d927..15e6222be 100644 --- a/scripts/mw2html_audacity/mw2html.py +++ b/scripts/mw2html_audacity/mw2html.py @@ -51,6 +51,7 @@ config = None MOVE_HREF = 'movehref' MADE_BY_COMMENT = '' INDEX_HTML = 'index.html' +QHELP_HTML = 'quick_help.html' url_filename_cache = {} redir_cache = {} wrote_file_set = set() @@ -235,7 +236,6 @@ def pre_html_transform(doc, url): modified HTML document. """ global config - new_urls = [] if config.hack_skin: if config.skin == MONOBOOK_SKIN: @@ -292,7 +292,7 @@ def pos_html_transform(doc, url): f.close() # add static dump time - footer_html = footer_text.replace('%DATE%', strftime("%Y-%m-%d %H:%M:%S")) + footer_html = footer_text.replace('%DATE%', strftime("%Y-%m-%d")) # add online url footer_html = footer_html.replace('%ONLINEURL%', url) @@ -618,7 +618,9 @@ def url_to_filename(url): return url_filename_cache[nurl] #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') - L = list(urlparse.urlparse(nurl)) + turl = re.sub(r'm/index.php\?title=', r'man/', nurl) + turl = re.sub(r'.css&[\S\s]+', r'.css', turl) + L = list(urlparse.urlparse(turl)) #this way the url will not create a folder outside of the maindomain droot = get_domain(config.rooturl) @@ -639,6 +641,10 @@ def url_to_filename(url): if lpath[-1]=='man': L[2] = INDEX_HTML + if lpath[-1].lower().startswith( 'quick_help'): + L[2] = QHELP_HTML + L[3] = '' + L[2] = L[2].strip('/') @@ -824,6 +830,10 @@ def should_follow(url): print url, 'is a file outside of scope with unknown extension' return False + # JKC: we do allow css from 'strange' places. + if '.css' in L[-1]: + return True + forbidden_parents = ['.php', '.html', '.htm'] for fp in forbidden_parents: if fp in L[-1]: @@ -852,7 +862,11 @@ def parse_html(doc, url): doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') - + + # in this code we change each absolute url in L + # into a relative one. + # we also kick-off zillions of subthreads to collect + # more pages. for item in L: u = item.url follow = should_follow(u) @@ -868,7 +882,7 @@ def parse_html(doc, url): # item.url = '' if config.debug: print 'NOT INCLUDED - ', u - + newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '')