Fixes to local-manual script Part 2.

This reinstates the logo and the 303.css. It's now mostly working for most of the pages.
2025-09-19 09:30:52 +02:00 · 2017-08-22 22:57:46 +01:00 · 2017-08-22 22:57:46 +01:00 · 74c2af918c
commit 74c2af918c
parent e036c59f15
3 changed files with 2081 additions and 16 deletions
--- a/scripts/mw2html_audacity/303.css
+++ b/scripts/mw2html_audacity/303.css
--- a/scripts/mw2html_audacity/AudacityLogo.png
+++ b/scripts/mw2html_audacity/AudacityLogo.png
--- a/scripts/mw2html_audacity/mw2html.py
+++ b/scripts/mw2html_audacity/mw2html.py
@ -18,7 +18,7 @@ Minor tweaks (for Audacity) By James Crook, Nov 2009.
 ...
 """
-__version__ = '0.1.0.0'
+__version__ = '0.1.0.1'
 import re
 import sys
@ -34,6 +34,7 @@ import hashlib
 import httplib
 #import pdb
 from time import strftime
 from shutil import copyfile
 try:
    set
@ -47,6 +48,8 @@ except:
    print '  http://www.connellybarnes.com/code/htmldata/'
    sys.exit()
 config = None
 MOVE_HREF = 'movehref'
 MADE_BY_COMMENT = '<!-- Content generated by Mediawiki and mw2html -->'
@ -121,6 +124,10 @@ def normalize_url(url, lower=True):
    if url.startswith('http://'):
        url = url[len('http://'):]
    # if url.startswith('https://'):
    #    url = url[len('https://'):]
    if url.startswith('www.'):
        url = url[len('www.'):]
@ -187,17 +194,22 @@ def monobook_fix_html(doc, page_url):
    if config.made_by:
        doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
-    doc = remove_tag(doc, '<div class="portlet" id="p-personal"', '</div>', '<div')
+    # Obselete substitutions.
-    doc = remove_tag(doc, '<div id="p-search" class="portlet"', '</div>', '<div')
+    # doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
-    doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
+    # doc = remove_tag(doc, '<div id=\'catlinks\' class=\'catlinks catlinks-allhidden\'>', '</div>', '<div')
    doc = remove_tag(doc, '<div id=\'catlinks\' class=\'catlinks catlinks-allhidden\'>', '</div>', '<div')
    #James also remove the page/discussion/source/history/ div.
    doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
    doc = remove_tag(doc, '<div id="p-search" class="portlet"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="portlet" id="p-personal"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="editornote2"', '</div>', '<div')
    doc = remove_tag(doc, '<div id="p-cactions"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="generated-sidebar portlet" id="p-For_Editors"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="generated-sidebar portlet" id="p-ToDo"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="portlet" id="p-tb"', '</div>', '<div')
    #remove javascript.
    doc = remove_tag(doc, '<script', '</script>', '<script')
    #andre special mode
    if config.special_mode:
@ -262,7 +274,7 @@ def pre_html_transform(doc, url):
    return doc
-def pos_html_transform(doc, url):
+def pos_html_transform(doc, url,filename):
    global footer_text, config, sidebar_html
    url = normalize_url(url, False)
@ -272,7 +284,7 @@ def pos_html_transform(doc, url):
        sidebar_html = f.read()
        f.close()
-    doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
+    # doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
    # Remove empty links
    doc = clean_tag(doc, 'href=""', '</a>', '<a ');
@ -284,9 +296,15 @@ def pos_html_transform(doc, url):
        # Remove external javascript
        doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>', r'', doc)
    # Add back relevant stylesheet.
    doc = re.sub(r'</head>', '<link rel="stylesheet" href="../m/skins/monobook/main.css/303.css" media="screen" />\n</head>', doc, flags=re.DOTALL)
    # Add back relevant stylesheet.
    top_level_dir = config.outdir
    if( os.path.dirname(os.path.dirname( filename )) == config.outdir ):
        doc = re.sub(r'</head>', '<link rel="stylesheet" href="m/skins/monobook/main.css/303.css" media="screen" />\n</head>', doc, flags=re.DOTALL)
    else:
        doc = re.sub(r'</head>',
                 '<link rel="stylesheet" href="../m/skins/monobook/main.css/303.css" media="screen" />\n</head>', doc,
                 flags=re.DOTALL)
    # Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
    if config.footer is not None:
@ -357,8 +375,8 @@ def html_remove_translation_links(doc):
    The second version deals with links like /pt_PT and /zh_CN
    We are case sensitive, so as not to treat FAQ as a language code.
    """
-    doc = re.sub(r'<a href="[^"]+/[a-z]{2,3}[/"][\s\S]+?</a>', r'<!--Removed Translation Flag-->', doc)
+    doc = re.sub(r'<a href="[^"]+/[a-z]{2,3}[/"][\s\S]+?</a>', r'', doc)
-    doc = re.sub(r'<a href="[^"]+/[a-z]{2}_[A-Z]{2}[/"][\s\S]+?</a>', r'<!--Removed Translation Flag2-->', doc)
+    doc = re.sub(r'<a href="[^"]+/[a-z]{2}_[A-Z]{2}[/"][\s\S]+?</a>', r'', doc)
    return doc
 def monobook_hack_skin_html(doc):
@ -859,12 +877,13 @@ def should_follow(url):
    return True
-def parse_html(doc, url):
+def parse_html(doc, url, filename):
    """
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links we want to spider in the HTML.
    """
    global config
    global counter
    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'
@ -886,7 +905,7 @@ def parse_html(doc, url):
    # more pages.
    for item in L:
        u = item.url
-        follow = should_follow(u)
+        follow = should_follow(u) # and (counter < 10)
        if follow:
            if config.debug:
                print 'ACCEPTED   - ', u
@ -904,11 +923,10 @@ def parse_html(doc, url):
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
-    newdoc = pos_html_transform(newdoc, url)
+    newdoc = pos_html_transform(newdoc, url,filename)
    return (newdoc, new_urls)
 def run(out=sys.stdout):
    """
    Code interface.
@ -976,7 +994,7 @@ def run(out=sys.stdout):
        new_urls = []
        if filename.endswith('.html'):
-            (doc, new_urls) = parse_html(doc, url)
+            (doc, new_urls) = parse_html(doc, url, filename)
        elif filename.endswith('.css'):
            (doc, new_urls) = parse_css(doc, url)
@ -1013,6 +1031,26 @@ def run(out=sys.stdout):
    print errors, "errors not recovered"
    src_dir = os.path.dirname(os.path.realpath(__file__))
    src = os.path.join(src_dir, "AudacityLogo.png")
    subfile = r"alphamanual.audacityteam.org\m\resources\assets\AudacityLogo.png"
    dest = os.path.join(config.outdir, subfile)
    print "copying from", src, "to", dest
    directory = os.path.dirname(dest)
    if not os.path.exists(directory):
        os.makedirs(directory)
    copyfile(src,dest)
    src = os.path.join(src_dir, "303.css")
    subfile = r"alphamanual.audacityteam.org\m\skins\monobook\main.css\303.css"
    dest = os.path.join(config.outdir, subfile)
    print "copying from", src, "to", dest
    directory = os.path.dirname(dest)
    if not os.path.exists(directory):
        os.makedirs(directory)
    copyfile(src,dest)
 def usage():
    """
    Print command line options.