Fixes to local-manual script Part 2.

This reinstates the logo and the 303.css. It's now mostly working for most of the pages.
2026-01-09 14:17:10 +01:00 · 2017-08-22 22:57:46 +01:00
parent e036c59f15
commit 74c2af918c
3 changed files with 2081 additions and 16 deletions
--- a/scripts/mw2html_audacity/303.css
+++ b/scripts/mw2html_audacity/303.css
--- a/scripts/mw2html_audacity/AudacityLogo.png
+++ b/scripts/mw2html_audacity/AudacityLogo.png
--- a/scripts/mw2html_audacity/mw2html.py
+++ b/scripts/mw2html_audacity/mw2html.py
@@ -18,7 +18,7 @@ Minor tweaks (for Audacity) By James Crook, Nov 2009.
 ...
 """

-__version__ = '0.1.0.0'
+__version__ = '0.1.0.1'

 import re
 import sys
@@ -34,6 +34,7 @@ import hashlib
 import httplib
 #import pdb
 from time import strftime
+from shutil import copyfile

 try:
    set
@@ -47,6 +48,8 @@ except:
    print '  http://www.connellybarnes.com/code/htmldata/'
    sys.exit()

+
+
 config = None
 MOVE_HREF = 'movehref'
 MADE_BY_COMMENT = '<!-- Content generated by Mediawiki and mw2html -->'
@@ -121,6 +124,10 @@ def normalize_url(url, lower=True):

    if url.startswith('http://'):
        url = url[len('http://'):]
+
+    # if url.startswith('https://'):
+    #    url = url[len('https://'):]
+
    if url.startswith('www.'):
        url = url[len('www.'):]

@@ -187,17 +194,22 @@ def monobook_fix_html(doc, page_url):
    if config.made_by:
        doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')

-    doc = remove_tag(doc, '<div class="portlet" id="p-personal"', '</div>', '<div')
-    doc = remove_tag(doc, '<div id="p-search" class="portlet"', '</div>', '<div')
-    doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
-    doc = remove_tag(doc, '<div id=\'catlinks\' class=\'catlinks catlinks-allhidden\'>', '</div>', '<div')
+    # Obselete substitutions.
+    # doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
+    # doc = remove_tag(doc, '<div id=\'catlinks\' class=\'catlinks catlinks-allhidden\'>', '</div>', '<div')
    #James also remove the page/discussion/source/history/ div.
    doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
+    doc = remove_tag(doc, '<div id="p-search" class="portlet"', '</div>', '<div')
+    doc = remove_tag(doc, '<div class="portlet" id="p-personal"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="editornote2"', '</div>', '<div')
    doc = remove_tag(doc, '<div id="p-cactions"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="generated-sidebar portlet" id="p-For_Editors"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="generated-sidebar portlet" id="p-ToDo"', '</div>', '<div')
    doc = remove_tag(doc, '<div class="portlet" id="p-tb"', '</div>', '<div')
+    #remove javascript.
+    doc = remove_tag(doc, '<script', '</script>', '<script')
+
+

    #andre special mode
    if config.special_mode:
@@ -262,7 +274,7 @@ def pre_html_transform(doc, url):

    return doc

-def pos_html_transform(doc, url):
+def pos_html_transform(doc, url,filename):
    global footer_text, config, sidebar_html
    url = normalize_url(url, False)

@@ -272,7 +284,7 @@ def pos_html_transform(doc, url):
        sidebar_html = f.read()
        f.close()

-    doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
+    # doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)

    # Remove empty links
    doc = clean_tag(doc, 'href=""', '</a>', '<a ');
@@ -284,9 +296,15 @@ def pos_html_transform(doc, url):
        # Remove external javascript
        doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>', r'', doc)

-    # Add back relevant stylesheet.
-    doc = re.sub(r'</head>', '<link rel="stylesheet" href="../m/skins/monobook/main.css/303.css" media="screen" />\n</head>', doc, flags=re.DOTALL)

+    # Add back relevant stylesheet.
+    top_level_dir = config.outdir
+    if( os.path.dirname(os.path.dirname( filename )) == config.outdir ):
+        doc = re.sub(r'</head>', '<link rel="stylesheet" href="m/skins/monobook/main.css/303.css" media="screen" />\n</head>', doc, flags=re.DOTALL)
+    else:
+        doc = re.sub(r'</head>',
+                 '<link rel="stylesheet" href="../m/skins/monobook/main.css/303.css" media="screen" />\n</head>', doc,
+                 flags=re.DOTALL)

    # Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
    if config.footer is not None:
@@ -357,8 +375,8 @@ def html_remove_translation_links(doc):
    The second version deals with links like /pt_PT and /zh_CN
    We are case sensitive, so as not to treat FAQ as a language code.
    """
-    doc = re.sub(r'<a href="[^"]+/[a-z]{2,3}[/"][\s\S]+?</a>', r'<!--Removed Translation Flag-->', doc)
-    doc = re.sub(r'<a href="[^"]+/[a-z]{2}_[A-Z]{2}[/"][\s\S]+?</a>', r'<!--Removed Translation Flag2-->', doc)
+    doc = re.sub(r'<a href="[^"]+/[a-z]{2,3}[/"][\s\S]+?</a>', r'', doc)
+    doc = re.sub(r'<a href="[^"]+/[a-z]{2}_[A-Z]{2}[/"][\s\S]+?</a>', r'', doc)
    return doc

 def monobook_hack_skin_html(doc):
@@ -859,12 +877,13 @@ def should_follow(url):

    return True

-def parse_html(doc, url):
+def parse_html(doc, url, filename):
    """
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links we want to spider in the HTML.
    """
    global config
+    global counter

    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'
@@ -886,7 +905,7 @@ def parse_html(doc, url):
    # more pages.
    for item in L:
        u = item.url
-        follow = should_follow(u)
+        follow = should_follow(u) # and (counter < 10)
        if follow:
            if config.debug:
                print 'ACCEPTED   - ', u
@@ -904,11 +923,10 @@ def parse_html(doc, url):
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')

-    newdoc = pos_html_transform(newdoc, url)
+    newdoc = pos_html_transform(newdoc, url,filename)

    return (newdoc, new_urls)

-
 def run(out=sys.stdout):
    """
    Code interface.
@@ -976,7 +994,7 @@ def run(out=sys.stdout):
        new_urls = []

        if filename.endswith('.html'):
-            (doc, new_urls) = parse_html(doc, url)
+            (doc, new_urls) = parse_html(doc, url, filename)
        elif filename.endswith('.css'):
            (doc, new_urls) = parse_css(doc, url)

@@ -1013,6 +1031,26 @@ def run(out=sys.stdout):
    print errors, "errors not recovered"


+    src_dir = os.path.dirname(os.path.realpath(__file__))
+    src = os.path.join(src_dir, "AudacityLogo.png")
+    subfile = r"alphamanual.audacityteam.org\m\resources\assets\AudacityLogo.png"
+    dest = os.path.join(config.outdir, subfile)
+    print "copying from", src, "to", dest
+    directory = os.path.dirname(dest)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    copyfile(src,dest)
+    src = os.path.join(src_dir, "303.css")
+    subfile = r"alphamanual.audacityteam.org\m\skins\monobook\main.css\303.css"
+    dest = os.path.join(config.outdir, subfile)
+    print "copying from", src, "to", dest
+    directory = os.path.dirname(dest)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    copyfile(src,dest)
+
+
+
 def usage():
    """
    Print command line options.