diff --git a/scripts/mw2html_audacity/mw2html.py b/scripts/mw2html_audacity/mw2html.py
index 1f042988f..01a6a06ff 100644
--- a/scripts/mw2html_audacity/mw2html.py
+++ b/scripts/mw2html_audacity/mw2html.py
@@ -249,6 +249,8 @@ def pre_html_transform(doc, url):
doc = fix_move_href_tags(doc)
if config.remove_history:
doc = html_remove_image_history(doc)
+
+ doc = html_remove_translation_links(doc)
return doc
@@ -335,6 +337,16 @@ def html_remove_image_history(doc):
doc = re.sub(r'
Image links
[\s\S]+?', r'', doc)
return doc
+def html_remove_translation_links(doc):
+ """
+ Remove translation links (the international flags).
+ We identify them by the pattern for a 2 letter language code, /[\s\S][\s\S][/"]
+ in the URL.
+ """
+ doc = re.sub(r'', r'', doc)
+ doc = re.sub(r'', r'', doc)
+ return doc
+
def monobook_hack_skin_html(doc):
"""
Hacks Monobook HTML output: use CSS ids for hacked skin.
@@ -624,6 +636,10 @@ def url_to_filename(url):
if L[4].startswith('title=') and L[2].endswith('index.php'):
L[4] = L[4][len('title='):]
L[2] = L[2][:-len('index.php')]
+
+ if lpath[-1]=='man':
+ L[2] += '/' + INDEX_HTML
+
L[2] = L[2].strip('/')
@@ -933,15 +949,8 @@ def run(out=sys.stdout):
elif filename.endswith('.css'):
(doc, new_urls) = parse_css(doc, url)
- # Enqueue URLs that we haven't yet spidered.
- for u in new_urls:
- if normalize_url(u) not in complete:
- # Strip off any #section link.
- if '#' in u:
- u = u[:u.index('#')]
- pending.add(u)
-
# Save document changes to disk
+ # The unmodified file already exists on disk.
update = False
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
for ext in text_ext:
@@ -957,6 +966,14 @@ def run(out=sys.stdout):
if config.debug:
out.write(url + '\n => ' + filename + '\n\n')
n += 1
+
+ # Enqueue URLs that we haven't yet spidered.
+ for u in new_urls:
+ if normalize_url(u) not in complete:
+ # Strip off any #section link.
+ if '#' in u:
+ u = u[:u.index('#')]
+ pending.add(u)
conn.close()
print "connection to", domain, "closed."