diff --git a/scripts/mw2html_audacity/htmldata.py b/scripts/mw2html_audacity/htmldata.py
index c11927dee..f10f8060e 100644
--- a/scripts/mw2html_audacity/htmldata.py
+++ b/scripts/mw2html_audacity/htmldata.py
@@ -11,22 +11,19 @@ Features:
This allows you to read and write HTML documents
programmably, with much flexibility.
- Extract and modify URLs in an HTML document.
- - Compatible with Python 2.0 - 2.5.
+ - Compatible with Python 3+
See the L{examples} for a quick start.
+Moved to Python3 by Jack Thomson May 2020
+
"""
-__version__ = '1.1.1'
+__version__ = '1.1.2'
__all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract',
'urljoin', 'URLMatch']
-# Define True and False for Python < 2.2.
-import sys
-if sys.version_info[:3] < (2, 2, 0):
- exec "True = 1; False = 0"
-
# -------------------------------------------------------------------
# Globals
# -------------------------------------------------------------------
@@ -34,8 +31,8 @@ if sys.version_info[:3] < (2, 2, 0):
import re
import shlex
import string
-import urllib
-import urlparse
+import urllib.request, urllib.parse, urllib.error
+import urllib.parse
import types
# Translate text between these strings as plain text (not HTML).
@@ -164,7 +161,7 @@ def tagjoin(L):
else:
rslash = ''
tag_items = []
- items = d.items()
+ items = list(d.items())
items.sort()
for (key, value) in items:
if value != None:
@@ -189,7 +186,7 @@ def _enumerate(L):
Returns a list instead of an iterator.
"""
- return zip(range(len(L)), L)
+ return list(zip(list(range(len(L))), L))
def _ignore_tag_index(s, i):
"""
@@ -261,7 +258,7 @@ def _html_split(s):
found = False
in_quot1 = False
in_quot2 = False
- for i2 in xrange(i + 1, len(s)):
+ for i2 in range(i + 1, len(s)):
c2 = s[i2]
if c2 == '"' and not in_quot1:
in_quot2 = not in_quot2
@@ -521,7 +518,7 @@ def _test_tag_dict():
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
(a, b, c) = _tag_dict(s)
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
- for key in a.keys():
+ for key in list(a.keys()):
assert s[b[key][0]:b[key][1]] == key
if a[key] != None:
assert s[c[key][0]:c[key][1]] == a[key]
@@ -609,7 +606,7 @@ def _full_tag_extract(s):
(attrs, key_pos, value_pos) = _tag_dict(dtext)
# Correct offsets in key_pos and value_pos.
- for key in attrs.keys():
+ for key in list(attrs.keys()):
key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
key_pos[key][1] + Lstart[i] + dtext_offset)
value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
@@ -720,7 +717,7 @@ _URL_TAGS = ['a href', 'applet archive', 'applet code',
'script src', 'table background', 'tbody background',
'td background', 'tfoot background', 'th background',
'thead background', 'tr background']
-_URL_TAGS = map(lambda s: tuple(s.split()), _URL_TAGS)
+_URL_TAGS = [tuple(s.split()) for s in _URL_TAGS]
def _finditer(pattern, string):
@@ -862,7 +859,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
pass
else:
# Current item is a tag.
- if item.attrs.has_key('style'):
+ if 'style' in item.attrs:
# Process a stylesheet embedded in the 'style' attribute.
temp = urlextract(item.attrs['style'], siteurl, 'text/css')
# Offset indices and add to ans.
@@ -872,7 +869,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
ans += temp
for (a, b) in _URL_TAGS:
- if item.name.startswith(a) and b in item.attrs.keys():
+ if item.name.startswith(a) and b in list(item.attrs.keys()):
# Got one URL.
url = item.attrs[b]
# FIXME: Some HTML tag wants a URL list, look up which
@@ -893,7 +890,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
start_end_map = {}
filtered_ans = []
for item in ans:
- if not start_end_map.has_key((item.start, item.end)):
+ if (item.start, item.end) not in start_end_map:
start_end_map[(item.start, item.end)] = None
filtered_ans.append(item)
return filtered_ans
@@ -1090,7 +1087,7 @@ def examples():
the offending IP address.
"""
- print examples.__doc__
+ print(examples.__doc__)
class URLMatch:
"""
@@ -1137,7 +1134,7 @@ class URLMatch:
self.in_css = in_css
if siteurl != None:
- self.url = urlparse.urljoin(siteurl, self.url)
+ self.url = urllib.parse.urljoin(siteurl, self.url)
self.tag_attr = tag_attr
self.tag_attrs = tag_attrs
@@ -1154,15 +1151,15 @@ def _cast_to_str(arg, str_class):
"""
if _is_str(arg):
return str_class(arg)
- elif isinstance(arg, types.ListType):
+ elif isinstance(arg, list):
ans = []
for item in arg:
if _is_str(item):
ans.append(str_class(item))
- elif isinstance(item, types.TupleType) and len(item) == 2:
+ elif isinstance(item, tuple) and len(item) == 2:
(a, b) = item
b_prime = {}
- for (b_key, b_value) in b.items():
+ for (b_key, b_value) in list(b.items()):
if b_value is None:
b_prime[str_class(b_key)] = None
else:
@@ -1321,7 +1318,7 @@ def _test_tagextract(str_class=str):
L = _full_tag_extract(s)
for (i, item) in _enumerate(L):
if isinstance(item, _HTMLTag):
- for key in item.attrs.keys():
+ for key in list(item.attrs.keys()):
assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\
== key
if item.attrs[key] != None:
@@ -1460,7 +1457,7 @@ def _test_urlextract(str_class=str):
base = f('http://www.python.org/~guido/')
L = urlextract(s, base)
L2 = [x.url for x in L]
- assert L2 == [urlparse.urljoin(base, x) for x in ans]
+ assert L2 == [urllib.parse.urljoin(base, x) for x in ans]
# Test urljoin().
assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1
@@ -1485,17 +1482,6 @@ def _test_urlextract(str_class=str):
assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html'])
assert [s[x.start:x.end] == x.url for x in L].count(False) == 0
-def _python_has_unicode():
- """
- True iff Python was compiled with unicode().
- """
- try:
- unicode
- return True
- except:
- return False
-
-
# -------------------------------------------------------------------
# Unit Test Main Routine
# -------------------------------------------------------------------
@@ -1504,32 +1490,30 @@ def _test():
"""
Unit test main routine.
"""
- print 'Unit tests:'
+ print('Unit tests:')
_test_remove_comments()
- print ' _remove_comments: OK'
+ print(' _remove_comments: OK')
_test_shlex_split()
- print ' _shlex_split: OK'
+ print(' _shlex_split: OK')
_test_tag_dict()
- print ' _tag_dict: OK'
+ print(' _tag_dict: OK')
_test_tuple_replace()
- print ' _tuple_replace: OK'
+ print(' _tuple_replace: OK')
_test_tagextract()
- print ' tagextract*: OK'
+ print(' tagextract*: OK')
- if _python_has_unicode():
- _test_tagextract(unicode)
- print ' tagextract (unicode)*: OK'
+ _test_tagextract(str)
+ print(' tagextract (unicode)*: OK')
_test_urlextract()
- print ' urlextract*: OK'
+ print(' urlextract*: OK')
- if _python_has_unicode():
- _test_urlextract(unicode)
- print ' urlextract (unicode)*: OK'
+ _test_urlextract(str)
+ print(' urlextract (unicode)*: OK')
- print
- print '* The corresponding join method has been tested as well.'
+ print()
+ print('* The corresponding join method has been tested as well.')
if __name__ == '__main__':
diff --git a/scripts/mw2html_audacity/mw2html.py b/scripts/mw2html_audacity/mw2html.py
index 39f327913..95c485260 100644
--- a/scripts/mw2html_audacity/mw2html.py
+++ b/scripts/mw2html_audacity/mw2html.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
"""
mw2html - Mediawiki to static HTML
@@ -15,37 +15,34 @@ Improved filtering.
Improved usability.
Customized for Audacity's manual wiki.
Minor tweaks (for Audacity) By James Crook, Nov 2009.
+Moved to Python3 by Jack Thomson, May 2020
...
"""
-__version__ = '0.1.0.2'
+__version__ = '0.1.0.3'
import re
import sys
import getopt
import random
-import urllib
+import urllib.request, urllib.parse, urllib.error
import textwrap
-import urlparse
+import urllib.parse
import os, os.path
+import htmldata
+
import errno
import hashlib
-import httplib
-#import pdb
+import http.client
from time import strftime
from shutil import copyfile
-try:
- set
-except:
- from sets import Set as set
-
try:
import htmldata
except:
- print 'Requires Python htmldata module:'
- print ' http://www.connellybarnes.com/code/htmldata/'
+ print('Requires Python3 htmldata module:')
+ print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py')
sys.exit()
@@ -111,7 +108,7 @@ def get_domain(u):
url = normalize_url(u)
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
- L = list(urlparse.urlparse(url))
+ L = list(urllib.parse.urlparse(url))
return L[1]
@@ -135,7 +132,7 @@ def normalize_url(url, lower=True):
url = 'https://' + url
- urlparse.urljoin(config.rooturl, url)
+ urllib.parse.urljoin(config.rooturl, url)
return url
@@ -283,7 +280,7 @@ def pos_html_transform(doc, url,filename):
# Add sidebar.html
if config.sidebar != None and sidebar_html == '':
- f = open(config.sidebar, 'rU')
+ f = open(config.sidebar, 'r')
sidebar_html = f.read()
f.close()
@@ -328,7 +325,7 @@ def pos_html_transform(doc, url,filename):
return doc
if footer_text == '':
- f = open(config.footer, 'rU')
+ f = open(config.footer, 'r')
footer_text = f.read()
f.close()
@@ -567,14 +564,14 @@ def url_open(url):
while redirect != '':
l_redir += [url]
- L = urlparse.urlparse(url)
+ L = urllib.parse.urlparse(url)
if L[1] != domain:
conn.close()
if L[1] == '': return(['',''])
- print "connection to", domain, "closed."
- conn = httplib.HTTPSConnection(L[1])
+ print("connection to", domain, "closed.")
+ conn = http.client.HTTPSConnection(L[1])
domain = L[1]
- print "connection to", domain, "opened."
+ print("connection to", domain, "opened.")
rel_url = url
pos = url.find(domain)
@@ -593,47 +590,47 @@ def url_open(url):
try:
conn.request("GET", rel_url,headers=headers)
r = conn.getresponse()
- print 'Status', r.status, r.reason, 'accessing', rel_url
+ print('Status', r.status, r.reason, 'accessing', rel_url)
if r.status == 404:
- print " it's not possible to recover this error."
+ print(" it's not possible to recover this error.")
errors += 1
return ('', '')
if r.status == 500:
- print " eventually this error might be recovered. let's try again."
- print ' reconnecting...'
- conn = httplib.HTTPSConnection(domain)
+ print(" eventually this error might be recovered. let's try again.")
+ print(' reconnecting...')
+ conn = http.client.HTTPSConnection(domain)
attempts += 1
continue
if r.status == 403:
- print " that shouldn't happen, but let's try again anyway."
- print ' reconnecting...'
- conn = httplib.HTTPSConnection(domain)
+ print(" that shouldn't happen, but let's try again anyway.")
+ print(' reconnecting...')
+ conn = http.client.HTTPSConnection(domain)
attempts += 1
continue
if attempts != 0:
recovered = True
if r.status != 200:
- print " Status other than 200, 404, 500, 403. It is: ", r.status
+ print(" Status other than 200, 404, 500, 403. It is: ", r.status)
success = True
- except httplib.HTTPException, e:
- print 'ERROR', e.__class__.__name__, 'while retrieving', url
+ except http.client.HTTPException as e:
+ print('ERROR', e.__class__.__name__, 'while retrieving', url)
conn.close
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
- print "eventually this error might be recovered. let's try again."
- print 'reconnecting...'
- conn = httplib.HTTPSConnection(domain)
+ print("eventually this error might be recovered. let's try again.")
+ print('reconnecting...')
+ conn = http.client.HTTPSConnection(domain)
attempts += 1
else:
- print "it's not possible to recover this error."
+ print("it's not possible to recover this error.")
errors += 1
return ('', '')
if recovered:
- print "error recovered"
+ print("error recovered")
if not success:
- print "it was not possible to recover this error."
+ print("it was not possible to recover this error.")
errors += 1
return ('', '')
@@ -666,7 +663,7 @@ def url_to_filename(url):
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
turl = re.sub(r'm/index.php\?title=', r'man/', nurl)
turl = re.sub(r'.css&[\S\s]+', r'.css', turl)
- L = list(urlparse.urlparse(turl))
+ L = list(urllib.parse.urlparse(turl))
#this way the url will not create a folder outside of the maindomain
droot = get_domain(config.rooturl)
@@ -697,10 +694,10 @@ def url_to_filename(url):
#don't sanitize / for path
L[0] = ''
- L[2] = urllib.quote_plus(L[2],'/')
- L[3] = urllib.quote_plus(L[3])
- L[4] = urllib.quote_plus(L[4])
- L[5] = urllib.quote_plus(L[5])
+ L[2] = urllib.parse.quote_plus(L[2],'/')
+ L[3] = urllib.parse.quote_plus(L[3])
+ L[4] = urllib.parse.quote_plus(L[4])
+ L[5] = urllib.parse.quote_plus(L[5])
# Local filename relative to outdir
# os.sep - O.S. directory separator
@@ -750,12 +747,11 @@ def url_to_filename(url):
wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
url_filename_cache[nurl] = ans
- mode = ['wb', 'w'][mimetype.startswith('text')]
# Make parent directory if it doesn't exist.
try:
os.makedirs(os.path.split(ans)[0])
- except OSError, e:
+ except OSError as e:
if e.errno != errno.EEXIST:
raise
@@ -765,7 +761,12 @@ def url_to_filename(url):
out.write('File already exists: ' + str(ans)) #@UndefinedVariable
sys.exit(1)
- f = open(ans, mode)
+ if mimetype.startswith('text'):
+ f = open(ans, 'w', encoding='utf8')
+ doc = str(doc)
+ else:
+ f = open(ans, 'wb')
+
f.write(doc)
f.close()
@@ -790,7 +791,7 @@ def url_to_relative(url, cururl):
L1 = L1[1:]
L2 = L2[1:]
- rel_url = urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
+ rel_url = urllib.parse.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
if rel_url == '':
return '#'
else:
@@ -842,28 +843,28 @@ def should_follow(url):
#if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)):
if droot != dn:
if config.debug:
- print url, 'not in the same domain'
+ print(url, 'not in the same domain')
return False
# False if multiple query fields or parameters found
if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')):
if config.debug:
- print url, 'with multiple query fields'
+ print(url, 'with multiple query fields')
return False
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
if config.debug:
- print url, 'is a forbidden wiki page'
+ print(url, 'is a forbidden wiki page')
return False
if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
if config.debug:
- print url, 'is a image and you are in no-images mode'
+ print(url, 'is a image and you are in no-images mode')
return False
if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')):
if config.debug:
- print url, 'is a compressed file'
+ print(url, 'is a compressed file')
return False
@@ -874,7 +875,7 @@ def should_follow(url):
L = nurl.split('/')
if ('.' not in L[-1]):
if config.debug:
- print url, 'is a file outside of scope with unknown extension'
+ print(url, 'is a file outside of scope with unknown extension')
return False
# JKC: we do allow css from 'strange' places.
@@ -885,7 +886,7 @@ def should_follow(url):
for fp in forbidden_parents:
if fp in L[-1]:
if config.debug:
- print url, 'is a page outside of scope'
+ print(url, 'is a page outside of scope')
return False
return True
@@ -921,7 +922,7 @@ def parse_html(doc, url, filename):
follow = should_follow(u) #and (counter < 10)
if follow:
if config.debug:
- print 'ACCEPTED - ', u
+ print('ACCEPTED - ', u)
# Store url locally.
new_urls += [u]
item.url = url_to_relative(u, url)
@@ -930,7 +931,7 @@ def parse_html(doc, url, filename):
# if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
# item.url = ''
if config.debug:
- print 'NOT INCLUDED - ', u
+ print('NOT INCLUDED - ', u)
newdoc = htmldata.urljoin(doc, L)
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '