diff --git a/scripts/mw2html_audacity/htmldata.py b/scripts/mw2html_audacity/htmldata.py index c11927dee..f10f8060e 100644 --- a/scripts/mw2html_audacity/htmldata.py +++ b/scripts/mw2html_audacity/htmldata.py @@ -11,22 +11,19 @@ Features: This allows you to read and write HTML documents programmably, with much flexibility. - Extract and modify URLs in an HTML document. - - Compatible with Python 2.0 - 2.5. + - Compatible with Python 3+ See the L{examples} for a quick start. +Moved to Python3 by Jack Thomson May 2020 + """ -__version__ = '1.1.1' +__version__ = '1.1.2' __all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract', 'urljoin', 'URLMatch'] -# Define True and False for Python < 2.2. -import sys -if sys.version_info[:3] < (2, 2, 0): - exec "True = 1; False = 0" - # ------------------------------------------------------------------- # Globals # ------------------------------------------------------------------- @@ -34,8 +31,8 @@ if sys.version_info[:3] < (2, 2, 0): import re import shlex import string -import urllib -import urlparse +import urllib.request, urllib.parse, urllib.error +import urllib.parse import types # Translate text between these strings as plain text (not HTML). @@ -164,7 +161,7 @@ def tagjoin(L): else: rslash = '' tag_items = [] - items = d.items() + items = list(d.items()) items.sort() for (key, value) in items: if value != None: @@ -189,7 +186,7 @@ def _enumerate(L): Returns a list instead of an iterator. """ - return zip(range(len(L)), L) + return list(zip(list(range(len(L))), L)) def _ignore_tag_index(s, i): """ @@ -261,7 +258,7 @@ def _html_split(s): found = False in_quot1 = False in_quot2 = False - for i2 in xrange(i + 1, len(s)): + for i2 in range(i + 1, len(s)): c2 = s[i2] if c2 == '"' and not in_quot1: in_quot2 = not in_quot2 @@ -521,7 +518,7 @@ def _test_tag_dict(): s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n' (a, b, c) = _tag_dict(s) assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None} - for key in a.keys(): + for key in list(a.keys()): assert s[b[key][0]:b[key][1]] == key if a[key] != None: assert s[c[key][0]:c[key][1]] == a[key] @@ -609,7 +606,7 @@ def _full_tag_extract(s): (attrs, key_pos, value_pos) = _tag_dict(dtext) # Correct offsets in key_pos and value_pos. - for key in attrs.keys(): + for key in list(attrs.keys()): key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset, key_pos[key][1] + Lstart[i] + dtext_offset) value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset, @@ -720,7 +717,7 @@ _URL_TAGS = ['a href', 'applet archive', 'applet code', 'script src', 'table background', 'tbody background', 'td background', 'tfoot background', 'th background', 'thead background', 'tr background'] -_URL_TAGS = map(lambda s: tuple(s.split()), _URL_TAGS) +_URL_TAGS = [tuple(s.split()) for s in _URL_TAGS] def _finditer(pattern, string): @@ -862,7 +859,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'): pass else: # Current item is a tag. - if item.attrs.has_key('style'): + if 'style' in item.attrs: # Process a stylesheet embedded in the 'style' attribute. temp = urlextract(item.attrs['style'], siteurl, 'text/css') # Offset indices and add to ans. @@ -872,7 +869,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'): ans += temp for (a, b) in _URL_TAGS: - if item.name.startswith(a) and b in item.attrs.keys(): + if item.name.startswith(a) and b in list(item.attrs.keys()): # Got one URL. url = item.attrs[b] # FIXME: Some HTML tag wants a URL list, look up which @@ -893,7 +890,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'): start_end_map = {} filtered_ans = [] for item in ans: - if not start_end_map.has_key((item.start, item.end)): + if (item.start, item.end) not in start_end_map: start_end_map[(item.start, item.end)] = None filtered_ans.append(item) return filtered_ans @@ -1090,7 +1087,7 @@ def examples(): the offending IP address. """ - print examples.__doc__ + print(examples.__doc__) class URLMatch: """ @@ -1137,7 +1134,7 @@ class URLMatch: self.in_css = in_css if siteurl != None: - self.url = urlparse.urljoin(siteurl, self.url) + self.url = urllib.parse.urljoin(siteurl, self.url) self.tag_attr = tag_attr self.tag_attrs = tag_attrs @@ -1154,15 +1151,15 @@ def _cast_to_str(arg, str_class): """ if _is_str(arg): return str_class(arg) - elif isinstance(arg, types.ListType): + elif isinstance(arg, list): ans = [] for item in arg: if _is_str(item): ans.append(str_class(item)) - elif isinstance(item, types.TupleType) and len(item) == 2: + elif isinstance(item, tuple) and len(item) == 2: (a, b) = item b_prime = {} - for (b_key, b_value) in b.items(): + for (b_key, b_value) in list(b.items()): if b_value is None: b_prime[str_class(b_key)] = None else: @@ -1321,7 +1318,7 @@ def _test_tagextract(str_class=str): L = _full_tag_extract(s) for (i, item) in _enumerate(L): if isinstance(item, _HTMLTag): - for key in item.attrs.keys(): + for key in list(item.attrs.keys()): assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\ == key if item.attrs[key] != None: @@ -1460,7 +1457,7 @@ def _test_urlextract(str_class=str): base = f('http://www.python.org/~guido/') L = urlextract(s, base) L2 = [x.url for x in L] - assert L2 == [urlparse.urljoin(base, x) for x in ans] + assert L2 == [urllib.parse.urljoin(base, x) for x in ans] # Test urljoin(). assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1 @@ -1485,17 +1482,6 @@ def _test_urlextract(str_class=str): assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html']) assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 -def _python_has_unicode(): - """ - True iff Python was compiled with unicode(). - """ - try: - unicode - return True - except: - return False - - # ------------------------------------------------------------------- # Unit Test Main Routine # ------------------------------------------------------------------- @@ -1504,32 +1490,30 @@ def _test(): """ Unit test main routine. """ - print 'Unit tests:' + print('Unit tests:') _test_remove_comments() - print ' _remove_comments: OK' + print(' _remove_comments: OK') _test_shlex_split() - print ' _shlex_split: OK' + print(' _shlex_split: OK') _test_tag_dict() - print ' _tag_dict: OK' + print(' _tag_dict: OK') _test_tuple_replace() - print ' _tuple_replace: OK' + print(' _tuple_replace: OK') _test_tagextract() - print ' tagextract*: OK' + print(' tagextract*: OK') - if _python_has_unicode(): - _test_tagextract(unicode) - print ' tagextract (unicode)*: OK' + _test_tagextract(str) + print(' tagextract (unicode)*: OK') _test_urlextract() - print ' urlextract*: OK' + print(' urlextract*: OK') - if _python_has_unicode(): - _test_urlextract(unicode) - print ' urlextract (unicode)*: OK' + _test_urlextract(str) + print(' urlextract (unicode)*: OK') - print - print '* The corresponding join method has been tested as well.' + print() + print('* The corresponding join method has been tested as well.') if __name__ == '__main__': diff --git a/scripts/mw2html_audacity/mw2html.py b/scripts/mw2html_audacity/mw2html.py index 39f327913..95c485260 100644 --- a/scripts/mw2html_audacity/mw2html.py +++ b/scripts/mw2html_audacity/mw2html.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 """ mw2html - Mediawiki to static HTML @@ -15,37 +15,34 @@ Improved filtering. Improved usability. Customized for Audacity's manual wiki. Minor tweaks (for Audacity) By James Crook, Nov 2009. +Moved to Python3 by Jack Thomson, May 2020 ... """ -__version__ = '0.1.0.2' +__version__ = '0.1.0.3' import re import sys import getopt import random -import urllib +import urllib.request, urllib.parse, urllib.error import textwrap -import urlparse +import urllib.parse import os, os.path +import htmldata + import errno import hashlib -import httplib -#import pdb +import http.client from time import strftime from shutil import copyfile -try: - set -except: - from sets import Set as set - try: import htmldata except: - print 'Requires Python htmldata module:' - print ' http://www.connellybarnes.com/code/htmldata/' + print('Requires Python3 htmldata module:') + print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py') sys.exit() @@ -111,7 +108,7 @@ def get_domain(u): url = normalize_url(u) #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') - L = list(urlparse.urlparse(url)) + L = list(urllib.parse.urlparse(url)) return L[1] @@ -135,7 +132,7 @@ def normalize_url(url, lower=True): url = 'https://' + url - urlparse.urljoin(config.rooturl, url) + urllib.parse.urljoin(config.rooturl, url) return url @@ -283,7 +280,7 @@ def pos_html_transform(doc, url,filename): # Add sidebar.html if config.sidebar != None and sidebar_html == '': - f = open(config.sidebar, 'rU') + f = open(config.sidebar, 'r') sidebar_html = f.read() f.close() @@ -328,7 +325,7 @@ def pos_html_transform(doc, url,filename): return doc if footer_text == '': - f = open(config.footer, 'rU') + f = open(config.footer, 'r') footer_text = f.read() f.close() @@ -567,14 +564,14 @@ def url_open(url): while redirect != '': l_redir += [url] - L = urlparse.urlparse(url) + L = urllib.parse.urlparse(url) if L[1] != domain: conn.close() if L[1] == '': return(['','']) - print "connection to", domain, "closed." - conn = httplib.HTTPSConnection(L[1]) + print("connection to", domain, "closed.") + conn = http.client.HTTPSConnection(L[1]) domain = L[1] - print "connection to", domain, "opened." + print("connection to", domain, "opened.") rel_url = url pos = url.find(domain) @@ -593,47 +590,47 @@ def url_open(url): try: conn.request("GET", rel_url,headers=headers) r = conn.getresponse() - print 'Status', r.status, r.reason, 'accessing', rel_url + print('Status', r.status, r.reason, 'accessing', rel_url) if r.status == 404: - print " it's not possible to recover this error." + print(" it's not possible to recover this error.") errors += 1 return ('', '') if r.status == 500: - print " eventually this error might be recovered. let's try again." - print ' reconnecting...' - conn = httplib.HTTPSConnection(domain) + print(" eventually this error might be recovered. let's try again.") + print(' reconnecting...') + conn = http.client.HTTPSConnection(domain) attempts += 1 continue if r.status == 403: - print " that shouldn't happen, but let's try again anyway." - print ' reconnecting...' - conn = httplib.HTTPSConnection(domain) + print(" that shouldn't happen, but let's try again anyway.") + print(' reconnecting...') + conn = http.client.HTTPSConnection(domain) attempts += 1 continue if attempts != 0: recovered = True if r.status != 200: - print " Status other than 200, 404, 500, 403. It is: ", r.status + print(" Status other than 200, 404, 500, 403. It is: ", r.status) success = True - except httplib.HTTPException, e: - print 'ERROR', e.__class__.__name__, 'while retrieving', url + except http.client.HTTPException as e: + print('ERROR', e.__class__.__name__, 'while retrieving', url) conn.close if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']: - print "eventually this error might be recovered. let's try again." - print 'reconnecting...' - conn = httplib.HTTPSConnection(domain) + print("eventually this error might be recovered. let's try again.") + print('reconnecting...') + conn = http.client.HTTPSConnection(domain) attempts += 1 else: - print "it's not possible to recover this error." + print("it's not possible to recover this error.") errors += 1 return ('', '') if recovered: - print "error recovered" + print("error recovered") if not success: - print "it was not possible to recover this error." + print("it was not possible to recover this error.") errors += 1 return ('', '') @@ -666,7 +663,7 @@ def url_to_filename(url): #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') turl = re.sub(r'm/index.php\?title=', r'man/', nurl) turl = re.sub(r'.css&[\S\s]+', r'.css', turl) - L = list(urlparse.urlparse(turl)) + L = list(urllib.parse.urlparse(turl)) #this way the url will not create a folder outside of the maindomain droot = get_domain(config.rooturl) @@ -697,10 +694,10 @@ def url_to_filename(url): #don't sanitize / for path L[0] = '' - L[2] = urllib.quote_plus(L[2],'/') - L[3] = urllib.quote_plus(L[3]) - L[4] = urllib.quote_plus(L[4]) - L[5] = urllib.quote_plus(L[5]) + L[2] = urllib.parse.quote_plus(L[2],'/') + L[3] = urllib.parse.quote_plus(L[3]) + L[4] = urllib.parse.quote_plus(L[4]) + L[5] = urllib.parse.quote_plus(L[5]) # Local filename relative to outdir # os.sep - O.S. directory separator @@ -750,12 +747,11 @@ def url_to_filename(url): wrote_file_set.add(os.path.normcase(os.path.normpath(ans))) url_filename_cache[nurl] = ans - mode = ['wb', 'w'][mimetype.startswith('text')] # Make parent directory if it doesn't exist. try: os.makedirs(os.path.split(ans)[0]) - except OSError, e: + except OSError as e: if e.errno != errno.EEXIST: raise @@ -765,7 +761,12 @@ def url_to_filename(url): out.write('File already exists: ' + str(ans)) #@UndefinedVariable sys.exit(1) - f = open(ans, mode) + if mimetype.startswith('text'): + f = open(ans, 'w', encoding='utf8') + doc = str(doc) + else: + f = open(ans, 'wb') + f.write(doc) f.close() @@ -790,7 +791,7 @@ def url_to_relative(url, cururl): L1 = L1[1:] L2 = L2[1:] - rel_url = urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section + rel_url = urllib.parse.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section if rel_url == '': return '#' else: @@ -842,28 +843,28 @@ def should_follow(url): #if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)): if droot != dn: if config.debug: - print url, 'not in the same domain' + print(url, 'not in the same domain') return False # False if multiple query fields or parameters found if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')): if config.debug: - print url, 'with multiple query fields' + print(url, 'with multiple query fields') return False if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')): if config.debug: - print url, 'is a forbidden wiki page' + print(url, 'is a forbidden wiki page') return False if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')): if config.debug: - print url, 'is a image and you are in no-images mode' + print(url, 'is a image and you are in no-images mode') return False if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')): if config.debug: - print url, 'is a compressed file' + print(url, 'is a compressed file') return False @@ -874,7 +875,7 @@ def should_follow(url): L = nurl.split('/') if ('.' not in L[-1]): if config.debug: - print url, 'is a file outside of scope with unknown extension' + print(url, 'is a file outside of scope with unknown extension') return False # JKC: we do allow css from 'strange' places. @@ -885,7 +886,7 @@ def should_follow(url): for fp in forbidden_parents: if fp in L[-1]: if config.debug: - print url, 'is a page outside of scope' + print(url, 'is a page outside of scope') return False return True @@ -921,7 +922,7 @@ def parse_html(doc, url, filename): follow = should_follow(u) #and (counter < 10) if follow: if config.debug: - print 'ACCEPTED - ', u + print('ACCEPTED - ', u) # Store url locally. new_urls += [u] item.url = url_to_relative(u, url) @@ -930,7 +931,7 @@ def parse_html(doc, url, filename): # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ): # item.url = '' if config.debug: - print 'NOT INCLUDED - ', u + print('NOT INCLUDED - ', u) newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '