1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-04-30 07:39:42 +02:00

Move mw2html and htmldata to python3

This commit is contained in:
Jack Thomson 2020-05-27 13:48:49 -04:00 committed by James Crook
parent 6541e808be
commit 56f02ce13e
2 changed files with 116 additions and 120 deletions

View File

@ -11,22 +11,19 @@ Features:
This allows you to read and write HTML documents
programmably, with much flexibility.
- Extract and modify URLs in an HTML document.
- Compatible with Python 2.0 - 2.5.
- Compatible with Python 3+
See the L{examples} for a quick start.
Moved to Python3 by Jack Thomson May 2020
"""
__version__ = '1.1.1'
__version__ = '1.1.2'
__all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract',
'urljoin', 'URLMatch']
# Define True and False for Python < 2.2.
import sys
if sys.version_info[:3] < (2, 2, 0):
exec "True = 1; False = 0"
# -------------------------------------------------------------------
# Globals
# -------------------------------------------------------------------
@ -34,8 +31,8 @@ if sys.version_info[:3] < (2, 2, 0):
import re
import shlex
import string
import urllib
import urlparse
import urllib.request, urllib.parse, urllib.error
import urllib.parse
import types
# Translate text between these strings as plain text (not HTML).
@ -164,7 +161,7 @@ def tagjoin(L):
else:
rslash = ''
tag_items = []
items = d.items()
items = list(d.items())
items.sort()
for (key, value) in items:
if value != None:
@ -189,7 +186,7 @@ def _enumerate(L):
Returns a list instead of an iterator.
"""
return zip(range(len(L)), L)
return list(zip(list(range(len(L))), L))
def _ignore_tag_index(s, i):
"""
@ -261,7 +258,7 @@ def _html_split(s):
found = False
in_quot1 = False
in_quot2 = False
for i2 in xrange(i + 1, len(s)):
for i2 in range(i + 1, len(s)):
c2 = s[i2]
if c2 == '"' and not in_quot1:
in_quot2 = not in_quot2
@ -521,7 +518,7 @@ def _test_tag_dict():
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
(a, b, c) = _tag_dict(s)
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
for key in a.keys():
for key in list(a.keys()):
assert s[b[key][0]:b[key][1]] == key
if a[key] != None:
assert s[c[key][0]:c[key][1]] == a[key]
@ -609,7 +606,7 @@ def _full_tag_extract(s):
(attrs, key_pos, value_pos) = _tag_dict(dtext)
# Correct offsets in key_pos and value_pos.
for key in attrs.keys():
for key in list(attrs.keys()):
key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
key_pos[key][1] + Lstart[i] + dtext_offset)
value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
@ -720,7 +717,7 @@ _URL_TAGS = ['a href', 'applet archive', 'applet code',
'script src', 'table background', 'tbody background',
'td background', 'tfoot background', 'th background',
'thead background', 'tr background']
_URL_TAGS = map(lambda s: tuple(s.split()), _URL_TAGS)
_URL_TAGS = [tuple(s.split()) for s in _URL_TAGS]
def _finditer(pattern, string):
@ -862,7 +859,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
pass
else:
# Current item is a tag.
if item.attrs.has_key('style'):
if 'style' in item.attrs:
# Process a stylesheet embedded in the 'style' attribute.
temp = urlextract(item.attrs['style'], siteurl, 'text/css')
# Offset indices and add to ans.
@ -872,7 +869,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
ans += temp
for (a, b) in _URL_TAGS:
if item.name.startswith(a) and b in item.attrs.keys():
if item.name.startswith(a) and b in list(item.attrs.keys()):
# Got one URL.
url = item.attrs[b]
# FIXME: Some HTML tag wants a URL list, look up which
@ -893,7 +890,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
start_end_map = {}
filtered_ans = []
for item in ans:
if not start_end_map.has_key((item.start, item.end)):
if (item.start, item.end) not in start_end_map:
start_end_map[(item.start, item.end)] = None
filtered_ans.append(item)
return filtered_ans
@ -1090,7 +1087,7 @@ def examples():
the offending IP address.
"""
print examples.__doc__
print(examples.__doc__)
class URLMatch:
"""
@ -1137,7 +1134,7 @@ class URLMatch:
self.in_css = in_css
if siteurl != None:
self.url = urlparse.urljoin(siteurl, self.url)
self.url = urllib.parse.urljoin(siteurl, self.url)
self.tag_attr = tag_attr
self.tag_attrs = tag_attrs
@ -1154,15 +1151,15 @@ def _cast_to_str(arg, str_class):
"""
if _is_str(arg):
return str_class(arg)
elif isinstance(arg, types.ListType):
elif isinstance(arg, list):
ans = []
for item in arg:
if _is_str(item):
ans.append(str_class(item))
elif isinstance(item, types.TupleType) and len(item) == 2:
elif isinstance(item, tuple) and len(item) == 2:
(a, b) = item
b_prime = {}
for (b_key, b_value) in b.items():
for (b_key, b_value) in list(b.items()):
if b_value is None:
b_prime[str_class(b_key)] = None
else:
@ -1321,7 +1318,7 @@ def _test_tagextract(str_class=str):
L = _full_tag_extract(s)
for (i, item) in _enumerate(L):
if isinstance(item, _HTMLTag):
for key in item.attrs.keys():
for key in list(item.attrs.keys()):
assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\
== key
if item.attrs[key] != None:
@ -1460,7 +1457,7 @@ def _test_urlextract(str_class=str):
base = f('http://www.python.org/~guido/')
L = urlextract(s, base)
L2 = [x.url for x in L]
assert L2 == [urlparse.urljoin(base, x) for x in ans]
assert L2 == [urllib.parse.urljoin(base, x) for x in ans]
# Test urljoin().
assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1
@ -1485,17 +1482,6 @@ def _test_urlextract(str_class=str):
assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html'])
assert [s[x.start:x.end] == x.url for x in L].count(False) == 0
def _python_has_unicode():
"""
True iff Python was compiled with unicode().
"""
try:
unicode
return True
except:
return False
# -------------------------------------------------------------------
# Unit Test Main Routine
# -------------------------------------------------------------------
@ -1504,32 +1490,30 @@ def _test():
"""
Unit test main routine.
"""
print 'Unit tests:'
print('Unit tests:')
_test_remove_comments()
print ' _remove_comments: OK'
print(' _remove_comments: OK')
_test_shlex_split()
print ' _shlex_split: OK'
print(' _shlex_split: OK')
_test_tag_dict()
print ' _tag_dict: OK'
print(' _tag_dict: OK')
_test_tuple_replace()
print ' _tuple_replace: OK'
print(' _tuple_replace: OK')
_test_tagextract()
print ' tagextract*: OK'
print(' tagextract*: OK')
if _python_has_unicode():
_test_tagextract(unicode)
print ' tagextract (unicode)*: OK'
_test_tagextract(str)
print(' tagextract (unicode)*: OK')
_test_urlextract()
print ' urlextract*: OK'
print(' urlextract*: OK')
if _python_has_unicode():
_test_urlextract(unicode)
print ' urlextract (unicode)*: OK'
_test_urlextract(str)
print(' urlextract (unicode)*: OK')
print
print '* The corresponding join method has been tested as well.'
print()
print('* The corresponding join method has been tested as well.')
if __name__ == '__main__':

View File

@ -1,4 +1,4 @@
#! /usr/bin/env python
#! /usr/bin/env python3
"""
mw2html - Mediawiki to static HTML
@ -15,37 +15,34 @@ Improved filtering.
Improved usability.
Customized for Audacity's manual wiki.
Minor tweaks (for Audacity) By James Crook, Nov 2009.
Moved to Python3 by Jack Thomson, May 2020
...
"""
__version__ = '0.1.0.2'
__version__ = '0.1.0.3'
import re
import sys
import getopt
import random
import urllib
import urllib.request, urllib.parse, urllib.error
import textwrap
import urlparse
import urllib.parse
import os, os.path
import htmldata
import errno
import hashlib
import httplib
#import pdb
import http.client
from time import strftime
from shutil import copyfile
try:
set
except:
from sets import Set as set
try:
import htmldata
except:
print 'Requires Python htmldata module:'
print ' http://www.connellybarnes.com/code/htmldata/'
print('Requires Python3 htmldata module:')
print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py')
sys.exit()
@ -111,7 +108,7 @@ def get_domain(u):
url = normalize_url(u)
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
L = list(urlparse.urlparse(url))
L = list(urllib.parse.urlparse(url))
return L[1]
@ -135,7 +132,7 @@ def normalize_url(url, lower=True):
url = 'https://' + url
urlparse.urljoin(config.rooturl, url)
urllib.parse.urljoin(config.rooturl, url)
return url
@ -283,7 +280,7 @@ def pos_html_transform(doc, url,filename):
# Add sidebar.html
if config.sidebar != None and sidebar_html == '':
f = open(config.sidebar, 'rU')
f = open(config.sidebar, 'r')
sidebar_html = f.read()
f.close()
@ -328,7 +325,7 @@ def pos_html_transform(doc, url,filename):
return doc
if footer_text == '':
f = open(config.footer, 'rU')
f = open(config.footer, 'r')
footer_text = f.read()
f.close()
@ -567,14 +564,14 @@ def url_open(url):
while redirect != '':
l_redir += [url]
L = urlparse.urlparse(url)
L = urllib.parse.urlparse(url)
if L[1] != domain:
conn.close()
if L[1] == '': return(['',''])
print "connection to", domain, "closed."
conn = httplib.HTTPSConnection(L[1])
print("connection to", domain, "closed.")
conn = http.client.HTTPSConnection(L[1])
domain = L[1]
print "connection to", domain, "opened."
print("connection to", domain, "opened.")
rel_url = url
pos = url.find(domain)
@ -593,47 +590,47 @@ def url_open(url):
try:
conn.request("GET", rel_url,headers=headers)
r = conn.getresponse()
print 'Status', r.status, r.reason, 'accessing', rel_url
print('Status', r.status, r.reason, 'accessing', rel_url)
if r.status == 404:
print " it's not possible to recover this error."
print(" it's not possible to recover this error.")
errors += 1
return ('', '')
if r.status == 500:
print " eventually this error might be recovered. let's try again."
print ' reconnecting...'
conn = httplib.HTTPSConnection(domain)
print(" eventually this error might be recovered. let's try again.")
print(' reconnecting...')
conn = http.client.HTTPSConnection(domain)
attempts += 1
continue
if r.status == 403:
print " that shouldn't happen, but let's try again anyway."
print ' reconnecting...'
conn = httplib.HTTPSConnection(domain)
print(" that shouldn't happen, but let's try again anyway.")
print(' reconnecting...')
conn = http.client.HTTPSConnection(domain)
attempts += 1
continue
if attempts != 0:
recovered = True
if r.status != 200:
print " Status other than 200, 404, 500, 403. It is: ", r.status
print(" Status other than 200, 404, 500, 403. It is: ", r.status)
success = True
except httplib.HTTPException, e:
print 'ERROR', e.__class__.__name__, 'while retrieving', url
except http.client.HTTPException as e:
print('ERROR', e.__class__.__name__, 'while retrieving', url)
conn.close
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
print "eventually this error might be recovered. let's try again."
print 'reconnecting...'
conn = httplib.HTTPSConnection(domain)
print("eventually this error might be recovered. let's try again.")
print('reconnecting...')
conn = http.client.HTTPSConnection(domain)
attempts += 1
else:
print "it's not possible to recover this error."
print("it's not possible to recover this error.")
errors += 1
return ('', '')
if recovered:
print "error recovered"
print("error recovered")
if not success:
print "it was not possible to recover this error."
print("it was not possible to recover this error.")
errors += 1
return ('', '')
@ -666,7 +663,7 @@ def url_to_filename(url):
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
turl = re.sub(r'm/index.php\?title=', r'man/', nurl)
turl = re.sub(r'.css&[\S\s]+', r'.css', turl)
L = list(urlparse.urlparse(turl))
L = list(urllib.parse.urlparse(turl))
#this way the url will not create a folder outside of the maindomain
droot = get_domain(config.rooturl)
@ -697,10 +694,10 @@ def url_to_filename(url):
#don't sanitize / for path
L[0] = ''
L[2] = urllib.quote_plus(L[2],'/')
L[3] = urllib.quote_plus(L[3])
L[4] = urllib.quote_plus(L[4])
L[5] = urllib.quote_plus(L[5])
L[2] = urllib.parse.quote_plus(L[2],'/')
L[3] = urllib.parse.quote_plus(L[3])
L[4] = urllib.parse.quote_plus(L[4])
L[5] = urllib.parse.quote_plus(L[5])
# Local filename relative to outdir
# os.sep - O.S. directory separator
@ -750,12 +747,11 @@ def url_to_filename(url):
wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
url_filename_cache[nurl] = ans
mode = ['wb', 'w'][mimetype.startswith('text')]
# Make parent directory if it doesn't exist.
try:
os.makedirs(os.path.split(ans)[0])
except OSError, e:
except OSError as e:
if e.errno != errno.EEXIST:
raise
@ -765,7 +761,12 @@ def url_to_filename(url):
out.write('File already exists: ' + str(ans)) #@UndefinedVariable
sys.exit(1)
f = open(ans, mode)
if mimetype.startswith('text'):
f = open(ans, 'w', encoding='utf8')
doc = str(doc)
else:
f = open(ans, 'wb')
f.write(doc)
f.close()
@ -790,7 +791,7 @@ def url_to_relative(url, cururl):
L1 = L1[1:]
L2 = L2[1:]
rel_url = urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
rel_url = urllib.parse.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
if rel_url == '':
return '#'
else:
@ -842,28 +843,28 @@ def should_follow(url):
#if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)):
if droot != dn:
if config.debug:
print url, 'not in the same domain'
print(url, 'not in the same domain')
return False
# False if multiple query fields or parameters found
if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')):
if config.debug:
print url, 'with multiple query fields'
print(url, 'with multiple query fields')
return False
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
if config.debug:
print url, 'is a forbidden wiki page'
print(url, 'is a forbidden wiki page')
return False
if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
if config.debug:
print url, 'is a image and you are in no-images mode'
print(url, 'is a image and you are in no-images mode')
return False
if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')):
if config.debug:
print url, 'is a compressed file'
print(url, 'is a compressed file')
return False
@ -874,7 +875,7 @@ def should_follow(url):
L = nurl.split('/')
if ('.' not in L[-1]):
if config.debug:
print url, 'is a file outside of scope with unknown extension'
print(url, 'is a file outside of scope with unknown extension')
return False
# JKC: we do allow css from 'strange' places.
@ -885,7 +886,7 @@ def should_follow(url):
for fp in forbidden_parents:
if fp in L[-1]:
if config.debug:
print url, 'is a page outside of scope'
print(url, 'is a page outside of scope')
return False
return True
@ -921,7 +922,7 @@ def parse_html(doc, url, filename):
follow = should_follow(u) #and (counter < 10)
if follow:
if config.debug:
print 'ACCEPTED - ', u
print('ACCEPTED - ', u)
# Store url locally.
new_urls += [u]
item.url = url_to_relative(u, url)
@ -930,7 +931,7 @@ def parse_html(doc, url, filename):
# if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
# item.url = ''
if config.debug:
print 'NOT INCLUDED - ', u
print('NOT INCLUDED - ', u)
newdoc = htmldata.urljoin(doc, L)
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
@ -938,13 +939,19 @@ def parse_html(doc, url, filename):
newdoc = pos_html_transform(newdoc, url,filename)
# Remove byte artifacts in string
newdoc = newdoc.replace('\\n','\n')
newdoc = newdoc.replace('\\t', '\t')
newdoc = newdoc.strip('b')
newdoc = newdoc.strip('')
return (newdoc, new_urls)
def deploy_file( src, dest ):
src_dir = os.path.dirname(os.path.realpath(__file__))
src = os.path.join(src_dir, src)
dest = os.path.join(config.outdir, dest)
print "copying from", src, "to", dest
print("copying from", src, "to", dest)
directory = os.path.dirname(dest)
if not os.path.exists(directory):
os.makedirs(directory)
@ -957,7 +964,7 @@ def run(out=sys.stdout):
"""
global conn, domain, counter, redir_cache, config, headers
if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
if urllib.parse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
out.write('Please do not use robots with the Wikipedia site.\n')
out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
out.write('your local installation. See the Mediawiki site for more information.\n')
@ -971,8 +978,8 @@ def run(out=sys.stdout):
sys.exit(1)
domain = get_domain(config.rooturl)
conn = httplib.HTTPSConnection(domain)
print 'connection established to:', domain
conn = http.client.HTTPSConnection(domain)
print('connection established to:', domain)
complete = set()
pending = set([config.rooturl])
@ -986,7 +993,7 @@ def run(out=sys.stdout):
if nurl in complete:
if config.debug:
print url, 'already processed'
print(url, 'already processed')
continue
complete.add(nurl)
@ -997,7 +1004,7 @@ def run(out=sys.stdout):
if start:
start = False
aux_url = ''
for redir in redir_cache.iterkeys():
for redir in redir_cache.keys():
aux_url = normalize_url(redir)
url_filename_cache[aux_url] = filename
if aux_url not in complete:
@ -1009,10 +1016,16 @@ def run(out=sys.stdout):
continue
if not os.path.exists(filename):
print "ERROR: ", url, '\n'
print("ERROR: ", url, '\n')
continue
f = open(filename, 'r')
# These formats are encoded as text. Everything else is read as bytes
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
if not filename.endswith(text_ext):
f = open(filename, 'rb')
else:
f = open(filename, 'r')
doc = f.read()
f.close()
new_urls = []
@ -1025,7 +1038,6 @@ def run(out=sys.stdout):
# Save document changes to disk
# The unmodified file already exists on disk.
update = False
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
for ext in text_ext:
if filename.endswith(ext):
update = True
@ -1049,10 +1061,10 @@ def run(out=sys.stdout):
pending.add(u)
conn.close()
print "connection to", domain, "closed."
print("connection to", domain, "closed.")
out.write(str(n) + ' files saved\n')
print counter, "httplib requests done"
print errors, "errors not recovered"
print(counter, "httplib requests done")
print(errors, "errors not recovered")
# use / not \ so as to work on both windows and mac.
deploy_file( "AudacityLogo.png", r"alphamanual.audacityteam.org/m/resources/assets/AudacityLogo.png")
@ -1118,7 +1130,7 @@ def usage():
"""
print textwrap.dedent(usage_str.strip('\n'))
print(textwrap.dedent(usage_str.strip('\n')))
sys.exit(1)