1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-07-30 07:29:29 +02:00

Move mw2html and htmldata to python3

This commit is contained in:
Jack Thomson 2020-05-27 13:48:49 -04:00 committed by James Crook
parent 6541e808be
commit 56f02ce13e
2 changed files with 116 additions and 120 deletions

View File

@ -11,22 +11,19 @@ Features:
This allows you to read and write HTML documents This allows you to read and write HTML documents
programmably, with much flexibility. programmably, with much flexibility.
- Extract and modify URLs in an HTML document. - Extract and modify URLs in an HTML document.
- Compatible with Python 2.0 - 2.5. - Compatible with Python 3+
See the L{examples} for a quick start. See the L{examples} for a quick start.
Moved to Python3 by Jack Thomson May 2020
""" """
__version__ = '1.1.1' __version__ = '1.1.2'
__all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract', __all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract',
'urljoin', 'URLMatch'] 'urljoin', 'URLMatch']
# Define True and False for Python < 2.2.
import sys
if sys.version_info[:3] < (2, 2, 0):
exec "True = 1; False = 0"
# ------------------------------------------------------------------- # -------------------------------------------------------------------
# Globals # Globals
# ------------------------------------------------------------------- # -------------------------------------------------------------------
@ -34,8 +31,8 @@ if sys.version_info[:3] < (2, 2, 0):
import re import re
import shlex import shlex
import string import string
import urllib import urllib.request, urllib.parse, urllib.error
import urlparse import urllib.parse
import types import types
# Translate text between these strings as plain text (not HTML). # Translate text between these strings as plain text (not HTML).
@ -164,7 +161,7 @@ def tagjoin(L):
else: else:
rslash = '' rslash = ''
tag_items = [] tag_items = []
items = d.items() items = list(d.items())
items.sort() items.sort()
for (key, value) in items: for (key, value) in items:
if value != None: if value != None:
@ -189,7 +186,7 @@ def _enumerate(L):
Returns a list instead of an iterator. Returns a list instead of an iterator.
""" """
return zip(range(len(L)), L) return list(zip(list(range(len(L))), L))
def _ignore_tag_index(s, i): def _ignore_tag_index(s, i):
""" """
@ -261,7 +258,7 @@ def _html_split(s):
found = False found = False
in_quot1 = False in_quot1 = False
in_quot2 = False in_quot2 = False
for i2 in xrange(i + 1, len(s)): for i2 in range(i + 1, len(s)):
c2 = s[i2] c2 = s[i2]
if c2 == '"' and not in_quot1: if c2 == '"' and not in_quot1:
in_quot2 = not in_quot2 in_quot2 = not in_quot2
@ -521,7 +518,7 @@ def _test_tag_dict():
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n' s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
(a, b, c) = _tag_dict(s) (a, b, c) = _tag_dict(s)
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None} assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
for key in a.keys(): for key in list(a.keys()):
assert s[b[key][0]:b[key][1]] == key assert s[b[key][0]:b[key][1]] == key
if a[key] != None: if a[key] != None:
assert s[c[key][0]:c[key][1]] == a[key] assert s[c[key][0]:c[key][1]] == a[key]
@ -609,7 +606,7 @@ def _full_tag_extract(s):
(attrs, key_pos, value_pos) = _tag_dict(dtext) (attrs, key_pos, value_pos) = _tag_dict(dtext)
# Correct offsets in key_pos and value_pos. # Correct offsets in key_pos and value_pos.
for key in attrs.keys(): for key in list(attrs.keys()):
key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset, key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
key_pos[key][1] + Lstart[i] + dtext_offset) key_pos[key][1] + Lstart[i] + dtext_offset)
value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset, value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
@ -720,7 +717,7 @@ _URL_TAGS = ['a href', 'applet archive', 'applet code',
'script src', 'table background', 'tbody background', 'script src', 'table background', 'tbody background',
'td background', 'tfoot background', 'th background', 'td background', 'tfoot background', 'th background',
'thead background', 'tr background'] 'thead background', 'tr background']
_URL_TAGS = map(lambda s: tuple(s.split()), _URL_TAGS) _URL_TAGS = [tuple(s.split()) for s in _URL_TAGS]
def _finditer(pattern, string): def _finditer(pattern, string):
@ -862,7 +859,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
pass pass
else: else:
# Current item is a tag. # Current item is a tag.
if item.attrs.has_key('style'): if 'style' in item.attrs:
# Process a stylesheet embedded in the 'style' attribute. # Process a stylesheet embedded in the 'style' attribute.
temp = urlextract(item.attrs['style'], siteurl, 'text/css') temp = urlextract(item.attrs['style'], siteurl, 'text/css')
# Offset indices and add to ans. # Offset indices and add to ans.
@ -872,7 +869,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
ans += temp ans += temp
for (a, b) in _URL_TAGS: for (a, b) in _URL_TAGS:
if item.name.startswith(a) and b in item.attrs.keys(): if item.name.startswith(a) and b in list(item.attrs.keys()):
# Got one URL. # Got one URL.
url = item.attrs[b] url = item.attrs[b]
# FIXME: Some HTML tag wants a URL list, look up which # FIXME: Some HTML tag wants a URL list, look up which
@ -893,7 +890,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
start_end_map = {} start_end_map = {}
filtered_ans = [] filtered_ans = []
for item in ans: for item in ans:
if not start_end_map.has_key((item.start, item.end)): if (item.start, item.end) not in start_end_map:
start_end_map[(item.start, item.end)] = None start_end_map[(item.start, item.end)] = None
filtered_ans.append(item) filtered_ans.append(item)
return filtered_ans return filtered_ans
@ -1090,7 +1087,7 @@ def examples():
the offending IP address. the offending IP address.
""" """
print examples.__doc__ print(examples.__doc__)
class URLMatch: class URLMatch:
""" """
@ -1137,7 +1134,7 @@ class URLMatch:
self.in_css = in_css self.in_css = in_css
if siteurl != None: if siteurl != None:
self.url = urlparse.urljoin(siteurl, self.url) self.url = urllib.parse.urljoin(siteurl, self.url)
self.tag_attr = tag_attr self.tag_attr = tag_attr
self.tag_attrs = tag_attrs self.tag_attrs = tag_attrs
@ -1154,15 +1151,15 @@ def _cast_to_str(arg, str_class):
""" """
if _is_str(arg): if _is_str(arg):
return str_class(arg) return str_class(arg)
elif isinstance(arg, types.ListType): elif isinstance(arg, list):
ans = [] ans = []
for item in arg: for item in arg:
if _is_str(item): if _is_str(item):
ans.append(str_class(item)) ans.append(str_class(item))
elif isinstance(item, types.TupleType) and len(item) == 2: elif isinstance(item, tuple) and len(item) == 2:
(a, b) = item (a, b) = item
b_prime = {} b_prime = {}
for (b_key, b_value) in b.items(): for (b_key, b_value) in list(b.items()):
if b_value is None: if b_value is None:
b_prime[str_class(b_key)] = None b_prime[str_class(b_key)] = None
else: else:
@ -1321,7 +1318,7 @@ def _test_tagextract(str_class=str):
L = _full_tag_extract(s) L = _full_tag_extract(s)
for (i, item) in _enumerate(L): for (i, item) in _enumerate(L):
if isinstance(item, _HTMLTag): if isinstance(item, _HTMLTag):
for key in item.attrs.keys(): for key in list(item.attrs.keys()):
assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\ assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\
== key == key
if item.attrs[key] != None: if item.attrs[key] != None:
@ -1460,7 +1457,7 @@ def _test_urlextract(str_class=str):
base = f('http://www.python.org/~guido/') base = f('http://www.python.org/~guido/')
L = urlextract(s, base) L = urlextract(s, base)
L2 = [x.url for x in L] L2 = [x.url for x in L]
assert L2 == [urlparse.urljoin(base, x) for x in ans] assert L2 == [urllib.parse.urljoin(base, x) for x in ans]
# Test urljoin(). # Test urljoin().
assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1 assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1
@ -1485,17 +1482,6 @@ def _test_urlextract(str_class=str):
assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html']) assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html'])
assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 assert [s[x.start:x.end] == x.url for x in L].count(False) == 0
def _python_has_unicode():
"""
True iff Python was compiled with unicode().
"""
try:
unicode
return True
except:
return False
# ------------------------------------------------------------------- # -------------------------------------------------------------------
# Unit Test Main Routine # Unit Test Main Routine
# ------------------------------------------------------------------- # -------------------------------------------------------------------
@ -1504,32 +1490,30 @@ def _test():
""" """
Unit test main routine. Unit test main routine.
""" """
print 'Unit tests:' print('Unit tests:')
_test_remove_comments() _test_remove_comments()
print ' _remove_comments: OK' print(' _remove_comments: OK')
_test_shlex_split() _test_shlex_split()
print ' _shlex_split: OK' print(' _shlex_split: OK')
_test_tag_dict() _test_tag_dict()
print ' _tag_dict: OK' print(' _tag_dict: OK')
_test_tuple_replace() _test_tuple_replace()
print ' _tuple_replace: OK' print(' _tuple_replace: OK')
_test_tagextract() _test_tagextract()
print ' tagextract*: OK' print(' tagextract*: OK')
if _python_has_unicode(): _test_tagextract(str)
_test_tagextract(unicode) print(' tagextract (unicode)*: OK')
print ' tagextract (unicode)*: OK'
_test_urlextract() _test_urlextract()
print ' urlextract*: OK' print(' urlextract*: OK')
if _python_has_unicode(): _test_urlextract(str)
_test_urlextract(unicode) print(' urlextract (unicode)*: OK')
print ' urlextract (unicode)*: OK'
print print()
print '* The corresponding join method has been tested as well.' print('* The corresponding join method has been tested as well.')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,4 +1,4 @@
#! /usr/bin/env python #! /usr/bin/env python3
""" """
mw2html - Mediawiki to static HTML mw2html - Mediawiki to static HTML
@ -15,37 +15,34 @@ Improved filtering.
Improved usability. Improved usability.
Customized for Audacity's manual wiki. Customized for Audacity's manual wiki.
Minor tweaks (for Audacity) By James Crook, Nov 2009. Minor tweaks (for Audacity) By James Crook, Nov 2009.
Moved to Python3 by Jack Thomson, May 2020
... ...
""" """
__version__ = '0.1.0.2' __version__ = '0.1.0.3'
import re import re
import sys import sys
import getopt import getopt
import random import random
import urllib import urllib.request, urllib.parse, urllib.error
import textwrap import textwrap
import urlparse import urllib.parse
import os, os.path import os, os.path
import htmldata
import errno import errno
import hashlib import hashlib
import httplib import http.client
#import pdb
from time import strftime from time import strftime
from shutil import copyfile from shutil import copyfile
try:
set
except:
from sets import Set as set
try: try:
import htmldata import htmldata
except: except:
print 'Requires Python htmldata module:' print('Requires Python3 htmldata module:')
print ' http://www.connellybarnes.com/code/htmldata/' print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py')
sys.exit() sys.exit()
@ -111,7 +108,7 @@ def get_domain(u):
url = normalize_url(u) url = normalize_url(u)
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
L = list(urlparse.urlparse(url)) L = list(urllib.parse.urlparse(url))
return L[1] return L[1]
@ -135,7 +132,7 @@ def normalize_url(url, lower=True):
url = 'https://' + url url = 'https://' + url
urlparse.urljoin(config.rooturl, url) urllib.parse.urljoin(config.rooturl, url)
return url return url
@ -283,7 +280,7 @@ def pos_html_transform(doc, url,filename):
# Add sidebar.html # Add sidebar.html
if config.sidebar != None and sidebar_html == '': if config.sidebar != None and sidebar_html == '':
f = open(config.sidebar, 'rU') f = open(config.sidebar, 'r')
sidebar_html = f.read() sidebar_html = f.read()
f.close() f.close()
@ -328,7 +325,7 @@ def pos_html_transform(doc, url,filename):
return doc return doc
if footer_text == '': if footer_text == '':
f = open(config.footer, 'rU') f = open(config.footer, 'r')
footer_text = f.read() footer_text = f.read()
f.close() f.close()
@ -567,14 +564,14 @@ def url_open(url):
while redirect != '': while redirect != '':
l_redir += [url] l_redir += [url]
L = urlparse.urlparse(url) L = urllib.parse.urlparse(url)
if L[1] != domain: if L[1] != domain:
conn.close() conn.close()
if L[1] == '': return(['','']) if L[1] == '': return(['',''])
print "connection to", domain, "closed." print("connection to", domain, "closed.")
conn = httplib.HTTPSConnection(L[1]) conn = http.client.HTTPSConnection(L[1])
domain = L[1] domain = L[1]
print "connection to", domain, "opened." print("connection to", domain, "opened.")
rel_url = url rel_url = url
pos = url.find(domain) pos = url.find(domain)
@ -593,47 +590,47 @@ def url_open(url):
try: try:
conn.request("GET", rel_url,headers=headers) conn.request("GET", rel_url,headers=headers)
r = conn.getresponse() r = conn.getresponse()
print 'Status', r.status, r.reason, 'accessing', rel_url print('Status', r.status, r.reason, 'accessing', rel_url)
if r.status == 404: if r.status == 404:
print " it's not possible to recover this error." print(" it's not possible to recover this error.")
errors += 1 errors += 1
return ('', '') return ('', '')
if r.status == 500: if r.status == 500:
print " eventually this error might be recovered. let's try again." print(" eventually this error might be recovered. let's try again.")
print ' reconnecting...' print(' reconnecting...')
conn = httplib.HTTPSConnection(domain) conn = http.client.HTTPSConnection(domain)
attempts += 1 attempts += 1
continue continue
if r.status == 403: if r.status == 403:
print " that shouldn't happen, but let's try again anyway." print(" that shouldn't happen, but let's try again anyway.")
print ' reconnecting...' print(' reconnecting...')
conn = httplib.HTTPSConnection(domain) conn = http.client.HTTPSConnection(domain)
attempts += 1 attempts += 1
continue continue
if attempts != 0: if attempts != 0:
recovered = True recovered = True
if r.status != 200: if r.status != 200:
print " Status other than 200, 404, 500, 403. It is: ", r.status print(" Status other than 200, 404, 500, 403. It is: ", r.status)
success = True success = True
except httplib.HTTPException, e: except http.client.HTTPException as e:
print 'ERROR', e.__class__.__name__, 'while retrieving', url print('ERROR', e.__class__.__name__, 'while retrieving', url)
conn.close conn.close
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']: if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
print "eventually this error might be recovered. let's try again." print("eventually this error might be recovered. let's try again.")
print 'reconnecting...' print('reconnecting...')
conn = httplib.HTTPSConnection(domain) conn = http.client.HTTPSConnection(domain)
attempts += 1 attempts += 1
else: else:
print "it's not possible to recover this error." print("it's not possible to recover this error.")
errors += 1 errors += 1
return ('', '') return ('', '')
if recovered: if recovered:
print "error recovered" print("error recovered")
if not success: if not success:
print "it was not possible to recover this error." print("it was not possible to recover this error.")
errors += 1 errors += 1
return ('', '') return ('', '')
@ -666,7 +663,7 @@ def url_to_filename(url):
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
turl = re.sub(r'm/index.php\?title=', r'man/', nurl) turl = re.sub(r'm/index.php\?title=', r'man/', nurl)
turl = re.sub(r'.css&[\S\s]+', r'.css', turl) turl = re.sub(r'.css&[\S\s]+', r'.css', turl)
L = list(urlparse.urlparse(turl)) L = list(urllib.parse.urlparse(turl))
#this way the url will not create a folder outside of the maindomain #this way the url will not create a folder outside of the maindomain
droot = get_domain(config.rooturl) droot = get_domain(config.rooturl)
@ -697,10 +694,10 @@ def url_to_filename(url):
#don't sanitize / for path #don't sanitize / for path
L[0] = '' L[0] = ''
L[2] = urllib.quote_plus(L[2],'/') L[2] = urllib.parse.quote_plus(L[2],'/')
L[3] = urllib.quote_plus(L[3]) L[3] = urllib.parse.quote_plus(L[3])
L[4] = urllib.quote_plus(L[4]) L[4] = urllib.parse.quote_plus(L[4])
L[5] = urllib.quote_plus(L[5]) L[5] = urllib.parse.quote_plus(L[5])
# Local filename relative to outdir # Local filename relative to outdir
# os.sep - O.S. directory separator # os.sep - O.S. directory separator
@ -750,12 +747,11 @@ def url_to_filename(url):
wrote_file_set.add(os.path.normcase(os.path.normpath(ans))) wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
url_filename_cache[nurl] = ans url_filename_cache[nurl] = ans
mode = ['wb', 'w'][mimetype.startswith('text')]
# Make parent directory if it doesn't exist. # Make parent directory if it doesn't exist.
try: try:
os.makedirs(os.path.split(ans)[0]) os.makedirs(os.path.split(ans)[0])
except OSError, e: except OSError as e:
if e.errno != errno.EEXIST: if e.errno != errno.EEXIST:
raise raise
@ -765,7 +761,12 @@ def url_to_filename(url):
out.write('File already exists: ' + str(ans)) #@UndefinedVariable out.write('File already exists: ' + str(ans)) #@UndefinedVariable
sys.exit(1) sys.exit(1)
f = open(ans, mode) if mimetype.startswith('text'):
f = open(ans, 'w', encoding='utf8')
doc = str(doc)
else:
f = open(ans, 'wb')
f.write(doc) f.write(doc)
f.close() f.close()
@ -790,7 +791,7 @@ def url_to_relative(url, cururl):
L1 = L1[1:] L1 = L1[1:]
L2 = L2[1:] L2 = L2[1:]
rel_url = urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section rel_url = urllib.parse.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
if rel_url == '': if rel_url == '':
return '#' return '#'
else: else:
@ -842,28 +843,28 @@ def should_follow(url):
#if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)): #if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)):
if droot != dn: if droot != dn:
if config.debug: if config.debug:
print url, 'not in the same domain' print(url, 'not in the same domain')
return False return False
# False if multiple query fields or parameters found # False if multiple query fields or parameters found
if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')): if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')):
if config.debug: if config.debug:
print url, 'with multiple query fields' print(url, 'with multiple query fields')
return False return False
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')): if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
if config.debug: if config.debug:
print url, 'is a forbidden wiki page' print(url, 'is a forbidden wiki page')
return False return False
if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')): if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
if config.debug: if config.debug:
print url, 'is a image and you are in no-images mode' print(url, 'is a image and you are in no-images mode')
return False return False
if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')): if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')):
if config.debug: if config.debug:
print url, 'is a compressed file' print(url, 'is a compressed file')
return False return False
@ -874,7 +875,7 @@ def should_follow(url):
L = nurl.split('/') L = nurl.split('/')
if ('.' not in L[-1]): if ('.' not in L[-1]):
if config.debug: if config.debug:
print url, 'is a file outside of scope with unknown extension' print(url, 'is a file outside of scope with unknown extension')
return False return False
# JKC: we do allow css from 'strange' places. # JKC: we do allow css from 'strange' places.
@ -885,7 +886,7 @@ def should_follow(url):
for fp in forbidden_parents: for fp in forbidden_parents:
if fp in L[-1]: if fp in L[-1]:
if config.debug: if config.debug:
print url, 'is a page outside of scope' print(url, 'is a page outside of scope')
return False return False
return True return True
@ -921,7 +922,7 @@ def parse_html(doc, url, filename):
follow = should_follow(u) #and (counter < 10) follow = should_follow(u) #and (counter < 10)
if follow: if follow:
if config.debug: if config.debug:
print 'ACCEPTED - ', u print('ACCEPTED - ', u)
# Store url locally. # Store url locally.
new_urls += [u] new_urls += [u]
item.url = url_to_relative(u, url) item.url = url_to_relative(u, url)
@ -930,7 +931,7 @@ def parse_html(doc, url, filename):
# if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ): # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
# item.url = '' # item.url = ''
if config.debug: if config.debug:
print 'NOT INCLUDED - ', u print('NOT INCLUDED - ', u)
newdoc = htmldata.urljoin(doc, L) newdoc = htmldata.urljoin(doc, L)
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
@ -938,13 +939,19 @@ def parse_html(doc, url, filename):
newdoc = pos_html_transform(newdoc, url,filename) newdoc = pos_html_transform(newdoc, url,filename)
# Remove byte artifacts in string
newdoc = newdoc.replace('\\n','\n')
newdoc = newdoc.replace('\\t', '\t')
newdoc = newdoc.strip('b')
newdoc = newdoc.strip('')
return (newdoc, new_urls) return (newdoc, new_urls)
def deploy_file( src, dest ): def deploy_file( src, dest ):
src_dir = os.path.dirname(os.path.realpath(__file__)) src_dir = os.path.dirname(os.path.realpath(__file__))
src = os.path.join(src_dir, src) src = os.path.join(src_dir, src)
dest = os.path.join(config.outdir, dest) dest = os.path.join(config.outdir, dest)
print "copying from", src, "to", dest print("copying from", src, "to", dest)
directory = os.path.dirname(dest) directory = os.path.dirname(dest)
if not os.path.exists(directory): if not os.path.exists(directory):
os.makedirs(directory) os.makedirs(directory)
@ -957,7 +964,7 @@ def run(out=sys.stdout):
""" """
global conn, domain, counter, redir_cache, config, headers global conn, domain, counter, redir_cache, config, headers
if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'): if urllib.parse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
out.write('Please do not use robots with the Wikipedia site.\n') out.write('Please do not use robots with the Wikipedia site.\n')
out.write('Instead, install the Wikipedia database locally and use mw2html on\n') out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
out.write('your local installation. See the Mediawiki site for more information.\n') out.write('your local installation. See the Mediawiki site for more information.\n')
@ -971,8 +978,8 @@ def run(out=sys.stdout):
sys.exit(1) sys.exit(1)
domain = get_domain(config.rooturl) domain = get_domain(config.rooturl)
conn = httplib.HTTPSConnection(domain) conn = http.client.HTTPSConnection(domain)
print 'connection established to:', domain print('connection established to:', domain)
complete = set() complete = set()
pending = set([config.rooturl]) pending = set([config.rooturl])
@ -986,7 +993,7 @@ def run(out=sys.stdout):
if nurl in complete: if nurl in complete:
if config.debug: if config.debug:
print url, 'already processed' print(url, 'already processed')
continue continue
complete.add(nurl) complete.add(nurl)
@ -997,7 +1004,7 @@ def run(out=sys.stdout):
if start: if start:
start = False start = False
aux_url = '' aux_url = ''
for redir in redir_cache.iterkeys(): for redir in redir_cache.keys():
aux_url = normalize_url(redir) aux_url = normalize_url(redir)
url_filename_cache[aux_url] = filename url_filename_cache[aux_url] = filename
if aux_url not in complete: if aux_url not in complete:
@ -1009,10 +1016,16 @@ def run(out=sys.stdout):
continue continue
if not os.path.exists(filename): if not os.path.exists(filename):
print "ERROR: ", url, '\n' print("ERROR: ", url, '\n')
continue continue
f = open(filename, 'r') # These formats are encoded as text. Everything else is read as bytes
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
if not filename.endswith(text_ext):
f = open(filename, 'rb')
else:
f = open(filename, 'r')
doc = f.read() doc = f.read()
f.close() f.close()
new_urls = [] new_urls = []
@ -1025,7 +1038,6 @@ def run(out=sys.stdout):
# Save document changes to disk # Save document changes to disk
# The unmodified file already exists on disk. # The unmodified file already exists on disk.
update = False update = False
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
for ext in text_ext: for ext in text_ext:
if filename.endswith(ext): if filename.endswith(ext):
update = True update = True
@ -1049,10 +1061,10 @@ def run(out=sys.stdout):
pending.add(u) pending.add(u)
conn.close() conn.close()
print "connection to", domain, "closed." print("connection to", domain, "closed.")
out.write(str(n) + ' files saved\n') out.write(str(n) + ' files saved\n')
print counter, "httplib requests done" print(counter, "httplib requests done")
print errors, "errors not recovered" print(errors, "errors not recovered")
# use / not \ so as to work on both windows and mac. # use / not \ so as to work on both windows and mac.
deploy_file( "AudacityLogo.png", r"alphamanual.audacityteam.org/m/resources/assets/AudacityLogo.png") deploy_file( "AudacityLogo.png", r"alphamanual.audacityteam.org/m/resources/assets/AudacityLogo.png")
@ -1118,7 +1130,7 @@ def usage():
""" """
print textwrap.dedent(usage_str.strip('\n')) print(textwrap.dedent(usage_str.strip('\n')))
sys.exit(1) sys.exit(1)