1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-10-22 06:22:58 +02:00

Move mw2html and htmldata to python3

This commit is contained in:
Jack Thomson
2020-05-27 13:48:49 -04:00
committed by James Crook
parent 6541e808be
commit 56f02ce13e
2 changed files with 116 additions and 120 deletions

View File

@@ -1,4 +1,4 @@
#! /usr/bin/env python
#! /usr/bin/env python3
"""
mw2html - Mediawiki to static HTML
@@ -15,37 +15,34 @@ Improved filtering.
Improved usability.
Customized for Audacity's manual wiki.
Minor tweaks (for Audacity) By James Crook, Nov 2009.
Moved to Python3 by Jack Thomson, May 2020
...
"""
__version__ = '0.1.0.2'
__version__ = '0.1.0.3'
import re
import sys
import getopt
import random
import urllib
import urllib.request, urllib.parse, urllib.error
import textwrap
import urlparse
import urllib.parse
import os, os.path
import htmldata
import errno
import hashlib
import httplib
#import pdb
import http.client
from time import strftime
from shutil import copyfile
try:
set
except:
from sets import Set as set
try:
import htmldata
except:
print 'Requires Python htmldata module:'
print ' http://www.connellybarnes.com/code/htmldata/'
print('Requires Python3 htmldata module:')
print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py')
sys.exit()
@@ -111,7 +108,7 @@ def get_domain(u):
url = normalize_url(u)
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
L = list(urlparse.urlparse(url))
L = list(urllib.parse.urlparse(url))
return L[1]
@@ -135,7 +132,7 @@ def normalize_url(url, lower=True):
url = 'https://' + url
urlparse.urljoin(config.rooturl, url)
urllib.parse.urljoin(config.rooturl, url)
return url
@@ -283,7 +280,7 @@ def pos_html_transform(doc, url,filename):
# Add sidebar.html
if config.sidebar != None and sidebar_html == '':
f = open(config.sidebar, 'rU')
f = open(config.sidebar, 'r')
sidebar_html = f.read()
f.close()
@@ -328,7 +325,7 @@ def pos_html_transform(doc, url,filename):
return doc
if footer_text == '':
f = open(config.footer, 'rU')
f = open(config.footer, 'r')
footer_text = f.read()
f.close()
@@ -567,14 +564,14 @@ def url_open(url):
while redirect != '':
l_redir += [url]
L = urlparse.urlparse(url)
L = urllib.parse.urlparse(url)
if L[1] != domain:
conn.close()
if L[1] == '': return(['',''])
print "connection to", domain, "closed."
conn = httplib.HTTPSConnection(L[1])
print("connection to", domain, "closed.")
conn = http.client.HTTPSConnection(L[1])
domain = L[1]
print "connection to", domain, "opened."
print("connection to", domain, "opened.")
rel_url = url
pos = url.find(domain)
@@ -593,47 +590,47 @@ def url_open(url):
try:
conn.request("GET", rel_url,headers=headers)
r = conn.getresponse()
print 'Status', r.status, r.reason, 'accessing', rel_url
print('Status', r.status, r.reason, 'accessing', rel_url)
if r.status == 404:
print " it's not possible to recover this error."
print(" it's not possible to recover this error.")
errors += 1
return ('', '')
if r.status == 500:
print " eventually this error might be recovered. let's try again."
print ' reconnecting...'
conn = httplib.HTTPSConnection(domain)
print(" eventually this error might be recovered. let's try again.")
print(' reconnecting...')
conn = http.client.HTTPSConnection(domain)
attempts += 1
continue
if r.status == 403:
print " that shouldn't happen, but let's try again anyway."
print ' reconnecting...'
conn = httplib.HTTPSConnection(domain)
print(" that shouldn't happen, but let's try again anyway.")
print(' reconnecting...')
conn = http.client.HTTPSConnection(domain)
attempts += 1
continue
if attempts != 0:
recovered = True
if r.status != 200:
print " Status other than 200, 404, 500, 403. It is: ", r.status
print(" Status other than 200, 404, 500, 403. It is: ", r.status)
success = True
except httplib.HTTPException, e:
print 'ERROR', e.__class__.__name__, 'while retrieving', url
except http.client.HTTPException as e:
print('ERROR', e.__class__.__name__, 'while retrieving', url)
conn.close
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
print "eventually this error might be recovered. let's try again."
print 'reconnecting...'
conn = httplib.HTTPSConnection(domain)
print("eventually this error might be recovered. let's try again.")
print('reconnecting...')
conn = http.client.HTTPSConnection(domain)
attempts += 1
else:
print "it's not possible to recover this error."
print("it's not possible to recover this error.")
errors += 1
return ('', '')
if recovered:
print "error recovered"
print("error recovered")
if not success:
print "it was not possible to recover this error."
print("it was not possible to recover this error.")
errors += 1
return ('', '')
@@ -666,7 +663,7 @@ def url_to_filename(url):
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
turl = re.sub(r'm/index.php\?title=', r'man/', nurl)
turl = re.sub(r'.css&[\S\s]+', r'.css', turl)
L = list(urlparse.urlparse(turl))
L = list(urllib.parse.urlparse(turl))
#this way the url will not create a folder outside of the maindomain
droot = get_domain(config.rooturl)
@@ -697,10 +694,10 @@ def url_to_filename(url):
#don't sanitize / for path
L[0] = ''
L[2] = urllib.quote_plus(L[2],'/')
L[3] = urllib.quote_plus(L[3])
L[4] = urllib.quote_plus(L[4])
L[5] = urllib.quote_plus(L[5])
L[2] = urllib.parse.quote_plus(L[2],'/')
L[3] = urllib.parse.quote_plus(L[3])
L[4] = urllib.parse.quote_plus(L[4])
L[5] = urllib.parse.quote_plus(L[5])
# Local filename relative to outdir
# os.sep - O.S. directory separator
@@ -750,12 +747,11 @@ def url_to_filename(url):
wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
url_filename_cache[nurl] = ans
mode = ['wb', 'w'][mimetype.startswith('text')]
# Make parent directory if it doesn't exist.
try:
os.makedirs(os.path.split(ans)[0])
except OSError, e:
except OSError as e:
if e.errno != errno.EEXIST:
raise
@@ -765,7 +761,12 @@ def url_to_filename(url):
out.write('File already exists: ' + str(ans)) #@UndefinedVariable
sys.exit(1)
f = open(ans, mode)
if mimetype.startswith('text'):
f = open(ans, 'w', encoding='utf8')
doc = str(doc)
else:
f = open(ans, 'wb')
f.write(doc)
f.close()
@@ -790,7 +791,7 @@ def url_to_relative(url, cururl):
L1 = L1[1:]
L2 = L2[1:]
rel_url = urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
rel_url = urllib.parse.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
if rel_url == '':
return '#'
else:
@@ -842,28 +843,28 @@ def should_follow(url):
#if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)):
if droot != dn:
if config.debug:
print url, 'not in the same domain'
print(url, 'not in the same domain')
return False
# False if multiple query fields or parameters found
if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')):
if config.debug:
print url, 'with multiple query fields'
print(url, 'with multiple query fields')
return False
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
if config.debug:
print url, 'is a forbidden wiki page'
print(url, 'is a forbidden wiki page')
return False
if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
if config.debug:
print url, 'is a image and you are in no-images mode'
print(url, 'is a image and you are in no-images mode')
return False
if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')):
if config.debug:
print url, 'is a compressed file'
print(url, 'is a compressed file')
return False
@@ -874,7 +875,7 @@ def should_follow(url):
L = nurl.split('/')
if ('.' not in L[-1]):
if config.debug:
print url, 'is a file outside of scope with unknown extension'
print(url, 'is a file outside of scope with unknown extension')
return False
# JKC: we do allow css from 'strange' places.
@@ -885,7 +886,7 @@ def should_follow(url):
for fp in forbidden_parents:
if fp in L[-1]:
if config.debug:
print url, 'is a page outside of scope'
print(url, 'is a page outside of scope')
return False
return True
@@ -921,7 +922,7 @@ def parse_html(doc, url, filename):
follow = should_follow(u) #and (counter < 10)
if follow:
if config.debug:
print 'ACCEPTED - ', u
print('ACCEPTED - ', u)
# Store url locally.
new_urls += [u]
item.url = url_to_relative(u, url)
@@ -930,7 +931,7 @@ def parse_html(doc, url, filename):
# if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
# item.url = ''
if config.debug:
print 'NOT INCLUDED - ', u
print('NOT INCLUDED - ', u)
newdoc = htmldata.urljoin(doc, L)
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
@@ -938,13 +939,19 @@ def parse_html(doc, url, filename):
newdoc = pos_html_transform(newdoc, url,filename)
# Remove byte artifacts in string
newdoc = newdoc.replace('\\n','\n')
newdoc = newdoc.replace('\\t', '\t')
newdoc = newdoc.strip('b')
newdoc = newdoc.strip('')
return (newdoc, new_urls)
def deploy_file( src, dest ):
src_dir = os.path.dirname(os.path.realpath(__file__))
src = os.path.join(src_dir, src)
dest = os.path.join(config.outdir, dest)
print "copying from", src, "to", dest
print("copying from", src, "to", dest)
directory = os.path.dirname(dest)
if not os.path.exists(directory):
os.makedirs(directory)
@@ -957,7 +964,7 @@ def run(out=sys.stdout):
"""
global conn, domain, counter, redir_cache, config, headers
if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
if urllib.parse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
out.write('Please do not use robots with the Wikipedia site.\n')
out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
out.write('your local installation. See the Mediawiki site for more information.\n')
@@ -971,8 +978,8 @@ def run(out=sys.stdout):
sys.exit(1)
domain = get_domain(config.rooturl)
conn = httplib.HTTPSConnection(domain)
print 'connection established to:', domain
conn = http.client.HTTPSConnection(domain)
print('connection established to:', domain)
complete = set()
pending = set([config.rooturl])
@@ -986,7 +993,7 @@ def run(out=sys.stdout):
if nurl in complete:
if config.debug:
print url, 'already processed'
print(url, 'already processed')
continue
complete.add(nurl)
@@ -997,7 +1004,7 @@ def run(out=sys.stdout):
if start:
start = False
aux_url = ''
for redir in redir_cache.iterkeys():
for redir in redir_cache.keys():
aux_url = normalize_url(redir)
url_filename_cache[aux_url] = filename
if aux_url not in complete:
@@ -1009,10 +1016,16 @@ def run(out=sys.stdout):
continue
if not os.path.exists(filename):
print "ERROR: ", url, '\n'
print("ERROR: ", url, '\n')
continue
f = open(filename, 'r')
# These formats are encoded as text. Everything else is read as bytes
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
if not filename.endswith(text_ext):
f = open(filename, 'rb')
else:
f = open(filename, 'r')
doc = f.read()
f.close()
new_urls = []
@@ -1025,7 +1038,6 @@ def run(out=sys.stdout):
# Save document changes to disk
# The unmodified file already exists on disk.
update = False
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
for ext in text_ext:
if filename.endswith(ext):
update = True
@@ -1049,10 +1061,10 @@ def run(out=sys.stdout):
pending.add(u)
conn.close()
print "connection to", domain, "closed."
print("connection to", domain, "closed.")
out.write(str(n) + ' files saved\n')
print counter, "httplib requests done"
print errors, "errors not recovered"
print(counter, "httplib requests done")
print(errors, "errors not recovered")
# use / not \ so as to work on both windows and mac.
deploy_file( "AudacityLogo.png", r"alphamanual.audacityteam.org/m/resources/assets/AudacityLogo.png")
@@ -1118,7 +1130,7 @@ def usage():
"""
print textwrap.dedent(usage_str.strip('\n'))
print(textwrap.dedent(usage_str.strip('\n')))
sys.exit(1)