mirror of
https://github.com/cookiengineer/audacity
synced 2025-07-30 07:29:29 +02:00
Move mw2html and htmldata to python3
This commit is contained in:
parent
6541e808be
commit
56f02ce13e
@ -11,22 +11,19 @@ Features:
|
|||||||
This allows you to read and write HTML documents
|
This allows you to read and write HTML documents
|
||||||
programmably, with much flexibility.
|
programmably, with much flexibility.
|
||||||
- Extract and modify URLs in an HTML document.
|
- Extract and modify URLs in an HTML document.
|
||||||
- Compatible with Python 2.0 - 2.5.
|
- Compatible with Python 3+
|
||||||
|
|
||||||
See the L{examples} for a quick start.
|
See the L{examples} for a quick start.
|
||||||
|
|
||||||
|
Moved to Python3 by Jack Thomson May 2020
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = '1.1.1'
|
__version__ = '1.1.2'
|
||||||
|
|
||||||
__all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract',
|
__all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract',
|
||||||
'urljoin', 'URLMatch']
|
'urljoin', 'URLMatch']
|
||||||
|
|
||||||
# Define True and False for Python < 2.2.
|
|
||||||
import sys
|
|
||||||
if sys.version_info[:3] < (2, 2, 0):
|
|
||||||
exec "True = 1; False = 0"
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
# Globals
|
# Globals
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
@ -34,8 +31,8 @@ if sys.version_info[:3] < (2, 2, 0):
|
|||||||
import re
|
import re
|
||||||
import shlex
|
import shlex
|
||||||
import string
|
import string
|
||||||
import urllib
|
import urllib.request, urllib.parse, urllib.error
|
||||||
import urlparse
|
import urllib.parse
|
||||||
import types
|
import types
|
||||||
|
|
||||||
# Translate text between these strings as plain text (not HTML).
|
# Translate text between these strings as plain text (not HTML).
|
||||||
@ -164,7 +161,7 @@ def tagjoin(L):
|
|||||||
else:
|
else:
|
||||||
rslash = ''
|
rslash = ''
|
||||||
tag_items = []
|
tag_items = []
|
||||||
items = d.items()
|
items = list(d.items())
|
||||||
items.sort()
|
items.sort()
|
||||||
for (key, value) in items:
|
for (key, value) in items:
|
||||||
if value != None:
|
if value != None:
|
||||||
@ -189,7 +186,7 @@ def _enumerate(L):
|
|||||||
|
|
||||||
Returns a list instead of an iterator.
|
Returns a list instead of an iterator.
|
||||||
"""
|
"""
|
||||||
return zip(range(len(L)), L)
|
return list(zip(list(range(len(L))), L))
|
||||||
|
|
||||||
def _ignore_tag_index(s, i):
|
def _ignore_tag_index(s, i):
|
||||||
"""
|
"""
|
||||||
@ -261,7 +258,7 @@ def _html_split(s):
|
|||||||
found = False
|
found = False
|
||||||
in_quot1 = False
|
in_quot1 = False
|
||||||
in_quot2 = False
|
in_quot2 = False
|
||||||
for i2 in xrange(i + 1, len(s)):
|
for i2 in range(i + 1, len(s)):
|
||||||
c2 = s[i2]
|
c2 = s[i2]
|
||||||
if c2 == '"' and not in_quot1:
|
if c2 == '"' and not in_quot1:
|
||||||
in_quot2 = not in_quot2
|
in_quot2 = not in_quot2
|
||||||
@ -521,7 +518,7 @@ def _test_tag_dict():
|
|||||||
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
|
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
|
||||||
(a, b, c) = _tag_dict(s)
|
(a, b, c) = _tag_dict(s)
|
||||||
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
|
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
|
||||||
for key in a.keys():
|
for key in list(a.keys()):
|
||||||
assert s[b[key][0]:b[key][1]] == key
|
assert s[b[key][0]:b[key][1]] == key
|
||||||
if a[key] != None:
|
if a[key] != None:
|
||||||
assert s[c[key][0]:c[key][1]] == a[key]
|
assert s[c[key][0]:c[key][1]] == a[key]
|
||||||
@ -609,7 +606,7 @@ def _full_tag_extract(s):
|
|||||||
|
|
||||||
(attrs, key_pos, value_pos) = _tag_dict(dtext)
|
(attrs, key_pos, value_pos) = _tag_dict(dtext)
|
||||||
# Correct offsets in key_pos and value_pos.
|
# Correct offsets in key_pos and value_pos.
|
||||||
for key in attrs.keys():
|
for key in list(attrs.keys()):
|
||||||
key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
|
key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
|
||||||
key_pos[key][1] + Lstart[i] + dtext_offset)
|
key_pos[key][1] + Lstart[i] + dtext_offset)
|
||||||
value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
|
value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
|
||||||
@ -720,7 +717,7 @@ _URL_TAGS = ['a href', 'applet archive', 'applet code',
|
|||||||
'script src', 'table background', 'tbody background',
|
'script src', 'table background', 'tbody background',
|
||||||
'td background', 'tfoot background', 'th background',
|
'td background', 'tfoot background', 'th background',
|
||||||
'thead background', 'tr background']
|
'thead background', 'tr background']
|
||||||
_URL_TAGS = map(lambda s: tuple(s.split()), _URL_TAGS)
|
_URL_TAGS = [tuple(s.split()) for s in _URL_TAGS]
|
||||||
|
|
||||||
|
|
||||||
def _finditer(pattern, string):
|
def _finditer(pattern, string):
|
||||||
@ -862,7 +859,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# Current item is a tag.
|
# Current item is a tag.
|
||||||
if item.attrs.has_key('style'):
|
if 'style' in item.attrs:
|
||||||
# Process a stylesheet embedded in the 'style' attribute.
|
# Process a stylesheet embedded in the 'style' attribute.
|
||||||
temp = urlextract(item.attrs['style'], siteurl, 'text/css')
|
temp = urlextract(item.attrs['style'], siteurl, 'text/css')
|
||||||
# Offset indices and add to ans.
|
# Offset indices and add to ans.
|
||||||
@ -872,7 +869,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
|
|||||||
ans += temp
|
ans += temp
|
||||||
|
|
||||||
for (a, b) in _URL_TAGS:
|
for (a, b) in _URL_TAGS:
|
||||||
if item.name.startswith(a) and b in item.attrs.keys():
|
if item.name.startswith(a) and b in list(item.attrs.keys()):
|
||||||
# Got one URL.
|
# Got one URL.
|
||||||
url = item.attrs[b]
|
url = item.attrs[b]
|
||||||
# FIXME: Some HTML tag wants a URL list, look up which
|
# FIXME: Some HTML tag wants a URL list, look up which
|
||||||
@ -893,7 +890,7 @@ def urlextract(doc, siteurl=None, mimetype='text/html'):
|
|||||||
start_end_map = {}
|
start_end_map = {}
|
||||||
filtered_ans = []
|
filtered_ans = []
|
||||||
for item in ans:
|
for item in ans:
|
||||||
if not start_end_map.has_key((item.start, item.end)):
|
if (item.start, item.end) not in start_end_map:
|
||||||
start_end_map[(item.start, item.end)] = None
|
start_end_map[(item.start, item.end)] = None
|
||||||
filtered_ans.append(item)
|
filtered_ans.append(item)
|
||||||
return filtered_ans
|
return filtered_ans
|
||||||
@ -1090,7 +1087,7 @@ def examples():
|
|||||||
the offending IP address.
|
the offending IP address.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
print examples.__doc__
|
print(examples.__doc__)
|
||||||
|
|
||||||
class URLMatch:
|
class URLMatch:
|
||||||
"""
|
"""
|
||||||
@ -1137,7 +1134,7 @@ class URLMatch:
|
|||||||
self.in_css = in_css
|
self.in_css = in_css
|
||||||
|
|
||||||
if siteurl != None:
|
if siteurl != None:
|
||||||
self.url = urlparse.urljoin(siteurl, self.url)
|
self.url = urllib.parse.urljoin(siteurl, self.url)
|
||||||
|
|
||||||
self.tag_attr = tag_attr
|
self.tag_attr = tag_attr
|
||||||
self.tag_attrs = tag_attrs
|
self.tag_attrs = tag_attrs
|
||||||
@ -1154,15 +1151,15 @@ def _cast_to_str(arg, str_class):
|
|||||||
"""
|
"""
|
||||||
if _is_str(arg):
|
if _is_str(arg):
|
||||||
return str_class(arg)
|
return str_class(arg)
|
||||||
elif isinstance(arg, types.ListType):
|
elif isinstance(arg, list):
|
||||||
ans = []
|
ans = []
|
||||||
for item in arg:
|
for item in arg:
|
||||||
if _is_str(item):
|
if _is_str(item):
|
||||||
ans.append(str_class(item))
|
ans.append(str_class(item))
|
||||||
elif isinstance(item, types.TupleType) and len(item) == 2:
|
elif isinstance(item, tuple) and len(item) == 2:
|
||||||
(a, b) = item
|
(a, b) = item
|
||||||
b_prime = {}
|
b_prime = {}
|
||||||
for (b_key, b_value) in b.items():
|
for (b_key, b_value) in list(b.items()):
|
||||||
if b_value is None:
|
if b_value is None:
|
||||||
b_prime[str_class(b_key)] = None
|
b_prime[str_class(b_key)] = None
|
||||||
else:
|
else:
|
||||||
@ -1321,7 +1318,7 @@ def _test_tagextract(str_class=str):
|
|||||||
L = _full_tag_extract(s)
|
L = _full_tag_extract(s)
|
||||||
for (i, item) in _enumerate(L):
|
for (i, item) in _enumerate(L):
|
||||||
if isinstance(item, _HTMLTag):
|
if isinstance(item, _HTMLTag):
|
||||||
for key in item.attrs.keys():
|
for key in list(item.attrs.keys()):
|
||||||
assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\
|
assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\
|
||||||
== key
|
== key
|
||||||
if item.attrs[key] != None:
|
if item.attrs[key] != None:
|
||||||
@ -1460,7 +1457,7 @@ def _test_urlextract(str_class=str):
|
|||||||
base = f('http://www.python.org/~guido/')
|
base = f('http://www.python.org/~guido/')
|
||||||
L = urlextract(s, base)
|
L = urlextract(s, base)
|
||||||
L2 = [x.url for x in L]
|
L2 = [x.url for x in L]
|
||||||
assert L2 == [urlparse.urljoin(base, x) for x in ans]
|
assert L2 == [urllib.parse.urljoin(base, x) for x in ans]
|
||||||
|
|
||||||
# Test urljoin().
|
# Test urljoin().
|
||||||
assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1
|
assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1
|
||||||
@ -1485,17 +1482,6 @@ def _test_urlextract(str_class=str):
|
|||||||
assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html'])
|
assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html'])
|
||||||
assert [s[x.start:x.end] == x.url for x in L].count(False) == 0
|
assert [s[x.start:x.end] == x.url for x in L].count(False) == 0
|
||||||
|
|
||||||
def _python_has_unicode():
|
|
||||||
"""
|
|
||||||
True iff Python was compiled with unicode().
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
unicode
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
# Unit Test Main Routine
|
# Unit Test Main Routine
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
@ -1504,32 +1490,30 @@ def _test():
|
|||||||
"""
|
"""
|
||||||
Unit test main routine.
|
Unit test main routine.
|
||||||
"""
|
"""
|
||||||
print 'Unit tests:'
|
print('Unit tests:')
|
||||||
_test_remove_comments()
|
_test_remove_comments()
|
||||||
print ' _remove_comments: OK'
|
print(' _remove_comments: OK')
|
||||||
_test_shlex_split()
|
_test_shlex_split()
|
||||||
print ' _shlex_split: OK'
|
print(' _shlex_split: OK')
|
||||||
_test_tag_dict()
|
_test_tag_dict()
|
||||||
print ' _tag_dict: OK'
|
print(' _tag_dict: OK')
|
||||||
_test_tuple_replace()
|
_test_tuple_replace()
|
||||||
print ' _tuple_replace: OK'
|
print(' _tuple_replace: OK')
|
||||||
|
|
||||||
_test_tagextract()
|
_test_tagextract()
|
||||||
print ' tagextract*: OK'
|
print(' tagextract*: OK')
|
||||||
|
|
||||||
if _python_has_unicode():
|
_test_tagextract(str)
|
||||||
_test_tagextract(unicode)
|
print(' tagextract (unicode)*: OK')
|
||||||
print ' tagextract (unicode)*: OK'
|
|
||||||
|
|
||||||
_test_urlextract()
|
_test_urlextract()
|
||||||
print ' urlextract*: OK'
|
print(' urlextract*: OK')
|
||||||
|
|
||||||
if _python_has_unicode():
|
_test_urlextract(str)
|
||||||
_test_urlextract(unicode)
|
print(' urlextract (unicode)*: OK')
|
||||||
print ' urlextract (unicode)*: OK'
|
|
||||||
|
|
||||||
print
|
print()
|
||||||
print '* The corresponding join method has been tested as well.'
|
print('* The corresponding join method has been tested as well.')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#! /usr/bin/env python
|
#! /usr/bin/env python3
|
||||||
|
|
||||||
"""
|
"""
|
||||||
mw2html - Mediawiki to static HTML
|
mw2html - Mediawiki to static HTML
|
||||||
@ -15,37 +15,34 @@ Improved filtering.
|
|||||||
Improved usability.
|
Improved usability.
|
||||||
Customized for Audacity's manual wiki.
|
Customized for Audacity's manual wiki.
|
||||||
Minor tweaks (for Audacity) By James Crook, Nov 2009.
|
Minor tweaks (for Audacity) By James Crook, Nov 2009.
|
||||||
|
Moved to Python3 by Jack Thomson, May 2020
|
||||||
...
|
...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = '0.1.0.2'
|
__version__ = '0.1.0.3'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import getopt
|
import getopt
|
||||||
import random
|
import random
|
||||||
import urllib
|
import urllib.request, urllib.parse, urllib.error
|
||||||
import textwrap
|
import textwrap
|
||||||
import urlparse
|
import urllib.parse
|
||||||
import os, os.path
|
import os, os.path
|
||||||
|
|
||||||
|
import htmldata
|
||||||
|
|
||||||
import errno
|
import errno
|
||||||
import hashlib
|
import hashlib
|
||||||
import httplib
|
import http.client
|
||||||
#import pdb
|
|
||||||
from time import strftime
|
from time import strftime
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
try:
|
|
||||||
set
|
|
||||||
except:
|
|
||||||
from sets import Set as set
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import htmldata
|
import htmldata
|
||||||
except:
|
except:
|
||||||
print 'Requires Python htmldata module:'
|
print('Requires Python3 htmldata module:')
|
||||||
print ' http://www.connellybarnes.com/code/htmldata/'
|
print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
@ -111,7 +108,7 @@ def get_domain(u):
|
|||||||
url = normalize_url(u)
|
url = normalize_url(u)
|
||||||
|
|
||||||
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
|
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
|
||||||
L = list(urlparse.urlparse(url))
|
L = list(urllib.parse.urlparse(url))
|
||||||
|
|
||||||
return L[1]
|
return L[1]
|
||||||
|
|
||||||
@ -135,7 +132,7 @@ def normalize_url(url, lower=True):
|
|||||||
|
|
||||||
url = 'https://' + url
|
url = 'https://' + url
|
||||||
|
|
||||||
urlparse.urljoin(config.rooturl, url)
|
urllib.parse.urljoin(config.rooturl, url)
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
@ -283,7 +280,7 @@ def pos_html_transform(doc, url,filename):
|
|||||||
|
|
||||||
# Add sidebar.html
|
# Add sidebar.html
|
||||||
if config.sidebar != None and sidebar_html == '':
|
if config.sidebar != None and sidebar_html == '':
|
||||||
f = open(config.sidebar, 'rU')
|
f = open(config.sidebar, 'r')
|
||||||
sidebar_html = f.read()
|
sidebar_html = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
@ -328,7 +325,7 @@ def pos_html_transform(doc, url,filename):
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
if footer_text == '':
|
if footer_text == '':
|
||||||
f = open(config.footer, 'rU')
|
f = open(config.footer, 'r')
|
||||||
footer_text = f.read()
|
footer_text = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
@ -567,14 +564,14 @@ def url_open(url):
|
|||||||
while redirect != '':
|
while redirect != '':
|
||||||
l_redir += [url]
|
l_redir += [url]
|
||||||
|
|
||||||
L = urlparse.urlparse(url)
|
L = urllib.parse.urlparse(url)
|
||||||
if L[1] != domain:
|
if L[1] != domain:
|
||||||
conn.close()
|
conn.close()
|
||||||
if L[1] == '': return(['',''])
|
if L[1] == '': return(['',''])
|
||||||
print "connection to", domain, "closed."
|
print("connection to", domain, "closed.")
|
||||||
conn = httplib.HTTPSConnection(L[1])
|
conn = http.client.HTTPSConnection(L[1])
|
||||||
domain = L[1]
|
domain = L[1]
|
||||||
print "connection to", domain, "opened."
|
print("connection to", domain, "opened.")
|
||||||
|
|
||||||
rel_url = url
|
rel_url = url
|
||||||
pos = url.find(domain)
|
pos = url.find(domain)
|
||||||
@ -593,47 +590,47 @@ def url_open(url):
|
|||||||
try:
|
try:
|
||||||
conn.request("GET", rel_url,headers=headers)
|
conn.request("GET", rel_url,headers=headers)
|
||||||
r = conn.getresponse()
|
r = conn.getresponse()
|
||||||
print 'Status', r.status, r.reason, 'accessing', rel_url
|
print('Status', r.status, r.reason, 'accessing', rel_url)
|
||||||
if r.status == 404:
|
if r.status == 404:
|
||||||
print " it's not possible to recover this error."
|
print(" it's not possible to recover this error.")
|
||||||
errors += 1
|
errors += 1
|
||||||
return ('', '')
|
return ('', '')
|
||||||
if r.status == 500:
|
if r.status == 500:
|
||||||
print " eventually this error might be recovered. let's try again."
|
print(" eventually this error might be recovered. let's try again.")
|
||||||
print ' reconnecting...'
|
print(' reconnecting...')
|
||||||
conn = httplib.HTTPSConnection(domain)
|
conn = http.client.HTTPSConnection(domain)
|
||||||
attempts += 1
|
attempts += 1
|
||||||
continue
|
continue
|
||||||
if r.status == 403:
|
if r.status == 403:
|
||||||
print " that shouldn't happen, but let's try again anyway."
|
print(" that shouldn't happen, but let's try again anyway.")
|
||||||
print ' reconnecting...'
|
print(' reconnecting...')
|
||||||
conn = httplib.HTTPSConnection(domain)
|
conn = http.client.HTTPSConnection(domain)
|
||||||
attempts += 1
|
attempts += 1
|
||||||
continue
|
continue
|
||||||
if attempts != 0:
|
if attempts != 0:
|
||||||
recovered = True
|
recovered = True
|
||||||
if r.status != 200:
|
if r.status != 200:
|
||||||
print " Status other than 200, 404, 500, 403. It is: ", r.status
|
print(" Status other than 200, 404, 500, 403. It is: ", r.status)
|
||||||
success = True
|
success = True
|
||||||
|
|
||||||
except httplib.HTTPException, e:
|
except http.client.HTTPException as e:
|
||||||
print 'ERROR', e.__class__.__name__, 'while retrieving', url
|
print('ERROR', e.__class__.__name__, 'while retrieving', url)
|
||||||
conn.close
|
conn.close
|
||||||
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
|
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
|
||||||
print "eventually this error might be recovered. let's try again."
|
print("eventually this error might be recovered. let's try again.")
|
||||||
print 'reconnecting...'
|
print('reconnecting...')
|
||||||
conn = httplib.HTTPSConnection(domain)
|
conn = http.client.HTTPSConnection(domain)
|
||||||
attempts += 1
|
attempts += 1
|
||||||
else:
|
else:
|
||||||
print "it's not possible to recover this error."
|
print("it's not possible to recover this error.")
|
||||||
errors += 1
|
errors += 1
|
||||||
return ('', '')
|
return ('', '')
|
||||||
|
|
||||||
if recovered:
|
if recovered:
|
||||||
print "error recovered"
|
print("error recovered")
|
||||||
|
|
||||||
if not success:
|
if not success:
|
||||||
print "it was not possible to recover this error."
|
print("it was not possible to recover this error.")
|
||||||
errors += 1
|
errors += 1
|
||||||
return ('', '')
|
return ('', '')
|
||||||
|
|
||||||
@ -666,7 +663,7 @@ def url_to_filename(url):
|
|||||||
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
|
#ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
|
||||||
turl = re.sub(r'm/index.php\?title=', r'man/', nurl)
|
turl = re.sub(r'm/index.php\?title=', r'man/', nurl)
|
||||||
turl = re.sub(r'.css&[\S\s]+', r'.css', turl)
|
turl = re.sub(r'.css&[\S\s]+', r'.css', turl)
|
||||||
L = list(urlparse.urlparse(turl))
|
L = list(urllib.parse.urlparse(turl))
|
||||||
|
|
||||||
#this way the url will not create a folder outside of the maindomain
|
#this way the url will not create a folder outside of the maindomain
|
||||||
droot = get_domain(config.rooturl)
|
droot = get_domain(config.rooturl)
|
||||||
@ -697,10 +694,10 @@ def url_to_filename(url):
|
|||||||
|
|
||||||
#don't sanitize / for path
|
#don't sanitize / for path
|
||||||
L[0] = ''
|
L[0] = ''
|
||||||
L[2] = urllib.quote_plus(L[2],'/')
|
L[2] = urllib.parse.quote_plus(L[2],'/')
|
||||||
L[3] = urllib.quote_plus(L[3])
|
L[3] = urllib.parse.quote_plus(L[3])
|
||||||
L[4] = urllib.quote_plus(L[4])
|
L[4] = urllib.parse.quote_plus(L[4])
|
||||||
L[5] = urllib.quote_plus(L[5])
|
L[5] = urllib.parse.quote_plus(L[5])
|
||||||
|
|
||||||
# Local filename relative to outdir
|
# Local filename relative to outdir
|
||||||
# os.sep - O.S. directory separator
|
# os.sep - O.S. directory separator
|
||||||
@ -750,12 +747,11 @@ def url_to_filename(url):
|
|||||||
wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
|
wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
|
||||||
url_filename_cache[nurl] = ans
|
url_filename_cache[nurl] = ans
|
||||||
|
|
||||||
mode = ['wb', 'w'][mimetype.startswith('text')]
|
|
||||||
|
|
||||||
# Make parent directory if it doesn't exist.
|
# Make parent directory if it doesn't exist.
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.split(ans)[0])
|
os.makedirs(os.path.split(ans)[0])
|
||||||
except OSError, e:
|
except OSError as e:
|
||||||
if e.errno != errno.EEXIST:
|
if e.errno != errno.EEXIST:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@ -765,7 +761,12 @@ def url_to_filename(url):
|
|||||||
out.write('File already exists: ' + str(ans)) #@UndefinedVariable
|
out.write('File already exists: ' + str(ans)) #@UndefinedVariable
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
f = open(ans, mode)
|
if mimetype.startswith('text'):
|
||||||
|
f = open(ans, 'w', encoding='utf8')
|
||||||
|
doc = str(doc)
|
||||||
|
else:
|
||||||
|
f = open(ans, 'wb')
|
||||||
|
|
||||||
f.write(doc)
|
f.write(doc)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
@ -790,7 +791,7 @@ def url_to_relative(url, cururl):
|
|||||||
L1 = L1[1:]
|
L1 = L1[1:]
|
||||||
L2 = L2[1:]
|
L2 = L2[1:]
|
||||||
|
|
||||||
rel_url = urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
|
rel_url = urllib.parse.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
|
||||||
if rel_url == '':
|
if rel_url == '':
|
||||||
return '#'
|
return '#'
|
||||||
else:
|
else:
|
||||||
@ -842,28 +843,28 @@ def should_follow(url):
|
|||||||
#if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)):
|
#if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)):
|
||||||
if droot != dn:
|
if droot != dn:
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'not in the same domain'
|
print(url, 'not in the same domain')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# False if multiple query fields or parameters found
|
# False if multiple query fields or parameters found
|
||||||
if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')):
|
if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')):
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'with multiple query fields'
|
print(url, 'with multiple query fields')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
|
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'is a forbidden wiki page'
|
print(url, 'is a forbidden wiki page')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
|
if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'is a image and you are in no-images mode'
|
print(url, 'is a image and you are in no-images mode')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')):
|
if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')):
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'is a compressed file'
|
print(url, 'is a compressed file')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@ -874,7 +875,7 @@ def should_follow(url):
|
|||||||
L = nurl.split('/')
|
L = nurl.split('/')
|
||||||
if ('.' not in L[-1]):
|
if ('.' not in L[-1]):
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'is a file outside of scope with unknown extension'
|
print(url, 'is a file outside of scope with unknown extension')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# JKC: we do allow css from 'strange' places.
|
# JKC: we do allow css from 'strange' places.
|
||||||
@ -885,7 +886,7 @@ def should_follow(url):
|
|||||||
for fp in forbidden_parents:
|
for fp in forbidden_parents:
|
||||||
if fp in L[-1]:
|
if fp in L[-1]:
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'is a page outside of scope'
|
print(url, 'is a page outside of scope')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -921,7 +922,7 @@ def parse_html(doc, url, filename):
|
|||||||
follow = should_follow(u) #and (counter < 10)
|
follow = should_follow(u) #and (counter < 10)
|
||||||
if follow:
|
if follow:
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print 'ACCEPTED - ', u
|
print('ACCEPTED - ', u)
|
||||||
# Store url locally.
|
# Store url locally.
|
||||||
new_urls += [u]
|
new_urls += [u]
|
||||||
item.url = url_to_relative(u, url)
|
item.url = url_to_relative(u, url)
|
||||||
@ -930,7 +931,7 @@ def parse_html(doc, url, filename):
|
|||||||
# if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
|
# if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
|
||||||
# item.url = ''
|
# item.url = ''
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print 'NOT INCLUDED - ', u
|
print('NOT INCLUDED - ', u)
|
||||||
|
|
||||||
newdoc = htmldata.urljoin(doc, L)
|
newdoc = htmldata.urljoin(doc, L)
|
||||||
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
|
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
|
||||||
@ -938,13 +939,19 @@ def parse_html(doc, url, filename):
|
|||||||
|
|
||||||
newdoc = pos_html_transform(newdoc, url,filename)
|
newdoc = pos_html_transform(newdoc, url,filename)
|
||||||
|
|
||||||
|
# Remove byte artifacts in string
|
||||||
|
newdoc = newdoc.replace('\\n','\n')
|
||||||
|
newdoc = newdoc.replace('\\t', '\t')
|
||||||
|
newdoc = newdoc.strip('b')
|
||||||
|
newdoc = newdoc.strip('')
|
||||||
|
|
||||||
return (newdoc, new_urls)
|
return (newdoc, new_urls)
|
||||||
|
|
||||||
def deploy_file( src, dest ):
|
def deploy_file( src, dest ):
|
||||||
src_dir = os.path.dirname(os.path.realpath(__file__))
|
src_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
src = os.path.join(src_dir, src)
|
src = os.path.join(src_dir, src)
|
||||||
dest = os.path.join(config.outdir, dest)
|
dest = os.path.join(config.outdir, dest)
|
||||||
print "copying from", src, "to", dest
|
print("copying from", src, "to", dest)
|
||||||
directory = os.path.dirname(dest)
|
directory = os.path.dirname(dest)
|
||||||
if not os.path.exists(directory):
|
if not os.path.exists(directory):
|
||||||
os.makedirs(directory)
|
os.makedirs(directory)
|
||||||
@ -957,7 +964,7 @@ def run(out=sys.stdout):
|
|||||||
"""
|
"""
|
||||||
global conn, domain, counter, redir_cache, config, headers
|
global conn, domain, counter, redir_cache, config, headers
|
||||||
|
|
||||||
if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
|
if urllib.parse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
|
||||||
out.write('Please do not use robots with the Wikipedia site.\n')
|
out.write('Please do not use robots with the Wikipedia site.\n')
|
||||||
out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
|
out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
|
||||||
out.write('your local installation. See the Mediawiki site for more information.\n')
|
out.write('your local installation. See the Mediawiki site for more information.\n')
|
||||||
@ -971,8 +978,8 @@ def run(out=sys.stdout):
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
domain = get_domain(config.rooturl)
|
domain = get_domain(config.rooturl)
|
||||||
conn = httplib.HTTPSConnection(domain)
|
conn = http.client.HTTPSConnection(domain)
|
||||||
print 'connection established to:', domain
|
print('connection established to:', domain)
|
||||||
complete = set()
|
complete = set()
|
||||||
pending = set([config.rooturl])
|
pending = set([config.rooturl])
|
||||||
|
|
||||||
@ -986,7 +993,7 @@ def run(out=sys.stdout):
|
|||||||
|
|
||||||
if nurl in complete:
|
if nurl in complete:
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'already processed'
|
print(url, 'already processed')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
complete.add(nurl)
|
complete.add(nurl)
|
||||||
@ -997,7 +1004,7 @@ def run(out=sys.stdout):
|
|||||||
if start:
|
if start:
|
||||||
start = False
|
start = False
|
||||||
aux_url = ''
|
aux_url = ''
|
||||||
for redir in redir_cache.iterkeys():
|
for redir in redir_cache.keys():
|
||||||
aux_url = normalize_url(redir)
|
aux_url = normalize_url(redir)
|
||||||
url_filename_cache[aux_url] = filename
|
url_filename_cache[aux_url] = filename
|
||||||
if aux_url not in complete:
|
if aux_url not in complete:
|
||||||
@ -1009,10 +1016,16 @@ def run(out=sys.stdout):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
print "ERROR: ", url, '\n'
|
print("ERROR: ", url, '\n')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
f = open(filename, 'r')
|
# These formats are encoded as text. Everything else is read as bytes
|
||||||
|
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
|
||||||
|
|
||||||
|
if not filename.endswith(text_ext):
|
||||||
|
f = open(filename, 'rb')
|
||||||
|
else:
|
||||||
|
f = open(filename, 'r')
|
||||||
doc = f.read()
|
doc = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
new_urls = []
|
new_urls = []
|
||||||
@ -1025,7 +1038,6 @@ def run(out=sys.stdout):
|
|||||||
# Save document changes to disk
|
# Save document changes to disk
|
||||||
# The unmodified file already exists on disk.
|
# The unmodified file already exists on disk.
|
||||||
update = False
|
update = False
|
||||||
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
|
|
||||||
for ext in text_ext:
|
for ext in text_ext:
|
||||||
if filename.endswith(ext):
|
if filename.endswith(ext):
|
||||||
update = True
|
update = True
|
||||||
@ -1049,10 +1061,10 @@ def run(out=sys.stdout):
|
|||||||
pending.add(u)
|
pending.add(u)
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
print "connection to", domain, "closed."
|
print("connection to", domain, "closed.")
|
||||||
out.write(str(n) + ' files saved\n')
|
out.write(str(n) + ' files saved\n')
|
||||||
print counter, "httplib requests done"
|
print(counter, "httplib requests done")
|
||||||
print errors, "errors not recovered"
|
print(errors, "errors not recovered")
|
||||||
|
|
||||||
# use / not \ so as to work on both windows and mac.
|
# use / not \ so as to work on both windows and mac.
|
||||||
deploy_file( "AudacityLogo.png", r"alphamanual.audacityteam.org/m/resources/assets/AudacityLogo.png")
|
deploy_file( "AudacityLogo.png", r"alphamanual.audacityteam.org/m/resources/assets/AudacityLogo.png")
|
||||||
@ -1118,7 +1130,7 @@ def usage():
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print textwrap.dedent(usage_str.strip('\n'))
|
print(textwrap.dedent(usage_str.strip('\n')))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user