mirror of
https://github.com/cookiengineer/audacity
synced 2025-09-18 17:10:55 +02:00
Avoid duplicate html files with names like 'index.htm'
This commit is contained in:
parent
9438fdf3e5
commit
eb904062ad
@ -189,7 +189,7 @@ def _enumerate(L):
|
|||||||
|
|
||||||
Returns a list instead of an iterator.
|
Returns a list instead of an iterator.
|
||||||
"""
|
"""
|
||||||
return zip(range(len(L)),L)
|
return zip(range(len(L)), L)
|
||||||
|
|
||||||
def _ignore_tag_index(s, i):
|
def _ignore_tag_index(s, i):
|
||||||
"""
|
"""
|
||||||
@ -199,9 +199,9 @@ def _ignore_tag_index(s, i):
|
|||||||
the index. Otherwise, return C{-1}.
|
the index. Otherwise, return C{-1}.
|
||||||
"""
|
"""
|
||||||
for (j, (a, b)) in _enumerate(_IGNORE_TAGS):
|
for (j, (a, b)) in _enumerate(_IGNORE_TAGS):
|
||||||
if s[i:i+len(a)+1].lower() == '<' + a:
|
if s[i:i + len(a) + 1].lower() == '<' + a:
|
||||||
return j
|
return j
|
||||||
return -1
|
return - 1
|
||||||
|
|
||||||
def _html_split(s):
|
def _html_split(s):
|
||||||
"""
|
"""
|
||||||
@ -233,7 +233,7 @@ def _html_split(s):
|
|||||||
c = s[i]
|
c = s[i]
|
||||||
if c == '<':
|
if c == '<':
|
||||||
# Left bracket, handle various cases.
|
# Left bracket, handle various cases.
|
||||||
if s[i:i+len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT):
|
if s[i:i + len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT):
|
||||||
# HTML begin comment tag, '<!--'. Scan for '-->'.
|
# HTML begin comment tag, '<!--'. Scan for '-->'.
|
||||||
i2 = s.find(_END_COMMENT, i)
|
i2 = s.find(_END_COMMENT, i)
|
||||||
if i2 < 0:
|
if i2 < 0:
|
||||||
@ -242,9 +242,9 @@ def _html_split(s):
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# Append the comment.
|
# Append the comment.
|
||||||
L.append(s[i:i2+len(_END_COMMENT)])
|
L.append(s[i:i2 + len(_END_COMMENT)])
|
||||||
i = i2 + len(_END_COMMENT)
|
i = i2 + len(_END_COMMENT)
|
||||||
elif s[i:i+len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA):
|
elif s[i:i + len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA):
|
||||||
# XHTML begin CDATA tag. Scan for ']]>'.
|
# XHTML begin CDATA tag. Scan for ']]>'.
|
||||||
i2 = s.find(_END_CDATA, i)
|
i2 = s.find(_END_CDATA, i)
|
||||||
if i2 < 0:
|
if i2 < 0:
|
||||||
@ -253,7 +253,7 @@ def _html_split(s):
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# Append the CDATA.
|
# Append the CDATA.
|
||||||
L.append(s[i:i2+len(_END_CDATA)])
|
L.append(s[i:i2 + len(_END_CDATA)])
|
||||||
i = i2 + len(_END_CDATA)
|
i = i2 + len(_END_CDATA)
|
||||||
else:
|
else:
|
||||||
# Regular HTML tag. Scan for '>'.
|
# Regular HTML tag. Scan for '>'.
|
||||||
@ -261,19 +261,19 @@ def _html_split(s):
|
|||||||
found = False
|
found = False
|
||||||
in_quot1 = False
|
in_quot1 = False
|
||||||
in_quot2 = False
|
in_quot2 = False
|
||||||
for i2 in xrange(i+1, len(s)):
|
for i2 in xrange(i + 1, len(s)):
|
||||||
c2 = s[i2]
|
c2 = s[i2]
|
||||||
if c2 == '"' and not in_quot1:
|
if c2 == '"' and not in_quot1:
|
||||||
in_quot2 = not in_quot2
|
in_quot2 = not in_quot2
|
||||||
# Only turn on double quote if it's in a realistic place.
|
# Only turn on double quote if it's in a realistic place.
|
||||||
if in_quot2 and not in_quot1:
|
if in_quot2 and not in_quot1:
|
||||||
if i2 > 0 and s[i2-1] not in [' ', '\t', '=']:
|
if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
|
||||||
in_quot2 = False
|
in_quot2 = False
|
||||||
elif c2 == "'" and not in_quot2:
|
elif c2 == "'" and not in_quot2:
|
||||||
in_quot1 = not in_quot1
|
in_quot1 = not in_quot1
|
||||||
# Only turn on single quote if it's in a realistic place.
|
# Only turn on single quote if it's in a realistic place.
|
||||||
if in_quot1 and not in_quot2:
|
if in_quot1 and not in_quot2:
|
||||||
if i2 > 0 and s[i2-1] not in [' ', '\t', '=']:
|
if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
|
||||||
in_quot1 = False
|
in_quot1 = False
|
||||||
elif c2 == '>' and (not in_quot2 and not in_quot1):
|
elif c2 == '>' and (not in_quot2 and not in_quot1):
|
||||||
found = True
|
found = True
|
||||||
@ -285,7 +285,7 @@ def _html_split(s):
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# Append the tag.
|
# Append the tag.
|
||||||
L.append(s[i:i2+1])
|
L.append(s[i:i2 + 1])
|
||||||
i = i2 + 1
|
i = i2 + 1
|
||||||
|
|
||||||
# Check whether we found a special ignore tag, eg '<script>'
|
# Check whether we found a special ignore tag, eg '<script>'
|
||||||
@ -460,18 +460,18 @@ def _tag_dict(s):
|
|||||||
|
|
||||||
# Strip spaces.
|
# Strip spaces.
|
||||||
while k1 < k2 and s[k1] in string.whitespace: k1 += 1
|
while k1 < k2 and s[k1] in string.whitespace: k1 += 1
|
||||||
while k1 < k2 and s[k2-1] in string.whitespace: k2 -= 1
|
while k1 < k2 and s[k2 - 1] in string.whitespace: k2 -= 1
|
||||||
|
|
||||||
while v1 < v2 and s[v1] in string.whitespace: v1 += 1
|
while v1 < v2 and s[v1] in string.whitespace: v1 += 1
|
||||||
while v1 < v2 and s[v2-1] in string.whitespace: v2 -= 1
|
while v1 < v2 and s[v2 - 1] in string.whitespace: v2 -= 1
|
||||||
|
|
||||||
# Strip one pair of double quotes around value.
|
# Strip one pair of double quotes around value.
|
||||||
if v1 < v2 - 1 and s[v1] == '"' and s[v2-1] == '"':
|
if v1 < v2 - 1 and s[v1] == '"' and s[v2 - 1] == '"':
|
||||||
v1 += 1
|
v1 += 1
|
||||||
v2 -= 1
|
v2 -= 1
|
||||||
|
|
||||||
# Strip one pair of single quotes around value.
|
# Strip one pair of single quotes around value.
|
||||||
if v1 < v2 - 1 and s[v1] == "'" and s[v2-1] == "'":
|
if v1 < v2 - 1 and s[v1] == "'" and s[v2 - 1] == "'":
|
||||||
v1 += 1
|
v1 += 1
|
||||||
v2 -= 1
|
v2 -= 1
|
||||||
|
|
||||||
@ -512,12 +512,12 @@ def _test_tag_dict():
|
|||||||
assert _tag_dict(' \t\r \n\n \r\n ') == ({}, {}, {})
|
assert _tag_dict(' \t\r \n\n \r\n ') == ({}, {}, {})
|
||||||
assert _tag_dict('bgcolor=#ffffff text="#000000" blink') == \
|
assert _tag_dict('bgcolor=#ffffff text="#000000" blink') == \
|
||||||
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
|
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
|
||||||
{'bgcolor':(0,7), 'text':(16,20), 'blink':(31,36)},
|
{'bgcolor':(0, 7), 'text':(16, 20), 'blink':(31, 36)},
|
||||||
{'bgcolor':(8,15), 'text':(22,29), 'blink':(36,36)})
|
{'bgcolor':(8, 15), 'text':(22, 29), 'blink':(36, 36)})
|
||||||
assert _tag_dict("bgcolor='#ffffff'text='#000000' blink") == \
|
assert _tag_dict("bgcolor='#ffffff'text='#000000' blink") == \
|
||||||
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
|
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
|
||||||
{'bgcolor':(0,7), 'text':(17,21), 'blink':(32,37)},
|
{'bgcolor':(0, 7), 'text':(17, 21), 'blink':(32, 37)},
|
||||||
{'bgcolor':(9,16), 'text':(23,30), 'blink':(37,37)})
|
{'bgcolor':(9, 16), 'text':(23, 30), 'blink':(37, 37)})
|
||||||
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
|
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
|
||||||
(a, b, c) = _tag_dict(s)
|
(a, b, c) = _tag_dict(s)
|
||||||
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
|
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
|
||||||
@ -541,7 +541,7 @@ def _full_tag_extract(s):
|
|||||||
# Starting position of each L[i] in s.
|
# Starting position of each L[i] in s.
|
||||||
Lstart = [0] * len(L)
|
Lstart = [0] * len(L)
|
||||||
for i in range(1, len(L)):
|
for i in range(1, len(L)):
|
||||||
Lstart[i] = Lstart[i-1] + len(L[i-1])
|
Lstart[i] = Lstart[i - 1] + len(L[i - 1])
|
||||||
|
|
||||||
class NotTagError(Exception): pass
|
class NotTagError(Exception): pass
|
||||||
|
|
||||||
@ -590,7 +590,7 @@ def _full_tag_extract(s):
|
|||||||
(name, dtext) = (text, '')
|
(name, dtext) = (text, '')
|
||||||
else:
|
else:
|
||||||
name = text[:first_space]
|
name = text[:first_space]
|
||||||
dtext = text[first_space+1:len(text)]
|
dtext = text[first_space + 1:len(text)]
|
||||||
|
|
||||||
# Position of dtext relative to original text.
|
# Position of dtext relative to original text.
|
||||||
dtext_offset = len(name) + 1 + orig_offset # +1 for space.
|
dtext_offset = len(name) + 1 + orig_offset # +1 for space.
|
||||||
@ -610,10 +610,10 @@ def _full_tag_extract(s):
|
|||||||
(attrs, key_pos, value_pos) = _tag_dict(dtext)
|
(attrs, key_pos, value_pos) = _tag_dict(dtext)
|
||||||
# Correct offsets in key_pos and value_pos.
|
# Correct offsets in key_pos and value_pos.
|
||||||
for key in attrs.keys():
|
for key in attrs.keys():
|
||||||
key_pos[key] = (key_pos[key][0]+Lstart[i]+dtext_offset,
|
key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
|
||||||
key_pos[key][1]+Lstart[i]+dtext_offset)
|
key_pos[key][1] + Lstart[i] + dtext_offset)
|
||||||
value_pos[key] = (value_pos[key][0]+Lstart[i]+dtext_offset,
|
value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
|
||||||
value_pos[key][1]+Lstart[i]+dtext_offset)
|
value_pos[key][1] + Lstart[i] + dtext_offset)
|
||||||
|
|
||||||
pos = (Lstart[i], Lstart[i] + len(L[i]))
|
pos = (Lstart[i], Lstart[i] + len(L[i]))
|
||||||
|
|
||||||
@ -761,7 +761,7 @@ def _remove_comments(doc):
|
|||||||
ans += [doc[i:]]
|
ans += [doc[i:]]
|
||||||
break
|
break
|
||||||
ans += [doc[i:i2]]
|
ans += [doc[i:i2]]
|
||||||
i3 = doc.find('*/', i2+1)
|
i3 = doc.find('*/', i2 + 1)
|
||||||
if i3 < 0:
|
if i3 < 0:
|
||||||
i3 = len(doc) - 2
|
i3 = len(doc) - 2
|
||||||
ans += [' ' * (i3 - i2 + 2)]
|
ans += [' ' * (i3 - i2 + 2)]
|
||||||
@ -775,7 +775,7 @@ def _test_remove_comments():
|
|||||||
"""
|
"""
|
||||||
s = '/*d s kjlsdf */*//*/*//**/**/*//**/a' * 50
|
s = '/*d s kjlsdf */*//*/*//**/**/*//**/a' * 50
|
||||||
assert len(_remove_comments(s)) == len(s)
|
assert len(_remove_comments(s)) == len(s)
|
||||||
s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd'+'/*//**//'
|
s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd' + '/*//**//'
|
||||||
assert len(_remove_comments(s)) == len(s)
|
assert len(_remove_comments(s)) == len(s)
|
||||||
s = 'a/**/' * 50 + '/**//**/////***/****/*//**//*/' * 5
|
s = 'a/**/' * 50 + '/**//**/////***/****/*//**//*/' * 5
|
||||||
assert len(_remove_comments(s)) == len(s)
|
assert len(_remove_comments(s)) == len(s)
|
||||||
@ -916,8 +916,8 @@ def _tuple_replace(s, Lindices, Lreplace):
|
|||||||
Lindices.sort()
|
Lindices.sort()
|
||||||
if len(Lindices) != len(Lreplace):
|
if len(Lindices) != len(Lreplace):
|
||||||
raise ValueError('lists differ in length')
|
raise ValueError('lists differ in length')
|
||||||
for i in range(len(Lindices)-1):
|
for i in range(len(Lindices) - 1):
|
||||||
if Lindices[i][1] > Lindices[i+1][0]:
|
if Lindices[i][1] > Lindices[i + 1][0]:
|
||||||
raise ValueError('tuples overlap')
|
raise ValueError('tuples overlap')
|
||||||
if Lindices[i][1] < Lindices[i][0]:
|
if Lindices[i][1] < Lindices[i][0]:
|
||||||
raise ValueError('invalid tuple')
|
raise ValueError('invalid tuple')
|
||||||
@ -932,7 +932,7 @@ def _tuple_replace(s, Lindices, Lreplace):
|
|||||||
len1 = Lindices[i][1] - Lindices[i][0]
|
len1 = Lindices[i][1] - Lindices[i][0]
|
||||||
len2 = len(Lreplace[i])
|
len2 = len(Lreplace[i])
|
||||||
|
|
||||||
ans.append(s[j:Lindices[i][0]+offset])
|
ans.append(s[j:Lindices[i][0] + offset])
|
||||||
ans.append(Lreplace[i])
|
ans.append(Lreplace[i])
|
||||||
|
|
||||||
j = Lindices[i][1]
|
j = Lindices[i][1]
|
||||||
@ -943,12 +943,12 @@ def _test_tuple_replace():
|
|||||||
"""
|
"""
|
||||||
Unit test for L{_tuple_replace}.
|
Unit test for L{_tuple_replace}.
|
||||||
"""
|
"""
|
||||||
assert _tuple_replace('',[],[]) == ''
|
assert _tuple_replace('', [], []) == ''
|
||||||
assert _tuple_replace('0123456789',[],[]) == '0123456789'
|
assert _tuple_replace('0123456789', [], []) == '0123456789'
|
||||||
assert _tuple_replace('0123456789',[(4,5),(6,9)],['abc', 'def'])== \
|
assert _tuple_replace('0123456789', [(4, 5), (6, 9)], ['abc', 'def']) == \
|
||||||
'0123abc5def9'
|
'0123abc5def9'
|
||||||
assert _tuple_replace('01234567890123456789', \
|
assert _tuple_replace('01234567890123456789', \
|
||||||
[(1,9),(13,14),(16,18)],['abcd','efg','hijk']) == \
|
[(1, 9), (13, 14), (16, 18)], ['abcd', 'efg', 'hijk']) == \
|
||||||
'0abcd9012efg45hijk89'
|
'0abcd9012efg45hijk89'
|
||||||
|
|
||||||
def urljoin(s, L):
|
def urljoin(s, L):
|
||||||
@ -1196,7 +1196,7 @@ def _test_tagextract(str_class=str):
|
|||||||
'<img test="5%ff" /></body></html>\nBye!\n')
|
'<img test="5%ff" /></body></html>\nBye!\n')
|
||||||
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
|
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
|
||||||
'<test tag="5" content=6><is broken=False><yay>' +
|
'<test tag="5" content=6><is broken=False><yay>' +
|
||||||
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+
|
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
|
||||||
'<script language="JavaScript"><>!><!_!_!-->!_-></script>')
|
'<script language="JavaScript"><>!><!_!_!-->!_-></script>')
|
||||||
doc3 = f('\r\t< html >< tag> <!--comment--> <tag a = 5> ' +
|
doc3 = f('\r\t< html >< tag> <!--comment--> <tag a = 5> ' +
|
||||||
'<foo \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n>')
|
'<foo \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n>')
|
||||||
@ -1239,7 +1239,7 @@ def _test_tagextract(str_class=str):
|
|||||||
' what', '<style>', 'hi<><>>', '</style>',
|
' what', '<style>', 'hi<><>>', '</style>',
|
||||||
'<script language="Java">', '<aL><>><>>', '</script>', 'a'])
|
'<script language="Java">', '<aL><>><>>', '</script>', 'a'])
|
||||||
|
|
||||||
s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag '+
|
s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag ' +
|
||||||
'<tag <! <! -> <!-- </who< <who> tag> <huh-->-</style>' +
|
'<tag <! <! -> <!-- </who< <who> tag> <huh-->-</style>' +
|
||||||
'</style<style>')
|
'</style<style>')
|
||||||
assert s == f('').join(_html_split(s))
|
assert s == f('').join(_html_split(s))
|
||||||
@ -1282,7 +1282,7 @@ def _test_tagextract(str_class=str):
|
|||||||
doc2old = doc2
|
doc2old = doc2
|
||||||
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
|
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
|
||||||
'<test tag="5" content=6><is broken=False><yay>' +
|
'<test tag="5" content=6><is broken=False><yay>' +
|
||||||
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+
|
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
|
||||||
'<script language="JavaScript"><>!><!_!_!-->!_-></script>')
|
'<script language="JavaScript"><>!><!_!_!-->!_-></script>')
|
||||||
assert doc2old == doc2
|
assert doc2old == doc2
|
||||||
|
|
||||||
@ -1334,7 +1334,7 @@ def _test_tagextract(str_class=str):
|
|||||||
L = tagextract(doc4)
|
L = tagextract(doc4)
|
||||||
assert len(L) == n
|
assert len(L) == n
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
assert L[i] == f([('tag/',{'name':'5','value':'6afdjherknc4 cdk j',
|
assert L[i] == f([('tag/', {'name':'5', 'value':'6afdjherknc4 cdk j',
|
||||||
'a':'7', 'b':'8'})])[0]
|
'a':'7', 'b':'8'})])[0]
|
||||||
|
|
||||||
# -----------------------------------------------------------------
|
# -----------------------------------------------------------------
|
||||||
|
@ -28,6 +28,7 @@ import urllib
|
|||||||
import textwrap
|
import textwrap
|
||||||
import urlparse
|
import urlparse
|
||||||
import os, os.path
|
import os, os.path
|
||||||
|
|
||||||
import errno
|
import errno
|
||||||
import hashlib
|
import hashlib
|
||||||
import httplib
|
import httplib
|
||||||
@ -129,27 +130,27 @@ def normalize_url(url, lower=True):
|
|||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point = 0):
|
def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point=0):
|
||||||
# find tag limits - start_string must be an unique identifier within doc
|
# find tag limits - start_string must be an unique identifier within doc
|
||||||
|
|
||||||
i1 = doc.find(filter_string, start_point)
|
i1 = doc.find(filter_string, start_point)
|
||||||
|
|
||||||
if i1 == -1:
|
if i1 == -1:
|
||||||
return (-1,-1)
|
return (-1, -1)
|
||||||
|
|
||||||
aux = doc.rfind(start_tag, start_point, i1+len(filter_string))
|
aux = doc.rfind(start_tag, start_point, i1 + len(filter_string))
|
||||||
|
|
||||||
# we've found the filter_string but it has not the start_tag, so we return a different value
|
# we've found the filter_string but it has not the start_tag, so we return a different value
|
||||||
# telling the script to keep searching starting on the end of the filter_string found
|
# telling the script to keep searching starting on the end of the filter_string found
|
||||||
if aux == -1:
|
if aux == -1:
|
||||||
return (-2, i1+len(filter_string))
|
return (-2, i1 + len(filter_string))
|
||||||
|
|
||||||
i1 = aux
|
i1 = aux
|
||||||
sdiv = i1
|
sdiv = i1
|
||||||
ediv = i1 + len(start_tag)
|
ediv = i1 + len(start_tag)
|
||||||
while(sdiv < ediv and sdiv != -1):
|
while(sdiv < ediv and sdiv != -1):
|
||||||
sdiv = doc.find(start_tag, sdiv+len(start_tag))
|
sdiv = doc.find(start_tag, sdiv + len(start_tag))
|
||||||
ediv = doc.find(end_tag , ediv+len(end_tag))
|
ediv = doc.find(end_tag , ediv + len(end_tag))
|
||||||
|
|
||||||
return (i1, ediv)
|
return (i1, ediv)
|
||||||
|
|
||||||
@ -163,9 +164,9 @@ def clean_tag(doc, filter_string, end_tag, start_tag):
|
|||||||
if start1 == -2:
|
if start1 == -2:
|
||||||
start_point = start2
|
start_point = start2
|
||||||
continue
|
continue
|
||||||
end1 = doc.find('>', start1)+1;
|
end1 = doc.find('>', start1) + 1;
|
||||||
end2 = start2 + len(end_tag);
|
end2 = start2 + len(end_tag);
|
||||||
doc = doc[:start1]+doc[end1:start2]+doc[end2:]
|
doc = doc[:start1] + doc[end1:start2] + doc[end2:]
|
||||||
|
|
||||||
def remove_tag(doc, start_string, end_tag, start_tag):
|
def remove_tag(doc, start_string, end_tag, start_tag):
|
||||||
#remove tagged text function
|
#remove tagged text function
|
||||||
@ -173,7 +174,7 @@ def remove_tag(doc, start_string, end_tag, start_tag):
|
|||||||
(i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag)
|
(i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag)
|
||||||
if i1 == -1 or i2 == -1:
|
if i1 == -1 or i2 == -1:
|
||||||
return doc
|
return doc
|
||||||
doc = doc[:i1]+doc[i2+len(end_tag):]
|
doc = doc[:i1] + doc[i2 + len(end_tag):]
|
||||||
|
|
||||||
def monobook_fix_html(doc, page_url):
|
def monobook_fix_html(doc, page_url):
|
||||||
"""
|
"""
|
||||||
@ -185,7 +186,7 @@ def monobook_fix_html(doc, page_url):
|
|||||||
doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
|
doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
|
||||||
|
|
||||||
doc = remove_tag(doc, '<div class="portlet" id="p-personal">', '</div>', '<div')
|
doc = remove_tag(doc, '<div class="portlet" id="p-personal">', '</div>', '<div')
|
||||||
doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>','<div')
|
doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>', '<div')
|
||||||
doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
|
doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
|
||||||
#James also remove the page/discussion/source/history/ div.
|
#James also remove the page/discussion/source/history/ div.
|
||||||
doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
|
doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
|
||||||
@ -193,20 +194,20 @@ def monobook_fix_html(doc, page_url):
|
|||||||
#andre special mode
|
#andre special mode
|
||||||
if config.special_mode:
|
if config.special_mode:
|
||||||
# Remove ul list
|
# Remove ul list
|
||||||
doc = remove_tag(doc,'<ul id="f-list">','</ul>', '<ul')
|
doc = remove_tag(doc, '<ul id="f-list">', '</ul>', '<ul')
|
||||||
|
|
||||||
# Remove link rel alternate and edit
|
# Remove link rel alternate and edit
|
||||||
doc = re.sub(r'<link rel="alternate"[\s\S]+?/>',r'',doc)
|
doc = re.sub(r'<link rel="alternate"[\s\S]+?/>', r'', doc)
|
||||||
doc = re.sub(r'<link rel="edit"[\s\S]+?/>',r'',doc)
|
doc = re.sub(r'<link rel="edit"[\s\S]+?/>', r'', doc)
|
||||||
|
|
||||||
# Remove print footer
|
# Remove print footer
|
||||||
doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>',r'',doc)
|
doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>', r'', doc)
|
||||||
|
|
||||||
# Remove noexport
|
# Remove noexport
|
||||||
doc = remove_tag(doc,'<div class="noexport"','</div>', '<div')
|
doc = remove_tag(doc, '<div class="noexport"', '</div>', '<div')
|
||||||
|
|
||||||
# Remove editornote
|
# Remove editornote
|
||||||
doc = remove_tag(doc,'<div class="editornote"','</div>', '<div')
|
doc = remove_tag(doc, '<div class="editornote"', '</div>', '<div')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Remove powered by MediaWiki logo
|
# Remove powered by MediaWiki logo
|
||||||
@ -261,24 +262,24 @@ def pos_html_transform(doc, url):
|
|||||||
sidebar_html = f.read()
|
sidebar_html = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
doc = re.sub( r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
|
doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
|
||||||
|
|
||||||
# Remove empty links
|
# Remove empty links
|
||||||
doc = clean_tag(doc, 'href=""', '</a>', '<a ');
|
doc = clean_tag(doc, 'href=""', '</a>', '<a ');
|
||||||
|
|
||||||
if config.special_mode:
|
if config.special_mode:
|
||||||
# Remove external link rel stylesheet
|
# Remove external link rel stylesheet
|
||||||
doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>',r'',doc)
|
doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>', r'', doc)
|
||||||
|
|
||||||
# Remove external javascript
|
# Remove external javascript
|
||||||
doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>',r'',doc)
|
doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>', r'', doc)
|
||||||
|
|
||||||
# Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
|
# Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
|
||||||
if config.footer is not None:
|
if config.footer is not None:
|
||||||
s1 = '<div id="footer">'
|
s1 = '<div id="footer">'
|
||||||
|
|
||||||
# match correct divs
|
# match correct divs
|
||||||
(i1,i2) = find_tag_limits(doc, s1, '</div>', '<div')
|
(i1, i2) = find_tag_limits(doc, s1, '</div>', '<div')
|
||||||
|
|
||||||
if (i1 == -1):
|
if (i1 == -1):
|
||||||
return doc
|
return doc
|
||||||
@ -298,7 +299,7 @@ def pos_html_transform(doc, url):
|
|||||||
# keep MediaWiki credits
|
# keep MediaWiki credits
|
||||||
doc = doc[:i2] + footer_html + doc[i2:]
|
doc = doc[:i2] + footer_html + doc[i2:]
|
||||||
else:
|
else:
|
||||||
doc = doc[:i1+len(s1)] + footer_html + doc[i2:]
|
doc = doc[:i1 + len(s1)] + footer_html + doc[i2:]
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@ -308,9 +309,9 @@ def fix_move_href_tags(doc):
|
|||||||
"""
|
"""
|
||||||
while '<' + MOVE_HREF in doc:
|
while '<' + MOVE_HREF in doc:
|
||||||
i1 = doc.index('<' + MOVE_HREF)
|
i1 = doc.index('<' + MOVE_HREF)
|
||||||
i2 = doc.index('</' + MOVE_HREF, i1+1)
|
i2 = doc.index('</' + MOVE_HREF, i1 + 1)
|
||||||
i3 = doc.index('>', i2+1)
|
i3 = doc.index('>', i2 + 1)
|
||||||
(start, end) = (i1, i3+1)
|
(start, end) = (i1, i3 + 1)
|
||||||
tags = htmldata.tagextract(doc[start:end])
|
tags = htmldata.tagextract(doc[start:end])
|
||||||
assert tags[0][0] == MOVE_HREF
|
assert tags[0][0] == MOVE_HREF
|
||||||
assert tags[-1][0] == '/' + MOVE_HREF
|
assert tags[-1][0] == '/' + MOVE_HREF
|
||||||
@ -420,7 +421,7 @@ def post_css_transform(doc, url):
|
|||||||
def move_to_index_if_needed(ans):
|
def move_to_index_if_needed(ans):
|
||||||
global config
|
global config
|
||||||
if ans.endswith(config.index):
|
if ans.endswith(config.index):
|
||||||
ans = ans[:len(ans)-len(config.index)] + INDEX_HTML
|
ans = ans[:len(ans) - len(config.index)] + INDEX_HTML
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def file_exists_in_written_set(filename):
|
def file_exists_in_written_set(filename):
|
||||||
@ -456,11 +457,11 @@ def clean_filename(url, ans):
|
|||||||
|
|
||||||
# Replace % escape codes with underscores, dashes with underscores.
|
# Replace % escape codes with underscores, dashes with underscores.
|
||||||
while '%%' in ans:
|
while '%%' in ans:
|
||||||
ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%')+2:]
|
ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%') + 2:]
|
||||||
while '%25' in ans:
|
while '%25' in ans:
|
||||||
ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25')+5:]
|
ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25') + 5:]
|
||||||
while '%' in ans:
|
while '%' in ans:
|
||||||
ans = ans[:ans.index('%')] + '_' + ans[ans.index('%')+3:]
|
ans = ans[:ans.index('%')] + '_' + ans[ans.index('%') + 3:]
|
||||||
ans = ans.replace('-', '_')
|
ans = ans.replace('-', '_')
|
||||||
while '__' in ans:
|
while '__' in ans:
|
||||||
ans = ans.replace('__', '_')
|
ans = ans.replace('__', '_')
|
||||||
@ -512,15 +513,15 @@ def url_open(url):
|
|||||||
L = urlparse.urlparse(url)
|
L = urlparse.urlparse(url)
|
||||||
if L[1] != domain:
|
if L[1] != domain:
|
||||||
conn.close()
|
conn.close()
|
||||||
print "connection to",domain,"closed."
|
print "connection to", domain, "closed."
|
||||||
conn = httplib.HTTPConnection(L[1])
|
conn = httplib.HTTPConnection(L[1])
|
||||||
domain = L[1]
|
domain = L[1]
|
||||||
print "connection to",domain,"opened."
|
print "connection to", domain, "opened."
|
||||||
|
|
||||||
rel_url = url
|
rel_url = url
|
||||||
pos = url.find(domain)
|
pos = url.find(domain)
|
||||||
if pos != -1:
|
if pos != -1:
|
||||||
rel_url = url[pos+len(domain):]
|
rel_url = url[pos + len(domain):]
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
#number of attempts
|
#number of attempts
|
||||||
@ -534,11 +535,11 @@ def url_open(url):
|
|||||||
try:
|
try:
|
||||||
conn.request("GET", rel_url)
|
conn.request("GET", rel_url)
|
||||||
r = conn.getresponse()
|
r = conn.getresponse()
|
||||||
print 'Status',r.status,r.reason,'accessing',rel_url
|
print 'Status', r.status, r.reason, 'accessing', rel_url
|
||||||
if r.status == 404:
|
if r.status == 404:
|
||||||
print " it's not possible to recover this error."
|
print " it's not possible to recover this error."
|
||||||
errors += 1
|
errors += 1
|
||||||
return ('','')
|
return ('', '')
|
||||||
if r.status == 500:
|
if r.status == 500:
|
||||||
print " eventually this error might be recovered. let's try again."
|
print " eventually this error might be recovered. let's try again."
|
||||||
print ' reconnecting...'
|
print ' reconnecting...'
|
||||||
@ -554,11 +555,11 @@ def url_open(url):
|
|||||||
if attempts != 0:
|
if attempts != 0:
|
||||||
recovered = True
|
recovered = True
|
||||||
if r.status != 200:
|
if r.status != 200:
|
||||||
print " Status other than 200, 404, 500, 403. It is: ",r.status
|
print " Status other than 200, 404, 500, 403. It is: ", r.status
|
||||||
success = True
|
success = True
|
||||||
|
|
||||||
except httplib.HTTPException, e:
|
except httplib.HTTPException, e:
|
||||||
print 'ERROR',e.__class__.__name__,'while retrieving', url
|
print 'ERROR', e.__class__.__name__, 'while retrieving', url
|
||||||
conn.close
|
conn.close
|
||||||
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
|
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
|
||||||
print "eventually this error might be recovered. let's try again."
|
print "eventually this error might be recovered. let's try again."
|
||||||
@ -568,7 +569,7 @@ def url_open(url):
|
|||||||
else:
|
else:
|
||||||
print "it's not possible to recover this error."
|
print "it's not possible to recover this error."
|
||||||
errors += 1
|
errors += 1
|
||||||
return ('','')
|
return ('', '')
|
||||||
|
|
||||||
if recovered:
|
if recovered:
|
||||||
print "error recovered"
|
print "error recovered"
|
||||||
@ -616,7 +617,8 @@ def url_to_filename(url):
|
|||||||
lpath = L[2].split('/')
|
lpath = L[2].split('/')
|
||||||
if not '.' in lpath[-1]:
|
if not '.' in lpath[-1]:
|
||||||
# url ends with a directory name. Store it under index.html.
|
# url ends with a directory name. Store it under index.html.
|
||||||
L[2] += '/' + INDEX_HTML
|
# L[2] += '/' + INDEX_HTML
|
||||||
|
L[2]=L[2]
|
||||||
else:
|
else:
|
||||||
# 'title=' parsing
|
# 'title=' parsing
|
||||||
if L[4].startswith('title=') and L[2].endswith('index.php'):
|
if L[4].startswith('title=') and L[2].endswith('index.php'):
|
||||||
@ -692,7 +694,7 @@ def url_to_filename(url):
|
|||||||
# Not really needed since we checked that the directory
|
# Not really needed since we checked that the directory
|
||||||
# outdir didn't exist at the top of run(), but let's double check.
|
# outdir didn't exist at the top of run(), but let's double check.
|
||||||
if os.path.exists(ans) and not config.overwrite:
|
if os.path.exists(ans) and not config.overwrite:
|
||||||
out.write('File already exists: ' + str(ans))
|
out.write('File already exists: ' + str(ans)) #@UndefinedVariable
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
f = open(ans, mode)
|
f = open(ans, mode)
|
||||||
@ -780,7 +782,7 @@ def should_follow(url):
|
|||||||
print url, 'with multiple query fields'
|
print url, 'with multiple query fields'
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-' )):
|
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
|
||||||
if config.debug:
|
if config.debug:
|
||||||
print url, 'is a forbidden wiki page'
|
print url, 'is a forbidden wiki page'
|
||||||
return False
|
return False
|
||||||
@ -806,7 +808,7 @@ def should_follow(url):
|
|||||||
print url, 'is a file outside of scope with unknown extension'
|
print url, 'is a file outside of scope with unknown extension'
|
||||||
return False
|
return False
|
||||||
|
|
||||||
forbidden_parents = ['.php','.html','.htm']
|
forbidden_parents = ['.php', '.html', '.htm']
|
||||||
for fp in forbidden_parents:
|
for fp in forbidden_parents:
|
||||||
if fp in L[-1]:
|
if fp in L[-1]:
|
||||||
if config.debug:
|
if config.debug:
|
||||||
@ -876,7 +878,7 @@ def run(out=sys.stdout):
|
|||||||
n = 0
|
n = 0
|
||||||
|
|
||||||
if not config.overwrite and os.path.exists(config.outdir):
|
if not config.overwrite and os.path.exists(config.outdir):
|
||||||
out.write('Error: Directory exists: ' + str(config.outdir) )
|
out.write('Error: Directory exists: ' + str(config.outdir))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
domain = get_domain(config.rooturl)
|
domain = get_domain(config.rooturl)
|
||||||
@ -941,7 +943,7 @@ def run(out=sys.stdout):
|
|||||||
|
|
||||||
# Save document changes to disk
|
# Save document changes to disk
|
||||||
update = False
|
update = False
|
||||||
text_ext = ( 'txt', 'html', 'rtf', 'css', 'sgml', 'xml' )
|
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
|
||||||
for ext in text_ext:
|
for ext in text_ext:
|
||||||
if filename.endswith(ext):
|
if filename.endswith(ext):
|
||||||
update = True
|
update = True
|
||||||
@ -957,7 +959,7 @@ def run(out=sys.stdout):
|
|||||||
n += 1
|
n += 1
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
print "connection to",domain,"closed."
|
print "connection to", domain, "closed."
|
||||||
out.write(str(n) + ' files saved\n')
|
out.write(str(n) + ' files saved\n')
|
||||||
print counter, "httplib requests done"
|
print counter, "httplib requests done"
|
||||||
print errors, "errors not recovered"
|
print errors, "errors not recovered"
|
||||||
@ -1029,7 +1031,7 @@ def main():
|
|||||||
'no-hack-skin', 'no-made-by', 'left=',
|
'no-hack-skin', 'no-made-by', 'left=',
|
||||||
'top=', 'bottom=', 'index=', 'no-move-href',
|
'top=', 'bottom=', 'index=', 'no-move-href',
|
||||||
'no-remove-png', 'no-remove-history', 'limit-parent',
|
'no-remove-png', 'no-remove-history', 'limit-parent',
|
||||||
'special-mode','debug','no-images'])
|
'special-mode', 'debug', 'no-images'])
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
usage()
|
usage()
|
||||||
|
|
||||||
@ -1066,7 +1068,7 @@ def main():
|
|||||||
config.special_mode = True
|
config.special_mode = True
|
||||||
config.sidebar = 'sidebar.html'
|
config.sidebar = 'sidebar.html'
|
||||||
config.footer = 'footer.html'
|
config.footer = 'footer.html'
|
||||||
if opt in ['-d','--debug']:
|
if opt in ['-d', '--debug']:
|
||||||
config.debug = True
|
config.debug = True
|
||||||
if opt in ['-l', '--left']:
|
if opt in ['-l', '--left']:
|
||||||
config.sidebar = os.path.abspath(arg)
|
config.sidebar = os.path.abspath(arg)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user