mirror of
https://github.com/cookiengineer/audacity
synced 2025-04-30 15:49:41 +02:00
Avoid duplicate html files with names like 'index.htm'
This commit is contained in:
parent
9438fdf3e5
commit
eb904062ad
@ -189,7 +189,7 @@ def _enumerate(L):
|
||||
|
||||
Returns a list instead of an iterator.
|
||||
"""
|
||||
return zip(range(len(L)),L)
|
||||
return zip(range(len(L)), L)
|
||||
|
||||
def _ignore_tag_index(s, i):
|
||||
"""
|
||||
@ -199,9 +199,9 @@ def _ignore_tag_index(s, i):
|
||||
the index. Otherwise, return C{-1}.
|
||||
"""
|
||||
for (j, (a, b)) in _enumerate(_IGNORE_TAGS):
|
||||
if s[i:i+len(a)+1].lower() == '<' + a:
|
||||
if s[i:i + len(a) + 1].lower() == '<' + a:
|
||||
return j
|
||||
return -1
|
||||
return - 1
|
||||
|
||||
def _html_split(s):
|
||||
"""
|
||||
@ -233,7 +233,7 @@ def _html_split(s):
|
||||
c = s[i]
|
||||
if c == '<':
|
||||
# Left bracket, handle various cases.
|
||||
if s[i:i+len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT):
|
||||
if s[i:i + len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT):
|
||||
# HTML begin comment tag, '<!--'. Scan for '-->'.
|
||||
i2 = s.find(_END_COMMENT, i)
|
||||
if i2 < 0:
|
||||
@ -242,9 +242,9 @@ def _html_split(s):
|
||||
break
|
||||
else:
|
||||
# Append the comment.
|
||||
L.append(s[i:i2+len(_END_COMMENT)])
|
||||
L.append(s[i:i2 + len(_END_COMMENT)])
|
||||
i = i2 + len(_END_COMMENT)
|
||||
elif s[i:i+len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA):
|
||||
elif s[i:i + len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA):
|
||||
# XHTML begin CDATA tag. Scan for ']]>'.
|
||||
i2 = s.find(_END_CDATA, i)
|
||||
if i2 < 0:
|
||||
@ -253,7 +253,7 @@ def _html_split(s):
|
||||
break
|
||||
else:
|
||||
# Append the CDATA.
|
||||
L.append(s[i:i2+len(_END_CDATA)])
|
||||
L.append(s[i:i2 + len(_END_CDATA)])
|
||||
i = i2 + len(_END_CDATA)
|
||||
else:
|
||||
# Regular HTML tag. Scan for '>'.
|
||||
@ -261,19 +261,19 @@ def _html_split(s):
|
||||
found = False
|
||||
in_quot1 = False
|
||||
in_quot2 = False
|
||||
for i2 in xrange(i+1, len(s)):
|
||||
for i2 in xrange(i + 1, len(s)):
|
||||
c2 = s[i2]
|
||||
if c2 == '"' and not in_quot1:
|
||||
in_quot2 = not in_quot2
|
||||
# Only turn on double quote if it's in a realistic place.
|
||||
if in_quot2 and not in_quot1:
|
||||
if i2 > 0 and s[i2-1] not in [' ', '\t', '=']:
|
||||
if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
|
||||
in_quot2 = False
|
||||
elif c2 == "'" and not in_quot2:
|
||||
in_quot1 = not in_quot1
|
||||
# Only turn on single quote if it's in a realistic place.
|
||||
if in_quot1 and not in_quot2:
|
||||
if i2 > 0 and s[i2-1] not in [' ', '\t', '=']:
|
||||
if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
|
||||
in_quot1 = False
|
||||
elif c2 == '>' and (not in_quot2 and not in_quot1):
|
||||
found = True
|
||||
@ -285,7 +285,7 @@ def _html_split(s):
|
||||
break
|
||||
else:
|
||||
# Append the tag.
|
||||
L.append(s[i:i2+1])
|
||||
L.append(s[i:i2 + 1])
|
||||
i = i2 + 1
|
||||
|
||||
# Check whether we found a special ignore tag, eg '<script>'
|
||||
@ -460,18 +460,18 @@ def _tag_dict(s):
|
||||
|
||||
# Strip spaces.
|
||||
while k1 < k2 and s[k1] in string.whitespace: k1 += 1
|
||||
while k1 < k2 and s[k2-1] in string.whitespace: k2 -= 1
|
||||
while k1 < k2 and s[k2 - 1] in string.whitespace: k2 -= 1
|
||||
|
||||
while v1 < v2 and s[v1] in string.whitespace: v1 += 1
|
||||
while v1 < v2 and s[v2-1] in string.whitespace: v2 -= 1
|
||||
while v1 < v2 and s[v2 - 1] in string.whitespace: v2 -= 1
|
||||
|
||||
# Strip one pair of double quotes around value.
|
||||
if v1 < v2 - 1 and s[v1] == '"' and s[v2-1] == '"':
|
||||
if v1 < v2 - 1 and s[v1] == '"' and s[v2 - 1] == '"':
|
||||
v1 += 1
|
||||
v2 -= 1
|
||||
|
||||
# Strip one pair of single quotes around value.
|
||||
if v1 < v2 - 1 and s[v1] == "'" and s[v2-1] == "'":
|
||||
if v1 < v2 - 1 and s[v1] == "'" and s[v2 - 1] == "'":
|
||||
v1 += 1
|
||||
v2 -= 1
|
||||
|
||||
@ -512,12 +512,12 @@ def _test_tag_dict():
|
||||
assert _tag_dict(' \t\r \n\n \r\n ') == ({}, {}, {})
|
||||
assert _tag_dict('bgcolor=#ffffff text="#000000" blink') == \
|
||||
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
|
||||
{'bgcolor':(0,7), 'text':(16,20), 'blink':(31,36)},
|
||||
{'bgcolor':(8,15), 'text':(22,29), 'blink':(36,36)})
|
||||
{'bgcolor':(0, 7), 'text':(16, 20), 'blink':(31, 36)},
|
||||
{'bgcolor':(8, 15), 'text':(22, 29), 'blink':(36, 36)})
|
||||
assert _tag_dict("bgcolor='#ffffff'text='#000000' blink") == \
|
||||
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
|
||||
{'bgcolor':(0,7), 'text':(17,21), 'blink':(32,37)},
|
||||
{'bgcolor':(9,16), 'text':(23,30), 'blink':(37,37)})
|
||||
{'bgcolor':(0, 7), 'text':(17, 21), 'blink':(32, 37)},
|
||||
{'bgcolor':(9, 16), 'text':(23, 30), 'blink':(37, 37)})
|
||||
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
|
||||
(a, b, c) = _tag_dict(s)
|
||||
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
|
||||
@ -541,7 +541,7 @@ def _full_tag_extract(s):
|
||||
# Starting position of each L[i] in s.
|
||||
Lstart = [0] * len(L)
|
||||
for i in range(1, len(L)):
|
||||
Lstart[i] = Lstart[i-1] + len(L[i-1])
|
||||
Lstart[i] = Lstart[i - 1] + len(L[i - 1])
|
||||
|
||||
class NotTagError(Exception): pass
|
||||
|
||||
@ -590,7 +590,7 @@ def _full_tag_extract(s):
|
||||
(name, dtext) = (text, '')
|
||||
else:
|
||||
name = text[:first_space]
|
||||
dtext = text[first_space+1:len(text)]
|
||||
dtext = text[first_space + 1:len(text)]
|
||||
|
||||
# Position of dtext relative to original text.
|
||||
dtext_offset = len(name) + 1 + orig_offset # +1 for space.
|
||||
@ -610,10 +610,10 @@ def _full_tag_extract(s):
|
||||
(attrs, key_pos, value_pos) = _tag_dict(dtext)
|
||||
# Correct offsets in key_pos and value_pos.
|
||||
for key in attrs.keys():
|
||||
key_pos[key] = (key_pos[key][0]+Lstart[i]+dtext_offset,
|
||||
key_pos[key][1]+Lstart[i]+dtext_offset)
|
||||
value_pos[key] = (value_pos[key][0]+Lstart[i]+dtext_offset,
|
||||
value_pos[key][1]+Lstart[i]+dtext_offset)
|
||||
key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
|
||||
key_pos[key][1] + Lstart[i] + dtext_offset)
|
||||
value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
|
||||
value_pos[key][1] + Lstart[i] + dtext_offset)
|
||||
|
||||
pos = (Lstart[i], Lstart[i] + len(L[i]))
|
||||
|
||||
@ -761,7 +761,7 @@ def _remove_comments(doc):
|
||||
ans += [doc[i:]]
|
||||
break
|
||||
ans += [doc[i:i2]]
|
||||
i3 = doc.find('*/', i2+1)
|
||||
i3 = doc.find('*/', i2 + 1)
|
||||
if i3 < 0:
|
||||
i3 = len(doc) - 2
|
||||
ans += [' ' * (i3 - i2 + 2)]
|
||||
@ -775,7 +775,7 @@ def _test_remove_comments():
|
||||
"""
|
||||
s = '/*d s kjlsdf */*//*/*//**/**/*//**/a' * 50
|
||||
assert len(_remove_comments(s)) == len(s)
|
||||
s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd'+'/*//**//'
|
||||
s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd' + '/*//**//'
|
||||
assert len(_remove_comments(s)) == len(s)
|
||||
s = 'a/**/' * 50 + '/**//**/////***/****/*//**//*/' * 5
|
||||
assert len(_remove_comments(s)) == len(s)
|
||||
@ -916,8 +916,8 @@ def _tuple_replace(s, Lindices, Lreplace):
|
||||
Lindices.sort()
|
||||
if len(Lindices) != len(Lreplace):
|
||||
raise ValueError('lists differ in length')
|
||||
for i in range(len(Lindices)-1):
|
||||
if Lindices[i][1] > Lindices[i+1][0]:
|
||||
for i in range(len(Lindices) - 1):
|
||||
if Lindices[i][1] > Lindices[i + 1][0]:
|
||||
raise ValueError('tuples overlap')
|
||||
if Lindices[i][1] < Lindices[i][0]:
|
||||
raise ValueError('invalid tuple')
|
||||
@ -932,7 +932,7 @@ def _tuple_replace(s, Lindices, Lreplace):
|
||||
len1 = Lindices[i][1] - Lindices[i][0]
|
||||
len2 = len(Lreplace[i])
|
||||
|
||||
ans.append(s[j:Lindices[i][0]+offset])
|
||||
ans.append(s[j:Lindices[i][0] + offset])
|
||||
ans.append(Lreplace[i])
|
||||
|
||||
j = Lindices[i][1]
|
||||
@ -943,12 +943,12 @@ def _test_tuple_replace():
|
||||
"""
|
||||
Unit test for L{_tuple_replace}.
|
||||
"""
|
||||
assert _tuple_replace('',[],[]) == ''
|
||||
assert _tuple_replace('0123456789',[],[]) == '0123456789'
|
||||
assert _tuple_replace('0123456789',[(4,5),(6,9)],['abc', 'def'])== \
|
||||
assert _tuple_replace('', [], []) == ''
|
||||
assert _tuple_replace('0123456789', [], []) == '0123456789'
|
||||
assert _tuple_replace('0123456789', [(4, 5), (6, 9)], ['abc', 'def']) == \
|
||||
'0123abc5def9'
|
||||
assert _tuple_replace('01234567890123456789', \
|
||||
[(1,9),(13,14),(16,18)],['abcd','efg','hijk']) == \
|
||||
[(1, 9), (13, 14), (16, 18)], ['abcd', 'efg', 'hijk']) == \
|
||||
'0abcd9012efg45hijk89'
|
||||
|
||||
def urljoin(s, L):
|
||||
@ -1196,7 +1196,7 @@ def _test_tagextract(str_class=str):
|
||||
'<img test="5%ff" /></body></html>\nBye!\n')
|
||||
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
|
||||
'<test tag="5" content=6><is broken=False><yay>' +
|
||||
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+
|
||||
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
|
||||
'<script language="JavaScript"><>!><!_!_!-->!_-></script>')
|
||||
doc3 = f('\r\t< html >< tag> <!--comment--> <tag a = 5> ' +
|
||||
'<foo \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n>')
|
||||
@ -1239,7 +1239,7 @@ def _test_tagextract(str_class=str):
|
||||
' what', '<style>', 'hi<><>>', '</style>',
|
||||
'<script language="Java">', '<aL><>><>>', '</script>', 'a'])
|
||||
|
||||
s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag '+
|
||||
s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag ' +
|
||||
'<tag <! <! -> <!-- </who< <who> tag> <huh-->-</style>' +
|
||||
'</style<style>')
|
||||
assert s == f('').join(_html_split(s))
|
||||
@ -1282,7 +1282,7 @@ def _test_tagextract(str_class=str):
|
||||
doc2old = doc2
|
||||
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
|
||||
'<test tag="5" content=6><is broken=False><yay>' +
|
||||
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+
|
||||
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
|
||||
'<script language="JavaScript"><>!><!_!_!-->!_-></script>')
|
||||
assert doc2old == doc2
|
||||
|
||||
@ -1334,7 +1334,7 @@ def _test_tagextract(str_class=str):
|
||||
L = tagextract(doc4)
|
||||
assert len(L) == n
|
||||
for i in range(n):
|
||||
assert L[i] == f([('tag/',{'name':'5','value':'6afdjherknc4 cdk j',
|
||||
assert L[i] == f([('tag/', {'name':'5', 'value':'6afdjherknc4 cdk j',
|
||||
'a':'7', 'b':'8'})])[0]
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
|
@ -28,6 +28,7 @@ import urllib
|
||||
import textwrap
|
||||
import urlparse
|
||||
import os, os.path
|
||||
|
||||
import errno
|
||||
import hashlib
|
||||
import httplib
|
||||
@ -129,27 +130,27 @@ def normalize_url(url, lower=True):
|
||||
|
||||
return url
|
||||
|
||||
def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point = 0):
|
||||
def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point=0):
|
||||
# find tag limits - start_string must be an unique identifier within doc
|
||||
|
||||
i1 = doc.find(filter_string, start_point)
|
||||
|
||||
if i1 == -1:
|
||||
return (-1,-1)
|
||||
return (-1, -1)
|
||||
|
||||
aux = doc.rfind(start_tag, start_point, i1+len(filter_string))
|
||||
aux = doc.rfind(start_tag, start_point, i1 + len(filter_string))
|
||||
|
||||
# we've found the filter_string but it has not the start_tag, so we return a different value
|
||||
# telling the script to keep searching starting on the end of the filter_string found
|
||||
if aux == -1:
|
||||
return (-2, i1+len(filter_string))
|
||||
return (-2, i1 + len(filter_string))
|
||||
|
||||
i1 = aux
|
||||
sdiv = i1
|
||||
ediv = i1 + len(start_tag)
|
||||
while(sdiv < ediv and sdiv != -1):
|
||||
sdiv = doc.find(start_tag, sdiv+len(start_tag))
|
||||
ediv = doc.find(end_tag , ediv+len(end_tag))
|
||||
sdiv = doc.find(start_tag, sdiv + len(start_tag))
|
||||
ediv = doc.find(end_tag , ediv + len(end_tag))
|
||||
|
||||
return (i1, ediv)
|
||||
|
||||
@ -163,9 +164,9 @@ def clean_tag(doc, filter_string, end_tag, start_tag):
|
||||
if start1 == -2:
|
||||
start_point = start2
|
||||
continue
|
||||
end1 = doc.find('>', start1)+1;
|
||||
end1 = doc.find('>', start1) + 1;
|
||||
end2 = start2 + len(end_tag);
|
||||
doc = doc[:start1]+doc[end1:start2]+doc[end2:]
|
||||
doc = doc[:start1] + doc[end1:start2] + doc[end2:]
|
||||
|
||||
def remove_tag(doc, start_string, end_tag, start_tag):
|
||||
#remove tagged text function
|
||||
@ -173,7 +174,7 @@ def remove_tag(doc, start_string, end_tag, start_tag):
|
||||
(i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag)
|
||||
if i1 == -1 or i2 == -1:
|
||||
return doc
|
||||
doc = doc[:i1]+doc[i2+len(end_tag):]
|
||||
doc = doc[:i1] + doc[i2 + len(end_tag):]
|
||||
|
||||
def monobook_fix_html(doc, page_url):
|
||||
"""
|
||||
@ -185,7 +186,7 @@ def monobook_fix_html(doc, page_url):
|
||||
doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
|
||||
|
||||
doc = remove_tag(doc, '<div class="portlet" id="p-personal">', '</div>', '<div')
|
||||
doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>','<div')
|
||||
doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>', '<div')
|
||||
doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
|
||||
#James also remove the page/discussion/source/history/ div.
|
||||
doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
|
||||
@ -193,20 +194,20 @@ def monobook_fix_html(doc, page_url):
|
||||
#andre special mode
|
||||
if config.special_mode:
|
||||
# Remove ul list
|
||||
doc = remove_tag(doc,'<ul id="f-list">','</ul>', '<ul')
|
||||
doc = remove_tag(doc, '<ul id="f-list">', '</ul>', '<ul')
|
||||
|
||||
# Remove link rel alternate and edit
|
||||
doc = re.sub(r'<link rel="alternate"[\s\S]+?/>',r'',doc)
|
||||
doc = re.sub(r'<link rel="edit"[\s\S]+?/>',r'',doc)
|
||||
doc = re.sub(r'<link rel="alternate"[\s\S]+?/>', r'', doc)
|
||||
doc = re.sub(r'<link rel="edit"[\s\S]+?/>', r'', doc)
|
||||
|
||||
# Remove print footer
|
||||
doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>',r'',doc)
|
||||
doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>', r'', doc)
|
||||
|
||||
# Remove noexport
|
||||
doc = remove_tag(doc,'<div class="noexport"','</div>', '<div')
|
||||
doc = remove_tag(doc, '<div class="noexport"', '</div>', '<div')
|
||||
|
||||
# Remove editornote
|
||||
doc = remove_tag(doc,'<div class="editornote"','</div>', '<div')
|
||||
doc = remove_tag(doc, '<div class="editornote"', '</div>', '<div')
|
||||
|
||||
else:
|
||||
# Remove powered by MediaWiki logo
|
||||
@ -261,24 +262,24 @@ def pos_html_transform(doc, url):
|
||||
sidebar_html = f.read()
|
||||
f.close()
|
||||
|
||||
doc = re.sub( r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
|
||||
doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
|
||||
|
||||
# Remove empty links
|
||||
doc = clean_tag(doc, 'href=""', '</a>', '<a ');
|
||||
|
||||
if config.special_mode:
|
||||
# Remove external link rel stylesheet
|
||||
doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>',r'',doc)
|
||||
doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>', r'', doc)
|
||||
|
||||
# Remove external javascript
|
||||
doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>',r'',doc)
|
||||
doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>', r'', doc)
|
||||
|
||||
# Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
|
||||
if config.footer is not None:
|
||||
s1 = '<div id="footer">'
|
||||
|
||||
# match correct divs
|
||||
(i1,i2) = find_tag_limits(doc, s1, '</div>', '<div')
|
||||
(i1, i2) = find_tag_limits(doc, s1, '</div>', '<div')
|
||||
|
||||
if (i1 == -1):
|
||||
return doc
|
||||
@ -298,7 +299,7 @@ def pos_html_transform(doc, url):
|
||||
# keep MediaWiki credits
|
||||
doc = doc[:i2] + footer_html + doc[i2:]
|
||||
else:
|
||||
doc = doc[:i1+len(s1)] + footer_html + doc[i2:]
|
||||
doc = doc[:i1 + len(s1)] + footer_html + doc[i2:]
|
||||
|
||||
return doc
|
||||
|
||||
@ -308,9 +309,9 @@ def fix_move_href_tags(doc):
|
||||
"""
|
||||
while '<' + MOVE_HREF in doc:
|
||||
i1 = doc.index('<' + MOVE_HREF)
|
||||
i2 = doc.index('</' + MOVE_HREF, i1+1)
|
||||
i3 = doc.index('>', i2+1)
|
||||
(start, end) = (i1, i3+1)
|
||||
i2 = doc.index('</' + MOVE_HREF, i1 + 1)
|
||||
i3 = doc.index('>', i2 + 1)
|
||||
(start, end) = (i1, i3 + 1)
|
||||
tags = htmldata.tagextract(doc[start:end])
|
||||
assert tags[0][0] == MOVE_HREF
|
||||
assert tags[-1][0] == '/' + MOVE_HREF
|
||||
@ -420,7 +421,7 @@ def post_css_transform(doc, url):
|
||||
def move_to_index_if_needed(ans):
|
||||
global config
|
||||
if ans.endswith(config.index):
|
||||
ans = ans[:len(ans)-len(config.index)] + INDEX_HTML
|
||||
ans = ans[:len(ans) - len(config.index)] + INDEX_HTML
|
||||
return ans
|
||||
|
||||
def file_exists_in_written_set(filename):
|
||||
@ -456,11 +457,11 @@ def clean_filename(url, ans):
|
||||
|
||||
# Replace % escape codes with underscores, dashes with underscores.
|
||||
while '%%' in ans:
|
||||
ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%')+2:]
|
||||
ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%') + 2:]
|
||||
while '%25' in ans:
|
||||
ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25')+5:]
|
||||
ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25') + 5:]
|
||||
while '%' in ans:
|
||||
ans = ans[:ans.index('%')] + '_' + ans[ans.index('%')+3:]
|
||||
ans = ans[:ans.index('%')] + '_' + ans[ans.index('%') + 3:]
|
||||
ans = ans.replace('-', '_')
|
||||
while '__' in ans:
|
||||
ans = ans.replace('__', '_')
|
||||
@ -512,15 +513,15 @@ def url_open(url):
|
||||
L = urlparse.urlparse(url)
|
||||
if L[1] != domain:
|
||||
conn.close()
|
||||
print "connection to",domain,"closed."
|
||||
print "connection to", domain, "closed."
|
||||
conn = httplib.HTTPConnection(L[1])
|
||||
domain = L[1]
|
||||
print "connection to",domain,"opened."
|
||||
print "connection to", domain, "opened."
|
||||
|
||||
rel_url = url
|
||||
pos = url.find(domain)
|
||||
if pos != -1:
|
||||
rel_url = url[pos+len(domain):]
|
||||
rel_url = url[pos + len(domain):]
|
||||
|
||||
attempts = 0
|
||||
#number of attempts
|
||||
@ -534,11 +535,11 @@ def url_open(url):
|
||||
try:
|
||||
conn.request("GET", rel_url)
|
||||
r = conn.getresponse()
|
||||
print 'Status',r.status,r.reason,'accessing',rel_url
|
||||
print 'Status', r.status, r.reason, 'accessing', rel_url
|
||||
if r.status == 404:
|
||||
print " it's not possible to recover this error."
|
||||
errors += 1
|
||||
return ('','')
|
||||
return ('', '')
|
||||
if r.status == 500:
|
||||
print " eventually this error might be recovered. let's try again."
|
||||
print ' reconnecting...'
|
||||
@ -554,11 +555,11 @@ def url_open(url):
|
||||
if attempts != 0:
|
||||
recovered = True
|
||||
if r.status != 200:
|
||||
print " Status other than 200, 404, 500, 403. It is: ",r.status
|
||||
print " Status other than 200, 404, 500, 403. It is: ", r.status
|
||||
success = True
|
||||
|
||||
except httplib.HTTPException, e:
|
||||
print 'ERROR',e.__class__.__name__,'while retrieving', url
|
||||
print 'ERROR', e.__class__.__name__, 'while retrieving', url
|
||||
conn.close
|
||||
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
|
||||
print "eventually this error might be recovered. let's try again."
|
||||
@ -568,7 +569,7 @@ def url_open(url):
|
||||
else:
|
||||
print "it's not possible to recover this error."
|
||||
errors += 1
|
||||
return ('','')
|
||||
return ('', '')
|
||||
|
||||
if recovered:
|
||||
print "error recovered"
|
||||
@ -616,7 +617,8 @@ def url_to_filename(url):
|
||||
lpath = L[2].split('/')
|
||||
if not '.' in lpath[-1]:
|
||||
# url ends with a directory name. Store it under index.html.
|
||||
L[2] += '/' + INDEX_HTML
|
||||
# L[2] += '/' + INDEX_HTML
|
||||
L[2]=L[2]
|
||||
else:
|
||||
# 'title=' parsing
|
||||
if L[4].startswith('title=') and L[2].endswith('index.php'):
|
||||
@ -692,7 +694,7 @@ def url_to_filename(url):
|
||||
# Not really needed since we checked that the directory
|
||||
# outdir didn't exist at the top of run(), but let's double check.
|
||||
if os.path.exists(ans) and not config.overwrite:
|
||||
out.write('File already exists: ' + str(ans))
|
||||
out.write('File already exists: ' + str(ans)) #@UndefinedVariable
|
||||
sys.exit(1)
|
||||
|
||||
f = open(ans, mode)
|
||||
@ -780,7 +782,7 @@ def should_follow(url):
|
||||
print url, 'with multiple query fields'
|
||||
return False
|
||||
|
||||
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-' )):
|
||||
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
|
||||
if config.debug:
|
||||
print url, 'is a forbidden wiki page'
|
||||
return False
|
||||
@ -806,7 +808,7 @@ def should_follow(url):
|
||||
print url, 'is a file outside of scope with unknown extension'
|
||||
return False
|
||||
|
||||
forbidden_parents = ['.php','.html','.htm']
|
||||
forbidden_parents = ['.php', '.html', '.htm']
|
||||
for fp in forbidden_parents:
|
||||
if fp in L[-1]:
|
||||
if config.debug:
|
||||
@ -876,7 +878,7 @@ def run(out=sys.stdout):
|
||||
n = 0
|
||||
|
||||
if not config.overwrite and os.path.exists(config.outdir):
|
||||
out.write('Error: Directory exists: ' + str(config.outdir) )
|
||||
out.write('Error: Directory exists: ' + str(config.outdir))
|
||||
sys.exit(1)
|
||||
|
||||
domain = get_domain(config.rooturl)
|
||||
@ -941,7 +943,7 @@ def run(out=sys.stdout):
|
||||
|
||||
# Save document changes to disk
|
||||
update = False
|
||||
text_ext = ( 'txt', 'html', 'rtf', 'css', 'sgml', 'xml' )
|
||||
text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
|
||||
for ext in text_ext:
|
||||
if filename.endswith(ext):
|
||||
update = True
|
||||
@ -957,7 +959,7 @@ def run(out=sys.stdout):
|
||||
n += 1
|
||||
|
||||
conn.close()
|
||||
print "connection to",domain,"closed."
|
||||
print "connection to", domain, "closed."
|
||||
out.write(str(n) + ' files saved\n')
|
||||
print counter, "httplib requests done"
|
||||
print errors, "errors not recovered"
|
||||
@ -1029,7 +1031,7 @@ def main():
|
||||
'no-hack-skin', 'no-made-by', 'left=',
|
||||
'top=', 'bottom=', 'index=', 'no-move-href',
|
||||
'no-remove-png', 'no-remove-history', 'limit-parent',
|
||||
'special-mode','debug','no-images'])
|
||||
'special-mode', 'debug', 'no-images'])
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
|
||||
@ -1066,7 +1068,7 @@ def main():
|
||||
config.special_mode = True
|
||||
config.sidebar = 'sidebar.html'
|
||||
config.footer = 'footer.html'
|
||||
if opt in ['-d','--debug']:
|
||||
if opt in ['-d', '--debug']:
|
||||
config.debug = True
|
||||
if opt in ['-l', '--left']:
|
||||
config.sidebar = os.path.abspath(arg)
|
||||
|
Loading…
x
Reference in New Issue
Block a user