1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-09-18 17:10:55 +02:00

Avoid duplicate html files with names like 'index.htm'

This commit is contained in:
james.k.crook@gmail.com 2011-04-12 15:04:58 +00:00
parent 9438fdf3e5
commit eb904062ad
2 changed files with 2624 additions and 2622 deletions

View File

@ -189,7 +189,7 @@ def _enumerate(L):
Returns a list instead of an iterator. Returns a list instead of an iterator.
""" """
return zip(range(len(L)),L) return zip(range(len(L)), L)
def _ignore_tag_index(s, i): def _ignore_tag_index(s, i):
""" """
@ -199,9 +199,9 @@ def _ignore_tag_index(s, i):
the index. Otherwise, return C{-1}. the index. Otherwise, return C{-1}.
""" """
for (j, (a, b)) in _enumerate(_IGNORE_TAGS): for (j, (a, b)) in _enumerate(_IGNORE_TAGS):
if s[i:i+len(a)+1].lower() == '<' + a: if s[i:i + len(a) + 1].lower() == '<' + a:
return j return j
return -1 return - 1
def _html_split(s): def _html_split(s):
""" """
@ -233,7 +233,7 @@ def _html_split(s):
c = s[i] c = s[i]
if c == '<': if c == '<':
# Left bracket, handle various cases. # Left bracket, handle various cases.
if s[i:i+len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT): if s[i:i + len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT):
# HTML begin comment tag, '<!--'. Scan for '-->'. # HTML begin comment tag, '<!--'. Scan for '-->'.
i2 = s.find(_END_COMMENT, i) i2 = s.find(_END_COMMENT, i)
if i2 < 0: if i2 < 0:
@ -242,9 +242,9 @@ def _html_split(s):
break break
else: else:
# Append the comment. # Append the comment.
L.append(s[i:i2+len(_END_COMMENT)]) L.append(s[i:i2 + len(_END_COMMENT)])
i = i2 + len(_END_COMMENT) i = i2 + len(_END_COMMENT)
elif s[i:i+len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA): elif s[i:i + len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA):
# XHTML begin CDATA tag. Scan for ']]>'. # XHTML begin CDATA tag. Scan for ']]>'.
i2 = s.find(_END_CDATA, i) i2 = s.find(_END_CDATA, i)
if i2 < 0: if i2 < 0:
@ -253,7 +253,7 @@ def _html_split(s):
break break
else: else:
# Append the CDATA. # Append the CDATA.
L.append(s[i:i2+len(_END_CDATA)]) L.append(s[i:i2 + len(_END_CDATA)])
i = i2 + len(_END_CDATA) i = i2 + len(_END_CDATA)
else: else:
# Regular HTML tag. Scan for '>'. # Regular HTML tag. Scan for '>'.
@ -261,19 +261,19 @@ def _html_split(s):
found = False found = False
in_quot1 = False in_quot1 = False
in_quot2 = False in_quot2 = False
for i2 in xrange(i+1, len(s)): for i2 in xrange(i + 1, len(s)):
c2 = s[i2] c2 = s[i2]
if c2 == '"' and not in_quot1: if c2 == '"' and not in_quot1:
in_quot2 = not in_quot2 in_quot2 = not in_quot2
# Only turn on double quote if it's in a realistic place. # Only turn on double quote if it's in a realistic place.
if in_quot2 and not in_quot1: if in_quot2 and not in_quot1:
if i2 > 0 and s[i2-1] not in [' ', '\t', '=']: if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
in_quot2 = False in_quot2 = False
elif c2 == "'" and not in_quot2: elif c2 == "'" and not in_quot2:
in_quot1 = not in_quot1 in_quot1 = not in_quot1
# Only turn on single quote if it's in a realistic place. # Only turn on single quote if it's in a realistic place.
if in_quot1 and not in_quot2: if in_quot1 and not in_quot2:
if i2 > 0 and s[i2-1] not in [' ', '\t', '=']: if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
in_quot1 = False in_quot1 = False
elif c2 == '>' and (not in_quot2 and not in_quot1): elif c2 == '>' and (not in_quot2 and not in_quot1):
found = True found = True
@ -285,7 +285,7 @@ def _html_split(s):
break break
else: else:
# Append the tag. # Append the tag.
L.append(s[i:i2+1]) L.append(s[i:i2 + 1])
i = i2 + 1 i = i2 + 1
# Check whether we found a special ignore tag, eg '<script>' # Check whether we found a special ignore tag, eg '<script>'
@ -460,18 +460,18 @@ def _tag_dict(s):
# Strip spaces. # Strip spaces.
while k1 < k2 and s[k1] in string.whitespace: k1 += 1 while k1 < k2 and s[k1] in string.whitespace: k1 += 1
while k1 < k2 and s[k2-1] in string.whitespace: k2 -= 1 while k1 < k2 and s[k2 - 1] in string.whitespace: k2 -= 1
while v1 < v2 and s[v1] in string.whitespace: v1 += 1 while v1 < v2 and s[v1] in string.whitespace: v1 += 1
while v1 < v2 and s[v2-1] in string.whitespace: v2 -= 1 while v1 < v2 and s[v2 - 1] in string.whitespace: v2 -= 1
# Strip one pair of double quotes around value. # Strip one pair of double quotes around value.
if v1 < v2 - 1 and s[v1] == '"' and s[v2-1] == '"': if v1 < v2 - 1 and s[v1] == '"' and s[v2 - 1] == '"':
v1 += 1 v1 += 1
v2 -= 1 v2 -= 1
# Strip one pair of single quotes around value. # Strip one pair of single quotes around value.
if v1 < v2 - 1 and s[v1] == "'" and s[v2-1] == "'": if v1 < v2 - 1 and s[v1] == "'" and s[v2 - 1] == "'":
v1 += 1 v1 += 1
v2 -= 1 v2 -= 1
@ -512,12 +512,12 @@ def _test_tag_dict():
assert _tag_dict(' \t\r \n\n \r\n ') == ({}, {}, {}) assert _tag_dict(' \t\r \n\n \r\n ') == ({}, {}, {})
assert _tag_dict('bgcolor=#ffffff text="#000000" blink') == \ assert _tag_dict('bgcolor=#ffffff text="#000000" blink') == \
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None}, ({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
{'bgcolor':(0,7), 'text':(16,20), 'blink':(31,36)}, {'bgcolor':(0, 7), 'text':(16, 20), 'blink':(31, 36)},
{'bgcolor':(8,15), 'text':(22,29), 'blink':(36,36)}) {'bgcolor':(8, 15), 'text':(22, 29), 'blink':(36, 36)})
assert _tag_dict("bgcolor='#ffffff'text='#000000' blink") == \ assert _tag_dict("bgcolor='#ffffff'text='#000000' blink") == \
({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None}, ({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
{'bgcolor':(0,7), 'text':(17,21), 'blink':(32,37)}, {'bgcolor':(0, 7), 'text':(17, 21), 'blink':(32, 37)},
{'bgcolor':(9,16), 'text':(23,30), 'blink':(37,37)}) {'bgcolor':(9, 16), 'text':(23, 30), 'blink':(37, 37)})
s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n' s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
(a, b, c) = _tag_dict(s) (a, b, c) = _tag_dict(s)
assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None} assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
@ -541,7 +541,7 @@ def _full_tag_extract(s):
# Starting position of each L[i] in s. # Starting position of each L[i] in s.
Lstart = [0] * len(L) Lstart = [0] * len(L)
for i in range(1, len(L)): for i in range(1, len(L)):
Lstart[i] = Lstart[i-1] + len(L[i-1]) Lstart[i] = Lstart[i - 1] + len(L[i - 1])
class NotTagError(Exception): pass class NotTagError(Exception): pass
@ -590,7 +590,7 @@ def _full_tag_extract(s):
(name, dtext) = (text, '') (name, dtext) = (text, '')
else: else:
name = text[:first_space] name = text[:first_space]
dtext = text[first_space+1:len(text)] dtext = text[first_space + 1:len(text)]
# Position of dtext relative to original text. # Position of dtext relative to original text.
dtext_offset = len(name) + 1 + orig_offset # +1 for space. dtext_offset = len(name) + 1 + orig_offset # +1 for space.
@ -610,10 +610,10 @@ def _full_tag_extract(s):
(attrs, key_pos, value_pos) = _tag_dict(dtext) (attrs, key_pos, value_pos) = _tag_dict(dtext)
# Correct offsets in key_pos and value_pos. # Correct offsets in key_pos and value_pos.
for key in attrs.keys(): for key in attrs.keys():
key_pos[key] = (key_pos[key][0]+Lstart[i]+dtext_offset, key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
key_pos[key][1]+Lstart[i]+dtext_offset) key_pos[key][1] + Lstart[i] + dtext_offset)
value_pos[key] = (value_pos[key][0]+Lstart[i]+dtext_offset, value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
value_pos[key][1]+Lstart[i]+dtext_offset) value_pos[key][1] + Lstart[i] + dtext_offset)
pos = (Lstart[i], Lstart[i] + len(L[i])) pos = (Lstart[i], Lstart[i] + len(L[i]))
@ -761,7 +761,7 @@ def _remove_comments(doc):
ans += [doc[i:]] ans += [doc[i:]]
break break
ans += [doc[i:i2]] ans += [doc[i:i2]]
i3 = doc.find('*/', i2+1) i3 = doc.find('*/', i2 + 1)
if i3 < 0: if i3 < 0:
i3 = len(doc) - 2 i3 = len(doc) - 2
ans += [' ' * (i3 - i2 + 2)] ans += [' ' * (i3 - i2 + 2)]
@ -775,7 +775,7 @@ def _test_remove_comments():
""" """
s = '/*d s kjlsdf */*//*/*//**/**/*//**/a' * 50 s = '/*d s kjlsdf */*//*/*//**/**/*//**/a' * 50
assert len(_remove_comments(s)) == len(s) assert len(_remove_comments(s)) == len(s)
s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd'+'/*//**//' s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd' + '/*//**//'
assert len(_remove_comments(s)) == len(s) assert len(_remove_comments(s)) == len(s)
s = 'a/**/' * 50 + '/**//**/////***/****/*//**//*/' * 5 s = 'a/**/' * 50 + '/**//**/////***/****/*//**//*/' * 5
assert len(_remove_comments(s)) == len(s) assert len(_remove_comments(s)) == len(s)
@ -916,8 +916,8 @@ def _tuple_replace(s, Lindices, Lreplace):
Lindices.sort() Lindices.sort()
if len(Lindices) != len(Lreplace): if len(Lindices) != len(Lreplace):
raise ValueError('lists differ in length') raise ValueError('lists differ in length')
for i in range(len(Lindices)-1): for i in range(len(Lindices) - 1):
if Lindices[i][1] > Lindices[i+1][0]: if Lindices[i][1] > Lindices[i + 1][0]:
raise ValueError('tuples overlap') raise ValueError('tuples overlap')
if Lindices[i][1] < Lindices[i][0]: if Lindices[i][1] < Lindices[i][0]:
raise ValueError('invalid tuple') raise ValueError('invalid tuple')
@ -932,7 +932,7 @@ def _tuple_replace(s, Lindices, Lreplace):
len1 = Lindices[i][1] - Lindices[i][0] len1 = Lindices[i][1] - Lindices[i][0]
len2 = len(Lreplace[i]) len2 = len(Lreplace[i])
ans.append(s[j:Lindices[i][0]+offset]) ans.append(s[j:Lindices[i][0] + offset])
ans.append(Lreplace[i]) ans.append(Lreplace[i])
j = Lindices[i][1] j = Lindices[i][1]
@ -943,12 +943,12 @@ def _test_tuple_replace():
""" """
Unit test for L{_tuple_replace}. Unit test for L{_tuple_replace}.
""" """
assert _tuple_replace('',[],[]) == '' assert _tuple_replace('', [], []) == ''
assert _tuple_replace('0123456789',[],[]) == '0123456789' assert _tuple_replace('0123456789', [], []) == '0123456789'
assert _tuple_replace('0123456789',[(4,5),(6,9)],['abc', 'def'])== \ assert _tuple_replace('0123456789', [(4, 5), (6, 9)], ['abc', 'def']) == \
'0123abc5def9' '0123abc5def9'
assert _tuple_replace('01234567890123456789', \ assert _tuple_replace('01234567890123456789', \
[(1,9),(13,14),(16,18)],['abcd','efg','hijk']) == \ [(1, 9), (13, 14), (16, 18)], ['abcd', 'efg', 'hijk']) == \
'0abcd9012efg45hijk89' '0abcd9012efg45hijk89'
def urljoin(s, L): def urljoin(s, L):
@ -1196,7 +1196,7 @@ def _test_tagextract(str_class=str):
'<img test="5%ff" /></body></html>\nBye!\n') '<img test="5%ff" /></body></html>\nBye!\n')
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' + doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
'<test tag="5" content=6><is broken=False><yay>' + '<test tag="5" content=6><is broken=False><yay>' +
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+ '<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
'<script language="JavaScript"><>!><!_!_!-->!_-></script>') '<script language="JavaScript"><>!><!_!_!-->!_-></script>')
doc3 = f('\r\t< html >< tag> <!--comment--> <tag a = 5> ' + doc3 = f('\r\t< html >< tag> <!--comment--> <tag a = 5> ' +
'<foo \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n>') '<foo \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n>')
@ -1239,7 +1239,7 @@ def _test_tagextract(str_class=str):
' what', '<style>', 'hi<><>>', '</style>', ' what', '<style>', 'hi<><>>', '</style>',
'<script language="Java">', '<aL><>><>>', '</script>', 'a']) '<script language="Java">', '<aL><>><>>', '</script>', 'a'])
s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag '+ s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag ' +
'<tag <! <! -> <!-- </who< <who> tag> <huh-->-</style>' + '<tag <! <! -> <!-- </who< <who> tag> <huh-->-</style>' +
'</style<style>') '</style<style>')
assert s == f('').join(_html_split(s)) assert s == f('').join(_html_split(s))
@ -1282,7 +1282,7 @@ def _test_tagextract(str_class=str):
doc2old = doc2 doc2old = doc2
doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' + doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
'<test tag="5" content=6><is broken=False><yay>' + '<test tag="5" content=6><is broken=False><yay>' +
'<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+ '<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
'<script language="JavaScript"><>!><!_!_!-->!_-></script>') '<script language="JavaScript"><>!><!_!_!-->!_-></script>')
assert doc2old == doc2 assert doc2old == doc2
@ -1334,7 +1334,7 @@ def _test_tagextract(str_class=str):
L = tagextract(doc4) L = tagextract(doc4)
assert len(L) == n assert len(L) == n
for i in range(n): for i in range(n):
assert L[i] == f([('tag/',{'name':'5','value':'6afdjherknc4 cdk j', assert L[i] == f([('tag/', {'name':'5', 'value':'6afdjherknc4 cdk j',
'a':'7', 'b':'8'})])[0] 'a':'7', 'b':'8'})])[0]
# ----------------------------------------------------------------- # -----------------------------------------------------------------

View File

@ -28,6 +28,7 @@ import urllib
import textwrap import textwrap
import urlparse import urlparse
import os, os.path import os, os.path
import errno import errno
import hashlib import hashlib
import httplib import httplib
@ -129,27 +130,27 @@ def normalize_url(url, lower=True):
return url return url
def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point = 0): def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point=0):
# find tag limits - start_string must be an unique identifier within doc # find tag limits - start_string must be an unique identifier within doc
i1 = doc.find(filter_string, start_point) i1 = doc.find(filter_string, start_point)
if i1 == -1: if i1 == -1:
return (-1,-1) return (-1, -1)
aux = doc.rfind(start_tag, start_point, i1+len(filter_string)) aux = doc.rfind(start_tag, start_point, i1 + len(filter_string))
# we've found the filter_string but it has not the start_tag, so we return a different value # we've found the filter_string but it has not the start_tag, so we return a different value
# telling the script to keep searching starting on the end of the filter_string found # telling the script to keep searching starting on the end of the filter_string found
if aux == -1: if aux == -1:
return (-2, i1+len(filter_string)) return (-2, i1 + len(filter_string))
i1 = aux i1 = aux
sdiv = i1 sdiv = i1
ediv = i1 + len(start_tag) ediv = i1 + len(start_tag)
while(sdiv < ediv and sdiv != -1): while(sdiv < ediv and sdiv != -1):
sdiv = doc.find(start_tag, sdiv+len(start_tag)) sdiv = doc.find(start_tag, sdiv + len(start_tag))
ediv = doc.find(end_tag , ediv+len(end_tag)) ediv = doc.find(end_tag , ediv + len(end_tag))
return (i1, ediv) return (i1, ediv)
@ -163,9 +164,9 @@ def clean_tag(doc, filter_string, end_tag, start_tag):
if start1 == -2: if start1 == -2:
start_point = start2 start_point = start2
continue continue
end1 = doc.find('>', start1)+1; end1 = doc.find('>', start1) + 1;
end2 = start2 + len(end_tag); end2 = start2 + len(end_tag);
doc = doc[:start1]+doc[end1:start2]+doc[end2:] doc = doc[:start1] + doc[end1:start2] + doc[end2:]
def remove_tag(doc, start_string, end_tag, start_tag): def remove_tag(doc, start_string, end_tag, start_tag):
#remove tagged text function #remove tagged text function
@ -173,7 +174,7 @@ def remove_tag(doc, start_string, end_tag, start_tag):
(i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag) (i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag)
if i1 == -1 or i2 == -1: if i1 == -1 or i2 == -1:
return doc return doc
doc = doc[:i1]+doc[i2+len(end_tag):] doc = doc[:i1] + doc[i2 + len(end_tag):]
def monobook_fix_html(doc, page_url): def monobook_fix_html(doc, page_url):
""" """
@ -185,7 +186,7 @@ def monobook_fix_html(doc, page_url):
doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=') doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
doc = remove_tag(doc, '<div class="portlet" id="p-personal">', '</div>', '<div') doc = remove_tag(doc, '<div class="portlet" id="p-personal">', '</div>', '<div')
doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>','<div') doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>', '<div')
doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div') doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
#James also remove the page/discussion/source/history/ div. #James also remove the page/discussion/source/history/ div.
doc = remove_tag(doc, '<li id="ca-', '</li>', '<li') doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
@ -193,20 +194,20 @@ def monobook_fix_html(doc, page_url):
#andre special mode #andre special mode
if config.special_mode: if config.special_mode:
# Remove ul list # Remove ul list
doc = remove_tag(doc,'<ul id="f-list">','</ul>', '<ul') doc = remove_tag(doc, '<ul id="f-list">', '</ul>', '<ul')
# Remove link rel alternate and edit # Remove link rel alternate and edit
doc = re.sub(r'<link rel="alternate"[\s\S]+?/>',r'',doc) doc = re.sub(r'<link rel="alternate"[\s\S]+?/>', r'', doc)
doc = re.sub(r'<link rel="edit"[\s\S]+?/>',r'',doc) doc = re.sub(r'<link rel="edit"[\s\S]+?/>', r'', doc)
# Remove print footer # Remove print footer
doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>',r'',doc) doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>', r'', doc)
# Remove noexport # Remove noexport
doc = remove_tag(doc,'<div class="noexport"','</div>', '<div') doc = remove_tag(doc, '<div class="noexport"', '</div>', '<div')
# Remove editornote # Remove editornote
doc = remove_tag(doc,'<div class="editornote"','</div>', '<div') doc = remove_tag(doc, '<div class="editornote"', '</div>', '<div')
else: else:
# Remove powered by MediaWiki logo # Remove powered by MediaWiki logo
@ -261,24 +262,24 @@ def pos_html_transform(doc, url):
sidebar_html = f.read() sidebar_html = f.read()
f.close() f.close()
doc = re.sub( r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc) doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
# Remove empty links # Remove empty links
doc = clean_tag(doc, 'href=""', '</a>', '<a '); doc = clean_tag(doc, 'href=""', '</a>', '<a ');
if config.special_mode: if config.special_mode:
# Remove external link rel stylesheet # Remove external link rel stylesheet
doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>',r'',doc) doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>', r'', doc)
# Remove external javascript # Remove external javascript
doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>',r'',doc) doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>', r'', doc)
# Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls # Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
if config.footer is not None: if config.footer is not None:
s1 = '<div id="footer">' s1 = '<div id="footer">'
# match correct divs # match correct divs
(i1,i2) = find_tag_limits(doc, s1, '</div>', '<div') (i1, i2) = find_tag_limits(doc, s1, '</div>', '<div')
if (i1 == -1): if (i1 == -1):
return doc return doc
@ -298,7 +299,7 @@ def pos_html_transform(doc, url):
# keep MediaWiki credits # keep MediaWiki credits
doc = doc[:i2] + footer_html + doc[i2:] doc = doc[:i2] + footer_html + doc[i2:]
else: else:
doc = doc[:i1+len(s1)] + footer_html + doc[i2:] doc = doc[:i1 + len(s1)] + footer_html + doc[i2:]
return doc return doc
@ -308,9 +309,9 @@ def fix_move_href_tags(doc):
""" """
while '<' + MOVE_HREF in doc: while '<' + MOVE_HREF in doc:
i1 = doc.index('<' + MOVE_HREF) i1 = doc.index('<' + MOVE_HREF)
i2 = doc.index('</' + MOVE_HREF, i1+1) i2 = doc.index('</' + MOVE_HREF, i1 + 1)
i3 = doc.index('>', i2+1) i3 = doc.index('>', i2 + 1)
(start, end) = (i1, i3+1) (start, end) = (i1, i3 + 1)
tags = htmldata.tagextract(doc[start:end]) tags = htmldata.tagextract(doc[start:end])
assert tags[0][0] == MOVE_HREF assert tags[0][0] == MOVE_HREF
assert tags[-1][0] == '/' + MOVE_HREF assert tags[-1][0] == '/' + MOVE_HREF
@ -420,7 +421,7 @@ def post_css_transform(doc, url):
def move_to_index_if_needed(ans): def move_to_index_if_needed(ans):
global config global config
if ans.endswith(config.index): if ans.endswith(config.index):
ans = ans[:len(ans)-len(config.index)] + INDEX_HTML ans = ans[:len(ans) - len(config.index)] + INDEX_HTML
return ans return ans
def file_exists_in_written_set(filename): def file_exists_in_written_set(filename):
@ -456,11 +457,11 @@ def clean_filename(url, ans):
# Replace % escape codes with underscores, dashes with underscores. # Replace % escape codes with underscores, dashes with underscores.
while '%%' in ans: while '%%' in ans:
ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%')+2:] ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%') + 2:]
while '%25' in ans: while '%25' in ans:
ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25')+5:] ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25') + 5:]
while '%' in ans: while '%' in ans:
ans = ans[:ans.index('%')] + '_' + ans[ans.index('%')+3:] ans = ans[:ans.index('%')] + '_' + ans[ans.index('%') + 3:]
ans = ans.replace('-', '_') ans = ans.replace('-', '_')
while '__' in ans: while '__' in ans:
ans = ans.replace('__', '_') ans = ans.replace('__', '_')
@ -512,15 +513,15 @@ def url_open(url):
L = urlparse.urlparse(url) L = urlparse.urlparse(url)
if L[1] != domain: if L[1] != domain:
conn.close() conn.close()
print "connection to",domain,"closed." print "connection to", domain, "closed."
conn = httplib.HTTPConnection(L[1]) conn = httplib.HTTPConnection(L[1])
domain = L[1] domain = L[1]
print "connection to",domain,"opened." print "connection to", domain, "opened."
rel_url = url rel_url = url
pos = url.find(domain) pos = url.find(domain)
if pos != -1: if pos != -1:
rel_url = url[pos+len(domain):] rel_url = url[pos + len(domain):]
attempts = 0 attempts = 0
#number of attempts #number of attempts
@ -534,11 +535,11 @@ def url_open(url):
try: try:
conn.request("GET", rel_url) conn.request("GET", rel_url)
r = conn.getresponse() r = conn.getresponse()
print 'Status',r.status,r.reason,'accessing',rel_url print 'Status', r.status, r.reason, 'accessing', rel_url
if r.status == 404: if r.status == 404:
print " it's not possible to recover this error." print " it's not possible to recover this error."
errors += 1 errors += 1
return ('','') return ('', '')
if r.status == 500: if r.status == 500:
print " eventually this error might be recovered. let's try again." print " eventually this error might be recovered. let's try again."
print ' reconnecting...' print ' reconnecting...'
@ -554,11 +555,11 @@ def url_open(url):
if attempts != 0: if attempts != 0:
recovered = True recovered = True
if r.status != 200: if r.status != 200:
print " Status other than 200, 404, 500, 403. It is: ",r.status print " Status other than 200, 404, 500, 403. It is: ", r.status
success = True success = True
except httplib.HTTPException, e: except httplib.HTTPException, e:
print 'ERROR',e.__class__.__name__,'while retrieving', url print 'ERROR', e.__class__.__name__, 'while retrieving', url
conn.close conn.close
if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']: if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
print "eventually this error might be recovered. let's try again." print "eventually this error might be recovered. let's try again."
@ -568,7 +569,7 @@ def url_open(url):
else: else:
print "it's not possible to recover this error." print "it's not possible to recover this error."
errors += 1 errors += 1
return ('','') return ('', '')
if recovered: if recovered:
print "error recovered" print "error recovered"
@ -616,7 +617,8 @@ def url_to_filename(url):
lpath = L[2].split('/') lpath = L[2].split('/')
if not '.' in lpath[-1]: if not '.' in lpath[-1]:
# url ends with a directory name. Store it under index.html. # url ends with a directory name. Store it under index.html.
L[2] += '/' + INDEX_HTML # L[2] += '/' + INDEX_HTML
L[2]=L[2]
else: else:
# 'title=' parsing # 'title=' parsing
if L[4].startswith('title=') and L[2].endswith('index.php'): if L[4].startswith('title=') and L[2].endswith('index.php'):
@ -692,7 +694,7 @@ def url_to_filename(url):
# Not really needed since we checked that the directory # Not really needed since we checked that the directory
# outdir didn't exist at the top of run(), but let's double check. # outdir didn't exist at the top of run(), but let's double check.
if os.path.exists(ans) and not config.overwrite: if os.path.exists(ans) and not config.overwrite:
out.write('File already exists: ' + str(ans)) out.write('File already exists: ' + str(ans)) #@UndefinedVariable
sys.exit(1) sys.exit(1)
f = open(ans, mode) f = open(ans, mode)
@ -780,7 +782,7 @@ def should_follow(url):
print url, 'with multiple query fields' print url, 'with multiple query fields'
return False return False
if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-' )): if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
if config.debug: if config.debug:
print url, 'is a forbidden wiki page' print url, 'is a forbidden wiki page'
return False return False
@ -806,7 +808,7 @@ def should_follow(url):
print url, 'is a file outside of scope with unknown extension' print url, 'is a file outside of scope with unknown extension'
return False return False
forbidden_parents = ['.php','.html','.htm'] forbidden_parents = ['.php', '.html', '.htm']
for fp in forbidden_parents: for fp in forbidden_parents:
if fp in L[-1]: if fp in L[-1]:
if config.debug: if config.debug:
@ -876,7 +878,7 @@ def run(out=sys.stdout):
n = 0 n = 0
if not config.overwrite and os.path.exists(config.outdir): if not config.overwrite and os.path.exists(config.outdir):
out.write('Error: Directory exists: ' + str(config.outdir) ) out.write('Error: Directory exists: ' + str(config.outdir))
sys.exit(1) sys.exit(1)
domain = get_domain(config.rooturl) domain = get_domain(config.rooturl)
@ -941,7 +943,7 @@ def run(out=sys.stdout):
# Save document changes to disk # Save document changes to disk
update = False update = False
text_ext = ( 'txt', 'html', 'rtf', 'css', 'sgml', 'xml' ) text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
for ext in text_ext: for ext in text_ext:
if filename.endswith(ext): if filename.endswith(ext):
update = True update = True
@ -957,7 +959,7 @@ def run(out=sys.stdout):
n += 1 n += 1
conn.close() conn.close()
print "connection to",domain,"closed." print "connection to", domain, "closed."
out.write(str(n) + ' files saved\n') out.write(str(n) + ' files saved\n')
print counter, "httplib requests done" print counter, "httplib requests done"
print errors, "errors not recovered" print errors, "errors not recovered"
@ -1029,7 +1031,7 @@ def main():
'no-hack-skin', 'no-made-by', 'left=', 'no-hack-skin', 'no-made-by', 'left=',
'top=', 'bottom=', 'index=', 'no-move-href', 'top=', 'bottom=', 'index=', 'no-move-href',
'no-remove-png', 'no-remove-history', 'limit-parent', 'no-remove-png', 'no-remove-history', 'limit-parent',
'special-mode','debug','no-images']) 'special-mode', 'debug', 'no-images'])
except getopt.GetoptError: except getopt.GetoptError:
usage() usage()
@ -1066,7 +1068,7 @@ def main():
config.special_mode = True config.special_mode = True
config.sidebar = 'sidebar.html' config.sidebar = 'sidebar.html'
config.footer = 'footer.html' config.footer = 'footer.html'
if opt in ['-d','--debug']: if opt in ['-d', '--debug']:
config.debug = True config.debug = True
if opt in ['-l', '--left']: if opt in ['-l', '--left']:
config.sidebar = os.path.abspath(arg) config.sidebar = os.path.abspath(arg)