Avoid duplicate html files with names like 'index.htm'

2025-09-18 17:10:55 +02:00 · 2011-04-12 15:04:58 +00:00 · 2011-04-12 15:04:58 +00:00 · eb904062ad
commit eb904062ad
parent 9438fdf3e5
2 changed files with 2624 additions and 2622 deletions
--- a/scripts/mw2html_audacity/htmldata.py
+++ b/scripts/mw2html_audacity/htmldata.py
@ -189,7 +189,7 @@ def _enumerate(L):

    Returns a list instead of an iterator.
    """
-  return zip(range(len(L)),L)
+    return zip(range(len(L)), L)

 def _ignore_tag_index(s, i):
    """
@ -199,9 +199,9 @@ def _ignore_tag_index(s, i):
    the index.  Otherwise, return C{-1}.
    """
    for (j, (a, b)) in _enumerate(_IGNORE_TAGS):
-    if s[i:i+len(a)+1].lower() == '<' + a:
+        if s[i:i + len(a) + 1].lower() == '<' + a:
            return j
-  return -1
+    return - 1

 def _html_split(s):
    """
@ -233,7 +233,7 @@ def _html_split(s):
        c = s[i]
        if c == '<':
            # Left bracket, handle various cases.
-      if s[i:i+len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT):
+            if s[i:i + len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT):
                # HTML begin comment tag, '<!--'.  Scan for '-->'.
                i2 = s.find(_END_COMMENT, i)
                if i2 < 0:
@ -242,9 +242,9 @@ def _html_split(s):
                    break
                else:
                    # Append the comment.
-          L.append(s[i:i2+len(_END_COMMENT)])
+                    L.append(s[i:i2 + len(_END_COMMENT)])
                    i = i2 + len(_END_COMMENT)
-      elif s[i:i+len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA):
+            elif s[i:i + len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA):
                # XHTML begin CDATA tag.  Scan for ']]>'.
                i2 = s.find(_END_CDATA, i)
                if i2 < 0:
@ -253,7 +253,7 @@ def _html_split(s):
                    break
                else:
                    # Append the CDATA.
-          L.append(s[i:i2+len(_END_CDATA)])
+                    L.append(s[i:i2 + len(_END_CDATA)])
                    i = i2 + len(_END_CDATA)
            else:
                # Regular HTML tag.  Scan for '>'.
@ -261,19 +261,19 @@ def _html_split(s):
                found = False
                in_quot1 = False
                in_quot2 = False
-        for i2 in xrange(i+1, len(s)):
+                for i2 in xrange(i + 1, len(s)):
                    c2 = s[i2]
                    if c2 == '"' and not in_quot1:
                        in_quot2 = not in_quot2
                        # Only turn on double quote if it's in a realistic place.
                        if in_quot2 and not in_quot1:
-              if i2 > 0 and s[i2-1] not in [' ', '\t', '=']:
+                            if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
                                in_quot2 = False
                    elif c2 == "'" and not in_quot2:
                        in_quot1 = not in_quot1
                        # Only turn on single quote if it's in a realistic place.
                        if in_quot1 and not in_quot2:
-              if i2 > 0 and s[i2-1] not in [' ', '\t', '=']:
+                            if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']:
                                in_quot1 = False
                    elif c2 == '>' and (not in_quot2 and not in_quot1):
                        found = True
@ -285,7 +285,7 @@ def _html_split(s):
                    break
                else:
                    # Append the tag.
-          L.append(s[i:i2+1])
+                    L.append(s[i:i2 + 1])
                    i = i2 + 1

                # Check whether we found a special ignore tag, eg '<script>'
@ -460,18 +460,18 @@ def _tag_dict(s):

            # Strip spaces.
            while k1 < k2 and s[k1] in string.whitespace:   k1 += 1
-      while k1 < k2 and s[k2-1] in string.whitespace: k2 -= 1
+            while k1 < k2 and s[k2 - 1] in string.whitespace: k2 -= 1

            while v1 < v2 and s[v1] in string.whitespace:   v1 += 1
-      while v1 < v2 and s[v2-1] in string.whitespace: v2 -= 1
+            while v1 < v2 and s[v2 - 1] in string.whitespace: v2 -= 1

            # Strip one pair of double quotes around value.
-      if v1 < v2 - 1 and s[v1] == '"' and s[v2-1] == '"':
+            if v1 < v2 - 1 and s[v1] == '"' and s[v2 - 1] == '"':
                v1 += 1
                v2 -= 1

            # Strip one pair of single quotes around value.
-      if v1 < v2 - 1 and s[v1] == "'" and s[v2-1] == "'":
+            if v1 < v2 - 1 and s[v1] == "'" and s[v2 - 1] == "'":
                v1 += 1
                v2 -= 1

@ -512,12 +512,12 @@ def _test_tag_dict():
    assert _tag_dict(' \t\r \n\n \r\n  ') == ({}, {}, {})
    assert _tag_dict('bgcolor=#ffffff text="#000000" blink') == \
      ({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
-     {'bgcolor':(0,7),  'text':(16,20), 'blink':(31,36)},
-     {'bgcolor':(8,15), 'text':(22,29), 'blink':(36,36)})
+       {'bgcolor':(0, 7), 'text':(16, 20), 'blink':(31, 36)},
+       {'bgcolor':(8, 15), 'text':(22, 29), 'blink':(36, 36)})
    assert _tag_dict("bgcolor='#ffffff'text='#000000' blink") == \
      ({'bgcolor':'#ffffff', 'text':'#000000', 'blink': None},
-     {'bgcolor':(0,7),  'text':(17,21), 'blink':(32,37)},
-     {'bgcolor':(9,16), 'text':(23,30), 'blink':(37,37)})
+       {'bgcolor':(0, 7), 'text':(17, 21), 'blink':(32, 37)},
+       {'bgcolor':(9, 16), 'text':(23, 30), 'blink':(37, 37)})
    s = ' \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n'
    (a, b, c) = _tag_dict(s)
    assert a == {'text': 'hi you', 'bg': 'val', 'e': '5', 'name': None}
@ -541,7 +541,7 @@ def _full_tag_extract(s):
    # Starting position of each L[i] in s.
    Lstart = [0] * len(L)
    for i in range(1, len(L)):
-    Lstart[i] = Lstart[i-1] + len(L[i-1])
+        Lstart[i] = Lstart[i - 1] + len(L[i - 1])

    class NotTagError(Exception): pass

@ -590,7 +590,7 @@ def _full_tag_extract(s):
                    (name, dtext) = (text, '')
                else:
                    name = text[:first_space]
-          dtext = text[first_space+1:len(text)]
+                    dtext = text[first_space + 1:len(text)]

                # Position of dtext relative to original text.
                dtext_offset = len(name) + 1 + orig_offset    # +1 for space.
@ -610,10 +610,10 @@ def _full_tag_extract(s):
                (attrs, key_pos, value_pos) = _tag_dict(dtext)
                # Correct offsets in key_pos and value_pos.
                for key in attrs.keys():
-          key_pos[key]   = (key_pos[key][0]+Lstart[i]+dtext_offset,
-                            key_pos[key][1]+Lstart[i]+dtext_offset)
-          value_pos[key] = (value_pos[key][0]+Lstart[i]+dtext_offset,
-                            value_pos[key][1]+Lstart[i]+dtext_offset)
+                    key_pos[key] = (key_pos[key][0] + Lstart[i] + dtext_offset,
+                                      key_pos[key][1] + Lstart[i] + dtext_offset)
+                    value_pos[key] = (value_pos[key][0] + Lstart[i] + dtext_offset,
+                                      value_pos[key][1] + Lstart[i] + dtext_offset)

                pos = (Lstart[i], Lstart[i] + len(L[i]))

@ -761,7 +761,7 @@ def _remove_comments(doc):
            ans += [doc[i:]]
            break
        ans += [doc[i:i2]]
-    i3 = doc.find('*/', i2+1)
+        i3 = doc.find('*/', i2 + 1)
        if i3 < 0:
            i3 = len(doc) - 2
        ans += [' ' * (i3 - i2 + 2)]
@ -775,7 +775,7 @@ def _test_remove_comments():
    """
    s = '/*d s kjlsdf */*//*/*//**/**/*//**/a' * 50
    assert len(_remove_comments(s)) == len(s)
-  s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd'+'/*//**//'
+    s = '/**/' * 50 + '/*5845*/*/*//*/**/dfd' + '/*//**//'
    assert len(_remove_comments(s)) == len(s)
    s = 'a/**/' * 50 + '/**//**/////***/****/*//**//*/' * 5
    assert len(_remove_comments(s)) == len(s)
@ -916,8 +916,8 @@ def _tuple_replace(s, Lindices, Lreplace):
    Lindices.sort()
    if len(Lindices) != len(Lreplace):
        raise ValueError('lists differ in length')
-  for i in range(len(Lindices)-1):
-    if Lindices[i][1] > Lindices[i+1][0]:
+    for i in range(len(Lindices) - 1):
+        if Lindices[i][1] > Lindices[i + 1][0]:
            raise ValueError('tuples overlap')
        if Lindices[i][1] < Lindices[i][0]:
            raise ValueError('invalid tuple')
@ -932,7 +932,7 @@ def _tuple_replace(s, Lindices, Lreplace):
        len1 = Lindices[i][1] - Lindices[i][0]
        len2 = len(Lreplace[i])

-    ans.append(s[j:Lindices[i][0]+offset])
+        ans.append(s[j:Lindices[i][0] + offset])
        ans.append(Lreplace[i])

        j = Lindices[i][1]
@ -943,12 +943,12 @@ def _test_tuple_replace():
    """
    Unit test for L{_tuple_replace}.
    """
-  assert _tuple_replace('',[],[]) == ''
-  assert _tuple_replace('0123456789',[],[]) == '0123456789'
-  assert _tuple_replace('0123456789',[(4,5),(6,9)],['abc', 'def'])== \
+    assert _tuple_replace('', [], []) == ''
+    assert _tuple_replace('0123456789', [], []) == '0123456789'
+    assert _tuple_replace('0123456789', [(4, 5), (6, 9)], ['abc', 'def']) == \
           '0123abc5def9'
    assert _tuple_replace('01234567890123456789', \
-         [(1,9),(13,14),(16,18)],['abcd','efg','hijk']) ==           \
+           [(1, 9), (13, 14), (16, 18)], ['abcd', 'efg', 'hijk']) == \
           '0abcd9012efg45hijk89'

 def urljoin(s, L):
@ -1196,7 +1196,7 @@ def _test_tagextract(str_class=str):
             '<img test="5%ff" /></body></html>\nBye!\n')
    doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
           '<test tag="5" content=6><is broken=False><yay>' +
-         '<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+
+           '<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
           '<script language="JavaScript"><>!><!_!_!-->!_-></script>')
    doc3 = f('\r\t< html >< tag> <!--comment--> <tag a = 5> ' +
           '<foo \r\nbg = val text \t= "hi you" name\t e="5"\t\t\t\n>')
@ -1239,7 +1239,7 @@ def _test_tagextract(str_class=str):
     ' what', '<style>', 'hi<><>>', '</style>',
     '<script language="Java">', '<aL><>><>>', '</script>', 'a'])

-  s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag '+
+    s = f('<!-- ><# -->!<!-!._-><!-- aa--> <style><tag//</style> <tag ' +
          '<tag <! <! -> <!-- </who< <who> tag> <huh-->-</style>' +
          '</style<style>')
    assert s == f('').join(_html_split(s))
@ -1282,7 +1282,7 @@ def _test_tagextract(str_class=str):
    doc2old = doc2
    doc2 = f('\r<HTML><!-- Comment<a href="blah"> --><hiYa><foo>' +
             '<test tag="5" content=6><is broken=False><yay>' +
-           '<style><><>><</style><foo bar=5>end<!-- <!-- nested --> '+
+             '<style><><>><</style><foo bar=5>end<!-- <!-- nested --> ' +
             '<script language="JavaScript"><>!><!_!_!-->!_-></script>')
    assert doc2old == doc2

@ -1334,7 +1334,7 @@ def _test_tagextract(str_class=str):
    L = tagextract(doc4)
    assert len(L) == n
    for i in range(n):
-    assert L[i] == f([('tag/',{'name':'5','value':'6afdjherknc4 cdk j',
+        assert L[i] == f([('tag/', {'name':'5', 'value':'6afdjherknc4 cdk j',
                               'a':'7', 'b':'8'})])[0]

    # -----------------------------------------------------------------
--- a/scripts/mw2html_audacity/mw2html.py
+++ b/scripts/mw2html_audacity/mw2html.py
@ -28,6 +28,7 @@ import urllib
 import textwrap
 import urlparse
 import os, os.path
+
 import errno
 import hashlib
 import httplib
@ -129,27 +130,27 @@ def normalize_url(url, lower=True):

    return url

-def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point = 0):
+def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point=0):
 # find tag limits - start_string must be an unique identifier within doc

    i1 = doc.find(filter_string, start_point)

    if i1 == -1:
-    return (-1,-1)
+        return (-1, -1)

-  aux   = doc.rfind(start_tag, start_point, i1+len(filter_string))
+    aux = doc.rfind(start_tag, start_point, i1 + len(filter_string))

    # we've found the filter_string but it has not the start_tag, so we return a different value
    # telling the script to keep searching starting on the end of the filter_string found
    if aux == -1:
-    return (-2, i1+len(filter_string))
+        return (-2, i1 + len(filter_string))

    i1 = aux
    sdiv = i1
    ediv = i1 + len(start_tag)
    while(sdiv < ediv and sdiv != -1):
-      sdiv = doc.find(start_tag, sdiv+len(start_tag))
-      ediv = doc.find(end_tag  , ediv+len(end_tag))
+        sdiv = doc.find(start_tag, sdiv + len(start_tag))
+        ediv = doc.find(end_tag  , ediv + len(end_tag))

    return (i1, ediv)

@ -163,9 +164,9 @@ def clean_tag(doc, filter_string, end_tag, start_tag):
        if start1 == -2:
            start_point = start2
            continue
-    end1 = doc.find('>', start1)+1;
+        end1 = doc.find('>', start1) + 1;
        end2 = start2 + len(end_tag);
-    doc = doc[:start1]+doc[end1:start2]+doc[end2:]
+        doc = doc[:start1] + doc[end1:start2] + doc[end2:]

 def remove_tag(doc, start_string, end_tag, start_tag):
    #remove tagged text function
@ -173,7 +174,7 @@ def remove_tag(doc, start_string, end_tag, start_tag):
        (i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag)
        if i1 == -1 or i2 == -1:
            return doc
-    doc = doc[:i1]+doc[i2+len(end_tag):]
+        doc = doc[:i1] + doc[i2 + len(end_tag):]

 def monobook_fix_html(doc, page_url):
    """
@ -185,7 +186,7 @@ def monobook_fix_html(doc, page_url):
        doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')

    doc = remove_tag(doc, '<div class="portlet" id="p-personal">', '</div>', '<div')
-  doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>','<div')
+    doc = remove_tag(doc, '<div id="p-search" class="portlet">', '</div>', '<div')
    doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
    #James also remove the page/discussion/source/history/ div.
    doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
@ -193,20 +194,20 @@ def monobook_fix_html(doc, page_url):
    #andre special mode
    if config.special_mode:
        # Remove ul list
-    doc = remove_tag(doc,'<ul id="f-list">','</ul>', '<ul')
+        doc = remove_tag(doc, '<ul id="f-list">', '</ul>', '<ul')

        # Remove link rel alternate and edit
-    doc = re.sub(r'<link rel="alternate"[\s\S]+?/>',r'',doc)
-    doc = re.sub(r'<link rel="edit"[\s\S]+?/>',r'',doc)
+        doc = re.sub(r'<link rel="alternate"[\s\S]+?/>', r'', doc)
+        doc = re.sub(r'<link rel="edit"[\s\S]+?/>', r'', doc)

        # Remove print footer
-    doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>',r'',doc)
+        doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>', r'', doc)

        # Remove noexport
-    doc = remove_tag(doc,'<div class="noexport"','</div>', '<div')
+        doc = remove_tag(doc, '<div class="noexport"', '</div>', '<div')

        # Remove editornote
-    doc = remove_tag(doc,'<div class="editornote"','</div>', '<div')
+        doc = remove_tag(doc, '<div class="editornote"', '</div>', '<div')

    else:
        # Remove powered by MediaWiki logo
@ -261,24 +262,24 @@ def pos_html_transform(doc, url):
        sidebar_html = f.read()
        f.close()

-  doc = re.sub( r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
+    doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)

    # Remove empty links
    doc = clean_tag(doc, 'href=""', '</a>', '<a ');

    if config.special_mode:
        # Remove external link rel stylesheet
-    doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>',r'',doc)
+        doc = re.sub(r'<link rel="stylesheet" href="http://[\s\S]+?/>', r'', doc)

        # Remove external javascript
-    doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>',r'',doc)
+        doc = re.sub(r'<script type="text/javascript" src="http://[\s\S]+?</script>', r'', doc)

    # Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
    if config.footer is not None:
        s1 = '<div id="footer">'

    # match correct divs
-  (i1,i2) = find_tag_limits(doc, s1, '</div>', '<div')
+    (i1, i2) = find_tag_limits(doc, s1, '</div>', '<div')

    if (i1 == -1):
        return doc
@ -298,7 +299,7 @@ def pos_html_transform(doc, url):
        # keep MediaWiki credits
        doc = doc[:i2] + footer_html + doc[i2:]
    else:
-    doc = doc[:i1+len(s1)] + footer_html + doc[i2:]
+        doc = doc[:i1 + len(s1)] + footer_html + doc[i2:]

    return doc

@ -308,9 +309,9 @@ def fix_move_href_tags(doc):
    """
    while '<' + MOVE_HREF in doc:
        i1 = doc.index('<' + MOVE_HREF)
-    i2 = doc.index('</' + MOVE_HREF, i1+1)
-    i3 = doc.index('>', i2+1)
-    (start, end) = (i1, i3+1)
+        i2 = doc.index('</' + MOVE_HREF, i1 + 1)
+        i3 = doc.index('>', i2 + 1)
+        (start, end) = (i1, i3 + 1)
        tags = htmldata.tagextract(doc[start:end])
        assert tags[0][0] == MOVE_HREF
        assert tags[-1][0] == '/' + MOVE_HREF
@ -420,7 +421,7 @@ def post_css_transform(doc, url):
 def move_to_index_if_needed(ans):
    global config
    if ans.endswith(config.index):
-    ans = ans[:len(ans)-len(config.index)] + INDEX_HTML
+        ans = ans[:len(ans) - len(config.index)] + INDEX_HTML
    return ans

 def file_exists_in_written_set(filename):
@ -456,11 +457,11 @@ def clean_filename(url, ans):

    # Replace % escape codes with underscores, dashes with underscores.
    while '%%' in ans:
-    ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%')+2:]
+        ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%') + 2:]
    while '%25' in ans:
-    ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25')+5:]
+        ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25') + 5:]
    while '%' in ans:
-    ans = ans[:ans.index('%')] + '_' + ans[ans.index('%')+3:]
+        ans = ans[:ans.index('%')] + '_' + ans[ans.index('%') + 3:]
    ans = ans.replace('-', '_')
    while '__' in ans:
        ans = ans.replace('__', '_')
@ -512,15 +513,15 @@ def url_open(url):
        L = urlparse.urlparse(url)
        if L[1] != domain:
            conn.close()
-      print "connection to",domain,"closed."
+            print "connection to", domain, "closed."
            conn = httplib.HTTPConnection(L[1])
            domain = L[1]
-      print "connection to",domain,"opened."
+            print "connection to", domain, "opened."

        rel_url = url
        pos = url.find(domain)
        if pos != -1:
-      rel_url = url[pos+len(domain):]
+            rel_url = url[pos + len(domain):]

        attempts = 0
        #number of attempts
@ -534,11 +535,11 @@ def url_open(url):
            try:
                conn.request("GET", rel_url)
                r = conn.getresponse()
-        print 'Status',r.status,r.reason,'accessing',rel_url
+                print 'Status', r.status, r.reason, 'accessing', rel_url
                if r.status == 404:
                    print "   it's not possible to recover this error."
                    errors += 1
-          return ('','')
+                    return ('', '')
                if r.status == 500:
                    print "   eventually this error might be recovered. let's try again."
                    print '   reconnecting...'
@ -554,11 +555,11 @@ def url_open(url):
                if attempts != 0:
                    recovered = True
                if r.status != 200:
-          print "      Status other than 200, 404, 500, 403. It is: ",r.status
+                    print "      Status other than 200, 404, 500, 403. It is: ", r.status
                success = True

            except httplib.HTTPException, e:
-        print 'ERROR',e.__class__.__name__,'while retrieving', url
+                print 'ERROR', e.__class__.__name__, 'while retrieving', url
                conn.close
                if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
                    print "eventually this error might be recovered. let's try again."
@ -568,7 +569,7 @@ def url_open(url):
                else:
                    print "it's not possible to recover this error."
                    errors += 1
-          return ('','')
+                    return ('', '')

        if recovered:
            print "error recovered"
@ -616,7 +617,8 @@ def url_to_filename(url):
    lpath = L[2].split('/')
    if not '.' in lpath[-1]:
        # url ends with a directory name.  Store it under index.html.
-    L[2] += '/' + INDEX_HTML
+        # L[2] += '/' + INDEX_HTML
+        L[2]=L[2]
    else:
        # 'title=' parsing
        if L[4].startswith('title=') and L[2].endswith('index.php'):
@ -692,7 +694,7 @@ def url_to_filename(url):
    # Not really needed since we checked that the directory
    # outdir didn't exist at the top of run(), but let's double check.
    if os.path.exists(ans) and not config.overwrite:
-    out.write('File already exists: ' + str(ans))
+        out.write('File already exists: ' + str(ans)) #@UndefinedVariable
        sys.exit(1)

    f = open(ans, mode)
@ -780,7 +782,7 @@ def should_follow(url):
            print url, 'with multiple query fields'
        return False

-  if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-' )):
+    if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
        if config.debug:
            print url, 'is a forbidden wiki page'
        return False
@ -806,7 +808,7 @@ def should_follow(url):
                print url, 'is a file outside of scope with unknown extension'
            return False

-    forbidden_parents = ['.php','.html','.htm']
+        forbidden_parents = ['.php', '.html', '.htm']
        for fp in forbidden_parents:
            if fp in L[-1]:
                if config.debug:
@ -876,7 +878,7 @@ def run(out=sys.stdout):
    n = 0

    if not config.overwrite and os.path.exists(config.outdir):
-    out.write('Error: Directory exists: ' + str(config.outdir) )
+        out.write('Error: Directory exists: ' + str(config.outdir))
        sys.exit(1)

    domain = get_domain(config.rooturl)
@ -941,7 +943,7 @@ def run(out=sys.stdout):

        # Save document changes to disk
        update = False
-    text_ext = ( 'txt', 'html', 'rtf', 'css', 'sgml', 'xml' )
+        text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
        for ext in text_ext:
            if filename.endswith(ext):
                update = True
@ -957,7 +959,7 @@ def run(out=sys.stdout):
        n += 1

    conn.close()
-  print "connection to",domain,"closed."
+    print "connection to", domain, "closed."
    out.write(str(n) + ' files saved\n')
    print counter, "httplib requests done"
    print errors, "errors not recovered"
@ -1029,7 +1031,7 @@ def main():
                        'no-hack-skin', 'no-made-by', 'left=',
                        'top=', 'bottom=', 'index=', 'no-move-href',
                        'no-remove-png', 'no-remove-history', 'limit-parent',
-                    'special-mode','debug','no-images'])
+                        'special-mode', 'debug', 'no-images'])
    except getopt.GetoptError:
        usage()

@ -1066,7 +1068,7 @@ def main():
            config.special_mode = True
            config.sidebar = 'sidebar.html'
            config.footer = 'footer.html'
-    if opt in ['-d','--debug']:
+        if opt in ['-d', '--debug']:
            config.debug = True
        if opt in ['-l', '--left']:
            config.sidebar = os.path.abspath(arg)