Replace msgfmt.py with one from Cython

Since it actually works. :-)
2025-10-10 16:43:33 +02:00 · 2021-02-13 23:35:20 -06:00
parent 56852ae4c1
commit 0ab7645105
1 changed files with 210 additions and 269 deletions
--- a/locale/msgfmt.py
+++ b/locale/msgfmt.py
@@ -1,305 +1,246 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
-# -*- coding: iso-8859-1 -*-
+# Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
 # Written by Martin v. Loewis <loewis@informatik.hu-berlin.de>
 #
 # Changed by Christian 'Tiran' Heimes <tiran@cheimes.de> for the placeless
 # translation service (PTS) of Zope
 #
 # Fixed some bugs and updated to support msgctxt
 # by Hanno Schlichting <hanno@hannosch.eu>
 """Generate binary message catalog from textual translation description.
 This program converts a textual Uniforum-style message catalog (.po file) into
-a binary GNU catalog (.mo file). This is essentially the same function as the
+a binary GNU catalog (.mo file).  This is essentially the same function as the
-GNU msgfmt program, however, it is a simpler implementation.
+GNU msgfmt program, however, it is a simpler implementation.  Currently it
 does not handle plural forms but it does handle message contexts.
-This file was taken from Python-2.3.2/Tools/i18n and altered in several ways.
+Usage: msgfmt.py [OPTIONS] filename.po
 Now you can simply use it from another python module:
-  from msgfmt import Msgfmt
+Options:
-  mo = Msgfmt(po).get()
+    -o file
    --output-file=file
        Specify the output file to write to.  If omitted, output will go to a
        file named filename.mo (based off the input file name).
-where po is path to a po file as string, an opened po file ready for reading or
+    -h
-a list of strings (readlines of a po file) and mo is the compiled mo file as
+    --help
-binary string.
+        Print this message and exit.
-Exceptions:
+    -V
-
+    --version
-  * IOError if the file couldn't be read
+        Display version information and exit.
  * msgfmt.PoSyntaxError if the po file has syntax errors
 """
-from __future__ import print_function
+import os
-import array
+import sys
-from ast import literal_eval
+import ast
 import codecs
 from email.parser import HeaderParser
 import getopt
 import struct
-import sys
+import array
 from email.parser import HeaderParser
-PY3 = sys.version_info[0] == 3
+__version__ = "1.2"
 if PY3:
    def header_charset(s):
        p = HeaderParser()
        return p.parsestr(s).get_content_charset()
-    import io
+MESSAGES = {}
    BytesIO = io.BytesIO
    FILE_TYPE = io.IOBase
 else:
    def header_charset(s):
        p = HeaderParser()
        return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset()
    from cStringIO import StringIO as BytesIO
    FILE_TYPE = file
-class PoSyntaxError(Exception):
+def usage(code, msg=''):
-    """ Syntax error in a po file """
+    print(__doc__, file=sys.stderr)
-
+    if msg:
-    def __init__(self, msg):
+        print(msg, file=sys.stderr)
-        self.msg = msg
+    sys.exit(code)
    def __str__(self):
        return 'Po file syntax error: %s' % self.msg
-class Msgfmt:
+def add(ctxt, id, str, fuzzy):
-
+    "Add a non-fuzzy translation to the dictionary."
-    def __init__(self, po, name='unknown'):
+    global MESSAGES
-        self.po = po
+    if not fuzzy and str:
-        self.name = name
+        if ctxt is None:
-        self.messages = {}
+            MESSAGES[id] = str
        self.openfile = False
        # Start off assuming latin-1, so everything decodes without failure,
        # until we know the exact encoding
        self.encoding = 'latin-1'
    def readPoData(self):
        """ read po data from self.po and return an iterator """
        output = []
        if isinstance(self.po, str):
            output = open(self.po, 'rb')
        elif isinstance(self.po, FILE_TYPE):
            self.po.seek(0)
            self.openfile = True
            output = self.po
        elif isinstance(self.po, list):
            output = self.po
        if not output:
            raise ValueError("self.po is invalid! %s" % type(self.po))
        if isinstance(output, FILE_TYPE):
            # remove BOM from the start of the parsed input
            first = output.readline()
            if len(first) == 0:
                return output.readlines()
            if first.startswith(codecs.BOM_UTF8):
                first = first.lstrip(codecs.BOM_UTF8)
            return [first] + output.readlines()
        return output
    def add(self, context, id, string, fuzzy):
        "Add a non-empty and non-fuzzy translation to the dictionary."
        if string and not fuzzy:
            # The context is put before the id and separated by a EOT char.
            if context:
                id = context + u'\x04' + id
            if not id:
                # See whether there is an encoding declaration
                charset = header_charset(string)
                if charset:
                    # decode header in proper encoding
                    string = string.encode(self.encoding).decode(charset)
                    if not PY3:
                        # undo damage done by literal_eval in Python 2.x
                        string = string.encode(self.encoding).decode(charset)
                    self.encoding = charset
            self.messages[id] = string
    def generate(self):
        "Return the generated output."
        # the keys are sorted in the .mo file
        keys = sorted(self.messages.keys())
        offsets = []
        ids = strs = b''
        for id in keys:
            msg = self.messages[id].encode(self.encoding)
            id = id.encode(self.encoding)
            # For each string, we need size and file offset. Each string is
            # NUL terminated; the NUL does not count into the size.
            offsets.append((len(ids), len(id), len(strs),
                            len(msg)))
            ids += id + b'\0'
            strs += msg + b'\0'
        output = b''
        # The header is 7 32-bit unsigned integers. We don't use hash tables,
        # so the keys start right after the index tables.
        keystart = 7 * 4 + 16 * len(keys)
        # and the values start after the keys
        valuestart = keystart + len(ids)
        koffsets = []
        voffsets = []
        # The string table first has the list of keys, then the list of values.
        # Each entry has first the size of the string, then the file offset.
        for o1, l1, o2, l2 in offsets:
            koffsets += [l1, o1 + keystart]
            voffsets += [l2, o2 + valuestart]
        offsets = koffsets + voffsets
        # Even though we don't use a hashtable, we still set its offset to be
        # binary compatible with the gnu gettext format produced by:
        # msgfmt file.po --no-hash
        output = struct.pack("Iiiiiii",
                             0x950412de,        # Magic
                             0,                 # Version
                             len(keys),         # # of entries
                             7 * 4,             # start of key index
                             7 * 4 + len(keys) * 8,  # start of value index
                             0, keystart)       # size and offset of hash table
        if PY3:
            output += array.array("i", offsets).tobytes()
        else:
-            output += array.array("i", offsets).tostring()
+            MESSAGES[b"%b\x04%b" % (ctxt, id)] = str
        output += ids
        output += strs
        return output
    def get(self):
        """ """
        self.read()
        # Compute output
        return self.generate()
-    def read(self, header_only=False):
+def generate():
-        """ """
+    "Return the generated output."
-        ID = 1
+    global MESSAGES
-        STR = 2
+    # the keys are sorted in the .mo file
-        CTXT = 3
+    keys = sorted(MESSAGES.keys())
    offsets = []
    ids = strs = b''
    for id in keys:
        # For each string, we need size and file offset.  Each string is NUL
        # terminated; the NUL does not count into the size.
        offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
        ids += id + b'\0'
        strs += MESSAGES[id] + b'\0'
    output = ''
    # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
    # the keys start right after the index tables.
    # translated string.
    keystart = 7*4+16*len(keys)
    # and the values start after the keys
    valuestart = keystart + len(ids)
    koffsets = []
    voffsets = []
    # The string table first has the list of keys, then the list of values.
    # Each entry has first the size of the string, then the file offset.
    for o1, l1, o2, l2 in offsets:
        koffsets += [l1, o1+keystart]
        voffsets += [l2, o2+valuestart]
    offsets = koffsets + voffsets
    output = struct.pack("Iiiiiii",
                         0x950412de,       # Magic
                         0,                 # Version
                         len(keys),         # # of entries
                         7*4,               # start of key index
                         7*4+len(keys)*8,   # start of value index
                         0, 0)              # size and offset of hash table
    output += array.array("i", offsets).tobytes()
    output += ids
    output += strs
    return output
        section = None
        fuzzy = 0
        msgid = msgstr = msgctxt = u''
-        # Parse the catalog
+def make(filename, outfile):
-        lno = 0
+    ID = 1
-        for l in self.readPoData():
+    STR = 2
-            l = l.decode(self.encoding)
+    CTXT = 3
            lno += 1
            # If we get a comment line after a msgstr or a line starting with
            # msgid or msgctxt, this is a new entry
            if section == STR and (l[0] == '#' or (l[0] == 'm' and
               (l.startswith('msgctxt') or l.startswith('msgid')))):
                self.add(msgctxt, msgid, msgstr, fuzzy)
                section = None
                fuzzy = 0
                # If we only want the header we stop after the first message
                if header_only:
                    break
            # Record a fuzzy mark
            if l[:2] == '#,' and 'fuzzy' in l:
                fuzzy = 1
            # Skip comments
            if l[0] == '#':
                continue
            # Now we are in a msgctxt section
            if l.startswith('msgctxt'):
                section = CTXT
                l = l[7:]
                msgctxt = u''
            # Now we are in a msgid section, output previous section
            elif (l.startswith('msgid') and
                  not l.startswith('msgid_plural')):
                if section == STR:
                    self.add(msgid, msgstr, fuzzy)
                section = ID
                l = l[5:]
                msgid = msgstr = u''
                is_plural = False
            # This is a message with plural forms
            elif l.startswith('msgid_plural'):
                if section != ID:
                    raise PoSyntaxError(
                        'msgid_plural not preceded by '
                        'msgid on line %d of po file %s' %
                        (lno, repr(self.name)))
                l = l[12:]
                msgid += u'\0'  # separator of singular and plural
                is_plural = True
            # Now we are in a msgstr section
            elif l.startswith('msgstr'):
                section = STR
                if l.startswith('msgstr['):
                    if not is_plural:
                        raise PoSyntaxError(
                            'plural without msgid_plural '
                            'on line %d of po file %s' %
                            (lno, repr(self.name)))
                    l = l.split(']', 1)[1]
                    if msgstr:
                        # Separator of the various plural forms
                        msgstr += u'\0'
                else:
                    if is_plural:
                        raise PoSyntaxError(
                            'indexed msgstr required for '
                            'plural on line %d of po file %s' %
                            (lno, repr(self.name)))
                    l = l[6:]
            # Skip empty lines
            l = l.strip()
            if not l:
                continue
            # TODO: Does this always follow Python escape semantics?
            try:
                l = literal_eval(l)
            except Exception as msg:
                raise PoSyntaxError(
                    '%s (line %d of po file %s): \n%s' %
                    (msg, lno, repr(self.name), l))
            if isinstance(l, bytes):
                l = l.decode(self.encoding)
            if section == CTXT:
                msgctxt += l
            elif section == ID:
                msgid += l
            elif section == STR:
                msgstr += l
            else:
                raise PoSyntaxError(
                    'error on line %d of po file %s' %
                    (lno, repr(self.name)))
-        # Add last entry
+    # Compute .mo name from .po name and arguments
-        if section == STR:
+    if filename.endswith('.po'):
-            self.add(msgctxt, msgid, msgstr, fuzzy)
+        infile = filename
    else:
        infile = filename + '.po'
    if outfile is None:
        outfile = os.path.splitext(infile)[0] + '.mo'
        if self.openfile:
            self.po.close()
    def getAsFile(self):
        return BytesIO(self.get())
 def main():
    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'o:')
+        with open(infile, 'rb') as f:
-    except getopt.error as msg:
+            lines = f.readlines()
    except IOError as msg:
        print(msg, file=sys.stderr)
        sys.exit(1)
    section = msgctxt = None
    fuzzy = 0
    # Start off assuming Latin-1, so everything decodes without failure,
    # until we know the exact encoding
    encoding = 'latin-1'
    # Parse the catalog
    lno = 0
    for l in lines:
        l = l.decode(encoding)
        lno += 1
        # If we get a comment line after a msgstr, this is a new entry
        if l[0] == '#' and section == STR:
            add(msgctxt, msgid, msgstr, fuzzy)
            section = msgctxt = None
            fuzzy = 0
        # Record a fuzzy mark
        if l[:2] == '#,' and 'fuzzy' in l:
            fuzzy = 1
        # Skip comments
        if l[0] == '#':
            continue
        # Now we are in a msgid or msgctxt section, output previous section
        if l.startswith('msgctxt'):
            if section == STR:
                add(msgctxt, msgid, msgstr, fuzzy)
            section = CTXT
            l = l[7:]
            msgctxt = b''
        elif l.startswith('msgid') and not l.startswith('msgid_plural'):
            if section == STR:
                add(msgctxt, msgid, msgstr, fuzzy)
                if not msgid:
                    # See whether there is an encoding declaration
                    p = HeaderParser()
                    charset = p.parsestr(msgstr.decode(encoding)).get_content_charset()
                    if charset:
                        encoding = charset
            section = ID
            l = l[5:]
            msgid = msgstr = b''
            is_plural = False
        # This is a message with plural forms
        elif l.startswith('msgid_plural'):
            if section != ID:
                print('msgid_plural not preceded by msgid on %s:%d' % (infile, lno),
                      file=sys.stderr)
                sys.exit(1)
            l = l[12:]
            msgid += b'\0' # separator of singular and plural
            is_plural = True
        # Now we are in a msgstr section
        elif l.startswith('msgstr'):
            section = STR
            if l.startswith('msgstr['):
                if not is_plural:
                    print('plural without msgid_plural on %s:%d' % (infile, lno),
                          file=sys.stderr)
                    sys.exit(1)
                l = l.split(']', 1)[1]
                if msgstr:
                    msgstr += b'\0' # Separator of the various plural forms
            else:
                if is_plural:
                    print('indexed msgstr required for plural on  %s:%d' % (infile, lno),
                          file=sys.stderr)
                    sys.exit(1)
                l = l[6:]
        # Skip empty lines
        l = l.strip()
        if not l:
            continue
        l = ast.literal_eval(l)
        if section == CTXT:
            msgctxt += l.encode(encoding)
        elif section == ID:
            msgid += l.encode(encoding)
        elif section == STR:
            msgstr += l.encode(encoding)
        else:
            print('Syntax error on %s:%d' % (infile, lno), \
                  'before:', file=sys.stderr)
            print(l, file=sys.stderr)
            sys.exit(1)
    # Add last entry
    if section == STR:
        add(msgctxt, msgid, msgstr, fuzzy)
    # Compute output
    output = generate()
    try:
        with open(outfile,"wb") as f:
            f.write(output)
    except IOError as msg:
        print(msg, file=sys.stderr)
 def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
                                   ['help', 'version', 'output-file='])
    except getopt.error as msg:
        usage(1, msg)
    outfile = None
    # parse options
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-V', '--version'):
            print("msgfmt.py", __version__)
            sys.exit(0)
        elif opt in ('-o', '--output-file'):
            outfile = arg
    # do it
    if not args:
        print('No input file given', file=sys.stderr)
-        sys.exit(1)
+        print("Try `msgfmt --help' for more information.", file=sys.stderr)
        return
-    if not opts:
+    for filename in args:
-        print('No output file given', file=sys.stderr)
+        make(filename, outfile)
        sys.exit(1)
    with open(opts[0][1], "wb") as mo:
        mo.write(Msgfmt(args[0]).get())
 if __name__ == '__main__':
    main()