Replace msgfmt.py with one from Cython

Since it actually works. :-)
2025-11-24 14:20:19 +01:00 · 2021-02-13 23:35:20 -06:00
parent 56852ae4c1
commit 0ab7645105
1 changed files with 210 additions and 269 deletions
--- a/locale/msgfmt.py
+++ b/locale/msgfmt.py
@@ -1,305 +1,246 @@
-#! /usr/bin/env python
-# -*- coding: iso-8859-1 -*-
-# Written by Martin v. Loewis <loewis@informatik.hu-berlin.de>
-#
-# Changed by Christian 'Tiran' Heimes <tiran@cheimes.de> for the placeless
-# translation service (PTS) of Zope
-#
-# Fixed some bugs and updated to support msgctxt
-# by Hanno Schlichting <hanno@hannosch.eu>
+#! /usr/bin/env python3
+# Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>

 """Generate binary message catalog from textual translation description.

 This program converts a textual Uniforum-style message catalog (.po file) into
-a binary GNU catalog (.mo file). This is essentially the same function as the
-GNU msgfmt program, however, it is a simpler implementation.
+a binary GNU catalog (.mo file).  This is essentially the same function as the
+GNU msgfmt program, however, it is a simpler implementation.  Currently it
+does not handle plural forms but it does handle message contexts.

-This file was taken from Python-2.3.2/Tools/i18n and altered in several ways.
-Now you can simply use it from another python module:
+Usage: msgfmt.py [OPTIONS] filename.po

-  from msgfmt import Msgfmt
-  mo = Msgfmt(po).get()
+Options:
+    -o file
+    --output-file=file
+        Specify the output file to write to.  If omitted, output will go to a
+        file named filename.mo (based off the input file name).

-where po is path to a po file as string, an opened po file ready for reading or
-a list of strings (readlines of a po file) and mo is the compiled mo file as
-binary string.
+    -h
+    --help
+        Print this message and exit.

-Exceptions:
-
-  * IOError if the file couldn't be read
-
-  * msgfmt.PoSyntaxError if the po file has syntax errors
+    -V
+    --version
+        Display version information and exit.
 """

-from __future__ import print_function
-import array
-from ast import literal_eval
-import codecs
-from email.parser import HeaderParser
+import os
+import sys
+import ast
 import getopt
 import struct
-import sys
+import array
+from email.parser import HeaderParser

-PY3 = sys.version_info[0] == 3
-if PY3:
-    def header_charset(s):
-        p = HeaderParser()
-        return p.parsestr(s).get_content_charset()
+__version__ = "1.2"

-    import io
-    BytesIO = io.BytesIO
-    FILE_TYPE = io.IOBase
-else:
-    def header_charset(s):
-        p = HeaderParser()
-        return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset()
-
-    from cStringIO import StringIO as BytesIO
-    FILE_TYPE = file
+MESSAGES = {}


-class PoSyntaxError(Exception):
-    """ Syntax error in a po file """
-
-    def __init__(self, msg):
-        self.msg = msg
-
-    def __str__(self):
-        return 'Po file syntax error: %s' % self.msg
+def usage(code, msg=''):
+    print(__doc__, file=sys.stderr)
+    if msg:
+        print(msg, file=sys.stderr)
+    sys.exit(code)


-class Msgfmt:
-
-    def __init__(self, po, name='unknown'):
-        self.po = po
-        self.name = name
-        self.messages = {}
-        self.openfile = False
-        # Start off assuming latin-1, so everything decodes without failure,
-        # until we know the exact encoding
-        self.encoding = 'latin-1'
-
-    def readPoData(self):
-        """ read po data from self.po and return an iterator """
-        output = []
-        if isinstance(self.po, str):
-            output = open(self.po, 'rb')
-        elif isinstance(self.po, FILE_TYPE):
-            self.po.seek(0)
-            self.openfile = True
-            output = self.po
-        elif isinstance(self.po, list):
-            output = self.po
-        if not output:
-            raise ValueError("self.po is invalid! %s" % type(self.po))
-        if isinstance(output, FILE_TYPE):
-            # remove BOM from the start of the parsed input
-            first = output.readline()
-            if len(first) == 0:
-                return output.readlines()
-            if first.startswith(codecs.BOM_UTF8):
-                first = first.lstrip(codecs.BOM_UTF8)
-            return [first] + output.readlines()
-        return output
-
-    def add(self, context, id, string, fuzzy):
-        "Add a non-empty and non-fuzzy translation to the dictionary."
-        if string and not fuzzy:
-            # The context is put before the id and separated by a EOT char.
-            if context:
-                id = context + u'\x04' + id
-            if not id:
-                # See whether there is an encoding declaration
-                charset = header_charset(string)
-                if charset:
-                    # decode header in proper encoding
-                    string = string.encode(self.encoding).decode(charset)
-                    if not PY3:
-                        # undo damage done by literal_eval in Python 2.x
-                        string = string.encode(self.encoding).decode(charset)
-                    self.encoding = charset
-            self.messages[id] = string
-
-    def generate(self):
-        "Return the generated output."
-        # the keys are sorted in the .mo file
-        keys = sorted(self.messages.keys())
-        offsets = []
-        ids = strs = b''
-        for id in keys:
-            msg = self.messages[id].encode(self.encoding)
-            id = id.encode(self.encoding)
-            # For each string, we need size and file offset. Each string is
-            # NUL terminated; the NUL does not count into the size.
-            offsets.append((len(ids), len(id), len(strs),
-                            len(msg)))
-            ids += id + b'\0'
-            strs += msg + b'\0'
-        output = b''
-        # The header is 7 32-bit unsigned integers. We don't use hash tables,
-        # so the keys start right after the index tables.
-        keystart = 7 * 4 + 16 * len(keys)
-        # and the values start after the keys
-        valuestart = keystart + len(ids)
-        koffsets = []
-        voffsets = []
-        # The string table first has the list of keys, then the list of values.
-        # Each entry has first the size of the string, then the file offset.
-        for o1, l1, o2, l2 in offsets:
-            koffsets += [l1, o1 + keystart]
-            voffsets += [l2, o2 + valuestart]
-        offsets = koffsets + voffsets
-        # Even though we don't use a hashtable, we still set its offset to be
-        # binary compatible with the gnu gettext format produced by:
-        # msgfmt file.po --no-hash
-        output = struct.pack("Iiiiiii",
-                             0x950412de,        # Magic
-                             0,                 # Version
-                             len(keys),         # # of entries
-                             7 * 4,             # start of key index
-                             7 * 4 + len(keys) * 8,  # start of value index
-                             0, keystart)       # size and offset of hash table
-        if PY3:
-            output += array.array("i", offsets).tobytes()
+def add(ctxt, id, str, fuzzy):
+    "Add a non-fuzzy translation to the dictionary."
+    global MESSAGES
+    if not fuzzy and str:
+        if ctxt is None:
+            MESSAGES[id] = str
        else:
-            output += array.array("i", offsets).tostring()
-        output += ids
-        output += strs
-        return output
+            MESSAGES[b"%b\x04%b" % (ctxt, id)] = str

-    def get(self):
-        """ """
-        self.read()
-        # Compute output
-        return self.generate()

-    def read(self, header_only=False):
-        """ """
-        ID = 1
-        STR = 2
-        CTXT = 3
+def generate():
+    "Return the generated output."
+    global MESSAGES
+    # the keys are sorted in the .mo file
+    keys = sorted(MESSAGES.keys())
+    offsets = []
+    ids = strs = b''
+    for id in keys:
+        # For each string, we need size and file offset.  Each string is NUL
+        # terminated; the NUL does not count into the size.
+        offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
+        ids += id + b'\0'
+        strs += MESSAGES[id] + b'\0'
+    output = ''
+    # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
+    # the keys start right after the index tables.
+    # translated string.
+    keystart = 7*4+16*len(keys)
+    # and the values start after the keys
+    valuestart = keystart + len(ids)
+    koffsets = []
+    voffsets = []
+    # The string table first has the list of keys, then the list of values.
+    # Each entry has first the size of the string, then the file offset.
+    for o1, l1, o2, l2 in offsets:
+        koffsets += [l1, o1+keystart]
+        voffsets += [l2, o2+valuestart]
+    offsets = koffsets + voffsets
+    output = struct.pack("Iiiiiii",
+                         0x950412de,       # Magic
+                         0,                 # Version
+                         len(keys),         # # of entries
+                         7*4,               # start of key index
+                         7*4+len(keys)*8,   # start of value index
+                         0, 0)              # size and offset of hash table
+    output += array.array("i", offsets).tobytes()
+    output += ids
+    output += strs
+    return output

-        section = None
-        fuzzy = 0
-        msgid = msgstr = msgctxt = u''

-        # Parse the catalog
-        lno = 0
-        for l in self.readPoData():
-            l = l.decode(self.encoding)
-            lno += 1
-            # If we get a comment line after a msgstr or a line starting with
-            # msgid or msgctxt, this is a new entry
-            if section == STR and (l[0] == '#' or (l[0] == 'm' and
-               (l.startswith('msgctxt') or l.startswith('msgid')))):
-                self.add(msgctxt, msgid, msgstr, fuzzy)
-                section = None
-                fuzzy = 0
-                # If we only want the header we stop after the first message
-                if header_only:
-                    break
-            # Record a fuzzy mark
-            if l[:2] == '#,' and 'fuzzy' in l:
-                fuzzy = 1
-            # Skip comments
-            if l[0] == '#':
-                continue
-            # Now we are in a msgctxt section
-            if l.startswith('msgctxt'):
-                section = CTXT
-                l = l[7:]
-                msgctxt = u''
-            # Now we are in a msgid section, output previous section
-            elif (l.startswith('msgid') and
-                  not l.startswith('msgid_plural')):
-                if section == STR:
-                    self.add(msgid, msgstr, fuzzy)
-                section = ID
-                l = l[5:]
-                msgid = msgstr = u''
-                is_plural = False
-            # This is a message with plural forms
-            elif l.startswith('msgid_plural'):
-                if section != ID:
-                    raise PoSyntaxError(
-                        'msgid_plural not preceded by '
-                        'msgid on line %d of po file %s' %
-                        (lno, repr(self.name)))
-                l = l[12:]
-                msgid += u'\0'  # separator of singular and plural
-                is_plural = True
-            # Now we are in a msgstr section
-            elif l.startswith('msgstr'):
-                section = STR
-                if l.startswith('msgstr['):
-                    if not is_plural:
-                        raise PoSyntaxError(
-                            'plural without msgid_plural '
-                            'on line %d of po file %s' %
-                            (lno, repr(self.name)))
-                    l = l.split(']', 1)[1]
-                    if msgstr:
-                        # Separator of the various plural forms
-                        msgstr += u'\0'
-                else:
-                    if is_plural:
-                        raise PoSyntaxError(
-                            'indexed msgstr required for '
-                            'plural on line %d of po file %s' %
-                            (lno, repr(self.name)))
-                    l = l[6:]
-            # Skip empty lines
-            l = l.strip()
-            if not l:
-                continue
-            # TODO: Does this always follow Python escape semantics?
-            try:
-                l = literal_eval(l)
-            except Exception as msg:
-                raise PoSyntaxError(
-                    '%s (line %d of po file %s): \n%s' %
-                    (msg, lno, repr(self.name), l))
-            if isinstance(l, bytes):
-                l = l.decode(self.encoding)
-            if section == CTXT:
-                msgctxt += l
-            elif section == ID:
-                msgid += l
-            elif section == STR:
-                msgstr += l
-            else:
-                raise PoSyntaxError(
-                    'error on line %d of po file %s' %
-                    (lno, repr(self.name)))
+def make(filename, outfile):
+    ID = 1
+    STR = 2
+    CTXT = 3

-        # Add last entry
-        if section == STR:
-            self.add(msgctxt, msgid, msgstr, fuzzy)
+    # Compute .mo name from .po name and arguments
+    if filename.endswith('.po'):
+        infile = filename
+    else:
+        infile = filename + '.po'
+    if outfile is None:
+        outfile = os.path.splitext(infile)[0] + '.mo'

-        if self.openfile:
-            self.po.close()
-
-    def getAsFile(self):
-        return BytesIO(self.get())
-
-def main():
    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'o:')
-    except getopt.error as msg:
+        with open(infile, 'rb') as f:
+            lines = f.readlines()
+    except IOError as msg:
        print(msg, file=sys.stderr)
        sys.exit(1)

+    section = msgctxt = None
+    fuzzy = 0
+
+    # Start off assuming Latin-1, so everything decodes without failure,
+    # until we know the exact encoding
+    encoding = 'latin-1'
+
+    # Parse the catalog
+    lno = 0
+    for l in lines:
+        l = l.decode(encoding)
+        lno += 1
+        # If we get a comment line after a msgstr, this is a new entry
+        if l[0] == '#' and section == STR:
+            add(msgctxt, msgid, msgstr, fuzzy)
+            section = msgctxt = None
+            fuzzy = 0
+        # Record a fuzzy mark
+        if l[:2] == '#,' and 'fuzzy' in l:
+            fuzzy = 1
+        # Skip comments
+        if l[0] == '#':
+            continue
+        # Now we are in a msgid or msgctxt section, output previous section
+        if l.startswith('msgctxt'):
+            if section == STR:
+                add(msgctxt, msgid, msgstr, fuzzy)
+            section = CTXT
+            l = l[7:]
+            msgctxt = b''
+        elif l.startswith('msgid') and not l.startswith('msgid_plural'):
+            if section == STR:
+                add(msgctxt, msgid, msgstr, fuzzy)
+                if not msgid:
+                    # See whether there is an encoding declaration
+                    p = HeaderParser()
+                    charset = p.parsestr(msgstr.decode(encoding)).get_content_charset()
+                    if charset:
+                        encoding = charset
+            section = ID
+            l = l[5:]
+            msgid = msgstr = b''
+            is_plural = False
+        # This is a message with plural forms
+        elif l.startswith('msgid_plural'):
+            if section != ID:
+                print('msgid_plural not preceded by msgid on %s:%d' % (infile, lno),
+                      file=sys.stderr)
+                sys.exit(1)
+            l = l[12:]
+            msgid += b'\0' # separator of singular and plural
+            is_plural = True
+        # Now we are in a msgstr section
+        elif l.startswith('msgstr'):
+            section = STR
+            if l.startswith('msgstr['):
+                if not is_plural:
+                    print('plural without msgid_plural on %s:%d' % (infile, lno),
+                          file=sys.stderr)
+                    sys.exit(1)
+                l = l.split(']', 1)[1]
+                if msgstr:
+                    msgstr += b'\0' # Separator of the various plural forms
+            else:
+                if is_plural:
+                    print('indexed msgstr required for plural on  %s:%d' % (infile, lno),
+                          file=sys.stderr)
+                    sys.exit(1)
+                l = l[6:]
+        # Skip empty lines
+        l = l.strip()
+        if not l:
+            continue
+        l = ast.literal_eval(l)
+        if section == CTXT:
+            msgctxt += l.encode(encoding)
+        elif section == ID:
+            msgid += l.encode(encoding)
+        elif section == STR:
+            msgstr += l.encode(encoding)
+        else:
+            print('Syntax error on %s:%d' % (infile, lno), \
+                  'before:', file=sys.stderr)
+            print(l, file=sys.stderr)
+            sys.exit(1)
+    # Add last entry
+    if section == STR:
+        add(msgctxt, msgid, msgstr, fuzzy)
+
+    # Compute output
+    output = generate()
+
+    try:
+        with open(outfile,"wb") as f:
+            f.write(output)
+    except IOError as msg:
+        print(msg, file=sys.stderr)
+
+
+def main():
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
+                                   ['help', 'version', 'output-file='])
+    except getopt.error as msg:
+        usage(1, msg)
+
+    outfile = None
+    # parse options
+    for opt, arg in opts:
+        if opt in ('-h', '--help'):
+            usage(0)
+        elif opt in ('-V', '--version'):
+            print("msgfmt.py", __version__)
+            sys.exit(0)
+        elif opt in ('-o', '--output-file'):
+            outfile = arg
+    # do it
    if not args:
        print('No input file given', file=sys.stderr)
-        sys.exit(1)
+        print("Try `msgfmt --help' for more information.", file=sys.stderr)
+        return

-    if not opts:
-        print('No output file given', file=sys.stderr)
-        sys.exit(1)
+    for filename in args:
+        make(filename, outfile)

-    with open(opts[0][1], "wb") as mo:
-        mo.write(Msgfmt(args[0]).get())

 if __name__ == '__main__':
    main()
-