diff --git a/locale/msgfmt.py b/locale/msgfmt.py old mode 100755 new mode 100644 index b0472e1ad..3f731e941 --- a/locale/msgfmt.py +++ b/locale/msgfmt.py @@ -1,305 +1,246 @@ -#! /usr/bin/env python -# -*- coding: iso-8859-1 -*- -# Written by Martin v. Loewis -# -# Changed by Christian 'Tiran' Heimes for the placeless -# translation service (PTS) of Zope -# -# Fixed some bugs and updated to support msgctxt -# by Hanno Schlichting +#! /usr/bin/env python3 +# Written by Martin v. Löwis """Generate binary message catalog from textual translation description. This program converts a textual Uniforum-style message catalog (.po file) into -a binary GNU catalog (.mo file). This is essentially the same function as the -GNU msgfmt program, however, it is a simpler implementation. +a binary GNU catalog (.mo file). This is essentially the same function as the +GNU msgfmt program, however, it is a simpler implementation. Currently it +does not handle plural forms but it does handle message contexts. -This file was taken from Python-2.3.2/Tools/i18n and altered in several ways. -Now you can simply use it from another python module: +Usage: msgfmt.py [OPTIONS] filename.po - from msgfmt import Msgfmt - mo = Msgfmt(po).get() +Options: + -o file + --output-file=file + Specify the output file to write to. If omitted, output will go to a + file named filename.mo (based off the input file name). -where po is path to a po file as string, an opened po file ready for reading or -a list of strings (readlines of a po file) and mo is the compiled mo file as -binary string. + -h + --help + Print this message and exit. -Exceptions: - - * IOError if the file couldn't be read - - * msgfmt.PoSyntaxError if the po file has syntax errors + -V + --version + Display version information and exit. """ -from __future__ import print_function -import array -from ast import literal_eval -import codecs -from email.parser import HeaderParser +import os +import sys +import ast import getopt import struct -import sys +import array +from email.parser import HeaderParser -PY3 = sys.version_info[0] == 3 -if PY3: - def header_charset(s): - p = HeaderParser() - return p.parsestr(s).get_content_charset() +__version__ = "1.2" - import io - BytesIO = io.BytesIO - FILE_TYPE = io.IOBase -else: - def header_charset(s): - p = HeaderParser() - return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset() - - from cStringIO import StringIO as BytesIO - FILE_TYPE = file +MESSAGES = {} -class PoSyntaxError(Exception): - """ Syntax error in a po file """ - - def __init__(self, msg): - self.msg = msg - - def __str__(self): - return 'Po file syntax error: %s' % self.msg +def usage(code, msg=''): + print(__doc__, file=sys.stderr) + if msg: + print(msg, file=sys.stderr) + sys.exit(code) -class Msgfmt: - - def __init__(self, po, name='unknown'): - self.po = po - self.name = name - self.messages = {} - self.openfile = False - # Start off assuming latin-1, so everything decodes without failure, - # until we know the exact encoding - self.encoding = 'latin-1' - - def readPoData(self): - """ read po data from self.po and return an iterator """ - output = [] - if isinstance(self.po, str): - output = open(self.po, 'rb') - elif isinstance(self.po, FILE_TYPE): - self.po.seek(0) - self.openfile = True - output = self.po - elif isinstance(self.po, list): - output = self.po - if not output: - raise ValueError("self.po is invalid! %s" % type(self.po)) - if isinstance(output, FILE_TYPE): - # remove BOM from the start of the parsed input - first = output.readline() - if len(first) == 0: - return output.readlines() - if first.startswith(codecs.BOM_UTF8): - first = first.lstrip(codecs.BOM_UTF8) - return [first] + output.readlines() - return output - - def add(self, context, id, string, fuzzy): - "Add a non-empty and non-fuzzy translation to the dictionary." - if string and not fuzzy: - # The context is put before the id and separated by a EOT char. - if context: - id = context + u'\x04' + id - if not id: - # See whether there is an encoding declaration - charset = header_charset(string) - if charset: - # decode header in proper encoding - string = string.encode(self.encoding).decode(charset) - if not PY3: - # undo damage done by literal_eval in Python 2.x - string = string.encode(self.encoding).decode(charset) - self.encoding = charset - self.messages[id] = string - - def generate(self): - "Return the generated output." - # the keys are sorted in the .mo file - keys = sorted(self.messages.keys()) - offsets = [] - ids = strs = b'' - for id in keys: - msg = self.messages[id].encode(self.encoding) - id = id.encode(self.encoding) - # For each string, we need size and file offset. Each string is - # NUL terminated; the NUL does not count into the size. - offsets.append((len(ids), len(id), len(strs), - len(msg))) - ids += id + b'\0' - strs += msg + b'\0' - output = b'' - # The header is 7 32-bit unsigned integers. We don't use hash tables, - # so the keys start right after the index tables. - keystart = 7 * 4 + 16 * len(keys) - # and the values start after the keys - valuestart = keystart + len(ids) - koffsets = [] - voffsets = [] - # The string table first has the list of keys, then the list of values. - # Each entry has first the size of the string, then the file offset. - for o1, l1, o2, l2 in offsets: - koffsets += [l1, o1 + keystart] - voffsets += [l2, o2 + valuestart] - offsets = koffsets + voffsets - # Even though we don't use a hashtable, we still set its offset to be - # binary compatible with the gnu gettext format produced by: - # msgfmt file.po --no-hash - output = struct.pack("Iiiiiii", - 0x950412de, # Magic - 0, # Version - len(keys), # # of entries - 7 * 4, # start of key index - 7 * 4 + len(keys) * 8, # start of value index - 0, keystart) # size and offset of hash table - if PY3: - output += array.array("i", offsets).tobytes() +def add(ctxt, id, str, fuzzy): + "Add a non-fuzzy translation to the dictionary." + global MESSAGES + if not fuzzy and str: + if ctxt is None: + MESSAGES[id] = str else: - output += array.array("i", offsets).tostring() - output += ids - output += strs - return output + MESSAGES[b"%b\x04%b" % (ctxt, id)] = str - def get(self): - """ """ - self.read() - # Compute output - return self.generate() - def read(self, header_only=False): - """ """ - ID = 1 - STR = 2 - CTXT = 3 +def generate(): + "Return the generated output." + global MESSAGES + # the keys are sorted in the .mo file + keys = sorted(MESSAGES.keys()) + offsets = [] + ids = strs = b'' + for id in keys: + # For each string, we need size and file offset. Each string is NUL + # terminated; the NUL does not count into the size. + offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) + ids += id + b'\0' + strs += MESSAGES[id] + b'\0' + output = '' + # The header is 7 32-bit unsigned integers. We don't use hash tables, so + # the keys start right after the index tables. + # translated string. + keystart = 7*4+16*len(keys) + # and the values start after the keys + valuestart = keystart + len(ids) + koffsets = [] + voffsets = [] + # The string table first has the list of keys, then the list of values. + # Each entry has first the size of the string, then the file offset. + for o1, l1, o2, l2 in offsets: + koffsets += [l1, o1+keystart] + voffsets += [l2, o2+valuestart] + offsets = koffsets + voffsets + output = struct.pack("Iiiiiii", + 0x950412de, # Magic + 0, # Version + len(keys), # # of entries + 7*4, # start of key index + 7*4+len(keys)*8, # start of value index + 0, 0) # size and offset of hash table + output += array.array("i", offsets).tobytes() + output += ids + output += strs + return output - section = None - fuzzy = 0 - msgid = msgstr = msgctxt = u'' - # Parse the catalog - lno = 0 - for l in self.readPoData(): - l = l.decode(self.encoding) - lno += 1 - # If we get a comment line after a msgstr or a line starting with - # msgid or msgctxt, this is a new entry - if section == STR and (l[0] == '#' or (l[0] == 'm' and - (l.startswith('msgctxt') or l.startswith('msgid')))): - self.add(msgctxt, msgid, msgstr, fuzzy) - section = None - fuzzy = 0 - # If we only want the header we stop after the first message - if header_only: - break - # Record a fuzzy mark - if l[:2] == '#,' and 'fuzzy' in l: - fuzzy = 1 - # Skip comments - if l[0] == '#': - continue - # Now we are in a msgctxt section - if l.startswith('msgctxt'): - section = CTXT - l = l[7:] - msgctxt = u'' - # Now we are in a msgid section, output previous section - elif (l.startswith('msgid') and - not l.startswith('msgid_plural')): - if section == STR: - self.add(msgid, msgstr, fuzzy) - section = ID - l = l[5:] - msgid = msgstr = u'' - is_plural = False - # This is a message with plural forms - elif l.startswith('msgid_plural'): - if section != ID: - raise PoSyntaxError( - 'msgid_plural not preceded by ' - 'msgid on line %d of po file %s' % - (lno, repr(self.name))) - l = l[12:] - msgid += u'\0' # separator of singular and plural - is_plural = True - # Now we are in a msgstr section - elif l.startswith('msgstr'): - section = STR - if l.startswith('msgstr['): - if not is_plural: - raise PoSyntaxError( - 'plural without msgid_plural ' - 'on line %d of po file %s' % - (lno, repr(self.name))) - l = l.split(']', 1)[1] - if msgstr: - # Separator of the various plural forms - msgstr += u'\0' - else: - if is_plural: - raise PoSyntaxError( - 'indexed msgstr required for ' - 'plural on line %d of po file %s' % - (lno, repr(self.name))) - l = l[6:] - # Skip empty lines - l = l.strip() - if not l: - continue - # TODO: Does this always follow Python escape semantics? - try: - l = literal_eval(l) - except Exception as msg: - raise PoSyntaxError( - '%s (line %d of po file %s): \n%s' % - (msg, lno, repr(self.name), l)) - if isinstance(l, bytes): - l = l.decode(self.encoding) - if section == CTXT: - msgctxt += l - elif section == ID: - msgid += l - elif section == STR: - msgstr += l - else: - raise PoSyntaxError( - 'error on line %d of po file %s' % - (lno, repr(self.name))) +def make(filename, outfile): + ID = 1 + STR = 2 + CTXT = 3 - # Add last entry - if section == STR: - self.add(msgctxt, msgid, msgstr, fuzzy) + # Compute .mo name from .po name and arguments + if filename.endswith('.po'): + infile = filename + else: + infile = filename + '.po' + if outfile is None: + outfile = os.path.splitext(infile)[0] + '.mo' - if self.openfile: - self.po.close() - - def getAsFile(self): - return BytesIO(self.get()) - -def main(): try: - opts, args = getopt.getopt(sys.argv[1:], 'o:') - except getopt.error as msg: + with open(infile, 'rb') as f: + lines = f.readlines() + except IOError as msg: print(msg, file=sys.stderr) sys.exit(1) + section = msgctxt = None + fuzzy = 0 + + # Start off assuming Latin-1, so everything decodes without failure, + # until we know the exact encoding + encoding = 'latin-1' + + # Parse the catalog + lno = 0 + for l in lines: + l = l.decode(encoding) + lno += 1 + # If we get a comment line after a msgstr, this is a new entry + if l[0] == '#' and section == STR: + add(msgctxt, msgid, msgstr, fuzzy) + section = msgctxt = None + fuzzy = 0 + # Record a fuzzy mark + if l[:2] == '#,' and 'fuzzy' in l: + fuzzy = 1 + # Skip comments + if l[0] == '#': + continue + # Now we are in a msgid or msgctxt section, output previous section + if l.startswith('msgctxt'): + if section == STR: + add(msgctxt, msgid, msgstr, fuzzy) + section = CTXT + l = l[7:] + msgctxt = b'' + elif l.startswith('msgid') and not l.startswith('msgid_plural'): + if section == STR: + add(msgctxt, msgid, msgstr, fuzzy) + if not msgid: + # See whether there is an encoding declaration + p = HeaderParser() + charset = p.parsestr(msgstr.decode(encoding)).get_content_charset() + if charset: + encoding = charset + section = ID + l = l[5:] + msgid = msgstr = b'' + is_plural = False + # This is a message with plural forms + elif l.startswith('msgid_plural'): + if section != ID: + print('msgid_plural not preceded by msgid on %s:%d' % (infile, lno), + file=sys.stderr) + sys.exit(1) + l = l[12:] + msgid += b'\0' # separator of singular and plural + is_plural = True + # Now we are in a msgstr section + elif l.startswith('msgstr'): + section = STR + if l.startswith('msgstr['): + if not is_plural: + print('plural without msgid_plural on %s:%d' % (infile, lno), + file=sys.stderr) + sys.exit(1) + l = l.split(']', 1)[1] + if msgstr: + msgstr += b'\0' # Separator of the various plural forms + else: + if is_plural: + print('indexed msgstr required for plural on %s:%d' % (infile, lno), + file=sys.stderr) + sys.exit(1) + l = l[6:] + # Skip empty lines + l = l.strip() + if not l: + continue + l = ast.literal_eval(l) + if section == CTXT: + msgctxt += l.encode(encoding) + elif section == ID: + msgid += l.encode(encoding) + elif section == STR: + msgstr += l.encode(encoding) + else: + print('Syntax error on %s:%d' % (infile, lno), \ + 'before:', file=sys.stderr) + print(l, file=sys.stderr) + sys.exit(1) + # Add last entry + if section == STR: + add(msgctxt, msgid, msgstr, fuzzy) + + # Compute output + output = generate() + + try: + with open(outfile,"wb") as f: + f.write(output) + except IOError as msg: + print(msg, file=sys.stderr) + + +def main(): + try: + opts, args = getopt.getopt(sys.argv[1:], 'hVo:', + ['help', 'version', 'output-file=']) + except getopt.error as msg: + usage(1, msg) + + outfile = None + # parse options + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-V', '--version'): + print("msgfmt.py", __version__) + sys.exit(0) + elif opt in ('-o', '--output-file'): + outfile = arg + # do it if not args: print('No input file given', file=sys.stderr) - sys.exit(1) + print("Try `msgfmt --help' for more information.", file=sys.stderr) + return - if not opts: - print('No output file given', file=sys.stderr) - sys.exit(1) + for filename in args: + make(filename, outfile) - with open(opts[0][1], "wb") as mo: - mo.write(Msgfmt(args[0]).get()) if __name__ == '__main__': main() -