1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-05-05 06:09:47 +02:00
2010-01-24 09:19:39 +00:00

675 lines
18 KiB
C

/* -*- Mode: c; c-basic-offset: 2 -*-
*
* rdf_utf8.c - RDF UTF8 / Unicode chars helper routines Implementation
*
* Copyright (C) 2000-2008, David Beckett http://www.dajobe.org/
* Copyright (C) 2000-2004, University of Bristol, UK http://www.bristol.ac.uk/
*
* This package is Free Software and part of Redland http://librdf.org/
*
* It is licensed under the following three licenses as alternatives:
* 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
* 2. GNU General Public License (GPL) V2 or any newer version
* 3. Apache License, V2.0 or any newer version
*
* You may not use this file except in compliance with at least one of
* the above three licenses.
*
* See LICENSE.html or LICENSE.txt at the top of this package for the
* complete terms and further detail along with the license texts for
* the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
*
*
*/
#ifdef HAVE_CONFIG_H
#include <rdf_config.h>
#endif
#ifdef WIN32
#include <win32_rdf_config.h>
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h> /* for isprint() */
#include <redland.h>
#include <rdf_utf8.h>
#ifndef STANDALONE
/* UTF-8 encoding of 32 bit Unicode chars
*
* Characters 0x00000000 to 0x0000007f are US-ASCII
* Characters 0x00000080 to 0x000000ff are ISO Latin 1 (ISO 8859-1)
*
* incoming char| outgoing
* bytes | bits | representation
* ==================================================
* 1 | 7 | 0xxxxxxx
* 2 | 11 | 110xxxxx 10xxxxxx
* 3 | 16 | 1110xxxx 10xxxxxx 10xxxxxx
* 4 | 21 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 5 | 26 | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 6 | 31 | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* The first byte is always in the range 0xC0-0xFD
* Further bytes are all in the range 0x80-0xBF
* No byte is ever 0xFE or 0xFF
*
*/
/*
* Unicode 3.0 Corrigendum #1: UTF-8 Shortest Form
* http://www.unicode.org/versions/corrigendum1.html
*
* C12
*
* (a) When a process generates data in a Unicode Transformation
* Format, it shall not emit ill-formed code unit sequences.
*
* (b) When a process interprets data in a Unicode Transformation
* Format, it shall treat illegal code unit sequences as an error
* condition.
*
* (c) A conformant process shall not interpret illegal UTF code unit
* sequences as characters.
*
* (d) Irregular UTF code unit sequences shall not be used for
* encoding any other information.
*
*
* My Summary: never encode non-shortest form UTF-8 sequences - they are
* are illegal sequences. Do not accept them on decoding.
*
* Table 3.1B. Legal UTF-8 Byte Sequences
* Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
* U+0000..U+007F 00..7F
* U+0080..U+07FF C2..DF 80..BF
* U+0800..U+0FFF E0 A0..BF 80..BF
* U+1000..U+FFFF E1..EF 80..BF 80..BF
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
*
*/
/**
* librdf_unicode_char_to_utf8:
* @c: Unicode character
* @output: UTF-8 string buffer or NULL
* @length: buffer size
*
* Convert a Unicode character to UTF-8 encoding.
*
* If buffer is NULL, then will calculate the length rather than
* perform it. This can be used by the caller to allocate space
* and then re-call this function with the new buffer.
*
* Return value: bytes written to output buffer or <0 on failure
**/
int
librdf_unicode_char_to_utf8(librdf_unichar c, byte *output, int length)
{
int size=0;
/* check for illegal code positions:
* U+D800 to U+DFFF (UTF-16 surrogates)
* U+FFFE and U+FFFF
*/
if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
return -1;
/* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
if(c > 0x10ffff)
return -1;
if (c < 0x00000080)
size=1;
else if (c < 0x00000800)
size=2;
else if (c < 0x00010000)
size=3;
else
size=4;
/* when no buffer given, return size */
if(!output)
return size;
if(size > length)
return -1;
switch(size) {
case 4:
output[3]=0x80 | (c & 0x3F);
c= c >> 6;
/* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
c |= 0x10000; /* 0x10000 = 0x10 << 12 */
/* FALLTHROUGH */
case 3:
output[2]=0x80 | (c & 0x3F);
c= c >> 6;
/* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
c |= 0x800; /* 0x800 = 0x20 << 6 */
/* FALLTHROUGH */
case 2:
output[1]=0x80 | (c & 0x3F);
c= c >> 6;
/* set bits 7,6 on last byte */
c |= 0xc0;
/* FALLTHROUGH */
case 1:
output[0]=c;
}
return size;
}
/**
* librdf_utf8_to_unicode_char:
* @output: Pointer to the Unicode character or NULL
* @input: UTF-8 string buffer
* @length: buffer size
*
* Convert an UTF-8 encoded buffer to a Unicode character.
*
* If output is NULL, then will calculate the number of bytes that
* will be used from the input buffer and not perform the conversion.
*
* Return value: bytes used from input buffer or <0 on failure
**/
int
librdf_utf8_to_unicode_char(librdf_unichar *output, const byte *input, int length)
{
byte in;
int size;
librdf_unichar c=0;
if(length < 1)
return -1;
in=*input++;
if((in & 0x80) == 0) { /* First byte 00..7F */
size=1;
c= in & 0x7f;
} else if((in & 0xe0) == 0xc0) { /* First byte C0..DF */
size=2;
c= in & 0x1f;
} else if((in & 0xf0) == 0xe0) { /* First byte E0..EF */
size=3;
c= in & 0x0f;
} else if((in & 0xf8) == 0xf0) { /* First byte F0..F7 */
size=4;
c = in & 0x07;
} else /* First byte anything else: 80..BF F8..FF - illegal */
return -1;
if(!output)
return size;
if(length < size)
return -1;
switch(size) {
case 4:
in=*input++ & 0x3f;
c= c << 6;
c |= in;
/* FALLTHROUGH */
case 3:
in=*input++ & 0x3f;
c= c << 6;
c |= in;
/* FALLTHROUGH */
case 2:
in=*input++ & 0x3f;
c= c << 6;
c |= in;
/* FALLTHROUGH */
default:
break;
}
/* check for overlong UTF-8 sequences */
switch(size) {
case 2:
if(c < 0x00000080)
return -2;
break;
case 3:
if(c < 0x00000800)
return -2;
break;
case 4:
if(c < 0x00010000)
return -2;
break;
default: /* 1 */
break;
}
/* check for illegal code positions:
* U+D800 to U+DFFF (UTF-16 surrogates)
* U+FFFE and U+FFFF
*/
if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
return -1;
/* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
/* of course this makes some 4 byte forms illegal */
if(c > 0x10ffff)
return -1;
*output=c;
return size;
}
/**
* librdf_utf8_to_latin1:
* @input: UTF-8 string buffer
* @length: buffer size
* @output_length: Pointer to variable to store resulting string length or NULL
*
* Convert a UTF-8 string to ISO Latin-1.
*
* Converts the given UTF-8 string to the ISO Latin-1 subset of
* Unicode (characters 0x00-0xff), discarding any out of range
* characters.
*
* If the output_length pointer is not NULL, the returned string
* length will be stored there.
*
* Return value: pointer to new ISO Latin-1 string or NULL on failure
**/
byte*
librdf_utf8_to_latin1(const byte *input, int length, int *output_length)
{
int utf8_char_length=0;
int utf8_byte_length=0;
int i;
int j;
byte *output;
i=0;
while(input[i]) {
int size=librdf_utf8_to_unicode_char(NULL, &input[i], length-i);
if(size <= 0)
return NULL;
utf8_char_length++;
i+= size;
}
/* This is a maximal length; since chars may be discarded, the
* actual length of the resulting can be shorter
*/
utf8_byte_length=i;
output=(byte*)LIBRDF_MALLOC(byte_string, utf8_byte_length+1);
if(!output)
return NULL;
i=0; j=0;
while(i < utf8_byte_length) {
librdf_unichar c;
int size=librdf_utf8_to_unicode_char(&c, &input[i], length-i);
if(size <= 0)
return NULL;
if(c < 0x100) /* Discards characters! */
output[j++]=c;
i+= size;
}
output[j]='\0';
if(output_length)
*output_length=j;
return output;
}
/**
* librdf_latin1_to_utf8:
* @input: ISO Latin-1 string buffer
* @length: buffer size
* @output_length: Pointer to variable to store resulting string length or NULL
*
* Convert an ISO Latin-1 encoded string to UTF-8.
*
* Converts the given ISO Latin-1 string to an UTF-8 encoded string
* representing the same content. This is lossless.
*
* If the output_length pointer is not NULL, the returned string
* length will be stored there.
*
* Return value: pointer to new UTF-8 string or NULL on failure
**/
byte*
librdf_latin1_to_utf8(const byte *input, int length, int *output_length)
{
int utf8_length=0;
int i;
int j;
byte *output;
for(i=0; input[i]; i++) {
int size=librdf_unicode_char_to_utf8(input[i], NULL, length-i);
if(size <= 0)
return NULL;
utf8_length += size;
}
output=(byte*)LIBRDF_MALLOC(byte_string, utf8_length+1);
if(!output)
return NULL;
j=0;
for(i=0; input[i]; i++) {
int size=librdf_unicode_char_to_utf8(input[i], &output[j], length-i);
if(size <= 0)
return NULL;
j+= size;
}
output[j]='\0';
if(output_length)
*output_length=j;
return output;
}
/**
* librdf_utf8_print:
* @input: UTF-8 string buffer
* @length: buffer size
* @stream: FILE* stream
*
* Print a UTF-8 string to a stream.
*
* Pretty prints the UTF-8 string in a pseudo-C character
* format like \u<emphasis>hex digits</emphasis> when the characters fail
* the isprint() test.
**/
void
librdf_utf8_print(const byte *input, int length, FILE *stream)
{
int i=0;
while(i<length && *input) {
librdf_unichar c;
int size=librdf_utf8_to_unicode_char(&c, input, length-i);
if(size <= 0)
return;
if(c < 0x100) {
if(isprint(c))
fputc(c, stream);
else
fprintf(stream, "\\u%02X", c);
} else if (c < 0x10000)
fprintf(stream, "\\u%04X", c);
else
fprintf(stream, "\\U%08X", c);
input += size;
i += size;
}
}
#endif
/* TEST CODE */
#ifdef STANDALONE
/* static prototypes */
void librdf_bad_string_print(const byte *input, int length, FILE *stream);
int main(int argc, char *argv[]);
void
librdf_bad_string_print(const byte *input, int length, FILE *stream)
{
while(*input && length>0) {
char c=*input;
if(isprint(c))
fputc(c, stream);
else
fprintf(stream, "\\x%02X", (c & 0xff));
input++;
length--;
}
}
int
main(int argc, char *argv[])
{
const char *program=librdf_basename((const char*)argv[0]);
librdf_unichar c;
struct tv {
const byte *string;
const int length;
const librdf_unichar result;
};
struct tv *t;
struct tv test_values[]={
/* what is the capital of England? 'E' */
{(const byte*)"E", 1, 'E'},
/* latin small letter e with acute, U+00E9 ISOlat1 */
{(const byte*)"\xc3\xa9", 2, 0xE9},
/* euro sign, U+20AC NEW */
{(const byte*)"\xe2\x82\xac", 3, 0x20AC},
/* unknown char - U+1FFFFF (21 bits) */
/* First possible sequence of a certain length */
{(const byte*)"\x00", 1, 0x00000000},
{(const byte*)"\xc2\x80", 2, 0x00000080},
{(const byte*)"\xe0\xa0\x80", 3, 0x00000800},
{(const byte*)"\xf0\x90\x80\x80", 4, 0x00010000},
/* Last possible sequence of a certain length */
{(const byte*)"\x7f", 1, 0x0000007F},
{(const byte*)"\xdf\xbf", 2, 0x000007FF},
{(const byte*)"\xef\xbf\xbd", 3, 0x0000FFFD}, /*no FFFE-FFFF */
{(const byte*)"\xf4\x8f\xbf\xbf", 4, 0x0010FFFF},
/* Boundary conditions */
{(const byte*)"\xed\x9f\xbf", 3, 0x0000D7FF},
{(const byte*)"\xee\x80\x80", 3, 0x0000E000},
{(const byte*)"\xef\xbf\xbd", 3, 0x0000FFFD},
{(const byte*)"\xf4\x8f\xbf\xbf", 4, 0x0010FFFF},
{NULL, 0, 0}
};
struct tv bad_test_values[]={
/* Sequences that cannot appear in UTF-8 */
{(const byte*)"\xfe", 1, 0x000000FE},
{(const byte*)"\xff", 1, 0x000000FF},
{(const byte*)"\xef\xbf\xbe", 3, 0x0000FFFE},
{(const byte*)"\xef\xbf\xbf", 3, 0x0000FFFF},
/* Minumum (ASCII NUL) overlong sequences */
{(const byte*)"\xc0\x80", 2, 0x00000000},
{(const byte*)"\xe0\x80\x80", 3, 0x00000000},
{(const byte*)"\xf0\x80\x80\x80", 4, 0x00000000},
/* Maximum overlong sequences */
{(const byte*)"\xc1\xbf", 2, 0x0000007F},
{(const byte*)"\xe0\x9f\xbf", 3, 0x000007FF},
{(const byte*)"\xf0\x8f\xbf\xbf", 4, 0x0000FFFF},
/* Beyond U+10FFFF */
{(const byte*)"\xf4\x90\x80\x80", 4, 0x00110000},
{NULL, 0, 0}
};
const byte test_utf8_string[]="Lib" "\xc3\xa9" "ration costs " "\xe2\x82\xac" "3.50";
int test_utf8_string_length=strlen((const char*)test_utf8_string);
const byte result_latin1_string[]="Lib" "\xe9" "ration costs 3.50";
int result_latin1_string_length=strlen((const char*)result_latin1_string);
const byte result_utf8_string[]="Lib" "\xc3\xa9" "ration costs 3.50";
int result_utf8_string_length=strlen((const char*)result_utf8_string);
int i;
byte *latin1_string;
int latin1_string_length;
byte *utf8_string;
int utf8_string_length;
int failures=0;
int verbose=0;
for(i=0; (t=&test_values[i]) && t->string; i++) {
int size;
const byte *buffer=t->string;
int length=t->length;
#define OUT_BUFFER_SIZE 6
byte out_buffer[OUT_BUFFER_SIZE];
size=librdf_utf8_to_unicode_char(&c, buffer, length);
if(size < 0) {
fprintf(stderr, "%s: librdf_utf8_to_unicode_char FAILED to convert UTF-8 string '", program);
librdf_bad_string_print(buffer, length, stderr);
fprintf(stderr, "' (length %d) to Unicode\n", length);
failures++;
continue;
}
if(c != t->result) {
fprintf(stderr, "%s: librdf_utf8_to_unicode_char FAILED conversion of UTF-8 string '", program);
librdf_bad_string_print(buffer, size, stderr);
fprintf(stderr, "' to Unicode char U+%04X, expected U+%04X\n",
(u32)c, (u32)t->result);
failures++;
continue;
}
if(verbose) {
fprintf(stderr, "%s: librdf_utf8_to_unicode_char converted UTF-8 string '", program);
librdf_utf8_print(buffer, size, stderr);
fprintf(stderr, "' to Unicode char U+%04X correctly\n", (u32)c);
}
size=librdf_unicode_char_to_utf8(t->result, out_buffer, OUT_BUFFER_SIZE);
if(size <= 0) {
fprintf(stderr, "%s: librdf_unicode_char_to_utf8 FAILED to convert U+%04X to UTF-8 string\n", program, (u32)t->result);
failures++;
continue;
}
if(memcmp(out_buffer, buffer, length)) {
fprintf(stderr, "%s: librdf_unicode_char_to_utf8 FAILED conversion U+%04X to UTF-8 - returned '", program, (u32)t->result);
librdf_utf8_print(buffer, size, stderr);
fputs("', expected '", stderr);
librdf_utf8_print(out_buffer, t->length, stderr);
fputs("'\n", stderr);
failures++;
continue;
}
if(verbose) {
fprintf(stderr, "%s: librdf_unicode_char_to_utf8 converted U+%04X to UTF-8 string '", program, (u32)t->result);
librdf_utf8_print(out_buffer, size, stderr);
fputs("' correctly\n", stderr);
}
}
/* Check for failures */
for(i=0; (t=&bad_test_values[i]) && t->string; i++) {
int size;
const byte *buffer=t->string;
int length=t->length;
size=librdf_utf8_to_unicode_char(&c, buffer, length);
if(size >= 0) {
fprintf(stderr, "%s: librdf_utf8_to_unicode_char SUCCEEDED when it should have failed to convert UTF-8 string '", program);
librdf_bad_string_print(buffer, length, stderr);
fprintf(stderr, "' (length %d) to Unicode\n", length);
failures++;
continue;
}
if(verbose) {
fprintf(stderr, "%s: librdf_utf8_to_unicode_char failed as expected converting bad UTF-8 string '", program);
librdf_bad_string_print(buffer, length, stderr);
fprintf(stderr, "' (length %d) to Unicode\n", length);
}
}
latin1_string=librdf_utf8_to_latin1(test_utf8_string,
test_utf8_string_length,
&latin1_string_length);
if(!latin1_string) {
fprintf(stderr, "%s: librdf_utf8_to_latin1 FAILED to convert UTF-8 string '", program);
librdf_bad_string_print(test_utf8_string, test_utf8_string_length, stderr);
fputs("' to Latin-1\n", stderr);
failures++;
}
if(memcmp(latin1_string, result_latin1_string, result_latin1_string_length)) {
fprintf(stderr, "%s: librdf_utf8_to_latin1 FAILED to convert UTF-8 string '", program);
librdf_utf8_print(test_utf8_string, test_utf8_string_length, stderr);
fprintf(stderr, "' to Latin-1 - returned '%s' but expected '%s'\n",
latin1_string, result_latin1_string);
failures++;
}
if(verbose) {
fprintf(stderr, "%s: librdf_utf8_to_latin1 converted UTF-8 string '",
program);
librdf_utf8_print(test_utf8_string, test_utf8_string_length, stderr);
fprintf(stderr, "' to Latin-1 string '%s' OK\n", latin1_string);
}
utf8_string=librdf_latin1_to_utf8(latin1_string, latin1_string_length,
&utf8_string_length);
if(!utf8_string) {
fprintf(stderr, "%s: librdf_latin1_to_utf8 FAILED to convert Latin-1 string '%s' to UTF-8\n", program, latin1_string);
failures++;
}
if(memcmp(utf8_string, result_utf8_string, result_utf8_string_length)) {
fprintf(stderr, "%s: librdf_latin1_to_utf8 FAILED to convert Latin-1 string '%s' to UTF-8 - returned '", program, latin1_string);
librdf_utf8_print(utf8_string, utf8_string_length, stderr);
fputs("' but expected '", stderr);
librdf_utf8_print(result_utf8_string, result_utf8_string_length, stderr);
fputs("'\n", stderr);
failures++;
}
if(verbose) {
fprintf(stderr, "%s: librdf_latin1_to_utf8 converted Latin-1 string '%s' to UTF-8 string '", program, latin1_string);
librdf_utf8_print(utf8_string, utf8_string_length, stderr);
fputs("' OK\n", stderr);
}
LIBRDF_FREE(cstring, latin1_string);
LIBRDF_FREE(cstring, utf8_string);
#ifdef LIBRDF_MEMORY_DEBUG
librdf_memory_report(stderr);
#endif
return failures;
}
#endif