/* -*- Mode: c; c-basic-offset: 2 -*- * * rdf_utf8.c - RDF UTF8 / Unicode chars helper routines Implementation * * Copyright (C) 2000-2008, David Beckett http://www.dajobe.org/ * Copyright (C) 2000-2004, University of Bristol, UK http://www.bristol.ac.uk/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * * */ #ifdef HAVE_CONFIG_H #include #endif #ifdef WIN32 #include #endif #include #include #include /* for isprint() */ #include #include #ifndef STANDALONE /* UTF-8 encoding of 32 bit Unicode chars * * Characters 0x00000000 to 0x0000007f are US-ASCII * Characters 0x00000080 to 0x000000ff are ISO Latin 1 (ISO 8859-1) * * incoming char| outgoing * bytes | bits | representation * ================================================== * 1 | 7 | 0xxxxxxx * 2 | 11 | 110xxxxx 10xxxxxx * 3 | 16 | 1110xxxx 10xxxxxx 10xxxxxx * 4 | 21 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 5 | 26 | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 6 | 31 | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * The first byte is always in the range 0xC0-0xFD * Further bytes are all in the range 0x80-0xBF * No byte is ever 0xFE or 0xFF * */ /* * Unicode 3.0 Corrigendum #1: UTF-8 Shortest Form * http://www.unicode.org/versions/corrigendum1.html * * C12 * * (a) When a process generates data in a Unicode Transformation * Format, it shall not emit ill-formed code unit sequences. * * (b) When a process interprets data in a Unicode Transformation * Format, it shall treat illegal code unit sequences as an error * condition. * * (c) A conformant process shall not interpret illegal UTF code unit * sequences as characters. * * (d) Irregular UTF code unit sequences shall not be used for * encoding any other information. * * * My Summary: never encode non-shortest form UTF-8 sequences - they are * are illegal sequences. Do not accept them on decoding. * * Table 3.1B. Legal UTF-8 Byte Sequences * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte * U+0000..U+007F 00..7F * U+0080..U+07FF C2..DF 80..BF * U+0800..U+0FFF E0 A0..BF 80..BF * U+1000..U+FFFF E1..EF 80..BF 80..BF * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF * */ /** * librdf_unicode_char_to_utf8: * @c: Unicode character * @output: UTF-8 string buffer or NULL * @length: buffer size * * Convert a Unicode character to UTF-8 encoding. * * If buffer is NULL, then will calculate the length rather than * perform it. This can be used by the caller to allocate space * and then re-call this function with the new buffer. * * Return value: bytes written to output buffer or <0 on failure **/ int librdf_unicode_char_to_utf8(librdf_unichar c, byte *output, int length) { int size=0; /* check for illegal code positions: * U+D800 to U+DFFF (UTF-16 surrogates) * U+FFFE and U+FFFF */ if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF) return -1; /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */ if(c > 0x10ffff) return -1; if (c < 0x00000080) size=1; else if (c < 0x00000800) size=2; else if (c < 0x00010000) size=3; else size=4; /* when no buffer given, return size */ if(!output) return size; if(size > length) return -1; switch(size) { case 4: output[3]=0x80 | (c & 0x3F); c= c >> 6; /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */ c |= 0x10000; /* 0x10000 = 0x10 << 12 */ /* FALLTHROUGH */ case 3: output[2]=0x80 | (c & 0x3F); c= c >> 6; /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */ c |= 0x800; /* 0x800 = 0x20 << 6 */ /* FALLTHROUGH */ case 2: output[1]=0x80 | (c & 0x3F); c= c >> 6; /* set bits 7,6 on last byte */ c |= 0xc0; /* FALLTHROUGH */ case 1: output[0]=c; } return size; } /** * librdf_utf8_to_unicode_char: * @output: Pointer to the Unicode character or NULL * @input: UTF-8 string buffer * @length: buffer size * * Convert an UTF-8 encoded buffer to a Unicode character. * * If output is NULL, then will calculate the number of bytes that * will be used from the input buffer and not perform the conversion. * * Return value: bytes used from input buffer or <0 on failure **/ int librdf_utf8_to_unicode_char(librdf_unichar *output, const byte *input, int length) { byte in; int size; librdf_unichar c=0; if(length < 1) return -1; in=*input++; if((in & 0x80) == 0) { /* First byte 00..7F */ size=1; c= in & 0x7f; } else if((in & 0xe0) == 0xc0) { /* First byte C0..DF */ size=2; c= in & 0x1f; } else if((in & 0xf0) == 0xe0) { /* First byte E0..EF */ size=3; c= in & 0x0f; } else if((in & 0xf8) == 0xf0) { /* First byte F0..F7 */ size=4; c = in & 0x07; } else /* First byte anything else: 80..BF F8..FF - illegal */ return -1; if(!output) return size; if(length < size) return -1; switch(size) { case 4: in=*input++ & 0x3f; c= c << 6; c |= in; /* FALLTHROUGH */ case 3: in=*input++ & 0x3f; c= c << 6; c |= in; /* FALLTHROUGH */ case 2: in=*input++ & 0x3f; c= c << 6; c |= in; /* FALLTHROUGH */ default: break; } /* check for overlong UTF-8 sequences */ switch(size) { case 2: if(c < 0x00000080) return -2; break; case 3: if(c < 0x00000800) return -2; break; case 4: if(c < 0x00010000) return -2; break; default: /* 1 */ break; } /* check for illegal code positions: * U+D800 to U+DFFF (UTF-16 surrogates) * U+FFFE and U+FFFF */ if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF) return -1; /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */ /* of course this makes some 4 byte forms illegal */ if(c > 0x10ffff) return -1; *output=c; return size; } /** * librdf_utf8_to_latin1: * @input: UTF-8 string buffer * @length: buffer size * @output_length: Pointer to variable to store resulting string length or NULL * * Convert a UTF-8 string to ISO Latin-1. * * Converts the given UTF-8 string to the ISO Latin-1 subset of * Unicode (characters 0x00-0xff), discarding any out of range * characters. * * If the output_length pointer is not NULL, the returned string * length will be stored there. * * Return value: pointer to new ISO Latin-1 string or NULL on failure **/ byte* librdf_utf8_to_latin1(const byte *input, int length, int *output_length) { int utf8_char_length=0; int utf8_byte_length=0; int i; int j; byte *output; i=0; while(input[i]) { int size=librdf_utf8_to_unicode_char(NULL, &input[i], length-i); if(size <= 0) return NULL; utf8_char_length++; i+= size; } /* This is a maximal length; since chars may be discarded, the * actual length of the resulting can be shorter */ utf8_byte_length=i; output=(byte*)LIBRDF_MALLOC(byte_string, utf8_byte_length+1); if(!output) return NULL; i=0; j=0; while(i < utf8_byte_length) { librdf_unichar c; int size=librdf_utf8_to_unicode_char(&c, &input[i], length-i); if(size <= 0) return NULL; if(c < 0x100) /* Discards characters! */ output[j++]=c; i+= size; } output[j]='\0'; if(output_length) *output_length=j; return output; } /** * librdf_latin1_to_utf8: * @input: ISO Latin-1 string buffer * @length: buffer size * @output_length: Pointer to variable to store resulting string length or NULL * * Convert an ISO Latin-1 encoded string to UTF-8. * * Converts the given ISO Latin-1 string to an UTF-8 encoded string * representing the same content. This is lossless. * * If the output_length pointer is not NULL, the returned string * length will be stored there. * * Return value: pointer to new UTF-8 string or NULL on failure **/ byte* librdf_latin1_to_utf8(const byte *input, int length, int *output_length) { int utf8_length=0; int i; int j; byte *output; for(i=0; input[i]; i++) { int size=librdf_unicode_char_to_utf8(input[i], NULL, length-i); if(size <= 0) return NULL; utf8_length += size; } output=(byte*)LIBRDF_MALLOC(byte_string, utf8_length+1); if(!output) return NULL; j=0; for(i=0; input[i]; i++) { int size=librdf_unicode_char_to_utf8(input[i], &output[j], length-i); if(size <= 0) return NULL; j+= size; } output[j]='\0'; if(output_length) *output_length=j; return output; } /** * librdf_utf8_print: * @input: UTF-8 string buffer * @length: buffer size * @stream: FILE* stream * * Print a UTF-8 string to a stream. * * Pretty prints the UTF-8 string in a pseudo-C character * format like \uhex digits when the characters fail * the isprint() test. **/ void librdf_utf8_print(const byte *input, int length, FILE *stream) { int i=0; while(i0) { char c=*input; if(isprint(c)) fputc(c, stream); else fprintf(stream, "\\x%02X", (c & 0xff)); input++; length--; } } int main(int argc, char *argv[]) { const char *program=librdf_basename((const char*)argv[0]); librdf_unichar c; struct tv { const byte *string; const int length; const librdf_unichar result; }; struct tv *t; struct tv test_values[]={ /* what is the capital of England? 'E' */ {(const byte*)"E", 1, 'E'}, /* latin small letter e with acute, U+00E9 ISOlat1 */ {(const byte*)"\xc3\xa9", 2, 0xE9}, /* euro sign, U+20AC NEW */ {(const byte*)"\xe2\x82\xac", 3, 0x20AC}, /* unknown char - U+1FFFFF (21 bits) */ /* First possible sequence of a certain length */ {(const byte*)"\x00", 1, 0x00000000}, {(const byte*)"\xc2\x80", 2, 0x00000080}, {(const byte*)"\xe0\xa0\x80", 3, 0x00000800}, {(const byte*)"\xf0\x90\x80\x80", 4, 0x00010000}, /* Last possible sequence of a certain length */ {(const byte*)"\x7f", 1, 0x0000007F}, {(const byte*)"\xdf\xbf", 2, 0x000007FF}, {(const byte*)"\xef\xbf\xbd", 3, 0x0000FFFD}, /*no FFFE-FFFF */ {(const byte*)"\xf4\x8f\xbf\xbf", 4, 0x0010FFFF}, /* Boundary conditions */ {(const byte*)"\xed\x9f\xbf", 3, 0x0000D7FF}, {(const byte*)"\xee\x80\x80", 3, 0x0000E000}, {(const byte*)"\xef\xbf\xbd", 3, 0x0000FFFD}, {(const byte*)"\xf4\x8f\xbf\xbf", 4, 0x0010FFFF}, {NULL, 0, 0} }; struct tv bad_test_values[]={ /* Sequences that cannot appear in UTF-8 */ {(const byte*)"\xfe", 1, 0x000000FE}, {(const byte*)"\xff", 1, 0x000000FF}, {(const byte*)"\xef\xbf\xbe", 3, 0x0000FFFE}, {(const byte*)"\xef\xbf\xbf", 3, 0x0000FFFF}, /* Minumum (ASCII NUL) overlong sequences */ {(const byte*)"\xc0\x80", 2, 0x00000000}, {(const byte*)"\xe0\x80\x80", 3, 0x00000000}, {(const byte*)"\xf0\x80\x80\x80", 4, 0x00000000}, /* Maximum overlong sequences */ {(const byte*)"\xc1\xbf", 2, 0x0000007F}, {(const byte*)"\xe0\x9f\xbf", 3, 0x000007FF}, {(const byte*)"\xf0\x8f\xbf\xbf", 4, 0x0000FFFF}, /* Beyond U+10FFFF */ {(const byte*)"\xf4\x90\x80\x80", 4, 0x00110000}, {NULL, 0, 0} }; const byte test_utf8_string[]="Lib" "\xc3\xa9" "ration costs " "\xe2\x82\xac" "3.50"; int test_utf8_string_length=strlen((const char*)test_utf8_string); const byte result_latin1_string[]="Lib" "\xe9" "ration costs 3.50"; int result_latin1_string_length=strlen((const char*)result_latin1_string); const byte result_utf8_string[]="Lib" "\xc3\xa9" "ration costs 3.50"; int result_utf8_string_length=strlen((const char*)result_utf8_string); int i; byte *latin1_string; int latin1_string_length; byte *utf8_string; int utf8_string_length; int failures=0; int verbose=0; for(i=0; (t=&test_values[i]) && t->string; i++) { int size; const byte *buffer=t->string; int length=t->length; #define OUT_BUFFER_SIZE 6 byte out_buffer[OUT_BUFFER_SIZE]; size=librdf_utf8_to_unicode_char(&c, buffer, length); if(size < 0) { fprintf(stderr, "%s: librdf_utf8_to_unicode_char FAILED to convert UTF-8 string '", program); librdf_bad_string_print(buffer, length, stderr); fprintf(stderr, "' (length %d) to Unicode\n", length); failures++; continue; } if(c != t->result) { fprintf(stderr, "%s: librdf_utf8_to_unicode_char FAILED conversion of UTF-8 string '", program); librdf_bad_string_print(buffer, size, stderr); fprintf(stderr, "' to Unicode char U+%04X, expected U+%04X\n", (u32)c, (u32)t->result); failures++; continue; } if(verbose) { fprintf(stderr, "%s: librdf_utf8_to_unicode_char converted UTF-8 string '", program); librdf_utf8_print(buffer, size, stderr); fprintf(stderr, "' to Unicode char U+%04X correctly\n", (u32)c); } size=librdf_unicode_char_to_utf8(t->result, out_buffer, OUT_BUFFER_SIZE); if(size <= 0) { fprintf(stderr, "%s: librdf_unicode_char_to_utf8 FAILED to convert U+%04X to UTF-8 string\n", program, (u32)t->result); failures++; continue; } if(memcmp(out_buffer, buffer, length)) { fprintf(stderr, "%s: librdf_unicode_char_to_utf8 FAILED conversion U+%04X to UTF-8 - returned '", program, (u32)t->result); librdf_utf8_print(buffer, size, stderr); fputs("', expected '", stderr); librdf_utf8_print(out_buffer, t->length, stderr); fputs("'\n", stderr); failures++; continue; } if(verbose) { fprintf(stderr, "%s: librdf_unicode_char_to_utf8 converted U+%04X to UTF-8 string '", program, (u32)t->result); librdf_utf8_print(out_buffer, size, stderr); fputs("' correctly\n", stderr); } } /* Check for failures */ for(i=0; (t=&bad_test_values[i]) && t->string; i++) { int size; const byte *buffer=t->string; int length=t->length; size=librdf_utf8_to_unicode_char(&c, buffer, length); if(size >= 0) { fprintf(stderr, "%s: librdf_utf8_to_unicode_char SUCCEEDED when it should have failed to convert UTF-8 string '", program); librdf_bad_string_print(buffer, length, stderr); fprintf(stderr, "' (length %d) to Unicode\n", length); failures++; continue; } if(verbose) { fprintf(stderr, "%s: librdf_utf8_to_unicode_char failed as expected converting bad UTF-8 string '", program); librdf_bad_string_print(buffer, length, stderr); fprintf(stderr, "' (length %d) to Unicode\n", length); } } latin1_string=librdf_utf8_to_latin1(test_utf8_string, test_utf8_string_length, &latin1_string_length); if(!latin1_string) { fprintf(stderr, "%s: librdf_utf8_to_latin1 FAILED to convert UTF-8 string '", program); librdf_bad_string_print(test_utf8_string, test_utf8_string_length, stderr); fputs("' to Latin-1\n", stderr); failures++; } if(memcmp(latin1_string, result_latin1_string, result_latin1_string_length)) { fprintf(stderr, "%s: librdf_utf8_to_latin1 FAILED to convert UTF-8 string '", program); librdf_utf8_print(test_utf8_string, test_utf8_string_length, stderr); fprintf(stderr, "' to Latin-1 - returned '%s' but expected '%s'\n", latin1_string, result_latin1_string); failures++; } if(verbose) { fprintf(stderr, "%s: librdf_utf8_to_latin1 converted UTF-8 string '", program); librdf_utf8_print(test_utf8_string, test_utf8_string_length, stderr); fprintf(stderr, "' to Latin-1 string '%s' OK\n", latin1_string); } utf8_string=librdf_latin1_to_utf8(latin1_string, latin1_string_length, &utf8_string_length); if(!utf8_string) { fprintf(stderr, "%s: librdf_latin1_to_utf8 FAILED to convert Latin-1 string '%s' to UTF-8\n", program, latin1_string); failures++; } if(memcmp(utf8_string, result_utf8_string, result_utf8_string_length)) { fprintf(stderr, "%s: librdf_latin1_to_utf8 FAILED to convert Latin-1 string '%s' to UTF-8 - returned '", program, latin1_string); librdf_utf8_print(utf8_string, utf8_string_length, stderr); fputs("' but expected '", stderr); librdf_utf8_print(result_utf8_string, result_utf8_string_length, stderr); fputs("'\n", stderr); failures++; } if(verbose) { fprintf(stderr, "%s: librdf_latin1_to_utf8 converted Latin-1 string '%s' to UTF-8 string '", program, latin1_string); librdf_utf8_print(utf8_string, utf8_string_length, stderr); fputs("' OK\n", stderr); } LIBRDF_FREE(cstring, latin1_string); LIBRDF_FREE(cstring, utf8_string); #ifdef LIBRDF_MEMORY_DEBUG librdf_memory_report(stderr); #endif return failures; } #endif