mirror of
				https://github.com/cookiengineer/audacity
				synced 2025-11-04 08:04:06 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			675 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			675 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* -*- Mode: c; c-basic-offset: 2 -*-
 | 
						|
 *
 | 
						|
 * rdf_utf8.c - RDF UTF8 / Unicode chars helper routines Implementation
 | 
						|
 *
 | 
						|
 * Copyright (C) 2000-2008, David Beckett http://www.dajobe.org/
 | 
						|
 * Copyright (C) 2000-2004, University of Bristol, UK http://www.bristol.ac.uk/
 | 
						|
 * 
 | 
						|
 * This package is Free Software and part of Redland http://librdf.org/
 | 
						|
 * 
 | 
						|
 * It is licensed under the following three licenses as alternatives:
 | 
						|
 *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
 | 
						|
 *   2. GNU General Public License (GPL) V2 or any newer version
 | 
						|
 *   3. Apache License, V2.0 or any newer version
 | 
						|
 * 
 | 
						|
 * You may not use this file except in compliance with at least one of
 | 
						|
 * the above three licenses.
 | 
						|
 * 
 | 
						|
 * See LICENSE.html or LICENSE.txt at the top of this package for the
 | 
						|
 * complete terms and further detail along with the license texts for
 | 
						|
 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
 | 
						|
 * 
 | 
						|
 * 
 | 
						|
 */
 | 
						|
 | 
						|
 | 
						|
#ifdef HAVE_CONFIG_H
 | 
						|
#include <rdf_config.h>
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef WIN32
 | 
						|
#include <win32_rdf_config.h>
 | 
						|
#endif
 | 
						|
 | 
						|
#include <stdio.h>
 | 
						|
#include <string.h>
 | 
						|
#include <ctype.h> /* for isprint() */
 | 
						|
 | 
						|
#include <redland.h>
 | 
						|
#include <rdf_utf8.h>
 | 
						|
 | 
						|
 | 
						|
#ifndef STANDALONE
 | 
						|
 | 
						|
/* UTF-8 encoding of 32 bit Unicode chars
 | 
						|
 *
 | 
						|
 * Characters  0x00000000 to 0x0000007f are US-ASCII
 | 
						|
 * Characters  0x00000080 to 0x000000ff are ISO Latin 1 (ISO 8859-1)
 | 
						|
 *
 | 
						|
 * incoming char| outgoing
 | 
						|
 * bytes | bits | representation
 | 
						|
 * ==================================================
 | 
						|
 *     1 |    7 | 0xxxxxxx
 | 
						|
 *     2 |   11 | 110xxxxx 10xxxxxx
 | 
						|
 *     3 |   16 | 1110xxxx 10xxxxxx 10xxxxxx
 | 
						|
 *     4 |   21 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 | 
						|
 *     5 |   26 | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | 
						|
 *     6 |   31 | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | 
						|
 *
 | 
						|
 * The first byte is always in the range 0xC0-0xFD
 | 
						|
 * Further bytes are all in the range 0x80-0xBF
 | 
						|
 * No byte is ever 0xFE or 0xFF
 | 
						|
 *
 | 
						|
*/
 | 
						|
 | 
						|
/*
 | 
						|
 * Unicode 3.0 Corrigendum #1: UTF-8 Shortest Form
 | 
						|
 * http://www.unicode.org/versions/corrigendum1.html
 | 
						|
 *
 | 
						|
 * C12 
 | 
						|
 *
 | 
						|
 * (a) When a process generates data in a Unicode Transformation
 | 
						|
 * Format, it shall not emit ill-formed code unit sequences.
 | 
						|
 *
 | 
						|
 * (b) When a process interprets data in a Unicode Transformation
 | 
						|
 * Format, it shall treat illegal code unit sequences as an error
 | 
						|
 * condition.
 | 
						|
 *
 | 
						|
 * (c) A conformant process shall not interpret illegal UTF code unit
 | 
						|
 * sequences as characters.
 | 
						|
 *
 | 
						|
 * (d) Irregular UTF code unit sequences shall not be used for
 | 
						|
 * encoding any other information.
 | 
						|
 *
 | 
						|
 *
 | 
						|
 * My Summary: never encode non-shortest form UTF-8 sequences - they are
 | 
						|
 * are illegal sequences.  Do not accept them on decoding.
 | 
						|
 *
 | 
						|
 *       Table 3.1B. Legal UTF-8 Byte Sequences
 | 
						|
 *   Code Points         1st Byte  2nd Byte  3rd Byte  4th Byte
 | 
						|
 *   U+0000..U+007F      00..7F
 | 
						|
 *   U+0080..U+07FF      C2..DF    80..BF
 | 
						|
 *   U+0800..U+0FFF      E0        A0..BF    80..BF
 | 
						|
 *   U+1000..U+FFFF      E1..EF    80..BF    80..BF
 | 
						|
 *   U+10000..U+3FFFF    F0        90..BF    80..BF    80..BF
 | 
						|
 *   U+40000..U+FFFFF    F1..F3    80..BF    80..BF    80..BF
 | 
						|
 *   U+100000..U+10FFFF  F4        80..8F    80..BF    80..BF
 | 
						|
 *
 | 
						|
 */
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
 * librdf_unicode_char_to_utf8:
 | 
						|
 * @c: Unicode character
 | 
						|
 * @output: UTF-8 string buffer or NULL
 | 
						|
 * @length: buffer size
 | 
						|
 *
 | 
						|
 * Convert a Unicode character to UTF-8 encoding.
 | 
						|
 * 
 | 
						|
 * If buffer is NULL, then will calculate the length rather than
 | 
						|
 * perform it.  This can be used by the caller to allocate space
 | 
						|
 * and then re-call this function with the new buffer.
 | 
						|
 * 
 | 
						|
 * Return value: bytes written to output buffer or <0 on failure
 | 
						|
 **/
 | 
						|
int
 | 
						|
librdf_unicode_char_to_utf8(librdf_unichar c, byte *output, int length)
 | 
						|
{
 | 
						|
  int size=0;
 | 
						|
 | 
						|
  /* check for illegal code positions:
 | 
						|
   * U+D800 to U+DFFF (UTF-16 surrogates)
 | 
						|
   * U+FFFE and U+FFFF
 | 
						|
   */
 | 
						|
  if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
 | 
						|
    return -1;
 | 
						|
 | 
						|
  /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
 | 
						|
  if(c > 0x10ffff)
 | 
						|
    return -1;
 | 
						|
  
 | 
						|
  if      (c < 0x00000080)
 | 
						|
    size=1;
 | 
						|
  else if (c < 0x00000800)
 | 
						|
    size=2;
 | 
						|
  else if (c < 0x00010000)
 | 
						|
    size=3;
 | 
						|
  else
 | 
						|
    size=4;
 | 
						|
 | 
						|
  /* when no buffer given, return size */
 | 
						|
  if(!output)
 | 
						|
    return size;
 | 
						|
 | 
						|
  if(size > length)
 | 
						|
    return -1;
 | 
						|
  
 | 
						|
  switch(size) {
 | 
						|
    case 4:
 | 
						|
      output[3]=0x80 | (c & 0x3F);
 | 
						|
      c= c >> 6;
 | 
						|
       /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
 | 
						|
      c |= 0x10000; /* 0x10000 = 0x10 << 12 */
 | 
						|
      /* FALLTHROUGH */
 | 
						|
    case 3:
 | 
						|
      output[2]=0x80 | (c & 0x3F);
 | 
						|
      c= c >> 6;
 | 
						|
      /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
 | 
						|
      c |= 0x800; /* 0x800 = 0x20 << 6 */
 | 
						|
      /* FALLTHROUGH */
 | 
						|
    case 2:
 | 
						|
      output[1]=0x80 | (c & 0x3F);
 | 
						|
      c= c >> 6;
 | 
						|
      /* set bits 7,6 on last byte */
 | 
						|
      c |= 0xc0; 
 | 
						|
      /* FALLTHROUGH */
 | 
						|
    case 1:
 | 
						|
      output[0]=c;
 | 
						|
  }
 | 
						|
 | 
						|
  return size;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
 * librdf_utf8_to_unicode_char:
 | 
						|
 * @output: Pointer to the Unicode character or NULL
 | 
						|
 * @input: UTF-8 string buffer
 | 
						|
 * @length: buffer size
 | 
						|
 *
 | 
						|
 * Convert an UTF-8 encoded buffer to a Unicode character.
 | 
						|
 * 
 | 
						|
 * If output is NULL, then will calculate the number of bytes that
 | 
						|
 * will be used from the input buffer and not perform the conversion.
 | 
						|
 * 
 | 
						|
 * Return value: bytes used from input buffer or <0 on failure
 | 
						|
 **/
 | 
						|
int
 | 
						|
librdf_utf8_to_unicode_char(librdf_unichar *output, const byte *input, int length)
 | 
						|
{
 | 
						|
  byte in;
 | 
						|
  int size;
 | 
						|
  librdf_unichar c=0;
 | 
						|
  
 | 
						|
  if(length < 1)
 | 
						|
    return -1;
 | 
						|
 | 
						|
  in=*input++;
 | 
						|
  if((in & 0x80) == 0) { /* First byte 00..7F */
 | 
						|
    size=1;
 | 
						|
    c= in & 0x7f;
 | 
						|
  } else if((in & 0xe0) == 0xc0) { /* First byte C0..DF */
 | 
						|
    size=2;
 | 
						|
    c= in & 0x1f;
 | 
						|
  } else if((in & 0xf0) == 0xe0) { /* First byte E0..EF */
 | 
						|
    size=3;
 | 
						|
    c= in & 0x0f;
 | 
						|
  } else if((in & 0xf8) == 0xf0) { /* First byte F0..F7 */
 | 
						|
    size=4;
 | 
						|
    c = in & 0x07;
 | 
						|
  } else /* First byte anything else: 80..BF F8..FF - illegal */
 | 
						|
    return -1;
 | 
						|
 | 
						|
 | 
						|
  if(!output)
 | 
						|
    return size;
 | 
						|
 | 
						|
  if(length < size)
 | 
						|
    return -1;
 | 
						|
 | 
						|
  switch(size) {
 | 
						|
    case 4:
 | 
						|
      in=*input++ & 0x3f;
 | 
						|
      c= c << 6;
 | 
						|
      c |= in;
 | 
						|
      /* FALLTHROUGH */
 | 
						|
    case 3:
 | 
						|
      in=*input++ & 0x3f;
 | 
						|
      c= c << 6;
 | 
						|
      c |= in;
 | 
						|
      /* FALLTHROUGH */
 | 
						|
    case 2:
 | 
						|
      in=*input++ & 0x3f;
 | 
						|
      c= c << 6;
 | 
						|
      c |= in;
 | 
						|
      /* FALLTHROUGH */
 | 
						|
    default:
 | 
						|
      break;
 | 
						|
  }
 | 
						|
 | 
						|
 | 
						|
  /* check for overlong UTF-8 sequences */
 | 
						|
  switch(size) {
 | 
						|
    case 2:
 | 
						|
      if(c < 0x00000080)
 | 
						|
        return -2;
 | 
						|
      break;
 | 
						|
    case 3:
 | 
						|
      if(c < 0x00000800)
 | 
						|
        return -2;
 | 
						|
      break;
 | 
						|
    case 4:
 | 
						|
      if(c < 0x00010000)
 | 
						|
        return -2;
 | 
						|
      break;
 | 
						|
 | 
						|
    default: /* 1 */
 | 
						|
      break;
 | 
						|
  }
 | 
						|
 | 
						|
 | 
						|
  /* check for illegal code positions:
 | 
						|
   * U+D800 to U+DFFF (UTF-16 surrogates)
 | 
						|
   * U+FFFE and U+FFFF
 | 
						|
   */
 | 
						|
  if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
 | 
						|
    return -1;
 | 
						|
 | 
						|
  /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
 | 
						|
  /* of course this makes some 4 byte forms illegal */
 | 
						|
  if(c > 0x10ffff)
 | 
						|
    return -1;
 | 
						|
 | 
						|
  *output=c;
 | 
						|
 | 
						|
  return size;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
 * librdf_utf8_to_latin1:
 | 
						|
 * @input: UTF-8 string buffer
 | 
						|
 * @length: buffer size
 | 
						|
 * @output_length: Pointer to variable to store resulting string length or NULL
 | 
						|
 *
 | 
						|
 * Convert a UTF-8 string to ISO Latin-1.
 | 
						|
 * 
 | 
						|
 * Converts the given UTF-8 string to the ISO Latin-1 subset of
 | 
						|
 * Unicode (characters 0x00-0xff), discarding any out of range
 | 
						|
 * characters.
 | 
						|
 *
 | 
						|
 * If the output_length pointer is not NULL, the returned string
 | 
						|
 * length will be stored there.
 | 
						|
 *
 | 
						|
 * Return value: pointer to new ISO Latin-1 string or NULL on failure
 | 
						|
 **/
 | 
						|
byte*
 | 
						|
librdf_utf8_to_latin1(const byte *input, int length, int *output_length)
 | 
						|
{
 | 
						|
  int utf8_char_length=0;
 | 
						|
  int utf8_byte_length=0;
 | 
						|
  int i;
 | 
						|
  int j;
 | 
						|
  byte *output;
 | 
						|
 | 
						|
  i=0;
 | 
						|
  while(input[i]) {
 | 
						|
    int size=librdf_utf8_to_unicode_char(NULL, &input[i], length-i);
 | 
						|
    if(size <= 0)
 | 
						|
      return NULL;
 | 
						|
    utf8_char_length++;
 | 
						|
    i+= size;
 | 
						|
  }
 | 
						|
 | 
						|
  /* This is a maximal length; since chars may be discarded, the
 | 
						|
   * actual length of the resulting can be shorter
 | 
						|
   */
 | 
						|
  utf8_byte_length=i;
 | 
						|
 | 
						|
 | 
						|
  output=(byte*)LIBRDF_MALLOC(byte_string, utf8_byte_length+1);
 | 
						|
  if(!output)
 | 
						|
    return NULL;
 | 
						|
  
 | 
						|
 | 
						|
  i=0; j=0;
 | 
						|
  while(i < utf8_byte_length) {
 | 
						|
    librdf_unichar c;
 | 
						|
    int size=librdf_utf8_to_unicode_char(&c, &input[i], length-i);
 | 
						|
    if(size <= 0)
 | 
						|
      return NULL;
 | 
						|
    if(c < 0x100) /* Discards characters! */
 | 
						|
      output[j++]=c;
 | 
						|
    i+= size;
 | 
						|
  } 
 | 
						|
  output[j]='\0';
 | 
						|
 | 
						|
  if(output_length)
 | 
						|
    *output_length=j;
 | 
						|
  
 | 
						|
  return output;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
 * librdf_latin1_to_utf8:
 | 
						|
 * @input: ISO Latin-1 string buffer
 | 
						|
 * @length: buffer size
 | 
						|
 * @output_length: Pointer to variable to store resulting string length or NULL
 | 
						|
 *
 | 
						|
 * Convert an ISO Latin-1 encoded string to UTF-8.
 | 
						|
 * 
 | 
						|
 * Converts the given ISO Latin-1 string to an UTF-8 encoded string
 | 
						|
 * representing the same content.  This is lossless.
 | 
						|
 * 
 | 
						|
 * If the output_length pointer is not NULL, the returned string
 | 
						|
 * length will be stored there.
 | 
						|
 *
 | 
						|
 * Return value: pointer to new UTF-8 string or NULL on failure
 | 
						|
 **/
 | 
						|
byte*
 | 
						|
librdf_latin1_to_utf8(const byte *input, int length, int *output_length)
 | 
						|
{
 | 
						|
  int utf8_length=0;
 | 
						|
  int i;
 | 
						|
  int j;
 | 
						|
  byte *output;
 | 
						|
 | 
						|
  for(i=0; input[i]; i++) {
 | 
						|
    int size=librdf_unicode_char_to_utf8(input[i], NULL, length-i);
 | 
						|
    if(size <= 0)
 | 
						|
      return NULL;
 | 
						|
    utf8_length += size;
 | 
						|
  }
 | 
						|
 | 
						|
  output=(byte*)LIBRDF_MALLOC(byte_string, utf8_length+1);
 | 
						|
  if(!output)
 | 
						|
    return NULL;
 | 
						|
  
 | 
						|
 | 
						|
  j=0;
 | 
						|
  for(i=0; input[i]; i++) {
 | 
						|
    int size=librdf_unicode_char_to_utf8(input[i], &output[j], length-i);
 | 
						|
    if(size <= 0)
 | 
						|
      return NULL;
 | 
						|
    j+= size;
 | 
						|
  } 
 | 
						|
  output[j]='\0';
 | 
						|
 | 
						|
  if(output_length)
 | 
						|
    *output_length=j;
 | 
						|
  
 | 
						|
  return output;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
 * librdf_utf8_print:
 | 
						|
 * @input: UTF-8 string buffer
 | 
						|
 * @length: buffer size
 | 
						|
 * @stream: FILE* stream
 | 
						|
 *
 | 
						|
 * Print a UTF-8 string to a stream.
 | 
						|
 * 
 | 
						|
 * Pretty prints the UTF-8 string in a pseudo-C character
 | 
						|
 * format like \u<emphasis>hex digits</emphasis> when the characters fail
 | 
						|
 * the isprint() test.
 | 
						|
 **/
 | 
						|
void
 | 
						|
librdf_utf8_print(const byte *input, int length, FILE *stream)
 | 
						|
{
 | 
						|
  int i=0;
 | 
						|
  
 | 
						|
  while(i<length && *input) {
 | 
						|
    librdf_unichar c;
 | 
						|
    int size=librdf_utf8_to_unicode_char(&c, input, length-i);
 | 
						|
    if(size <= 0)
 | 
						|
      return;
 | 
						|
    if(c < 0x100) {
 | 
						|
      if(isprint(c))
 | 
						|
        fputc(c, stream);
 | 
						|
      else
 | 
						|
        fprintf(stream, "\\u%02X", c);
 | 
						|
    } else if (c < 0x10000)
 | 
						|
      fprintf(stream, "\\u%04X", c);
 | 
						|
    else
 | 
						|
      fprintf(stream, "\\U%08X", c);
 | 
						|
    input += size;
 | 
						|
    i += size;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/* TEST CODE */
 | 
						|
 | 
						|
 | 
						|
#ifdef STANDALONE
 | 
						|
 | 
						|
/* static prototypes */
 | 
						|
void librdf_bad_string_print(const byte *input, int length, FILE *stream);
 | 
						|
int main(int argc, char *argv[]);
 | 
						|
 | 
						|
void
 | 
						|
librdf_bad_string_print(const byte *input, int length, FILE *stream)
 | 
						|
{
 | 
						|
  while(*input && length>0) {
 | 
						|
    char c=*input;
 | 
						|
    if(isprint(c))
 | 
						|
      fputc(c, stream);
 | 
						|
    else
 | 
						|
      fprintf(stream, "\\x%02X", (c & 0xff));
 | 
						|
    input++;
 | 
						|
    length--;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
int
 | 
						|
main(int argc, char *argv[]) 
 | 
						|
{
 | 
						|
  const char *program=librdf_basename((const char*)argv[0]);
 | 
						|
  librdf_unichar c;
 | 
						|
  struct tv {
 | 
						|
    const byte *string;
 | 
						|
    const int length;
 | 
						|
    const librdf_unichar result;
 | 
						|
  };
 | 
						|
  struct tv *t;
 | 
						|
  struct tv test_values[]={
 | 
						|
    /* what is the capital of England? 'E' */
 | 
						|
    {(const byte*)"E", 1, 'E'},
 | 
						|
    /* latin small letter e with acute, U+00E9 ISOlat1 */
 | 
						|
    {(const byte*)"\xc3\xa9", 2, 0xE9},
 | 
						|
    /*  euro sign, U+20AC NEW */
 | 
						|
    {(const byte*)"\xe2\x82\xac", 3, 0x20AC}, 
 | 
						|
    /* unknown char - U+1FFFFF (21 bits) */
 | 
						|
 | 
						|
    /* First possible sequence of a certain length */
 | 
						|
    {(const byte*)"\x00",                     1, 0x00000000},
 | 
						|
    {(const byte*)"\xc2\x80",                 2, 0x00000080},
 | 
						|
    {(const byte*)"\xe0\xa0\x80",             3, 0x00000800}, 
 | 
						|
    {(const byte*)"\xf0\x90\x80\x80",         4, 0x00010000},
 | 
						|
 | 
						|
    /* Last possible sequence of a certain length */
 | 
						|
    {(const byte*)"\x7f",                     1, 0x0000007F},
 | 
						|
    {(const byte*)"\xdf\xbf",                 2, 0x000007FF},
 | 
						|
    {(const byte*)"\xef\xbf\xbd",             3, 0x0000FFFD}, /*no FFFE-FFFF */
 | 
						|
    {(const byte*)"\xf4\x8f\xbf\xbf",         4, 0x0010FFFF},
 | 
						|
 | 
						|
    /* Boundary conditions */
 | 
						|
    {(const byte*)"\xed\x9f\xbf",     3, 0x0000D7FF},
 | 
						|
    {(const byte*)"\xee\x80\x80",     3, 0x0000E000},
 | 
						|
    {(const byte*)"\xef\xbf\xbd",     3, 0x0000FFFD}, 
 | 
						|
    {(const byte*)"\xf4\x8f\xbf\xbf", 4, 0x0010FFFF},
 | 
						|
 | 
						|
    {NULL, 0, 0}
 | 
						|
  };
 | 
						|
  struct tv bad_test_values[]={
 | 
						|
    /* Sequences that cannot appear in UTF-8 */
 | 
						|
    {(const byte*)"\xfe",                     1, 0x000000FE},
 | 
						|
    {(const byte*)"\xff",                     1, 0x000000FF},
 | 
						|
    {(const byte*)"\xef\xbf\xbe",             3, 0x0000FFFE},
 | 
						|
    {(const byte*)"\xef\xbf\xbf",             3, 0x0000FFFF},
 | 
						|
 | 
						|
    /* Minumum (ASCII NUL) overlong sequences */
 | 
						|
    {(const byte*)"\xc0\x80",                 2, 0x00000000},
 | 
						|
    {(const byte*)"\xe0\x80\x80",             3, 0x00000000},
 | 
						|
    {(const byte*)"\xf0\x80\x80\x80",         4, 0x00000000},
 | 
						|
 | 
						|
    /* Maximum overlong sequences */
 | 
						|
    {(const byte*)"\xc1\xbf",                 2, 0x0000007F},
 | 
						|
    {(const byte*)"\xe0\x9f\xbf",             3, 0x000007FF},
 | 
						|
    {(const byte*)"\xf0\x8f\xbf\xbf",         4, 0x0000FFFF},
 | 
						|
 | 
						|
    /* Beyond U+10FFFF */
 | 
						|
    {(const byte*)"\xf4\x90\x80\x80",         4, 0x00110000},
 | 
						|
 | 
						|
    {NULL, 0, 0}
 | 
						|
  };
 | 
						|
 | 
						|
  const byte test_utf8_string[]="Lib" "\xc3\xa9" "ration costs " "\xe2\x82\xac" "3.50";
 | 
						|
  int test_utf8_string_length=strlen((const char*)test_utf8_string);
 | 
						|
  const byte result_latin1_string[]="Lib" "\xe9" "ration costs 3.50";
 | 
						|
  int result_latin1_string_length=strlen((const char*)result_latin1_string);
 | 
						|
  const byte result_utf8_string[]="Lib" "\xc3\xa9" "ration costs 3.50";
 | 
						|
  int result_utf8_string_length=strlen((const char*)result_utf8_string);
 | 
						|
  
 | 
						|
  int i;
 | 
						|
  byte *latin1_string;
 | 
						|
  int latin1_string_length;
 | 
						|
  byte *utf8_string;
 | 
						|
  int utf8_string_length;
 | 
						|
  int failures=0;
 | 
						|
  int verbose=0;
 | 
						|
 | 
						|
  for(i=0; (t=&test_values[i]) && t->string; i++) {
 | 
						|
    int size;
 | 
						|
    const byte *buffer=t->string;
 | 
						|
    int length=t->length;
 | 
						|
#define OUT_BUFFER_SIZE 6
 | 
						|
    byte out_buffer[OUT_BUFFER_SIZE];
 | 
						|
    
 | 
						|
    size=librdf_utf8_to_unicode_char(&c, buffer, length);
 | 
						|
    if(size < 0) {
 | 
						|
      fprintf(stderr, "%s: librdf_utf8_to_unicode_char FAILED to convert UTF-8 string '", program);
 | 
						|
      librdf_bad_string_print(buffer, length, stderr);
 | 
						|
      fprintf(stderr, "' (length %d) to Unicode\n", length);
 | 
						|
      failures++;
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
    if(c != t->result) {
 | 
						|
      fprintf(stderr, "%s: librdf_utf8_to_unicode_char FAILED conversion of UTF-8 string '", program);
 | 
						|
      librdf_bad_string_print(buffer, size, stderr);
 | 
						|
      fprintf(stderr, "' to Unicode char U+%04X, expected U+%04X\n",
 | 
						|
              (u32)c, (u32)t->result);
 | 
						|
      failures++;
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
 | 
						|
    if(verbose) {
 | 
						|
      fprintf(stderr, "%s: librdf_utf8_to_unicode_char converted UTF-8 string '", program);
 | 
						|
      librdf_utf8_print(buffer, size, stderr);
 | 
						|
      fprintf(stderr, "' to Unicode char U+%04X correctly\n", (u32)c);
 | 
						|
    }
 | 
						|
 | 
						|
    size=librdf_unicode_char_to_utf8(t->result, out_buffer, OUT_BUFFER_SIZE);
 | 
						|
    if(size <= 0) {
 | 
						|
      fprintf(stderr, "%s: librdf_unicode_char_to_utf8 FAILED to convert U+%04X to UTF-8 string\n", program, (u32)t->result);
 | 
						|
      failures++;
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
 | 
						|
    if(memcmp(out_buffer, buffer, length)) {
 | 
						|
      fprintf(stderr, "%s: librdf_unicode_char_to_utf8 FAILED conversion U+%04X to UTF-8 - returned '", program, (u32)t->result);
 | 
						|
      librdf_utf8_print(buffer, size, stderr);
 | 
						|
      fputs("', expected '", stderr);
 | 
						|
      librdf_utf8_print(out_buffer, t->length, stderr);
 | 
						|
      fputs("'\n", stderr);
 | 
						|
      failures++;
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
    
 | 
						|
    if(verbose) {
 | 
						|
      fprintf(stderr, "%s: librdf_unicode_char_to_utf8 converted U+%04X to UTF-8 string '", program, (u32)t->result);
 | 
						|
      librdf_utf8_print(out_buffer, size, stderr);
 | 
						|
      fputs("' correctly\n", stderr);
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
 | 
						|
  /* Check for failures */
 | 
						|
  for(i=0; (t=&bad_test_values[i]) && t->string; i++) {
 | 
						|
    int size;
 | 
						|
    const byte *buffer=t->string;
 | 
						|
    int length=t->length;
 | 
						|
    
 | 
						|
    size=librdf_utf8_to_unicode_char(&c, buffer, length);
 | 
						|
    if(size >= 0) {
 | 
						|
      fprintf(stderr, "%s: librdf_utf8_to_unicode_char SUCCEEDED when it should have failed to convert UTF-8 string '", program);
 | 
						|
      librdf_bad_string_print(buffer, length, stderr);
 | 
						|
      fprintf(stderr, "' (length %d) to Unicode\n", length);
 | 
						|
      failures++;
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
    if(verbose) {
 | 
						|
      fprintf(stderr, "%s: librdf_utf8_to_unicode_char failed as expected converting bad UTF-8 string '", program);
 | 
						|
      librdf_bad_string_print(buffer, length, stderr);
 | 
						|
      fprintf(stderr, "' (length %d) to Unicode\n", length);
 | 
						|
    }
 | 
						|
  }
 | 
						|
  
 | 
						|
 | 
						|
 | 
						|
  latin1_string=librdf_utf8_to_latin1(test_utf8_string, 
 | 
						|
                                      test_utf8_string_length,
 | 
						|
                                      &latin1_string_length);
 | 
						|
  if(!latin1_string) {
 | 
						|
    fprintf(stderr, "%s: librdf_utf8_to_latin1 FAILED to convert UTF-8 string '", program);
 | 
						|
    librdf_bad_string_print(test_utf8_string, test_utf8_string_length, stderr);
 | 
						|
    fputs("' to Latin-1\n", stderr);
 | 
						|
    failures++;
 | 
						|
  }
 | 
						|
 | 
						|
  if(memcmp(latin1_string, result_latin1_string, result_latin1_string_length)) {
 | 
						|
    fprintf(stderr, "%s: librdf_utf8_to_latin1 FAILED to convert UTF-8 string '", program);
 | 
						|
    librdf_utf8_print(test_utf8_string, test_utf8_string_length, stderr);
 | 
						|
    fprintf(stderr, "' to Latin-1 - returned '%s' but expected '%s'\n",
 | 
						|
            latin1_string, result_latin1_string);
 | 
						|
    failures++;
 | 
						|
  }
 | 
						|
 | 
						|
  if(verbose) {
 | 
						|
    fprintf(stderr, "%s: librdf_utf8_to_latin1 converted UTF-8 string '",
 | 
						|
            program);
 | 
						|
    librdf_utf8_print(test_utf8_string, test_utf8_string_length, stderr);
 | 
						|
    fprintf(stderr, "' to Latin-1 string '%s' OK\n", latin1_string);
 | 
						|
  }
 | 
						|
  
 | 
						|
 | 
						|
  utf8_string=librdf_latin1_to_utf8(latin1_string, latin1_string_length,
 | 
						|
                                    &utf8_string_length);
 | 
						|
  if(!utf8_string) {
 | 
						|
    fprintf(stderr, "%s: librdf_latin1_to_utf8 FAILED to convert Latin-1 string '%s' to UTF-8\n", program, latin1_string);
 | 
						|
    failures++;
 | 
						|
  }
 | 
						|
 | 
						|
  if(memcmp(utf8_string, result_utf8_string, result_utf8_string_length)) {
 | 
						|
    fprintf(stderr, "%s: librdf_latin1_to_utf8 FAILED to convert Latin-1 string '%s' to UTF-8 - returned '", program, latin1_string);
 | 
						|
    librdf_utf8_print(utf8_string, utf8_string_length, stderr);
 | 
						|
    fputs("' but expected '", stderr);
 | 
						|
    librdf_utf8_print(result_utf8_string, result_utf8_string_length, stderr);
 | 
						|
    fputs("'\n", stderr);
 | 
						|
    failures++;
 | 
						|
  }
 | 
						|
 | 
						|
  if(verbose) {
 | 
						|
    fprintf(stderr, "%s: librdf_latin1_to_utf8 converted Latin-1 string '%s' to UTF-8 string '", program, latin1_string);
 | 
						|
    librdf_utf8_print(utf8_string, utf8_string_length, stderr);
 | 
						|
    fputs("' OK\n", stderr);
 | 
						|
  }
 | 
						|
 | 
						|
  LIBRDF_FREE(cstring, latin1_string);
 | 
						|
  LIBRDF_FREE(cstring, utf8_string);
 | 
						|
 | 
						|
#ifdef LIBRDF_MEMORY_DEBUG 
 | 
						|
  librdf_memory_report(stderr);
 | 
						|
#endif
 | 
						|
 
 | 
						|
  return failures;
 | 
						|
}
 | 
						|
 | 
						|
#endif
 |