mirror of
				https://github.com/cookiengineer/audacity
				synced 2025-10-31 14:13:50 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			366 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			366 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * libid3tag - ID3 tag manipulation library
 | |
|  * Copyright (C) 2000-2004 Underbit Technologies, Inc.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU General Public License as published by
 | |
|  * the Free Software Foundation; either version 2 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU General Public License
 | |
|  * along with this program; if not, write to the Free Software
 | |
|  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | |
|  *
 | |
|  * $Id: utf8.c,v 1.3 2004-06-08 06:38:15 dmazzoni Exp $
 | |
|  */
 | |
| 
 | |
| # ifdef HAVE_CONFIG_H
 | |
| #  include "config.h"
 | |
| # endif
 | |
| 
 | |
| # include "global.h"
 | |
| 
 | |
| # include <stdlib.h>
 | |
| 
 | |
| # include "id3tag.h"
 | |
| # include "utf8.h"
 | |
| # include "ucs4.h"
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->length()
 | |
|  * DESCRIPTION:	return the number of ucs4 chars represented by a utf8 string
 | |
|  */
 | |
| id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
 | |
| {
 | |
|   id3_length_t length = 0;
 | |
| 
 | |
|   while (*utf8) {
 | |
|     if ((utf8[0] & 0x80) == 0x00)
 | |
|       ++length;
 | |
|     else if ((utf8[0] & 0xe0) == 0xc0 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80) {
 | |
|       if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
 | |
| 	++length;
 | |
| 	utf8 += 1;
 | |
|       }
 | |
|     }
 | |
|     else if ((utf8[0] & 0xf0) == 0xe0 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80) {
 | |
|       if ((((utf8[0] & 0x0fL) << 12) |
 | |
| 	   ((utf8[1] & 0x3fL) <<  6)) >= 0x00000800L) {
 | |
| 	++length;
 | |
| 	utf8 += 2;
 | |
|       }
 | |
|     }
 | |
|     else if ((utf8[0] & 0xf8) == 0xf0 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[3] & 0xc0) == 0x80) {
 | |
|       if ((((utf8[0] & 0x07L) << 18) |
 | |
| 	   ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
 | |
| 	++length;
 | |
| 	utf8 += 3;
 | |
|       }
 | |
|     }
 | |
|     else if ((utf8[0] & 0xfc) == 0xf8 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[3] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[4] & 0xc0) == 0x80) {
 | |
|       if ((((utf8[0] & 0x03L) << 24) |
 | |
| 	   ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
 | |
| 	++length;
 | |
| 	utf8 += 4;
 | |
|       }
 | |
|     }
 | |
|     else if ((utf8[0] & 0xfe) == 0xfc &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[3] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[4] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[5] & 0xc0) == 0x80) {
 | |
|       if ((((utf8[0] & 0x01L) << 30) |
 | |
| 	   ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
 | |
| 	++length;
 | |
| 	utf8 += 5;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     ++utf8;
 | |
|   }
 | |
| 
 | |
|   return length;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->size()
 | |
|  * DESCRIPTION:	return the encoding size of a utf8 string
 | |
|  */
 | |
| id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
 | |
| {
 | |
|   id3_utf8_t const *ptr = utf8;
 | |
| 
 | |
|   while (*ptr)
 | |
|     ++ptr;
 | |
| 
 | |
|   return ptr - utf8 + 1;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->ucs4duplicate()
 | |
|  * DESCRIPTION:	duplicate and decode a utf8 string into ucs4
 | |
|  */
 | |
| id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
 | |
| {
 | |
|   id3_ucs4_t *ucs4;
 | |
| 
 | |
|   ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
 | |
|   if (ucs4)
 | |
|     id3_utf8_decode(utf8, ucs4);
 | |
| 
 | |
|   return release(ucs4);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->decodechar()
 | |
|  * DESCRIPTION:	decode a series of utf8 chars into a single ucs4 char
 | |
|  */
 | |
| id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
 | |
| {
 | |
|   id3_utf8_t const *start = utf8;
 | |
| 
 | |
|   while (1) {
 | |
|     if ((utf8[0] & 0x80) == 0x00) {
 | |
|       *ucs4 = utf8[0];
 | |
|       return utf8 - start + 1;
 | |
|     }
 | |
|     else if ((utf8[0] & 0xe0) == 0xc0 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80) {
 | |
|       *ucs4 =
 | |
| 	((utf8[0] & 0x1fL) << 6) |
 | |
| 	((utf8[1] & 0x3fL) << 0);
 | |
|       if (*ucs4 >= 0x00000080L)
 | |
| 	return utf8 - start + 2;
 | |
|     }
 | |
|     else if ((utf8[0] & 0xf0) == 0xe0 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80) {
 | |
|       *ucs4 =
 | |
| 	((utf8[0] & 0x0fL) << 12) |
 | |
| 	((utf8[1] & 0x3fL) <<  6) |
 | |
| 	((utf8[2] & 0x3fL) <<  0);
 | |
|       if (*ucs4 >= 0x00000800L)
 | |
| 	return utf8 - start + 3;
 | |
|     }
 | |
|     else if ((utf8[0] & 0xf8) == 0xf0 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[3] & 0xc0) == 0x80) {
 | |
|       *ucs4 =
 | |
| 	((utf8[0] & 0x07L) << 18) |
 | |
| 	((utf8[1] & 0x3fL) << 12) |
 | |
| 	((utf8[2] & 0x3fL) <<  6) |
 | |
| 	((utf8[3] & 0x3fL) <<  0);
 | |
|       if (*ucs4 >= 0x00010000L)
 | |
| 	return utf8 - start + 4;
 | |
|     }
 | |
|     else if ((utf8[0] & 0xfc) == 0xf8 &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[3] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[4] & 0xc0) == 0x80) {
 | |
|       *ucs4 =
 | |
| 	((utf8[0] & 0x03L) << 24) |
 | |
| 	((utf8[1] & 0x3fL) << 18) |
 | |
| 	((utf8[2] & 0x3fL) << 12) |
 | |
| 	((utf8[3] & 0x3fL) <<  6) |
 | |
| 	((utf8[4] & 0x3fL) <<  0);
 | |
|       if (*ucs4 >= 0x00200000L)
 | |
| 	return utf8 - start + 5;
 | |
|     }
 | |
|     else if ((utf8[0] & 0xfe) == 0xfc &&
 | |
| 	     (utf8[1] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[2] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[3] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[4] & 0xc0) == 0x80 &&
 | |
| 	     (utf8[5] & 0xc0) == 0x80) {
 | |
|       *ucs4 =
 | |
| 	((utf8[0] & 0x01L) << 30) |
 | |
| 	((utf8[1] & 0x3fL) << 24) |
 | |
| 	((utf8[2] & 0x3fL) << 18) |
 | |
| 	((utf8[3] & 0x3fL) << 12) |
 | |
| 	((utf8[4] & 0x3fL) <<  6) |
 | |
| 	((utf8[5] & 0x3fL) <<  0);
 | |
|       if (*ucs4 >= 0x04000000L)
 | |
| 	return utf8 - start + 6;
 | |
|     }
 | |
| 
 | |
|     ++utf8;
 | |
|   }
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->encodechar()
 | |
|  * DESCRIPTION:	encode a single ucs4 char into a series of up to 6 utf8 chars
 | |
|  */
 | |
| id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
 | |
| {
 | |
|   if (ucs4 <= 0x0000007fL) {
 | |
|     utf8[0] = ucs4;
 | |
| 
 | |
|     return 1;
 | |
|   }
 | |
|   else if (ucs4 <= 0x000007ffL) {
 | |
|     utf8[0] = 0xc0 | ((ucs4 >>  6) & 0x1f);
 | |
|     utf8[1] = 0x80 | ((ucs4 >>  0) & 0x3f);
 | |
| 
 | |
|     return 2;
 | |
|   }
 | |
|   else if (ucs4 <= 0x0000ffffL) {
 | |
|     utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
 | |
|     utf8[1] = 0x80 | ((ucs4 >>  6) & 0x3f);
 | |
|     utf8[2] = 0x80 | ((ucs4 >>  0) & 0x3f);
 | |
| 
 | |
|     return 3;
 | |
|   }
 | |
|   else if (ucs4 <= 0x001fffffL) {
 | |
|     utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
 | |
|     utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
 | |
|     utf8[2] = 0x80 | ((ucs4 >>  6) & 0x3f);
 | |
|     utf8[3] = 0x80 | ((ucs4 >>  0) & 0x3f);
 | |
| 
 | |
|     return 4;
 | |
|   }
 | |
|   else if (ucs4 <= 0x03ffffffL) {
 | |
|     utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
 | |
|     utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
 | |
|     utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
 | |
|     utf8[3] = 0x80 | ((ucs4 >>  6) & 0x3f);
 | |
|     utf8[4] = 0x80 | ((ucs4 >>  0) & 0x3f);
 | |
| 
 | |
|     return 5;
 | |
|   }
 | |
|   else if (ucs4 <= 0x7fffffffL) {
 | |
|     utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
 | |
|     utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
 | |
|     utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
 | |
|     utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
 | |
|     utf8[4] = 0x80 | ((ucs4 >>  6) & 0x3f);
 | |
|     utf8[5] = 0x80 | ((ucs4 >>  0) & 0x3f);
 | |
| 
 | |
|     return 6;
 | |
|   }
 | |
| 
 | |
|   /* default */
 | |
| 
 | |
|   return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->decode()
 | |
|  * DESCRIPTION:	decode a complete utf8 string into a ucs4 string
 | |
|  */
 | |
| void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
 | |
| {
 | |
|   do
 | |
|     utf8 += id3_utf8_decodechar(utf8, ucs4);
 | |
|   while (*ucs4++);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->encode()
 | |
|  * DESCRIPTION:	encode a complete ucs4 string into a utf8 string
 | |
|  */
 | |
| void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
 | |
| {
 | |
|   do
 | |
|     utf8 += id3_utf8_encodechar(utf8, *ucs4);
 | |
|   while (*ucs4++);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->put()
 | |
|  * DESCRIPTION:	serialize a single utf8 character
 | |
|  */
 | |
| id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
 | |
| {
 | |
|   if (ptr)
 | |
|     *(*ptr)++ = utf8;
 | |
| 
 | |
|   return 1;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->get()
 | |
|  * DESCRIPTION:	deserialize a single utf8 character
 | |
|  */
 | |
| id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
 | |
| {
 | |
|   return *(*ptr)++;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->serialize()
 | |
|  * DESCRIPTION:	serialize a ucs4 string using utf8 encoding
 | |
|  */
 | |
| id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
 | |
| 				int terminate)
 | |
| {
 | |
|   id3_length_t size = 0;
 | |
|   id3_utf8_t utf8[6], *out;
 | |
| 
 | |
|   while (*ucs4) {
 | |
|     switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
 | |
|     case 6: size += id3_utf8_put(ptr, *out++);
 | |
|     case 5: size += id3_utf8_put(ptr, *out++);
 | |
|     case 4: size += id3_utf8_put(ptr, *out++);
 | |
|     case 3: size += id3_utf8_put(ptr, *out++);
 | |
|     case 2: size += id3_utf8_put(ptr, *out++);
 | |
|     case 1: size += id3_utf8_put(ptr, *out++);
 | |
|     case 0: break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (terminate)
 | |
|     size += id3_utf8_put(ptr, 0);
 | |
| 
 | |
|   return size;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NAME:	utf8->deserialize()
 | |
|  * DESCRIPTION:	deserialize a ucs4 string using utf8 encoding
 | |
|  */
 | |
| id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
 | |
| {
 | |
|   id3_byte_t const *end;
 | |
|   id3_utf8_t *utf8ptr, *utf8;
 | |
|   id3_ucs4_t *ucs4;
 | |
| 
 | |
|   end = *ptr + length;
 | |
| 
 | |
|   utf8 = malloc((length + 1) * sizeof(*utf8));
 | |
|   if (utf8 == 0)
 | |
|     return 0;
 | |
| 
 | |
|   utf8ptr = utf8;
 | |
|   while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
 | |
|     ++utf8ptr;
 | |
| 
 | |
|   *utf8ptr = 0;
 | |
| 
 | |
|   ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
 | |
|   if (ucs4)
 | |
|     id3_utf8_decode(utf8, ucs4);
 | |
| 
 | |
|   free(utf8);
 | |
| 
 | |
|   return ucs4;
 | |
| }
 |