PeerCast.root/PeerCast/core/common/utf8.c

   1 /*
   2  * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
   3  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  */
  19
  20 /*
  21  * Convert a string between UTF-8 and the locale's charset.
  22  */
  23
  24 #include <stdlib.h>
  25 #include <string.h>
  26
  27 #include "utf8.h"
  28 #include "identify_encoding.h"
  29 #ifdef _WIN32
  30
  31 /* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32
  32  * code.
  33  */
  34
  35 #include <stdio.h>
  36 #include <windows.h>
  37
  38 static unsigned char *make_utf8_string(const wchar_t *unicode)
  39 {
  40         int size = 0, index = 0, out_index = 0;
  41         unsigned char *out;
  42         unsigned short c;
  43
  44         /* first calculate the size of the target string */
  45         c = unicode[index++];
  46         while (c)
  47         {
  48                 if (c < 0x0080)
  49                 {
  50                         size += 1;
  51                 }
  52                 else if (c < 0x0800)
  53                 {
  54                         size += 2;
  55         }else{
  56                         size += 3;
  57                 }
  58                 c = unicode[index++];
  59         }
  60
  61         out = (unsigned char *)malloc(size + 1);
  62         if (out == NULL)
  63                 return NULL;
  64         index = 0;
  65
  66         c = unicode[index++];
  67         while (c)
  68     {
  69                 if (c < 0x080)
  70                 {
  71                         out[out_index++] = (unsigned char)c;
  72                 }else if (c < 0x800)
  73                 {
  74                         out[out_index++] = 0xc0 | (c >> 6);
  75                         out[out_index++] = 0x80 | (c & 0x3f);
  76         }else{
  77                         out[out_index++] = 0xe0 | (c >> 12);
  78                         out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
  79                         out[out_index++] = 0x80 | (c & 0x3f);
  80                 }
  81                 c = unicode[index++];
  82         }
  83         out[out_index] = 0x00;
  84
  85         return out;
  86 }
  87
  88 static wchar_t *make_unicode_string(const unsigned char *utf8)
  89 {
  90         int size = 0, index = 0, out_index = 0;
  91         wchar_t *out;
  92         unsigned char c;
  93
  94         /* first calculate the size of the target string */
  95         c = utf8[index++];
  96         while (c)
  97         {
  98                 if ((c & 0x80) == 0)
  99                 {
 100                         index += 0;
 101                 }else if((c & 0xe0) == 0xe0)
 102                 {
 103                         index += 2;
 104         }else{
 105                         index += 1;
 106                 }
 107                 size += 1;
 108                 c = utf8[index++];
 109         }
 110
 111         out = (wchar_t *)malloc((size + 1) * sizeof(wchar_t));
 112         if (out == NULL)
 113                 return NULL;
 114         index = 0;
 115
 116         c = utf8[index++];
 117         while (c)
 118         {
 119                 if ((c & 0x80) == 0)
 120                 {
 121             out[out_index++] = c;
 122         }else if ((c & 0xe0) == 0xe0)
 123                 {
 124                         out[out_index] = (c & 0x1F) << 12;
 125                         c = utf8[index++];
 126                         out[out_index] |= (c & 0x3F) << 6;
 127                         c = utf8[index++];
 128                         out[out_index++] |= (c & 0x3F);
 129         }else{
 130                         out[out_index] = (c & 0x3F) << 6;
 131                         c = utf8[index++];
 132                         out[out_index++] |= (c & 0x3F);
 133                 }
 134                 c = utf8[index++];
 135         }
 136         out[out_index] = 0;
 137
 138         return out;
 139 }
 140
 141 int utf8_encode(const char *from, char **to)
 142 {
 143         wchar_t *unicode;
 144         int wchars, err;
 145
 146
 147         wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
 148                         strlen(from), NULL, 0);
 149
 150         if(wchars == 0)
 151         {
 152 //              fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 153                 return -1;
 154         }
 155
 156         unicode = (wchar_t *)calloc(wchars + 1, sizeof(unsigned short));
 157         if(unicode == NULL)
 158         {
 159 //              fprintf(stderr, "Out of memory processing string to UTF8\n");
 160                 return -1;
 161         }
 162
 163         err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, strlen(from), unicode, wchars);
 164         if (err != wchars)
 165         {
 166                 free(unicode);
 167 //              fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 168                 return -1;
 169         }
 170
 171         /* On NT-based windows systems, we could use WideCharToMultiByte(), but
 172          * MS doesn't actually have a consistent API across win32.
 173          */
 174         *to = (char *)make_utf8_string(unicode);
 175
 176         free(unicode);
 177         return 0;
 178 }
 179
 180 int utf8_decode(const char *from, char **to)
 181 {
 182         wchar_t *unicode;
 183         int chars, err;
 184
 185         const char *cds;
 186         identify_encoding_t *cdt;
 187         enum identify_encoding_order order;
 188         order = ieo_SJIS;
 189         cdt = identify_encoding_open(order);
 190         cds = identify_encoding(cdt,(char *)from);
 191         if (strcmp(cds,"UTF-8")!=0)
 192         {
 193                 identify_encoding_close(cdt);
 194                 return -1;
 195         }
 196         identify_encoding_close(cdt);
 197
 198
 199         /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
 200          * MS doesn't actually have a consistent API across win32.
 201          */
 202         unicode = (wchar_t *)make_unicode_string((const unsigned char *)from);
 203         if (unicode == NULL)
 204         {
 205 //              fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
 206                 return -1;
 207         }
 208
 209         chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode, -1, NULL, 0, NULL, NULL);
 210
 211         if (chars == 0)
 212         {
 213 //              fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 214                 free(unicode);
 215                 return -1;
 216         }
 217
 218         *to = (char *)calloc(chars + 1, sizeof(unsigned char));
 219         if (*to == NULL)
 220         {
 221 //              fprintf(stderr, "Out of memory processing string to local charset\n");
 222                 free(unicode);
 223                 return -1;
 224         }
 225
 226         err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode, -1, *to, chars, NULL, NULL);
 227         if (err != chars)
 228         {
 229 //              fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 230                 free(unicode);
 231                 free(*to);
 232                 *to = NULL;
 233                 return -1;
 234         }
 235
 236         free(unicode);
 237         return 0;
 238 }
 239
 240 #else /* End win32. Rest is for real operating systems */
 241
 242
 243 #ifdef HAVE_LANGINFO_CODESET
 244 #include <langinfo.h>
 245 #endif
 246
 247 int iconvert(const char *fromcode, const char *tocode,
 248                          const char *from, size_t fromlen,
 249                          char **to, size_t *tolen);
 250
 251 static char *current_charset = 0; /* means "US-ASCII" */
 252
 253 void convert_set_charset(const char *charset)
 254 {
 255
 256         if (!charset)
 257                 charset = getenv("CHARSET");
 258
 259 #ifdef HAVE_LANGINFO_CODESET
 260         if (!charset)
 261                 charset = nl_langinfo(CODESET);
 262 #endif
 263
 264         free(current_charset);
 265         current_charset = 0;
 266         if (charset && *charset)
 267                 current_charset = strdup(charset);
 268 }
 269
 270 static int convert_buffer(const char *fromcode, const char *tocode,
 271                                                   const char *from, size_t fromlen,
 272                                                   char **to, size_t *tolen)
 273 {
 274         int ret = -1;
 275
 276 #ifdef HAVE_ICONV
 277         ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
 278         if (ret != -1)
 279                 return ret;
 280 #endif
 281
 282 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
 283         ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
 284         if (ret != -1)
 285                 return ret;
 286 #endif
 287
 288         return ret;
 289 }
 290
 291 static int convert_string(const char *fromcode, const char *tocode,
 292                                                   const char *from, char **to, char replace)
 293 {
 294         int ret;
 295         size_t fromlen;
 296         char *s;
 297
 298         fromlen = strlen(from);
 299         ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
 300         if (ret == -2)
 301                 return -1;
 302         if (ret != -1)
 303                 return ret;
 304
 305         s = malloc(fromlen + 1);
 306         if (!s)
 307                 return -1;
 308         strcpy(s, from);
 309         *to = s;
 310         for (; *s; s++)
 311                 if (*s & ~0x7f)
 312                         *s = replace;
 313         return 3;
 314 }
 315
 316 int utf8_encode(const char *from, char **to)
 317 {
 318         char *charset;
 319
 320         if (!current_charset)
 321                 convert_set_charset(0);
 322         charset = current_charset ? current_charset : "US-ASCII";
 323         return convert_string(charset, "UTF-8", from, to, '#');
 324 }
 325
 326 int utf8_decode(const char *from, char **to)
 327 {
 328         char *charset;
 329
 330         if (*from == 0)
 331         {
 332                 *to = malloc(1);
 333                 **to = 0;
 334                 return 1;
 335         }
 336
 337         if (!current_charset)
 338                 convert_set_charset(0);
 339         charset = current_charset ? current_charset : "US-ASCII";
 340         return convert_string("UTF-8", charset, from, to, '?');
 341 }
 342
 343 #endif