OSDN Git Service

pgindent run.
[pg-rex/syncrep.git] / src / backend / utils / mb / wchar.c
1 /*
2  * conversion functions between pg_wchar and multibyte streams.
3  * Tatsuo Ishii
4  * $Id: wchar.c,v 1.30 2002/09/04 20:31:31 momjian Exp $
5  *
6  * WIN1250 client encoding updated by Pavel Behal
7  *
8  */
9 /* can be used in either frontend or backend */
10 #ifdef FRONTEND
11 #include "postgres_fe.h"
12 #define Assert(condition)
13 #else
14 #include "postgres.h"
15 #endif
16
17 #include "mb/pg_wchar.h"
18
19
20 /*
21  * conversion to pg_wchar is done by "table driven."
22  * to add an encoding support, define mb2wchar_with_len(), mblen()
23  * for the particular encoding. Note that if the encoding is only
24  * supported in the client, you don't need to define
25  * mb2wchar_with_len() function (SJIS is the case).
26  */
27
28 /*
29  * SQL/ASCII
30  */
31 static int      pg_ascii2wchar_with_len
32                         (const unsigned char *from, pg_wchar *to, int len)
33 {
34         int                     cnt = 0;
35
36         while (len > 0 && *from)
37         {
38                 *to++ = *from++;
39                 len--;
40                 cnt++;
41         }
42         *to = 0;
43         return (cnt);
44 }
45
46 static int
47 pg_ascii_mblen(const unsigned char *s)
48 {
49         return (1);
50 }
51
52 /*
53  * EUC
54  */
55
56 static int      pg_euc2wchar_with_len
57                         (const unsigned char *from, pg_wchar *to, int len)
58 {
59         int                     cnt = 0;
60
61         while (len > 0 && *from)
62         {
63                 if (*from == SS2 && len >= 2)
64                 {
65                         from++;
66                         *to = 0xff & *from++;
67                         len -= 2;
68                 }
69                 else if (*from == SS3 && len >= 3)
70                 {
71                         from++;
72                         *to = *from++ << 8;
73                         *to |= 0x3f & *from++;
74                         len -= 3;
75                 }
76                 else if ((*from & 0x80) && len >= 2)
77                 {
78                         *to = *from++ << 8;
79                         *to |= *from++;
80                         len -= 2;
81                 }
82                 else
83                 {
84                         *to = *from++;
85                         len--;
86                 }
87                 to++;
88                 cnt++;
89         }
90         *to = 0;
91         return (cnt);
92 }
93
94 static int
95 pg_euc_mblen(const unsigned char *s)
96 {
97         int                     len;
98
99         if (*s == SS2)
100                 len = 2;
101         else if (*s == SS3)
102                 len = 3;
103         else if (*s & 0x80)
104                 len = 2;
105         else
106                 len = 1;
107         return (len);
108 }
109
110 /*
111  * EUC_JP
112  */
113 static int      pg_eucjp2wchar_with_len
114                         (const unsigned char *from, pg_wchar *to, int len)
115 {
116         return (pg_euc2wchar_with_len(from, to, len));
117 }
118
119 static int
120 pg_eucjp_mblen(const unsigned char *s)
121 {
122         return (pg_euc_mblen(s));
123 }
124
125 /*
126  * EUC_KR
127  */
128 static int      pg_euckr2wchar_with_len
129                         (const unsigned char *from, pg_wchar *to, int len)
130 {
131         return (pg_euc2wchar_with_len(from, to, len));
132 }
133
134 static int
135 pg_euckr_mblen(const unsigned char *s)
136 {
137         return (pg_euc_mblen(s));
138 }
139
140 /*
141  * EUC_CN
142  */
143 static int      pg_euccn2wchar_with_len
144                         (const unsigned char *from, pg_wchar *to, int len)
145 {
146         int                     cnt = 0;
147
148         while (len > 0 && *from)
149         {
150                 if (*from == SS2 && len >= 3)
151                 {
152                         from++;
153                         *to = 0x3f00 & (*from++ << 8);
154                         *to = *from++;
155                         len -= 3;
156                 }
157                 else if (*from == SS3 && len >= 3)
158                 {
159                         from++;
160                         *to = *from++ << 8;
161                         *to |= 0x3f & *from++;
162                         len -= 3;
163                 }
164                 else if ((*from & 0x80) && len >= 2)
165                 {
166                         *to = *from++ << 8;
167                         *to |= *from++;
168                         len -= 2;
169                 }
170                 else
171                 {
172                         *to = *from++;
173                         len--;
174                 }
175                 to++;
176                 cnt++;
177         }
178         *to = 0;
179         return (cnt);
180 }
181
182 static int
183 pg_euccn_mblen(const unsigned char *s)
184 {
185         int                     len;
186
187         if (*s & 0x80)
188                 len = 2;
189         else
190                 len = 1;
191         return (len);
192 }
193
194 /*
195  * EUC_TW
196  */
197 static int      pg_euctw2wchar_with_len
198                         (const unsigned char *from, pg_wchar *to, int len)
199 {
200         int                     cnt = 0;
201
202         while (len > 0 && *from)
203         {
204                 if (*from == SS2 && len >= 4)
205                 {
206                         from++;
207                         *to = *from++ << 16;
208                         *to |= *from++ << 8;
209                         *to |= *from++;
210                         len -= 4;
211                 }
212                 else if (*from == SS3 && len >= 3)
213                 {
214                         from++;
215                         *to = *from++ << 8;
216                         *to |= 0x3f & *from++;
217                         len -= 3;
218                 }
219                 else if ((*from & 0x80) && len >= 2)
220                 {
221                         *to = *from++ << 8;
222                         *to |= *from++;
223                         len -= 2;
224                 }
225                 else
226                 {
227                         *to = *from++;
228                         len--;
229                 }
230                 to++;
231                 cnt++;
232         }
233         *to = 0;
234         return (cnt);
235 }
236
237 static int
238 pg_euctw_mblen(const unsigned char *s)
239 {
240         int                     len;
241
242         if (*s == SS2)
243                 len = 4;
244         else if (*s == SS3)
245                 len = 3;
246         else if (*s & 0x80)
247                 len = 2;
248         else
249                 len = 1;
250         return (len);
251 }
252
253 /*
254  * JOHAB
255  */
256 static int
257 pg_johab2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
258 {
259         return (pg_euc2wchar_with_len(from, to, len));
260 }
261
262 static int
263 pg_johab_mblen(const unsigned char *s)
264 {
265         return (pg_euc_mblen(s));
266 }
267
268 /*
269  * convert UTF-8 string to pg_wchar (UCS-2)
270  * caller should allocate enough space for "to"
271  * len: length of from.
272  * "from" not necessarily null terminated.
273  */
274 static int
275 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
276 {
277         unsigned char c1,
278                                 c2,
279                                 c3;
280         int                     cnt = 0;
281
282         while (len > 0 && *from)
283         {
284                 if ((*from & 0x80) == 0)
285                 {
286                         *to = *from++;
287                         len--;
288                 }
289                 else if ((*from & 0xe0) == 0xc0 && len >= 2)
290                 {
291                         c1 = *from++ & 0x1f;
292                         c2 = *from++ & 0x3f;
293                         *to = c1 << 6;
294                         *to |= c2;
295                         len -= 2;
296                 }
297                 else if ((*from & 0xe0) == 0xe0 && len >= 3)
298                 {
299                         c1 = *from++ & 0x0f;
300                         c2 = *from++ & 0x3f;
301                         c3 = *from++ & 0x3f;
302                         *to = c1 << 12;
303                         *to |= c2 << 6;
304                         *to |= c3;
305                         len -= 3;
306                 }
307                 else
308                 {
309                         *to = *from++;
310                         len--;
311                 }
312                 to++;
313                 cnt++;
314         }
315         *to = 0;
316         return (cnt);
317 }
318
319 /*
320  * returns the byte length of a UTF-8 word pointed to by s
321  */
322 int
323 pg_utf_mblen(const unsigned char *s)
324 {
325         int                     len = 1;
326
327         if ((*s & 0x80) == 0)
328                 len = 1;
329         else if ((*s & 0xe0) == 0xc0)
330                 len = 2;
331         else if ((*s & 0xe0) == 0xe0)
332                 len = 3;
333         return (len);
334 }
335
336 /*
337  * convert mule internal code to pg_wchar
338  * caller should allocate enough space for "to"
339  * len: length of from.
340  * "from" not necessarily null terminated.
341  */
342 static int
343 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
344 {
345         int                     cnt = 0;
346
347         while (len > 0 && *from)
348         {
349                 if (IS_LC1(*from) && len >= 2)
350                 {
351                         *to = *from++ << 16;
352                         *to |= *from++;
353                         len -= 2;
354                 }
355                 else if (IS_LCPRV1(*from) && len >= 3)
356                 {
357                         from++;
358                         *to = *from++ << 16;
359                         *to |= *from++;
360                         len -= 3;
361                 }
362                 else if (IS_LC2(*from) && len >= 3)
363                 {
364                         *to = *from++ << 16;
365                         *to |= *from++ << 8;
366                         *to |= *from++;
367                         len -= 3;
368                 }
369                 else if (IS_LCPRV2(*from) && len >= 4)
370                 {
371                         from++;
372                         *to = *from++ << 16;
373                         *to |= *from++ << 8;
374                         *to |= *from++;
375                         len -= 4;
376                 }
377                 else
378                 {                                               /* assume ASCII */
379                         *to = (unsigned char) *from++;
380                         len--;
381                 }
382                 to++;
383                 cnt++;
384         }
385         *to = 0;
386         return (cnt);
387 }
388
389 int
390 pg_mule_mblen(const unsigned char *s)
391 {
392         int                     len;
393
394         if (IS_LC1(*s))
395                 len = 2;
396         else if (IS_LCPRV1(*s))
397                 len = 3;
398         else if (IS_LC2(*s))
399                 len = 3;
400         else if (IS_LCPRV2(*s))
401                 len = 4;
402         else
403         {                                                       /* assume ASCII */
404                 len = 1;
405         }
406         return (len);
407 }
408
409 /*
410  * ISO8859-1
411  */
412 static int
413 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
414 {
415         int                     cnt = 0;
416
417         while (len > 0 && *from)
418         {
419                 *to++ = *from++;
420                 len--;
421                 cnt++;
422         }
423         *to = 0;
424         return (cnt);
425 }
426
427 static int
428 pg_latin1_mblen(const unsigned char *s)
429 {
430         return (1);
431 }
432
433 /*
434  * SJIS
435  */
436 static int
437 pg_sjis_mblen(const unsigned char *s)
438 {
439         int                     len;
440
441         if (*s >= 0xa1 && *s <= 0xdf)
442         {                                                       /* 1 byte kana? */
443                 len = 1;
444         }
445         else if (*s > 0x7f)
446         {                                                       /* kanji? */
447                 len = 2;
448         }
449         else
450         {                                                       /* should be ASCII */
451                 len = 1;
452         }
453         return (len);
454 }
455
456 /*
457  * Big5
458  */
459 static int
460 pg_big5_mblen(const unsigned char *s)
461 {
462         int                     len;
463
464         if (*s > 0x7f)
465         {                                                       /* kanji? */
466                 len = 2;
467         }
468         else
469         {                                                       /* should be ASCII */
470                 len = 1;
471         }
472         return (len);
473 }
474
475 /*
476  * GBK
477  */
478 static int
479 pg_gbk_mblen(const unsigned char *s)
480 {
481         int                     len;
482
483         if (*s > 0x7f)
484         {                                                       /* kanji? */
485                 len = 2;
486         }
487         else
488         {                                                       /* should be ASCII */
489                 len = 1;
490         }
491         return (len);
492 }
493
494 /*
495  * UHC
496  */
497 static int
498 pg_uhc_mblen(const unsigned char *s)
499 {
500         int                     len;
501
502         if (*s > 0x7f)
503         {                                                       /* 2byte? */
504                 len = 2;
505         }
506         else
507         {                                                       /* should be ASCII */
508                 len = 1;
509         }
510         return (len);
511 }
512
513 /*
514  *      * GB18030
515  *       * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
516  *        */
517 static int
518 pg_gb18030_mblen(const unsigned char *s)
519 {
520         int                     len;
521
522         if (*s <= 0x7f)
523         {                                                       /* ASCII */
524                 len = 1;
525         }
526         else
527         {
528                 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) || (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
529                         len = 2;
530                 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
531                         len = 4;
532                 else
533                         len = 2;
534         }
535         return (len);
536 }
537
538
539 pg_wchar_tbl pg_wchar_table[] = {
540         {pg_ascii2wchar_with_len, pg_ascii_mblen, 1},           /* 0; PG_SQL_ASCII      */
541         {pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3},           /* 1; PG_EUC_JP */
542         {pg_euccn2wchar_with_len, pg_euccn_mblen, 3},           /* 2; PG_EUC_CN */
543         {pg_euckr2wchar_with_len, pg_euckr_mblen, 3},           /* 3; PG_EUC_KR */
544         {pg_euctw2wchar_with_len, pg_euctw_mblen, 3},           /* 4; PG_EUC_TW */
545         {pg_johab2wchar_with_len, pg_johab_mblen, 3},           /* 5; PG_JOHAB */
546         {pg_utf2wchar_with_len, pg_utf_mblen, 3},       /* 6; PG_UNICODE */
547         {pg_mule2wchar_with_len, pg_mule_mblen, 3}, /* 7; PG_MULE_INTERNAL */
548         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 8; PG_LATIN1 */
549         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 9; PG_LATIN2 */
550         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 10; PG_LATIN3 */
551         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 11; PG_LATIN4 */
552         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 12; PG_LATIN5 */
553         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 13; PG_LATIN6 */
554         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 14; PG_LATIN7 */
555         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 15; PG_LATIN8 */
556         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 16; PG_LATIN9 */
557         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 17; PG_LATIN10 */
558         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 18; PG_WIN1256 */
559         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 19; PG_TCVN */
560         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 20; PG_WIN874 */
561         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 21; PG_KOI8 */
562         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 22; PG_WIN1251 */
563         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 23; PG_ALT */
564         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 24; ISO-8859-5 */
565         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 25; ISO-8859-6 */
566         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 26; ISO-8859-7 */
567         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 27; ISO-8859-8 */
568         {0, pg_sjis_mblen, 2},          /* 28; PG_SJIS */
569         {0, pg_big5_mblen, 2},          /* 29; PG_BIG5 */
570         {0, pg_gbk_mblen, 2},           /* 30; PG_GBK */
571         {0, pg_uhc_mblen, 2},           /* 31; PG_UHC */
572         {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 32; PG_WIN1250 */
573         {0, pg_gb18030_mblen, 2}        /* 33; PG_GB18030 */
574 };
575
576 /* returns the byte length of a word for mule internal code */
577 int
578 pg_mic_mblen(const unsigned char *mbstr)
579 {
580         return (pg_mule_mblen(mbstr));
581 }
582
583 /*
584  * Returns the byte length of a multibyte word.
585  */
586 int
587 pg_encoding_mblen(int encoding, const unsigned char *mbstr)
588 {
589         Assert(PG_VALID_ENCODING(encoding));
590
591         return ((encoding >= 0 &&
592                          encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
593                         ((*pg_wchar_table[encoding].mblen) (mbstr)) :
594                         ((*pg_wchar_table[PG_SQL_ASCII].mblen) (mbstr)));
595 }
596
597 /*
598  * fetch maximum length of a char encoding
599  */
600 int
601 pg_encoding_max_length(int encoding)
602 {
603         Assert(PG_VALID_ENCODING(encoding));
604
605         return pg_wchar_table[encoding].maxmblen;
606 }
607
608 #ifndef FRONTEND
609 /*
610  * Verify mbstr to make sure that it has a valid character sequence.
611  * mbstr is not necessarily NULL terminated. length of mbstr is
612  * specified by len. If an error was found, returns an error message.
613  * Note that the message is kept in a static buffer, the next invocation
614  * might break the message.
615  * If no error was found, this function returns NULL.
616  */
617 char *
618 pg_verifymbstr(const unsigned char *mbstr, int len)
619 {
620         int                     l;
621         int                     i,
622                                 j;
623         static char buf[256];
624         int                     slen = 0;
625
626         /* we do not check single byte encodings */
627         if (pg_database_encoding_max_length() <= 1)
628                 return NULL;
629
630         while (len > 0 && *mbstr)
631         {
632                 /* special UTF-8 check */
633                 if (GetDatabaseEncoding() == PG_UTF8 &&
634                         (*mbstr & 0xf8) == 0xf0)
635                 {
636                         snprintf(buf, sizeof(buf), "Unicode >= 0x10000 is not supoorted");
637                         return (buf);
638                 }
639
640                 l = pg_mblen(mbstr);
641
642                 /* multibyte letter? */
643                 if (l > 1)
644                 {
645                         for (i = 1; i < l; i++)
646                         {
647                                 if (i > len || *(mbstr + i) == '\0' ||
648
649                                 /*
650                                  * we assume that every multibyte letter consists of bytes
651                                  * being the 8th bit set
652                                  */
653                                         ((*(mbstr + i) & 0x80) == 0))
654                                 {
655                                         int                     remains = sizeof(buf);
656                                         char       *p = buf;
657
658                                         slen = snprintf(p, remains, "Invalid %s character sequence found (0x",
659                                                                         GetDatabaseEncodingName());
660                                         p += slen;
661                                         remains -= slen;
662
663                                         i = ((*(mbstr + i) & 0x80) == 0) ? l : i;
664
665                                         for (j = 0; j < i; j++)
666                                         {
667                                                 slen = snprintf(p, remains, "%02x",
668                                                                                 *(mbstr + j));
669                                                 p += slen;
670                                                 remains -= slen;
671                                         }
672                                         snprintf(p, remains, ")");
673                                         return (buf);
674                                 }
675                         }
676                 }
677                 len -= l;
678                 mbstr += l;
679         }
680         return NULL;
681 }
682
683 /*
684  * fetch maximum length of a char encoding for the current database
685  */
686 int
687 pg_database_encoding_max_length(void)
688 {
689         return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
690 }
691
692 #endif