Change the backend to reject strings containing invalidly-encoded multibyte

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 21 May 2006 20:05:21 +0000 (20:05 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 21 May 2006 20:05:21 +0000 (20:05 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 21 May 2006 20:05:21 +0000 (20:05 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 21 May 2006 20:05:21 +0000 (20:05 +0000)
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index 2b49094..6678522 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.263 2006/04/05 22:11:54 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.264 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1023,9 +1023,15 @@ DoCopy(const CopyStmt *stmt)
         cstate->raw_buf_index = cstate->raw_buf_len = 0;
         cstate->processed = 0;
  
-       /* Set up encoding conversion info */
+       /*
+        * Set up encoding conversion info.  Even if the client and server
+        * encodings are the same, we must apply pg_client_to_server() to
+        * validate data in multibyte encodings.
+        */
         cstate->client_encoding = pg_get_client_encoding();
-       cstate->need_transcoding = (cstate->client_encoding != GetDatabaseEncoding());
+       cstate->need_transcoding =
+               (cstate->client_encoding != GetDatabaseEncoding() ||
+                pg_database_encoding_max_length() > 1);
         /* See Multibyte encoding comment above */
         cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
  
diff --git a/src/backend/utils/adt/name.c b/src/backend/utils/adt/name.c

index ed6f891..aa53fb0 100644 (file)
--- a/src/backend/utils/adt/name.c
+++ b/src/backend/utils/adt/name.c
@@ -14,7 +14,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/name.c,v 1.57 2006/03/05 15:58:43 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/name.c,v 1.58 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -49,10 +49,7 @@ namein(PG_FUNCTION_ARGS)
         NameData   *result;
         int                     len;
  
-       /* verify encoding */
         len = strlen(s);
-       pg_verifymbstr(s, len, false);
-
         len = pg_mbcliplen(s, len, NAMEDATALEN - 1);
  
         result = (NameData *) palloc0(NAMEDATALEN);
diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c

index f72b372..c6ea11d 100644 (file)
--- a/src/backend/utils/adt/varchar.c
+++ b/src/backend/utils/adt/varchar.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.115 2006/03/05 15:58:44 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.116 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -73,9 +73,6 @@ bpchar_input(const char *s, size_t len, int32 atttypmod)
         char       *r;
         size_t          maxlen;
  
-       /* verify encoding */
-       pg_verifymbstr(s, len, false);
-
         /* If typmod is -1 (or invalid), use the actual string length */
         if (atttypmod < (int32) VARHDRSZ)
                 maxlen = len;
@@ -393,9 +390,6 @@ varchar_input(const char *s, size_t len, int32 atttypmod)
         VarChar    *result;
         size_t          maxlen;
  
-       /* verify encoding */
-       pg_verifymbstr(s, len, false);
-
         maxlen = atttypmod - VARHDRSZ;
  
         if (atttypmod >= (int32) VARHDRSZ && len > maxlen)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index 79dc017..7bc5a09 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.146 2006/04/04 19:35:36 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.147 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -256,10 +256,7 @@ textin(PG_FUNCTION_ARGS)
         text       *result;
         int                     len;
  
-       /* verify encoding */
         len = strlen(inputText);
-       pg_verifymbstr(inputText, len, false);
-
         result = (text *) palloc(len + VARHDRSZ);
         VARATT_SIZEP(result) = len + VARHDRSZ;
  
@@ -299,9 +296,6 @@ textrecv(PG_FUNCTION_ARGS)
  
         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
  
-       /* verify encoding */
-       pg_verifymbstr(str, nbytes, false);
-
         result = (text *) palloc(nbytes + VARHDRSZ);
         VARATT_SIZEP(result) = nbytes + VARHDRSZ;
         memcpy(VARDATA(result), str, nbytes);
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c

index a544f10..deaf912 100644 (file)
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -6,170 +6,81 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.59 2006/03/05 15:58:46 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.60 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
  #include "mb/pg_wchar.h"
  
-/*
- * convert bogus chars that cannot be represented in the current
- * encoding system.
- */
-void
-pg_print_bogus_char(unsigned char **mic, unsigned char **p)
-{
-       char            strbuf[16];
-       int                     l = pg_mic_mblen(*mic);
-
-       *(*p)++ = '(';
-       while (l--)
-       {
-               sprintf(strbuf, "%02x", *(*mic)++);
-               *(*p)++ = strbuf[0];
-               *(*p)++ = strbuf[1];
-       }
-       *(*p)++ = ')';
-}
-
-#ifdef NOT_USED
  
  /*
- * GB18030 ---> MIC
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
-{
-       int                     c1;
-       int                     c2;
-
-       while (len > 0 && (c1 = *gb18030++))
-       {
-               if (c1 < 0x80)
-               {                                               /* should be ASCII */
-                       len--;
-                       *p++ = c1;
-               }
-               else if (c1 >= 0x81 && c1 <= 0xfe)
-               {
-                       c2 = *gb18030++;
-
-                       if (c2 >= 0x30 && c2 <= 0x69)
-                       {
-                               len -= 4;
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *gb18030++;
-                               *p++ = *gb18030++;
-                               *p++ = *gb18030++;
-                       }
-                       else if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
-                       {
-                               len -= 2;
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *gb18030++;
-                       }
-                       else
-                       {                                       /* throw the strange code */
-                               len--;
-                       }
-               }
-       }
-       *p = '\0';
-}
-
-/*
- * MIC ---> GB18030
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-mic2gb18030(unsigned char *mic, unsigned char *p, int len)
-{
-       int                     c1;
-       int                     c2;
-
-       while (len > 0 && (c1 = *mic))
-       {
-               len -= pg_mic_mblen(mic++);
-
-               if (!IS_HIGHBIT_SET(c1))                /* ASCII */
-                       *p++ = c1;
-               else if (c1 >= 0x81 && c1 <= 0xfe)
-               {
-                       c2 = *mic++;
-
-                       if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
-                       {
-                               *p++ = c1;
-                               *p++ = c2;
-                       }
-                       else if (c2 >= 0x30 && c2 <= 0x39)
-                       {
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *mic++;
-                               *p++ = *mic++;
-                       }
-                       else
-                       {
-                               mic--;
-                               pg_print_bogus_char(&mic, &p);
-                               mic--;
-                               pg_print_bogus_char(&mic, &p);
-                       }
-               }
-               else
-               {
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
-               }
-       }
-       *p = '\0';
-}
-#endif
-
-/*
- * LATINn ---> MIC
+ * LATINn ---> MIC when the charset's local codes map directly to MIC
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
   */
  void
-latin2mic(unsigned char *l, unsigned char *p, int len, int lc)
+latin2mic(const unsigned char *l, unsigned char *p, int len,
+                 int lc, int encoding)
  {
         int                     c1;
  
-       while (len-- > 0 && (c1 = *l++))
+       while (len > 0)
         {
+               c1 = *l;
+               if (c1 == 0)
+                       report_invalid_encoding(encoding, (const char *) l, len);
                 if (IS_HIGHBIT_SET(c1))
-                       *p++ = lc;                      /* Latin? */
+                       *p++ = lc;
                 *p++ = c1;
+               l++;
+               len--;
         }
         *p = '\0';
  }
  
  /*
- * MIC ---> LATINn
+ * MIC ---> LATINn when the charset's local codes map directly to MIC
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
   */
  void
-mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
+mic2latin(const unsigned char *mic, unsigned char *p, int len,
+                 int lc, int encoding)
  {
         int                     c1;
  
-       while (len > 0 && (c1 = *mic))
+       while (len > 0)
         {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == lc)
-                       *p++ = *mic++;
-               else if (IS_HIGHBIT_SET(c1))
+               c1 = *mic;
+               if (c1 == 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (!IS_HIGHBIT_SET(c1))
                 {
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       /* easy for ASCII */
+                       *p++ = c1;
+                       mic++;
+                       len--;
                 }
                 else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
+               {
+                       int             l = pg_mic_mblen(mic);
+
+                       if (len < l)
+                               report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+                                                                               len);
+                       if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+                               report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+                                                                                  (const char *) mic, len);
+                       *p++ = mic[1];
+                       mic += 2;
+                       len -= 2;
                 }
         }
         *p = '\0';
@@ -178,14 +89,25 @@ mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
  
  /*
   * ASCII ---> MIC
+ *
+ * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
+ * characters, here we must take a hard line because we don't know
+ * the appropriate MIC equivalent.
   */
  void
-pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
+pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
  {
         int                     c1;
  
-       while (len-- > 0 && (c1 = *l++))
-               *p++ = (c1 & 0x7f);
+       while (len > 0)
+       {
+               c1 = *l;
+               if (c1 == 0 || IS_HIGHBIT_SET(c1))
+                       report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
+               *p++ = c1;
+               l++;
+               len--;
+       }
         *p = '\0';
  }
  
@@ -193,19 +115,19 @@ pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
   * MIC ---> ASCII
   */
  void
-pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
+pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
  {
         int                     c1;
  
-       while (len-- > 0 && (c1 = *mic))
+       while (len > 0)
         {
-               if (IS_HIGHBIT_SET(c1))
-                       pg_print_bogus_char(&mic, &p);
-               else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-                       mic++;
-               }
+               c1 = *mic;
+               if (c1 == 0 || IS_HIGHBIT_SET(c1))
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
+                                                                          (const char *) mic, len);
+               *p++ = c1;
+               mic++;
+               len--;
         }
         *p = '\0';
  }
@@ -213,86 +135,103 @@ pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
  /*
   * latin2mic_with_table: a generic single byte charset encoding
   * conversion from a local charset to the mule internal code.
- * with a encoding conversion table.
- * the table is ordered according to the local charset,
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the local charset
   * starting from 128 (0x80). each entry in the table
   * holds the corresponding code point for the mule internal code.
   */
  void
-latin2mic_with_table(
-                                        unsigned char *l,      /* local charset string (source) */
-                                        unsigned char *p,      /* pointer to store mule internal code
-                                                                                * (destination) */
-                                        int len,       /* length of l */
-                                        int lc,        /* leading character of p */
-                                        unsigned char *tab /* code conversion table */
-)
+latin2mic_with_table(const unsigned char *l,
+                                        unsigned char *p,
+                                        int len,
+                                        int lc,
+                                        int encoding,
+                                        const unsigned char *tab)
  {
         unsigned char c1,
                                 c2;
  
-       while (len-- > 0 && (c1 = *l++))
+       while (len > 0)
         {
-               if (c1 < 128)
+               c1 = *l;
+               if (c1 == 0)
+                       report_invalid_encoding(encoding, (const char *) l, len);
+               if (!IS_HIGHBIT_SET(c1))
                         *p++ = c1;
                 else
                 {
-                       c2 = tab[c1 - 128];
+                       c2 = tab[c1 - HIGHBIT];
                         if (c2)
                         {
                                 *p++ = lc;
                                 *p++ = c2;
                         }
                         else
-                       {
-                               *p++ = ' ';             /* cannot convert */
-                       }
+                               report_untranslatable_char(encoding, PG_MULE_INTERNAL,
+                                                                                  (const char *) l, len);
                 }
+               l++;
+               len--;
         }
         *p = '\0';
  }
  
  /*
   * mic2latin_with_table: a generic single byte charset encoding
- * conversion from the mule internal code to a local charset
- * with a encoding conversion table.
- * the table is ordered according to the second byte of the mule
- * internal code starting from 128 (0x80).
- * each entry in the table
- * holds the corresponding code point for the local code.
+ * conversion from the mule internal code to a local charset.
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the mule internal code's
+ * second byte, starting from 128 (0x80). each entry in the table
+ * holds the corresponding code point for the local charset.
   */
  void
-mic2latin_with_table(
-                                        unsigned char *mic,            /* mule internal code (source) */
-                                        unsigned char *p,      /* local code (destination) */
-                                        int len,       /* length of p */
-                                        int lc,        /* leading character */
-                                        unsigned char *tab /* code conversion table */
-)
+mic2latin_with_table(const unsigned char *mic,
+                                        unsigned char *p,
+                                        int len,
+                                        int lc,
+                                        int encoding,
+                                        const unsigned char *tab)
  {
-
         unsigned char c1,
                                 c2;
  
-       while (len-- > 0 && (c1 = *mic++))
+       while (len > 0)
         {
-               if (c1 < 128)
-                       *p++ = c1;
-               else if (c1 == lc)
+               c1 = *mic;
+               if (c1 == 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (!IS_HIGHBIT_SET(c1))
                 {
-                       c1 = *mic++;
+                       /* easy for ASCII */
+                       *p++ = c1;
+                       mic++;
                         len--;
-                       c2 = tab[c1 - 128];
-                       if (c2)
-                               *p++ = c2;
-                       else
-                       {
-                               *p++ = ' ';             /* cannot convert */
-                       }
                 }
                 else
                 {
-                       *p++ = ' ';                     /* bogus character */
+                       int             l = pg_mic_mblen(mic);
+
+                       if (len < l)
+                               report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+                                                                               len);
+                       if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
+                               (c2 = tab[mic[1] - HIGHBIT]) == 0)
+                       {
+                               report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+                                                                                  (const char *) mic, len);
+                               break;                  /* keep compiler quiet */
+                       }
+                       *p++ = c2;
+                       mic += 2;
+                       len -= 2;
                 }
         }
         *p = '\0';
@@ -331,25 +270,38 @@ compare2(const void *p1, const void *p2)
  /*
   * UTF8 ---> local code
   *
- * utf: input UTF8 string. Its length is limited by "len" parameter
- *             or a null terminator.
- * iso: pointer to the output.
+ * utf: input UTF8 string (need not be null-terminated).
+ * iso: pointer to the output area (must be large enough!)
   * map: the conversion map.
   * size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
   */
  void
-UtfToLocal(unsigned char *utf, unsigned char *iso,
-                  pg_utf_to_local *map, int size, int len)
+UtfToLocal(const unsigned char *utf, unsigned char *iso,
+                  const pg_utf_to_local *map, int size, int encoding, int len)
  {
         unsigned int iutf;
         int                     l;
         pg_utf_to_local *p;
  
-       for (; len > 0 && *utf; len -= l)
+       for (; len > 0; len -= l)
         {
+               /* "break" cases all represent errors */
+               if (*utf == '\0')
+                       break;
+
                 l = pg_utf_mblen(utf);
+
+               if (len < l)
+                       break;
+
+               if (!pg_utf8_islegal(utf, l))
+                       break;
+
                 if (l == 1)
                 {
+                       /* ASCII case is easy */
                         *iso++ = *utf++;
                         continue;
                 }
@@ -371,16 +323,14 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
                         iutf |= *utf++ << 8;
                         iutf |= *utf++;
                 }
+
                 p = bsearch(&iutf, map, size,
                                         sizeof(pg_utf_to_local), compare1);
+
                 if (p == NULL)
-               {
-                       ereport(WARNING,
-                                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
-                                        errmsg("ignoring unconvertible UTF-8 character 0x%04x",
-                                                       iutf)));
-                       continue;
-               }
+                       report_untranslatable_char(PG_UTF8, encoding,
+                                                                          (const char *) (utf - l), len);
+
                 if (p->code & 0xff000000)
                         *iso++ = p->code >> 24;
                 if (p->code & 0x00ff0000)
@@ -390,15 +340,26 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
                 if (p->code & 0x000000ff)
                         *iso++ = p->code & 0x000000ff;
         }
+
+       if (len > 0)
+               report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+
         *iso = '\0';
  }
  
  /*
   * local code ---> UTF8
+ *
+ * iso: input local string (need not be null-terminated).
+ * utf: pointer to the output area (must be large enough!)
+ * map: the conversion map.
+ * size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
   */
  void
-LocalToUtf(unsigned char *iso, unsigned char *utf,
-                  pg_local_to_utf *map, int size, int encoding, int len)
+LocalToUtf(const unsigned char *iso, unsigned char *utf,
+                  const pg_local_to_utf *map, int size, int encoding, int len)
  {
         unsigned int iiso;
         int                     l;
@@ -409,16 +370,23 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                  errmsg("invalid encoding number: %d", encoding)));
  
-       for (; len > 0 && *iso; len -= l)
+       for (; len > 0; len -= l)
         {
+               /* "break" cases all represent errors */
+               if (*iso == '\0')
+                       break;
+
                 if (!IS_HIGHBIT_SET(*iso))
                 {
+                       /* ASCII case is easy */
                         *utf++ = *iso++;
                         l = 1;
                         continue;
                 }
  
-               l = pg_encoding_mblen(encoding, (char *) iso);
+               l = pg_encoding_verifymb(encoding, (const char *) iso, len);
+               if (l < 0)
+                       break;
  
                 if (l == 1)
                         iiso = *iso++;
@@ -440,16 +408,13 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                         iiso |= *iso++ << 8;
                         iiso |= *iso++;
                 }
+
                 p = bsearch(&iiso, map, size,
                                         sizeof(pg_local_to_utf), compare2);
                 if (p == NULL)
-               {
-                       ereport(WARNING,
-                                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
-                                        errmsg("ignoring unconvertible %s character 0x%04x",
-                                                       (&pg_enc2name_tbl[encoding])->name, iiso)));
-                       continue;
-               }
+                       report_untranslatable_char(encoding, PG_UTF8,
+                                                                          (const char *) (iso - l), len);
+
                 if (p->utf & 0xff000000)
                         *utf++ = p->utf >> 24;
                 if (p->utf & 0x00ff0000)
@@ -459,5 +424,9 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                 if (p->utf & 0x000000ff)
                         *utf++ = p->utf & 0x000000ff;
         }
+
+       if (len > 0)
+               report_invalid_encoding(encoding, (const char *) iso, len);
+
         *utf = '\0';
  }
diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c

index 544530d..57fc2d7 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.12 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.13 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -70,14 +70,14 @@ extern Datum win866_to_iso(PG_FUNCTION_ARGS);
   * ----------
   */
  
-static void koi8r2mic(unsigned char *l, unsigned char *p, int len);
-static void mic2koi8r(unsigned char *mic, unsigned char *p, int len);
-static void iso2mic(unsigned char *l, unsigned char *p, int len);
-static void mic2iso(unsigned char *mic, unsigned char *p, int len);
-static void win12512mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win1251(unsigned char *mic, unsigned char *p, int len);
-static void win8662mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win866(unsigned char *mic, unsigned char *p, int len);
+static void koi8r2mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2koi8r(const unsigned char *mic, unsigned char *p, int len);
+static void iso2mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2iso(const unsigned char *mic, unsigned char *p, int len);
+static void win12512mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win1251(const unsigned char *mic, unsigned char *p, int len);
+static void win8662mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win866(const unsigned char *mic, unsigned char *p, int len);
  
  Datum
  koi8r_to_mic(PG_FUNCTION_ARGS)
@@ -401,7 +401,7 @@ win1251_to_iso(PG_FUNCTION_ARGS)
  
         buf = palloc(len * ENCODING_GROWTH_RATE);
         win12512mic(src, buf, len);
-       mic2win1251(buf, dest, strlen((char *) buf));
+       mic2iso(buf, dest, strlen((char *) buf));
         pfree(buf);
  
         PG_RETURN_VOID();
@@ -441,7 +441,7 @@ win866_to_iso(PG_FUNCTION_ARGS)
  
         buf = palloc(len * ENCODING_GROWTH_RATE);
         win8662mic(src, buf, len);
-       mic2win866(buf, dest, strlen((char *) buf));
+       mic2iso(buf, dest, strlen((char *) buf));
         pfree(buf);
  
         PG_RETURN_VOID();
@@ -460,23 +460,23 @@ win866_to_iso(PG_FUNCTION_ARGS)
  
  /* koi8r2mic: KOI8-R to Mule internal code */
  static void
-koi8r2mic(unsigned char *l, unsigned char *p, int len)
+koi8r2mic(const unsigned char *l, unsigned char *p, int len)
  {
-       latin2mic(l, p, len, LC_KOI8_R);
+       latin2mic(l, p, len, LC_KOI8_R, PG_KOI8R);
  }
  
  /* mic2koi8r: Mule internal code to KOI8-R */
  static void
-mic2koi8r(unsigned char *mic, unsigned char *p, int len)
+mic2koi8r(const unsigned char *mic, unsigned char *p, int len)
  {
-       mic2latin(mic, p, len, LC_KOI8_R);
+       mic2latin(mic, p, len, LC_KOI8_R, PG_KOI8R);
  }
  
  /* iso2mic: ISO-8859-5 to Mule internal code */
  static void
-iso2mic(unsigned char *l, unsigned char *p, int len)
+iso2mic(const unsigned char *l, unsigned char *p, int len)
  {
-       static unsigned char iso2koi[] = {
+       static const unsigned char iso2koi[] = {
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -495,14 +495,14 @@ iso2mic(unsigned char *l, unsigned char *p, int len)
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
         };
  
-       latin2mic_with_table(l, p, len, LC_KOI8_R, iso2koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi);
  }
  
  /* mic2iso: Mule internal code to ISO8859-5 */
  static void
-mic2iso(unsigned char *mic, unsigned char *p, int len)
+mic2iso(const unsigned char *mic, unsigned char *p, int len)
  {
-       static unsigned char koi2iso[] = {
+       static const unsigned char koi2iso[] = {
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -521,14 +521,14 @@ mic2iso(unsigned char *mic, unsigned char *p, int len)
                 0xcc, 0xcb, 0xb7, 0xc8, 0xcd, 0xc9, 0xc7, 0xca
         };
  
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2iso);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso);
  }
  
  /* win2mic: CP1251 to Mule internal code */
  static void
-win12512mic(unsigned char *l, unsigned char *p, int len)
+win12512mic(const unsigned char *l, unsigned char *p, int len)
  {
-       static unsigned char win2koi[] = {
+       static const unsigned char win2koi[] = {
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -547,14 +547,14 @@ win12512mic(unsigned char *l, unsigned char *p, int len)
                 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1
         };
  
-       latin2mic_with_table(l, p, len, LC_KOI8_R, win2koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN1251, win2koi);
  }
  
  /* mic2win: Mule internal code to CP1251 */
  static void
-mic2win1251(unsigned char *mic, unsigned char *p, int len)
+mic2win1251(const unsigned char *mic, unsigned char *p, int len)
  {
-       static unsigned char koi2win[] = {
+       static const unsigned char koi2win[] = {
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -573,14 +573,14 @@ mic2win1251(unsigned char *mic, unsigned char *p, int len)
                 0xdc, 0xdb, 0xc7, 0xd8, 0xdd, 0xd9, 0xd7, 0xda
         };
  
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN1251, koi2win);
  }
  
  /* win8662mic: CP866 to Mule internal code */
  static void
-win8662mic(unsigned char *l, unsigned char *p, int len)
+win8662mic(const unsigned char *l, unsigned char *p, int len)
  {
-       static unsigned char win8662koi[] = {
+       static const unsigned char win8662koi[] = {
                 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa,
                 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0,
                 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe,
@@ -599,14 +599,14 @@ win8662mic(unsigned char *l, unsigned char *p, int len)
                 0xb6, 0xa6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
         };
  
-       latin2mic_with_table(l, p, len, LC_KOI8_R, win8662koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN866, win8662koi);
  }
  
  /* mic2win866: Mule internal code to CP866 */
  static void
-mic2win866(unsigned char *mic, unsigned char *p, int len)
+mic2win866(const unsigned char *mic, unsigned char *p, int len)
  {
-       static unsigned char koi2win866[] = {
+       static const unsigned char koi2win866[] = {
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -625,5 +625,5 @@ mic2win866(unsigned char *mic, unsigned char *p, int len)
                 0x9c, 0x9b, 0x87, 0x98, 0x9d, 0x99, 0x97, 0x9a
         };
  
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win866);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN866, koi2win866);
  }
diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c

index 8a05738..9121ac4 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.13 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.14 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -32,8 +32,8 @@ extern Datum mic_to_euc_cn(PG_FUNCTION_ARGS);
   * ----------
   */
  
-static void euc_cn2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_cn(unsigned char *mic, unsigned char *p, int len);
+static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len);
  
  Datum
  euc_cn_to_mic(PG_FUNCTION_ARGS)
@@ -71,23 +71,30 @@ mic_to_euc_cn(PG_FUNCTION_ARGS)
   * EUC_CN ---> MIC
   */
  static void
-euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
+euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
  {
         int                     c1;
  
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
         {
+               c1 = *euc;
                 if (IS_HIGHBIT_SET(c1))
                 {
-                       len -= 2;
+                       if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
+                               report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
                         *p++ = LC_GB2312_80;
                         *p++ = c1;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       euc += 2;
+                       len -= 2;
                 }
                 else
                 {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
                         *p++ = c1;
+                       euc++;
+                       len--;
                 }
         }
         *p = '\0';
@@ -97,26 +104,35 @@ euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
   * MIC ---> EUC_CN
   */
  static void
-mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
+mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
  {
         int                     c1;
  
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
         {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == LC_GB2312_80)
+               c1 = *mic;
+               if (IS_HIGHBIT_SET(c1))
                 {
+                       if (c1 != LC_GB2312_80)
+                               report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
+                                                                                  (const char *) mic, len);
+                       if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       mic++;
                         *p++ = *mic++;
                         *p++ = *mic++;
-               }
-               else if (IS_HIGHBIT_SET(c1))
-               {                                               /* cannot convert to EUC_CN! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       len -= 3;
                 }
                 else
-                       *p++ = c1;              /* should be ASCII */
+               {                                               /* should be ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+               }
         }
         *p = '\0';
  }
diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c

index 31fb737..372dda2 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.15 2006/03/04 10:57:35 ishii Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.16 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -22,9 +22,6 @@
  #define PGSJISALTCODE 0x81ac
  #define PGEUCALTCODE 0xa2ae
  
-#define ISSJISHEAD(c) ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc))
-#define ISSJISTAIL(c) ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xfc))
-
  /*
   * conversion table between SJIS UDC (IBM kanji) and EUC_JP
   */
@@ -57,12 +54,12 @@ extern Datum mic_to_sjis(PG_FUNCTION_ARGS);
   * ----------
   */
  
-static void sjis2mic(unsigned char *sjis, unsigned char *p, int len);
-static void mic2sjis(unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2sjis(unsigned char *mic, unsigned char *p, int len);
-static void sjis2euc_jp(unsigned char *mic, unsigned char *p, int len);
+static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
+static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
+static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
+static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len);
+static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len);
  
  Datum
  euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -164,38 +161,34 @@ mic_to_sjis(PG_FUNCTION_ARGS)
   * SJIS ---> MIC
   */
  static void
-sjis2mic(unsigned char *sjis, unsigned char *p, int len)
+sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
  {
         int                     c1,
                                 c2,
-/* Eiji Tokuya patched begin */
                                 i,
                                 k,
                                 k2;
  
-/* Eiji Tokuya patched end */
-       while (len >= 0 && (c1 = *sjis++))
+       while (len > 0)
         {
+               c1 = *sjis;
                 if (c1 >= 0xa1 && c1 <= 0xdf)
                 {
                         /* JIS X0201 (1 byte kana) */
-                       len--;
                         *p++ = LC_JISX0201K;
                         *p++ = c1;
+                       sjis++;
+                       len--;
                 }
                 else if (IS_HIGHBIT_SET(c1))
                 {
                         /*
                          * JIS X0208, X0212, user defined extended characters
                          */
-                       c2 = *sjis++;
-                       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                       errmsg("invalid byte sequence for encoding \"SJIS\": 0x%02x%02x",
-                                                   c1, c2)));
+                       if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
+                               report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+                       c2 = sjis[1];
                         k = (c1 << 8) + c2;
-/* Eiji Tokuya patched begin */
                         if (k >= 0xed40 && k < 0xf040)
                         {
                                 /* NEC selection IBM kanji */
@@ -214,19 +207,15 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                         }
  
                         if (k < 0xeb3f)
-/* Eiji Tokuya patched end */
                         {
                                 /* JIS X0208 */
-                               len -= 2;
                                 *p++ = LC_JISX0208;
                                 *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
                                 *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
                         }
-/* Eiji Tokuya patched begin */
                         else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
                         {
                                 /* NEC selection IBM kanji - Other undecided justice */
-/* Eiji Tokuya patched end */
                                 *p++ = LC_JISX0208;
                                 *p++ = PGEUCALTCODE >> 8;
                                 *p++ = PGEUCALTCODE & 0xff;
@@ -237,7 +226,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                  * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
                                  * 0x7e7e EUC 0xf5a1 - 0xfefe
                                  */
-                               len -= 2;
                                 *p++ = LC_JISX0208;
                                 c1 -= 0x6f;
                                 *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
@@ -249,7 +237,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                  * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
                                  * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
                                  */
-                               len -= 2;
                                 *p++ = LC_JISX0212;
                                 c1 -= 0x74;
                                 *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
@@ -259,9 +246,7 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                         {
                                 /*
                                  * mapping IBM kanji to X0208 and X0212
-                                *
                                  */
-                               len -= 2;
                                 for (i = 0;; i++)
                                 {
                                         k2 = ibmkanji[i].sjis;
@@ -285,11 +270,16 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                         }
                                 }
                         }
+                       sjis += 2;
+                       len -= 2;
                 }
                 else
                 {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
                         *p++ = c1;
+                       sjis++;
+                       len--;
                 }
         }
         *p = '\0';
@@ -299,22 +289,37 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
   * MIC ---> SJIS
   */
  static void
-mic2sjis(unsigned char *mic, unsigned char *p, int len)
+mic2sjis(const unsigned char *mic, unsigned char *p, int len)
  {
         int                     c1,
                                 c2,
-                               k;
+                               k,
+                               l;
  
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
         {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                 if (c1 == LC_JISX0201K)
-                       *p++ = *mic++;
+                       *p++ = mic[1];
                 else if (c1 == LC_JISX0208)
                 {
-                       c1 = *mic++;
-                       c2 = *mic++;
+                       c1 = mic[1];
+                       c2 = mic[2];
                         k = (c1 << 8) | (c2 & 0xff);
                         if (k >= 0xf5a1)
                         {
@@ -331,8 +336,8 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
                         int                     i,
                                                 k2;
  
-                       c1 = *mic++;
-                       c2 = *mic++;
+                       c1 = mic[1];
+                       c2 = mic[2];
                         k = c1 << 8 | c2;
                         if (k >= 0xf5a1)
                         {
@@ -363,14 +368,11 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
                                 }
                         }
                 }
-               else if (IS_HIGHBIT_SET(c1))
-               {
-                       /* cannot convert to SJIS! */
-                       *p++ = PGSJISALTCODE >> 8;
-                       *p++ = PGSJISALTCODE & 0xff;
-               }
                 else
-                       *p++ = c1;              /* should be ASCII */
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
         }
         *p = '\0';
  }
@@ -379,37 +381,48 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
   * EUC_JP ---> MIC
   */
  static void
-euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
+euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
  {
         int                     c1;
+       int                     l;
  
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
         {
+               c1 = *euc;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_JP,
+                                                                               (const char *) euc, len);
+                       *p++ = c1;
+                       euc++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_EUC_JP,
+                                                                       (const char *) euc, len);
                 if (c1 == SS2)
                 {                                               /* 1 byte kana? */
-                       len -= 2;
                         *p++ = LC_JISX0201K;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
                 }
                 else if (c1 == SS3)
                 {                                               /* JIS X0212 kanji? */
-                       len -= 3;
                         *p++ = LC_JISX0212;
-                       *p++ = *euc++;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       *p++ = euc[2];
                 }
-               else if (c1 & 0x80)
+               else
                 {                                               /* kanji? */
-                       len -= 2;
                         *p++ = LC_JISX0208;
                         *p++ = c1;
-                       *p++ = *euc++;
-               }
-               else
-               {                                               /* should be ASCII */
-                       len--;
-                       *p++ = c1;
+                       *p++ = euc[1];
                 }
+               euc += l;
+               len -= l;
         }
         *p = '\0';
  }
@@ -418,37 +431,50 @@ euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
   * MIC ---> EUC_JP
   */
  static void
-mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
+mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
  {
         int                     c1;
+       int                     l;
  
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
         {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                 if (c1 == LC_JISX0201K)
                 {
                         *p++ = SS2;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
                 }
                 else if (c1 == LC_JISX0212)
                 {
                         *p++ = SS3;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                 }
                 else if (c1 == LC_JISX0208)
                 {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
-               }
-               else if (IS_HIGHBIT_SET(c1))
-               {                                               /* cannot convert to EUC_JP! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                 }
                 else
-                       *p++ = c1;              /* should be ASCII */
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
         }
         *p = '\0';
  }
@@ -457,30 +483,41 @@ mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
   * EUC_JP -> SJIS
   */
  static void
-euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
+euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
  {
         int                     c1,
                                 c2,
                                 k;
-       unsigned char *euc_end = euc + len;
+       int                     l;
  
-       while (euc_end >= euc && (c1 = *euc++))
+       while (len > 0)
         {
-               if (c1 < 0x80)
+               c1 = *euc;
+               if (!IS_HIGHBIT_SET(c1))
                 {
-                       /* should be ASCII */
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_JP,
+                                                                               (const char *) euc, len);
                         *p++ = c1;
+                       euc++;
+                       len--;
+                       continue;
                 }
-               else if (c1 == SS2)
+               l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_EUC_JP,
+                                                                       (const char *) euc, len);
+               if (c1 == SS2)
                 {
                         /* hankaku kana? */
-                       *p++ = *euc++;
+                       *p++ = euc[1];
                 }
                 else if (c1 == SS3)
                 {
                         /* JIS X0212 kanji? */
-                       c1 = *euc++;
-                       c2 = *euc++;
+                       c1 = euc[1];
+                       c2 = euc[2];
                         k = c1 << 8 | c2;
                         if (k >= 0xf5a1)
                         {
@@ -517,7 +554,7 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
                 else
                 {
                         /* JIS X0208 kanji? */
-                       c2 = *euc++;
+                       c2 = euc[1];
                         k = (c1 << 8) | (c2 & 0xff);
                         if (k >= 0xf5a1)
                         {
@@ -529,6 +566,8 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
                                 *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
                         *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
                 }
+               euc += l;
+               len -= l;
         }
         *p = '\0';
  }
@@ -537,23 +576,34 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
   * SJIS ---> EUC_JP
   */
  static void
-sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
+sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
  {
         int                     c1,
                                 c2,
                                 i,
                                 k,
                                 k2;
-       unsigned char *sjis_end = sjis + len;
+       int                     l;
  
-       while (sjis_end >= sjis && (c1 = *sjis++))
+       while (len > 0)
         {
-               if (c1 < 0x80)
+               c1 = *sjis;
+               if (!IS_HIGHBIT_SET(c1))
                 {
-                       /* should be ASCII */
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_SJIS,
+                                                                               (const char *) sjis, len);
                         *p++ = c1;
+                       sjis++;
+                       len--;
+                       continue;
                 }
-               else if (c1 >= 0xa1 && c1 <= 0xdf)
+               l = pg_encoding_verifymb(PG_SJIS, (const char *) sjis, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_SJIS,
+                                                                       (const char *) sjis, len);
+               if (c1 >= 0xa1 && c1 <= 0xdf)
                 {
                         /* JIS X0201 (1 byte kana) */
                         *p++ = SS2;
@@ -564,12 +614,7 @@ sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
                         /*
                          * JIS X0208, X0212, user defined extended characters
                          */
-                       c2 = *sjis++;
-                       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                       errmsg("invalid byte sequence for encoding \"SJIS\": 0x%02x%02x",
-                                                   c1, c2)));
+                       c2 = sjis[1];
                         k = (c1 << 8) + c2;
                         if (k >= 0xed40 && k < 0xf040)
                         {
@@ -650,6 +695,8 @@ sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
                                 }
                         }
                 }
+               sjis += l;
+               len -= l;
         }
         *p = '\0';
  }
diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c

index ff08a4f..a424803 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.13 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.14 2006/05/21 20:05:19 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -32,8 +32,8 @@ extern Datum mic_to_euc_kr(PG_FUNCTION_ARGS);
   * ----------
   */
  
-static void euc_kr2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_kr(unsigned char *mic, unsigned char *p, int len);
+static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len);
  
  Datum
  euc_kr_to_mic(PG_FUNCTION_ARGS)
@@ -71,23 +71,34 @@ mic_to_euc_kr(PG_FUNCTION_ARGS)
   * EUC_KR ---> MIC
   */
  static void
-euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
+euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
  {
         int                     c1;
+       int                     l;
  
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
         {
+               c1 = *euc;
                 if (IS_HIGHBIT_SET(c1))
                 {
-                       len -= 2;
+                       l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len);
+                       if (l != 2)
+                               report_invalid_encoding(PG_EUC_KR,
+                                                                               (const char *) euc, len);
                         *p++ = LC_KS5601;
                         *p++ = c1;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       euc += 2;
+                       len -= 2;
                 }
                 else
                 {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_KR,
+                                                                               (const char *) euc, len);
                         *p++ = c1;
+                       euc++;
+                       len--;
                 }
         }
         *p = '\0';
@@ -97,26 +108,39 @@ euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
   * MIC ---> EUC_KR
   */
  static void
-mic2euc_kr(unsigned char *mic, unsigned char *p, int len)
+mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
  {
         int                     c1;
+       int                     l;
  
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
         {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == LC_KS5601)
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
                 {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
                 }
-               else if (IS_HIGHBIT_SET(c1))
-               {                                               /* cannot convert to EUC_KR! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
+               if (c1 == LC_KS5601)
+               {
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                 }
                 else
-                       *p++ = c1;              /* should be ASCII */
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
         }
         *p = '\0';
  }
diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c

index 501dab4..28ca458 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.13 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.14 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -42,10 +42,10 @@ extern Datum mic_to_big5(PG_FUNCTION_ARGS);
   * ----------
   */
  
-static void big52mic(unsigned char *big5, unsigned char *p, int len);
-static void mic2big5(unsigned char *mic, unsigned char *p, int len);
-static void euc_tw2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_tw(unsigned char *mic, unsigned char *p, int len);
+static void big52mic(const unsigned char *big5, unsigned char *p, int len);
+static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
+static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len);
  
  Datum
  euc_tw_to_big5(PG_FUNCTION_ARGS)
@@ -114,7 +114,7 @@ mic_to_euc_tw(PG_FUNCTION_ARGS)
         Assert(PG_GETARG_INT32(1) == PG_EUC_TW);
         Assert(len >= 0);
  
-       mic2big5(src, dest, len);
+       mic2euc_tw(src, dest, len);
  
         PG_RETURN_VOID();
  }
@@ -155,39 +155,52 @@ mic_to_big5(PG_FUNCTION_ARGS)
   * EUC_TW ---> MIC
   */
  static void
-euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
+euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
  {
         int                     c1;
+       int                     l;
  
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
         {
-               if (c1 == SS2)
+               c1 = *euc;
+               if (IS_HIGHBIT_SET(c1))
                 {
-                       len -= 4;
-                       c1 = *euc++;            /* plane No. */
-                       if (c1 == 0xa1)
-                               *p++ = LC_CNS11643_1;
-                       else if (c1 == 0xa2)
-                               *p++ = LC_CNS11643_2;
-                       else
+                       l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len);
+                       if (l < 0)
+                               report_invalid_encoding(PG_EUC_TW,
+                                                                               (const char *) euc, len);
+                       if (c1 == SS2)
                         {
-                               *p++ = 0x9d;    /* LCPRV2 */
-                               *p++ = 0xa3 - c1 + LC_CNS11643_3;
+                               c1 = euc[1];            /* plane No. */
+                               if (c1 == 0xa1)
+                                       *p++ = LC_CNS11643_1;
+                               else if (c1 == 0xa2)
+                                       *p++ = LC_CNS11643_2;
+                               else
+                               {
+                                       *p++ = 0x9d;    /* LCPRV2 */
+                                       *p++ = c1 - 0xa3 + LC_CNS11643_3;
+                               }
+                               *p++ = euc[2];
+                               *p++ = euc[3];
                         }
-                       *p++ = *euc++;
-                       *p++ = *euc++;
-               }
-               else if (IS_HIGHBIT_SET(c1))
-               {                                               /* CNS11643-1 */
-                       len -= 2;
-                       *p++ = LC_CNS11643_1;
-                       *p++ = c1;
-                       *p++ = *euc++;
+                       else
+                       {                                               /* CNS11643-1 */
+                               *p++ = LC_CNS11643_1;
+                               *p++ = c1;
+                               *p++ = euc[1];
+                       }
+                       euc += l;
+                       len -= l;
                 }
                 else
                 {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_TW,
+                                                                               (const char *) euc, len);
                         *p++ = c1;
+                       euc++;
+                       len--;
                 }
         }
         *p = '\0';
@@ -197,40 +210,54 @@ euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
   * MIC ---> EUC_TW
   */
  static void
-mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
+mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
  {
         int                     c1;
+       int                     l;
  
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
         {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                 if (c1 == LC_CNS11643_1)
                 {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                 }
                 else if (c1 == LC_CNS11643_2)
                 {
                         *p++ = SS2;
                         *p++ = 0xa2;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                 }
-               else if (c1 == 0x9d)
+               else if (c1 == 0x9d &&
+                                mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7)
                 {                                               /* LCPRV2? */
                         *p++ = SS2;
-                       *p++ = *mic++ - LC_CNS11643_3 + 0xa3;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
-               }
-               else if (IS_HIGHBIT_SET(c1))
-               {                                               /* cannot convert to EUC_TW! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       *p++ = mic[1] - LC_CNS11643_3 + 0xa3;
+                       *p++ = mic[2];
+                       *p++ = mic[3];
                 }
                 else
-                       *p++ = c1;              /* should be ASCII */
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
         }
         *p = '\0';
  }
@@ -239,52 +266,49 @@ mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
   * Big5 ---> MIC
   */
  static void
-big52mic(unsigned char *big5, unsigned char *p, int len)
+big52mic(const unsigned char *big5, unsigned char *p, int len)
  {
         unsigned short c1;
         unsigned short big5buf,
                                 cnsBuf;
         unsigned char lc;
-       char            bogusBuf[3];
-       int                     i;
+       int                     l;
  
-       while (len >= 0 && (c1 = *big5++))
+       while (len > 0)
         {
+               c1 = *big5;
                 if (!IS_HIGHBIT_SET(c1))
-               {                                               /* ASCII */
-                       len--;
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_BIG5,
+                                                                               (const char *) big5, len);
                         *p++ = c1;
+                       big5++;
+                       len--;
+                       continue;
                 }
-               else
+               l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_BIG5,
+                                                                       (const char *) big5, len);
+               big5buf = (c1 << 8) | big5[1];
+               cnsBuf = BIG5toCNS(big5buf, &lc);
+               if (lc != 0)
                 {
-                       len -= 2;
-                       big5buf = c1 << 8;
-                       c1 = *big5++;
-                       big5buf |= c1;
-                       cnsBuf = BIG5toCNS(big5buf, &lc);
-                       if (lc != 0)
+                       if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
                         {
-                               if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
-                               {
-                                       *p++ = 0x9d;    /* LCPRV2 */
-                               }
-                               *p++ = lc;              /* Plane No. */
-                               *p++ = (cnsBuf >> 8) & 0x00ff;
-                               *p++ = cnsBuf & 0x00ff;
-                       }
-                       else
-                       {                                       /* cannot convert */
-                               big5 -= 2;
-                               *p++ = '(';
-                               for (i = 0; i < 2; i++)
-                               {
-                                       sprintf(bogusBuf, "%02x", *big5++);
-                                       *p++ = bogusBuf[0];
-                                       *p++ = bogusBuf[1];
-                               }
-                               *p++ = ')';
+                               *p++ = 0x9d;    /* LCPRV2 */
                         }
+                       *p++ = lc;              /* Plane No. */
+                       *p++ = (cnsBuf >> 8) & 0x00ff;
+                       *p++ = cnsBuf & 0x00ff;
                 }
+               else
+                       report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
+                                                                          (const char *) big5, len);
+               big5 += l;
+               len -= l;
         }
         *p = '\0';
  }
@@ -293,46 +317,55 @@ big52mic(unsigned char *big5, unsigned char *p, int len)
   * MIC ---> Big5
   */
  static void
-mic2big5(unsigned char *mic, unsigned char *p, int len)
+mic2big5(const unsigned char *mic, unsigned char *p, int len)
  {
-       int                     l;
         unsigned short c1;
         unsigned short big5buf,
                                 cnsBuf;
+       int                     l;
  
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
         {
-               l = pg_mic_mblen(mic++);
-               len -= l;
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                 /* 0x9d means LCPRV2 */
                 if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == 0x9d)
                 {
                         if (c1 == 0x9d)
                         {
-                               c1 = *mic++;    /* get plane no. */
-                       }
-                       cnsBuf = (*mic++) << 8;
-                       cnsBuf |= (*mic++) & 0x00ff;
-                       big5buf = CNStoBIG5(cnsBuf, c1);
-                       if (big5buf == 0)
-                       {                                       /* cannot convert to Big5! */
-                               mic -= l;
-                               pg_print_bogus_char(&mic, &p);
+                               c1 = mic[1];    /* get plane no. */
+                               cnsBuf = (mic[2] << 8) | mic[3];
                         }
                         else
                         {
-                               *p++ = (big5buf >> 8) & 0x00ff;
-                               *p++ = big5buf & 0x00ff;
+                               cnsBuf = (mic[1] << 8) | mic[2];
                         }
+                       big5buf = CNStoBIG5(cnsBuf, c1);
+                       if (big5buf == 0)
+                               report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
+                                                                                  (const char *) mic, len);
+                       *p++ = (big5buf >> 8) & 0x00ff;
+                       *p++ = big5buf & 0x00ff;
                 }
-               else if (!IS_HIGHBIT_SET(c1))   /* ASCII */
-                       *p++ = c1;
                 else
-               {                                               /* cannot convert to Big5! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
         }
         *p = '\0';
  }
diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c

index b85ebf4..5563b20 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
+++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.11 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.12 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -42,10 +42,10 @@ extern Datum win1250_to_latin2(PG_FUNCTION_ARGS);
   * ----------
   */
  
-static void latin22mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin2(unsigned char *mic, unsigned char *p, int len);
-static void win12502mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win1250(unsigned char *mic, unsigned char *p, int len);
+static void latin22mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin2(const unsigned char *mic, unsigned char *p, int len);
+static void win12502mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win1250(const unsigned char *mic, unsigned char *p, int len);
  
  Datum
  latin2_to_mic(PG_FUNCTION_ARGS)
@@ -152,14 +152,15 @@ win1250_to_latin2(PG_FUNCTION_ARGS)
  }
  
  static void
-latin22mic(unsigned char *l, unsigned char *p, int len)
+latin22mic(const unsigned char *l, unsigned char *p, int len)
  {
-       latin2mic(l, p, len, LC_ISO8859_2);
+       latin2mic(l, p, len, LC_ISO8859_2, PG_LATIN2);
  }
+
  static void
-mic2latin2(unsigned char *mic, unsigned char *p, int len)
+mic2latin2(const unsigned char *mic, unsigned char *p, int len)
  {
-       mic2latin(mic, p, len, LC_ISO8859_2);
+       mic2latin(mic, p, len, LC_ISO8859_2, PG_LATIN2);
  }
  
  /*-----------------------------------------------------------------
@@ -167,9 +168,9 @@ mic2latin2(unsigned char *mic, unsigned char *p, int len)
   * Microsoft's CP1250(windows-1250)
   *-----------------------------------------------------------------*/
  static void
-win12502mic(unsigned char *l, unsigned char *p, int len)
+win12502mic(const unsigned char *l, unsigned char *p, int len)
  {
-       static unsigned char win1250_2_iso88592[] = {
+       static const unsigned char win1250_2_iso88592[] = {
                 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
                 0x88, 0x89, 0xA9, 0x8B, 0xA6, 0xAB, 0xAE, 0xAC,
                 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
@@ -188,12 +189,14 @@ win12502mic(unsigned char *l, unsigned char *p, int len)
                 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
         };
  
-       latin2mic_with_table(l, p, len, LC_ISO8859_2, win1250_2_iso88592);
+       latin2mic_with_table(l, p, len, LC_ISO8859_2, PG_WIN1250,
+                                                win1250_2_iso88592);
  }
+
  static void
-mic2win1250(unsigned char *mic, unsigned char *p, int len)
+mic2win1250(const unsigned char *mic, unsigned char *p, int len)
  {
-       static unsigned char iso88592_2_win1250[] = {
+       static const unsigned char iso88592_2_win1250[] = {
                 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
                 0x88, 0x89, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x00,
                 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
@@ -212,5 +215,6 @@ mic2win1250(unsigned char *mic, unsigned char *p, int len)
                 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
         };
  
-       mic2latin_with_table(mic, p, len, LC_ISO8859_2, iso88592_2_win1250);
+       mic2latin_with_table(mic, p, len, LC_ISO8859_2, PG_WIN1250,
+                                                iso88592_2_win1250);
  }
diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c

index adf72e2..14c220c 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
+++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.11 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.12 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -40,12 +40,12 @@ extern Datum mic_to_latin4(PG_FUNCTION_ARGS);
   * ----------
   */
  
-static void latin12mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin1(unsigned char *mic, unsigned char *p, int len);
-static void latin32mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin3(unsigned char *mic, unsigned char *p, int len);
-static void latin42mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin4(unsigned char *mic, unsigned char *p, int len);
+static void latin12mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin1(const unsigned char *mic, unsigned char *p, int len);
+static void latin32mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin3(const unsigned char *mic, unsigned char *p, int len);
+static void latin42mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin4(const unsigned char *mic, unsigned char *p, int len);
  
  Datum
  latin1_to_mic(PG_FUNCTION_ARGS)
@@ -144,32 +144,37 @@ mic_to_latin4(PG_FUNCTION_ARGS)
  }
  
  static void
-latin12mic(unsigned char *l, unsigned char *p, int len)
+latin12mic(const unsigned char *l, unsigned char *p, int len)
  {
-       latin2mic(l, p, len, LC_ISO8859_1);
+       latin2mic(l, p, len, LC_ISO8859_1, PG_LATIN1);
  }
+
  static void
-mic2latin1(unsigned char *mic, unsigned char *p, int len)
+mic2latin1(const unsigned char *mic, unsigned char *p, int len)
  {
-       mic2latin(mic, p, len, LC_ISO8859_1);
+       mic2latin(mic, p, len, LC_ISO8859_1, PG_LATIN1);
  }
+
  static void
-latin32mic(unsigned char *l, unsigned char *p, int len)
+latin32mic(const unsigned char *l, unsigned char *p, int len)
  {
-       latin2mic(l, p, len, LC_ISO8859_3);
+       latin2mic(l, p, len, LC_ISO8859_3, PG_LATIN3);
  }
+
  static void
-mic2latin3(unsigned char *mic, unsigned char *p, int len)
+mic2latin3(const unsigned char *mic, unsigned char *p, int len)
  {
-       mic2latin(mic, p, len, LC_ISO8859_3);
+       mic2latin(mic, p, len, LC_ISO8859_3, PG_LATIN3);
  }
+
  static void
-latin42mic(unsigned char *l, unsigned char *p, int len)
+latin42mic(const unsigned char *l, unsigned char *p, int len)
  {
-       latin2mic(l, p, len, LC_ISO8859_4);
+       latin2mic(l, p, len, LC_ISO8859_4, PG_LATIN4);
  }
+
  static void
-mic2latin4(unsigned char *mic, unsigned char *p, int len)
+mic2latin4(const unsigned char *mic, unsigned char *p, int len)
  {
-       mic2latin(mic, p, len, LC_ISO8859_4);
+       mic2latin(mic, p, len, LC_ISO8859_4, PG_LATIN4);
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c b/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c

index 33bd4bc..ce9639d 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.12 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.13 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -43,6 +43,7 @@ ascii_to_utf8(PG_FUNCTION_ARGS)
         Assert(PG_GETARG_INT32(1) == PG_UTF8);
         Assert(len >= 0);
  
+       /* this looks wrong, but basically we're just rejecting high-bit-set */
         pg_ascii2mic(src, dest, len);
  
         PG_RETURN_VOID();
@@ -59,6 +60,7 @@ utf8_to_ascii(PG_FUNCTION_ARGS)
         Assert(PG_GETARG_INT32(1) == PG_SQL_ASCII);
         Assert(len >= 0);
  
+       /* this looks wrong, but basically we're just rejecting high-bit-set */
         pg_mic2ascii(src, dest, len);
  
         PG_RETURN_VOID();
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c

index 3ec1497..00fd62c 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.12 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.13 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_big5(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapBIG5,
-                          sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), PG_BIG5, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c

index 3dd4796..0854e0d 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.14 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.15 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -46,7 +46,7 @@ utf8_to_koi8r(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapKOI8R,
-                          sizeof(ULmapKOI8R) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapKOI8R) / sizeof(pg_utf_to_local), PG_KOI8R, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c

index 860445f..23a1a50 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.13 2006/03/05 15:58:47 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.14 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapEUC_CN,
-                          sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), PG_EUC_CN, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c

index 1662d79..11bcd7e 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.13 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.14 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapEUC_JP,
-                          sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), PG_EUC_JP, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c

index 2059ad9..6895843 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.13 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.14 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapEUC_KR,
-                          sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), PG_EUC_KR, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c

index f7141df..a26139d 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.13 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.14 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapEUC_TW,
-                          sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), PG_EUC_TW, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c

index 379dae6..f2587b0 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.13 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.14 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapGB18030,
-                          sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), PG_GB18030, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c

index f52004f..be7a283 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.12 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.13 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_gbk(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapGBK,
-                          sizeof(ULmapGBK) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapGBK) / sizeof(pg_utf_to_local), PG_GBK, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c

index 8a86c35..6de77c1 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.18 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.19 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -153,7 +153,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
         {
                 if (encoding == maps[i].encoding)
                 {
-                       UtfToLocal(src, dest, maps[i].map2, maps[i].size2, len);
+                       UtfToLocal(src, dest, maps[i].map2, maps[i].size2, encoding, len);
                         PG_RETURN_VOID();
                 }
         }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c

index 1686137..038f678 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.15 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.16 2006/05/21 20:05:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -44,8 +44,11 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
         Assert(PG_GETARG_INT32(1) == PG_UTF8);
         Assert(len >= 0);
  
-       while (len-- > 0 && (c = *src++))
+       while (len > 0)
         {
+               c = *src;
+               if (c == 0)
+                       report_invalid_encoding(PG_LATIN1, (const char *) src, len);
                 if (!IS_HIGHBIT_SET(c))
                         *dest++ = c;
                 else
@@ -53,6 +56,8 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
                         *dest++ = (c >> 6) | 0xc0;
                         *dest++ = (c & 0x003f) | HIGHBIT;
                 }
+               src++;
+               len--;
         }
         *dest = '\0';
  
@@ -66,32 +71,44 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
         unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
         int                     len = PG_GETARG_INT32(4);
         unsigned short c,
-                               c1,
-                               c2;
+                               c1;
  
         Assert(PG_GETARG_INT32(0) == PG_UTF8);
         Assert(PG_GETARG_INT32(1) == PG_LATIN1);
         Assert(len >= 0);
  
-       while (len >= 0 && (c = *src++))
+       while (len > 0)
         {
-               if ((c & 0xe0) == 0xc0)
+               c = *src;
+               if (c == 0)
+                       report_invalid_encoding(PG_UTF8, (const char *) src, len);
+               /* fast path for ASCII-subset characters */
+               if (!IS_HIGHBIT_SET(c))
                 {
-                       c1 = c & 0x1f;
-                       c2 = *src++ & 0x3f;
-                       *dest = c1 << 6;
-                       *dest++ |= c2;
-                       len -= 2;
+                       *dest++ = c;
+                       src++;
+                       len--;
                 }
-               else if ((c & 0xe0) == 0xe0)
-                       ereport(WARNING,
-                                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
-                                        errmsg("ignoring unconvertible UTF-8 character 0x%04x",
-                                                       c)));
                 else
                 {
-                       *dest++ = c;
-                       len--;
+                       int             l = pg_utf_mblen(src);
+
+                       if (l > len || !pg_utf8_islegal(src, l))
+                               report_invalid_encoding(PG_UTF8, (const char *) src, len);
+                       if (l != 2)
+                               report_untranslatable_char(PG_UTF8, PG_LATIN1,
+                                                                                  (const char *) src, len);
+                       c1 = src[1] & 0x3f;
+                       c = ((c & 0x1f) << 6) | c1;
+                       if (c >= 0x80 && c <= 0xff)
+                       {
+                               *dest++ = (unsigned char) c;
+                               src += 2;
+                               len -= 2;
+                       }
+                       else
+                               report_untranslatable_char(PG_UTF8, PG_LATIN1,
+                                                                                  (const char *) src, len);
                 }
         }
         *dest = '\0';
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c

index 5896faf..7223ece 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.13 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.14 2006/05/21 20:05:21 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_johab(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapJOHAB,
-                          sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), PG_JOHAB, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c

index decdc5f..12f9f43 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.12 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.13 2006/05/21 20:05:21 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_sjis(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapSJIS,
-                          sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), PG_SJIS, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c

index 5689dc5..860b475 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.12 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.13 2006/05/21 20:05:21 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,7 +62,7 @@ utf8_to_uhc(PG_FUNCTION_ARGS)
         Assert(len >= 0);
  
         UtfToLocal(src, dest, ULmapUHC,
-                          sizeof(ULmapUHC) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapUHC) / sizeof(pg_utf_to_local), PG_UHC, len);
  
         PG_RETURN_VOID();
  }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c

index ad7f319..932b164 100644 (file)
--- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c,v 1.2 2006/03/05 15:58:48 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c,v 1.3 2006/05/21 20:05:21 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -143,7 +143,7 @@ utf8_to_win(PG_FUNCTION_ARGS)
         {
                 if (encoding == maps[i].encoding)
                 {
-                       UtfToLocal(src, dest, maps[i].map2, maps[i].size2, len);
+                       UtfToLocal(src, dest, maps[i].map2, maps[i].size2, encoding, len);
                         PG_RETURN_VOID();
                 }
         }
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c

index dc43b04..935e4a8 100644 (file)
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -4,7 +4,7 @@
   * (currently mule internal code (mic) is used)
   * Tatsuo Ishii
   *
- * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.55 2006/01/12 22:04:02 neilc Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.56 2006/05/21 20:05:19 tgl Exp $
   */
  #include "postgres.h"
  
@@ -362,8 +362,49 @@ pg_client_to_server(const char *s, int len)
         Assert(DatabaseEncoding);
         Assert(ClientEncoding);
  
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding)
+       if (len <= 0)
+               return (char *) s;
+
+       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
+               ClientEncoding->encoding == PG_SQL_ASCII)
+       {
+               /*
+                * No conversion is needed, but we must still validate the data.
+                */
+               (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
+               return (char *) s;
+       }
+
+       if (DatabaseEncoding->encoding == PG_SQL_ASCII)
+       {
+               /*
+                * No conversion is possible, but we must still validate the data,
+                * because the client-side code might have done string escaping
+                * using the selected client_encoding.  If the client encoding is
+                * ASCII-safe then we just do a straight validation under that
+                * encoding.  For an ASCII-unsafe encoding we have a problem:
+                * we dare not pass such data to the parser but we have no way
+                * to convert it.  We compromise by rejecting the data if it
+                * contains any non-ASCII characters.
+                */
+               if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
+                       (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
+               else
+               {
+                       int             i;
+
+                       for (i = 0; i < len; i++)
+                       {
+                               if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                                                        errmsg("invalid byte value for encoding \"%s\": 0x%02x",
+                                                                       pg_enc2name_tbl[PG_SQL_ASCII].name,
+                                                                       (unsigned char) s[i])));
+                       }
+               }
                 return (char *) s;
+       }
  
         return perform_default_encoding_conversion(s, len, true);
  }
@@ -377,9 +418,14 @@ pg_server_to_client(const char *s, int len)
         Assert(DatabaseEncoding);
         Assert(ClientEncoding);
  
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding)
+       if (len <= 0)
                 return (char *) s;
  
+       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
+               ClientEncoding->encoding == PG_SQL_ASCII ||
+               DatabaseEncoding->encoding == PG_SQL_ASCII)
+               return (char *) s;              /* assume data is valid */
+
         return perform_default_encoding_conversion(s, len, false);
  }
  
@@ -398,9 +444,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_
                                 dest_encoding;
         FmgrInfo   *flinfo;
  
-       if (len <= 0)
-               return (char *) src;
-
         if (is_client_to_server)
         {
                 src_encoding = ClientEncoding->encoding;
@@ -417,12 +460,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_
         if (flinfo == NULL)
                 return (char *) src;
  
-       if (src_encoding == dest_encoding)
-               return (char *) src;
-
-       if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
-               return (char *) src;
-
         result = palloc(len * 4 + 1);
  
         FunctionCall5(flinfo,
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c

index d996b6c..0cd1d31 100644 (file)
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1,7 +1,7 @@
  /*
   * conversion functions between pg_wchar and multibyte streams.
   * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.54 2006/02/18 16:15:22 petere Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.55 2006/05/21 20:05:19 tgl Exp $
   *
   * WIN1250 client encoding updated by Pavel Behal
   *
@@ -19,17 +19,21 @@
  
  /*
   * conversion to pg_wchar is done by "table driven."
- * to add an encoding support, define mb2wchar_with_len(), mblen()
+ * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen()
   * for the particular encoding. Note that if the encoding is only
   * supported in the client, you don't need to define
   * mb2wchar_with_len() function (SJIS is the case).
   *
+ * These functions generally assume that their input is validly formed.
+ * The "verifier" functions, further down in the file, have to be more
+ * paranoid.  We expect that mblen() does not need to examine more than
+ * the first byte of the character to discover the correct length.
+ *
   * Note: for the display output of psql to work properly, the return values
- * of these functions must conform to the Unicode standard. In particular
+ * of the dsplen functions must conform to the Unicode standard. In particular
   * the NUL character is zero width and control characters are generally
   * width -1. It is recommended that non-ASCII encodings refer their ASCII
- * subset to the ASCII routines to ensure consistancy.
- *
+ * subset to the ASCII routines to ensure consistency.
   */
  
  /*
@@ -109,7 +113,7 @@ static int  pg_euc2wchar_with_len
         return cnt;
  }
  
-static int
+static inline int
  pg_euc_mblen(const unsigned char *s)
  {
         int                     len;
@@ -125,7 +129,7 @@ pg_euc_mblen(const unsigned char *s)
         return len;
  }
  
-static int
+static inline int
  pg_euc_dsplen(const unsigned char *s)
  {
         int                     len;
@@ -316,7 +320,7 @@ pg_euctw_mblen(const unsigned char *s)
         else if (IS_HIGHBIT_SET(*s))
                 len = 2;
         else
-               len = pg_ascii_dsplen(s);
+               len = 1;
         return len;
  }
  
@@ -409,7 +413,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  }
  
  /*
- * returns the byte length of a UTF8 word pointed to by s
+ * returns the byte length of a UTF8 character pointed to by s
   */
  int
  pg_utf_mblen(const unsigned char *s)
@@ -680,7 +684,20 @@ pg_mule_mblen(const unsigned char *s)
  static int
  pg_mule_dsplen(const unsigned char *s)
  {
-       return pg_ascii_dsplen(s);                                      /* XXX fix me! */
+       int len;
+
+       if (IS_LC1(*s))
+               len = 1;
+       else if (IS_LCPRV1(*s))
+               len = 1;
+       else if (IS_LC2(*s))
+               len = 2;
+       else if (IS_LCPRV2(*s))
+               len = 2;
+       else
+               len = 1;        /* assume ASCII */
+
+       return len;
  }
  
  /*
@@ -860,233 +877,646 @@ pg_gb18030_dsplen(const unsigned char *s)
         return len;
  }
  
+/*
+ *-------------------------------------------------------------------
+ * multibyte sequence validators
+ *
+ * These functions accept "s", a pointer to the first byte of a string,
+ * and "len", the remaining length of the string.  If there is a validly
+ * encoded character beginning at *s, return its length in bytes; else
+ * return -1.
+ *
+ * The functions can assume that len > 0 and that *s != '\0', but they must
+ * test for and reject zeroes in any additional bytes of a multibyte character.
+ *
+ * Note that this definition allows the function for a single-byte
+ * encoding to be just "return 1".
+ *-------------------------------------------------------------------
+ */
  
-pg_wchar_tbl pg_wchar_table[] = {
-       {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, 1},          /* 0; PG_SQL_ASCII      */
-       {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, 3},          /* 1; PG_EUC_JP */
-       {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, 3},          /* 2; PG_EUC_CN */
-       {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},          /* 3; PG_EUC_KR */
-       {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},          /* 4; PG_EUC_TW */
-       {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},          /* 5; PG_JOHAB */
-       {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4},        /* 6; PG_UTF8 */
-       {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 8; PG_LATIN1 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 9; PG_LATIN2 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 10; PG_LATIN3 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 11; PG_LATIN4 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 12; PG_LATIN5 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 13; PG_LATIN6 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 14; PG_LATIN7 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 15; PG_LATIN8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 16; PG_LATIN9 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 17; PG_LATIN10 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 18; PG_WIN1256 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 19; PG_WIN1258 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 20; PG_WIN874 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 21; PG_KOI8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 22; PG_WIN1251 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 22; PG_WIN1252 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 23; PG_WIN866 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 24; ISO-8859-5 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 25; ISO-8859-6 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 26; ISO-8859-7 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 27; ISO-8859-8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 28; PG_WIN1250 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 29; PG_WIN1253 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 30; PG_WIN1254 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 31; PG_WIN1255 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 32; PG_WIN1257 */
-       {0, pg_sjis_mblen, pg_sjis_dsplen, 2},          /* 33; PG_SJIS */
-       {0, pg_big5_mblen, pg_big5_dsplen, 2},          /* 34; PG_BIG5 */
-       {0, pg_gbk_mblen, pg_gbk_dsplen, 2},            /* 35; PG_GBK */
-       {0, pg_uhc_mblen, pg_uhc_dsplen, 2},            /* 36; PG_UHC */
-       {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 37; PG_GB18030 */
-};
+static int
+pg_ascii_verifier(const unsigned char *s, int len)
+{
+       return 1;
+}
  
-/* returns the byte length of a word for mule internal code */
-int
-pg_mic_mblen(const unsigned char *mbstr)
+#define IS_EUC_RANGE_VALID(c)  ((c) >= 0xa1 && (c) <= 0xfe)
+
+static int
+pg_eucjp_verifier(const unsigned char *s, int len)
  {
-       return pg_mule_mblen(mbstr);
+       int                     l;
+       unsigned char c1, c2;
+
+       c1 = *s++;
+
+       switch (c1)
+       {
+               case SS2:               /* JIS X 0201 */
+                       l = 2;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (c2 < 0xa1 || c2 > 0xdf)
+                               return -1;
+                       break;
+
+               case SS3:               /* JIS X 0212 */
+                       l = 3;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       break;
+
+               default:
+                       if (IS_HIGHBIT_SET(c1))         /* JIS X 0208? */
+                       {
+                               l = 2;
+                               if (l > len)
+                                       return -1;
+                               if (!IS_EUC_RANGE_VALID(c1))
+                                       return -1;
+                               c2 = *s++;
+                               if (!IS_EUC_RANGE_VALID(c2))
+                                       return -1;
+                       }
+                       else            /* must be ASCII */
+                       {
+                               l = 1;
+                       }
+                       break;
+       }
+
+       return l;
  }
  
-/*
- * Returns the byte length of a multibyte word.
- */
-int
-pg_encoding_mblen(int encoding, const char *mbstr)
+static int
+pg_euckr_verifier(const unsigned char *s, int len)
  {
-       Assert(PG_VALID_ENCODING(encoding));
+       int                     l;
+       unsigned char c1, c2;
  
-       return ((encoding >= 0 &&
-                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
-               ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) :
-       ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));
+       c1 = *s++;
+
+       if (IS_HIGHBIT_SET(c1))
+       {
+               l = 2;
+               if (l > len)
+                       return -1;
+               if (!IS_EUC_RANGE_VALID(c1))
+                       return -1;
+               c2 = *s++;
+               if (!IS_EUC_RANGE_VALID(c2))
+                       return -1;
+       }
+       else            /* must be ASCII */
+       {
+               l = 1;
+       }
+
+       return l;
  }
  
-/*
- * Returns the display length of a multibyte word.
- */
-int
-pg_encoding_dsplen(int encoding, const char *mbstr)
+/* EUC-CN byte sequences are exactly same as EUC-KR */
+#define pg_euccn_verifier      pg_euckr_verifier
+
+static int
+pg_euctw_verifier(const unsigned char *s, int len)
  {
-       Assert(PG_VALID_ENCODING(encoding));
+       int                     l;
+       unsigned char c1, c2;
  
-       return ((encoding >= 0 &&
-                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
-          ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) :
-       ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));
+       c1 = *s++;
+
+       switch (c1)
+       {
+               case SS2:               /* CNS 11643 Plane 1-7 */
+                       l = 4;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (c2 < 0xa1 || c2 > 0xa7)
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       break;
+
+               case SS3:               /* unused */
+                       return -1;
+
+               default:
+                       if (IS_HIGHBIT_SET(c1))         /* CNS 11643 Plane 1 */
+                       {
+                               l = 2;
+                               if (l > len)
+                                       return -1;
+                               /* no further range check on c1? */
+                               c2 = *s++;
+                               if (!IS_EUC_RANGE_VALID(c2))
+                                       return -1;
+                       }
+                       else            /* must be ASCII */
+                       {
+                               l = 1;
+                       }
+                       break;
+       }
+       return l;
  }
  
-/*
- * fetch maximum length of a char encoding
- */
-int
-pg_encoding_max_length(int encoding)
+static int
+pg_johab_verifier(const unsigned char *s, int len)
  {
-       Assert(PG_VALID_ENCODING(encoding));
+       int l, mbl;
+       unsigned char c;
  
-       return pg_wchar_table[encoding].maxmblen;
+       l = mbl = pg_johab_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (!IS_HIGHBIT_SET(*s))
+               return mbl;
+
+       while (--l > 0)
+       {
+               c = *++s;
+               if (!IS_EUC_RANGE_VALID(c))
+                       return -1;
+       }
+       return mbl;
  }
  
-#ifndef FRONTEND
+static int
+pg_mule_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+       unsigned char c;
+
+       l = mbl = pg_mule_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               c = *++s;
+               if (!IS_HIGHBIT_SET(c))
+                       return -1;
+       }
+       return mbl;
+}
+
+static int
+pg_latin1_verifier(const unsigned char *s, int len)
+{
+       return 1;
+}
+
+static int
+pg_sjis_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+       unsigned char c1, c2;
+
+       l = mbl = pg_sjis_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (l == 1)                                     /* pg_sjis_mblen already verified it */
+               return mbl;
+
+       c1 = *s++;
+       c2 = *s;
+       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
+               return -1;
+       return mbl;
+}
+
+static int
+pg_big5_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_big5_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_gbk_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_gbk_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
  
+static int
+pg_uhc_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_uhc_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_gb18030_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_gb18030_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_utf8_verifier(const unsigned char *s, int len)
+{
+       int l = pg_utf_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (!pg_utf8_islegal(s, l))
+               return -1;
+
+       return l;
+}
+
+/*
+ * Check for validity of a single UTF-8 encoded character
+ *
+ * This directly implements the rules in RFC3629.  The bizarre-looking
+ * restrictions on the second byte are meant to ensure that there isn't
+ * more than one encoding of a given Unicode character point; that is,
+ * you may not use a longer-than-necessary byte sequence with high order
+ * zero bits to represent a character that would fit in fewer bytes.
+ * To do otherwise is to create security hazards (eg, create an apparent
+ * non-ASCII character that decodes to plain ASCII).
+ *
+ * length is assumed to have been obtained by pg_utf_mblen(), and the
+ * caller must have checked that that many bytes are present in the buffer.
+ */
  bool
  pg_utf8_islegal(const unsigned char *source, int length)
  {
         unsigned char a;
-       const unsigned char *srcptr = source + length;
  
         switch (length)
         {
                 default:
+                       /* reject lengths 5 and 6 for now */
                         return false;
-                       /* Everything else falls through when "true"... */
                 case 4:
-                       if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
+                       a = source[3];
+                       if (a < 0x80 || a > 0xBF)
                                 return false;
+                       /* FALL THRU */
                 case 3:
-                       if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
+                       a = source[2];
+                       if (a < 0x80 || a > 0xBF)
                                 return false;
+                       /* FALL THRU */
                 case 2:
-                       if ((a = (*--srcptr)) > 0xBF)
-                               return false;
+                       a = source[1];
                         switch (*source)
                         {
-                                       /* no fall-through in this inner switch */
                                 case 0xE0:
-                                       if (a < 0xA0)
+                                       if (a < 0xA0 || a > 0xBF)
                                                 return false;
                                         break;
                                 case 0xED:
-                                       if (a > 0x9F)
+                                       if (a < 0x80 || a > 0x9F)
                                                 return false;
                                         break;
                                 case 0xF0:
-                                       if (a < 0x90)
+                                       if (a < 0x90 || a > 0xBF)
                                                 return false;
                                         break;
                                 case 0xF4:
-                                       if (a > 0x8F)
+                                       if (a < 0x80 || a > 0x8F)
                                                 return false;
                                         break;
                                 default:
-                                       if (a < 0x80)
+                                       if (a < 0x80 || a > 0xBF)
                                                 return false;
+                                       break;
                         }
-
+                       /* FALL THRU */
                 case 1:
-                       if (*source >= 0x80 && *source < 0xC2)
+                       a = *source;
+                       if (a >= 0x80 && a < 0xC2)
+                               return false;
+                       if (a > 0xF4)
                                 return false;
+                       break;
         }
-       if (*source > 0xF4)
-               return false;
         return true;
  }
  
+/*
+ *-------------------------------------------------------------------
+ * encoding info table
+ *-------------------------------------------------------------------
+ */
+pg_wchar_tbl pg_wchar_table[] = {
+       {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1},               /* 0; PG_SQL_ASCII      */
+       {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},               /* 1; PG_EUC_JP */
+       {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3},               /* 2; PG_EUC_CN */
+       {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},               /* 3; PG_EUC_KR */
+       {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 3},               /* 4; PG_EUC_TW */
+       {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},               /* 5; PG_JOHAB */
+       {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},      /* 6; PG_UTF8 */
+       {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 8; PG_LATIN1 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 9; PG_LATIN2 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 10; PG_LATIN3 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 11; PG_LATIN4 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 12; PG_LATIN5 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 13; PG_LATIN6 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 14; PG_LATIN7 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 15; PG_LATIN8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 16; PG_LATIN9 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 17; PG_LATIN10 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 18; PG_WIN1256 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 19; PG_WIN1258 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 20; PG_WIN874 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 21; PG_KOI8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 22; PG_WIN1251 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 22; PG_WIN1252 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 23; PG_WIN866 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 24; ISO-8859-5 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 25; ISO-8859-6 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 26; ISO-8859-7 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 27; ISO-8859-8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 28; PG_WIN1250 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 29; PG_WIN1253 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 30; PG_WIN1254 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 31; PG_WIN1255 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 32; PG_WIN1257 */
+       {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2},                /* 33; PG_SJIS */
+       {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2},                /* 34; PG_BIG5 */
+       {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},           /* 35; PG_GBK */
+       {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},           /* 36; PG_UHC */
+       {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 2} /* 37; PG_GB18030 */
+};
+
+/* returns the byte length of a word for mule internal code */
+int
+pg_mic_mblen(const unsigned char *mbstr)
+{
+       return pg_mule_mblen(mbstr);
+}
+
+/*
+ * Returns the byte length of a multibyte character.
+ */
+int
+pg_encoding_mblen(int encoding, const char *mbstr)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return ((encoding >= 0 &&
+                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+               ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) :
+       ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));
+}
+
+/*
+ * Returns the display length of a multibyte character.
+ */
+int
+pg_encoding_dsplen(int encoding, const char *mbstr)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return ((encoding >= 0 &&
+                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+          ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) :
+       ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));
+}
+
+/*
+ * Verify the first multibyte character of the given string.
+ * Return its byte length if good, -1 if bad.  (See comments above for
+ * full details of the mbverify API.)
+ */
+int
+pg_encoding_verifymb(int encoding, const char *mbstr, int len)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return ((encoding >= 0 &&
+                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+               ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) :
+       ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len)));
+}
  
  /*
- * Verify mbstr to make sure that it has a valid character sequence.
- * mbstr is not necessarily NULL terminated; length of mbstr is
+ * fetch maximum length of a given encoding
+ */
+int
+pg_encoding_max_length(int encoding)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return pg_wchar_table[encoding].maxmblen;
+}
+
+#ifndef FRONTEND
+
+/*
+ * fetch maximum length of the encoding for the current database
+ */
+int
+pg_database_encoding_max_length(void)
+{
+       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the current
+ * database encoding.  Otherwise same as pg_verify_mbstr().
+ */
+bool
+pg_verifymbstr(const char *mbstr, int len, bool noError)
+{
+       return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ * mbstr is not necessarily zero terminated; length of mbstr is
   * specified by len.
   *
   * If OK, return TRUE. If a problem is found, return FALSE when noError is
   * true; when noError is false, ereport() a descriptive message.
   */
  bool
-pg_verifymbstr(const char *mbstr, int len, bool noError)
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
  {
-       int                     l;
-       int                     i;
-       int                     encoding;
+       mbverifier      mbverify;
+
+       Assert(PG_VALID_ENCODING(encoding));
+
+       /*
+        * In single-byte encodings, we need only reject nulls (\0).
+        */
+       if (pg_encoding_max_length(encoding) <= 1)
+       {
+               const char *nullpos = memchr(mbstr, 0, len);
  
-       /* we do not need any check in single-byte encodings */
-       if (pg_database_encoding_max_length() <= 1)
-               return true;
+               if (nullpos == NULL)
+                       return true;
+               if (noError)
+                       return false;
+               report_invalid_encoding(encoding, nullpos, 1);
+       }
  
-       encoding = GetDatabaseEncoding();
+       /* fetch function pointer just once */
+       mbverify = pg_wchar_table[encoding].mbverify;
  
-       while (len > 0 && *mbstr)
+       while (len > 0)
         {
-               l = pg_mblen(mbstr);
+               int                     l;
  
-               /* special UTF-8 check */
-               if (encoding == PG_UTF8)
+               /* fast path for ASCII-subset characters */
+               if (!IS_HIGHBIT_SET(*mbstr))
                 {
-                       if (!pg_utf8_islegal((const unsigned char *) mbstr, l))
+                       if (*mbstr != '\0')
                         {
-                               if (noError)
-                                       return false;
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                                errmsg("invalid UTF-8 byte sequence detected near byte 0x%02x",
-                                                               (unsigned char) *mbstr)));
+                               mbstr++;
+                               len--;
+                               continue;
                         }
+                       if (noError)
+                               return false;
+                       report_invalid_encoding(encoding, mbstr, len);
                 }
-               else
-               {
-                       for (i = 1; i < l; i++)
-                       {
-                               /*
-                                * we expect that every multibyte char consists of bytes
-                                * having the 8th bit set
-                                */
-                               if (i >= len || !IS_HIGHBIT_SET(mbstr[i]))
-                               {
-                                       char            buf[8 * 2 + 1];
-                                       char       *p = buf;
-                                       int                     j,
-                                                               jlimit;
-
-                                       if (noError)
-                                               return false;
-
-                                       jlimit = Min(l, len);
-                                       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
  
-                                       for (j = 0; j < jlimit; j++)
-                                               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+               l = (*mbverify) ((const unsigned char *) mbstr, len);
  
-                                       ereport(ERROR,
-                                                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                       errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
-                                                  GetDatabaseEncodingName(), buf)));
-                               }
-                       }
+               if (l < 0)
+               {
+                       if (noError)
+                               return false;
+                       report_invalid_encoding(encoding, mbstr, len);
                 }
-               len -= l;
+
                 mbstr += l;
+               len -= l;
         }
         return true;
  }
  
  /*
- * fetch maximum length of a char encoding for the current database
+ * report_invalid_encoding: complain about invalid multibyte character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_invalid_encoding(int encoding, const char *mbstr, int len)
+{
+       int                     l = pg_encoding_mblen(encoding, mbstr);
+       char            buf[8 * 2 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+       ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                        errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
+                                       pg_enc2name_tbl[encoding].name,
+                                       buf)));
+}
+
+/*
+ * report_untranslatable_char: complain about untranslatable character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
   */
-int
-pg_database_encoding_max_length(void)
+void
+report_untranslatable_char(int src_encoding, int dest_encoding,
+                                                  const char *mbstr, int len)
  {
-       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+       int                     l = pg_encoding_mblen(src_encoding, mbstr);
+       char            buf[8 * 2 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+       ereport(ERROR,
+                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+                        errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"",
+                                       buf,
+                                       pg_enc2name_tbl[src_encoding].name,
+                                       pg_enc2name_tbl[dest_encoding].name)));
  }
  
  #endif
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index d049f4e..1bb8042 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.67 2006/02/18 16:15:23 petere Exp $ */
+/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.68 2006/05/21 20:05:21 tgl Exp $ */
  
  #ifndef PG_WCHAR_H
  #define PG_WCHAR_H
@@ -24,10 +24,16 @@ typedef unsigned int pg_wchar;
  #define SS3 0x8f                               /* single shift 3 (JIS0212) */
  
  /*
+ * SJIS validation macros
+ */
+#define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
+#define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
+
+/*
   * Leading byte types or leading prefix byte for MULE internal code.
   * See http://www.xemacs.org for more details. (there is a doc titled
   * "XEmacs Internals Manual", "MULE Character Sets and Encodings"
- * section.
+ * section.)
   */
  /*
   * Is a leading byte for "official" single byte encodings?
@@ -64,7 +70,7 @@ typedef unsigned int pg_wchar;
  #define LC_ISO8859_8           0x88    /* Hebrew (not supported yet) */
  #define LC_JISX0201K           0x89    /* Japanese 1 byte kana */
  #define LC_JISX0201R           0x8a    /* Japanese 1 byte Roman */
-/* Note that 0x8b seems to be unused in as of Emacs 20.7.
+/* Note that 0x8b seems to be unused as of Emacs 20.7.
   * However, there might be a chance that 0x8b could be used
   * in later version of Emacs.
   */
@@ -135,13 +141,13 @@ typedef unsigned int pg_wchar;
  /* #define FREE                                0xff    free (unused) */
  
  /*
- * Encoding numeral identificators
+ * PostgreSQL encoding identifiers
   *
   * WARNING: the order of this table must be same as order
   *                     in the pg_enc2name[] (mb/encnames.c) array!
   *
- *                     If you add some encoding don'y forget check
- *                     PG_ENCODING_[BE|FE]_LAST macros.
+ *                     If you add some encoding don't forget to check
+ *                     PG_ENCODING_BE_LAST macro.
   *
   * The PG_SQL_ASCII is default encoding and must be = 0.
   */
@@ -208,8 +214,7 @@ typedef enum pg_enc
  #define PG_VALID_ENCODING(_enc) \
                 ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_)
  
-/* On FE are possible all encodings
- */
+/* On FE are possible all encodings */
  #define PG_VALID_FE_ENCODING(_enc)     PG_VALID_ENCODING(_enc)
  
  /*
@@ -249,18 +254,21 @@ extern const char *pg_encoding_to_char(int encoding);
  typedef int (*mb2wchar_with_len_converter) (const unsigned char *from,
                                                                                                                 pg_wchar *to,
                                                                                                                 int len);
+
  typedef int (*mblen_converter) (const unsigned char *mbstr);
  
  typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
  
+typedef int (*mbverifier) (const unsigned char *mbstr, int len);
+
  typedef struct
  {
         mb2wchar_with_len_converter mb2wchar_with_len;          /* convert a multibyte
                                                                                                                  * string to a wchar */
-       mblen_converter mblen;          /* returns the length of a multibyte char */
-       mbdisplaylen_converter dsplen;          /* returns the lenghth of a display
-                                                                                * length */
-       int                     maxmblen;               /* max bytes for a char in this charset */
+       mblen_converter mblen;          /* get byte length of a char */
+       mbdisplaylen_converter dsplen;          /* get display width of a char */
+       mbverifier      mbverify;               /* verify multibyte sequence */
+       int                     maxmblen;               /* max bytes for a char in this encoding */
  } pg_wchar_tbl;
  
  extern pg_wchar_tbl pg_wchar_table[];
@@ -293,6 +301,7 @@ extern int  pg_mblen(const char *mbstr);
  extern int     pg_dsplen(const char *mbstr);
  extern int     pg_encoding_mblen(int encoding, const char *mbstr);
  extern int     pg_encoding_dsplen(int encoding, const char *mbstr);
+extern int     pg_encoding_verifymb(int encoding, const char *mbstr, int len);
  extern int     pg_mule_mblen(const unsigned char *mbstr);
  extern int     pg_mic_mblen(const unsigned char *mbstr);
  extern int     pg_mbstrlen(const char *mbstr);
@@ -326,21 +335,32 @@ extern char *pg_server_to_client(const char *s, int len);
  extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
  extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
  
-extern void LocalToUtf(unsigned char *iso, unsigned char *utf,
-                  pg_local_to_utf *map, int size, int encoding, int len);
+extern void LocalToUtf(const unsigned char *iso, unsigned char *utf,
+                  const pg_local_to_utf *map, int size, int encoding, int len);
  
-extern void UtfToLocal(unsigned char *utf, unsigned char *iso,
-                  pg_utf_to_local *map, int size, int len);
+extern void UtfToLocal(const unsigned char *utf, unsigned char *iso,
+                  const pg_utf_to_local *map, int size, int encoding, int len);
  
  extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
-
-extern void pg_ascii2mic(unsigned char *src, unsigned char *dest, int len);
-extern void pg_mic2ascii(unsigned char *src, unsigned char *dest, int len);
-extern void pg_print_bogus_char(unsigned char **mic, unsigned char **p);
-extern void latin2mic(unsigned char *l, unsigned char *p, int len, int lc);
-extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
-extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
-extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
+extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
+                                                       bool noError);
+
+extern void report_invalid_encoding(int encoding, const char *mbstr, int len);
+extern void report_untranslatable_char(int src_encoding, int dest_encoding,
+                                                                          const char *mbstr, int len);
+
+extern void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len);
+extern void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len);
+extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
+                                         int lc, int encoding);
+extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
+                                         int lc, int encoding);
+extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
+                                                                int len, int lc, int encoding,
+                                                                const unsigned char *tab);
+extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
+                                                                int len, int lc, int encoding,
+                                                                const unsigned char *tab);
  
  extern bool pg_utf8_islegal(const unsigned char *source, int length);
  
diff --git a/src/test/mb/expected/mule_internal.out b/src/test/mb/expected/mule_internal.out

index fa1f836..ac8b57d 100644 (file)
--- a/src/test/mb/expected/mule_internal.out
+++ b/src/test/mb/expected/mule_internal.out
@@ -8,81 +8,81 @@ insert into 
  insert into \92·×\92»»\92µ¡\92ÍÑ\92¸ì values('\92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼','\92¿ÍZ01\92²¼');
  vacuum \92·×\92»»\92µ¡\92ÍÑ\92¸ì;
  select * from \92·×\92»»\92µ¡\92ÍÑ\92¸ì;
-                 \92ÍÑ\92¸ì                  | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
------------------------------------------+-----------------+----------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤    | \92µ¡A01\92¾å       | 
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ       | 
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼    | \92¿ÍZ01\92²¼       | 
+            \92ÍÑ\92¸ì            | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
+----------------------------+------------+------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤   | \92µ¡A01\92¾å    | 
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ    | 
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼   | \92¿ÍZ01\92²¼    | 
  (3 rows)
  
  select * from \92·×\92»»\92µ¡\92ÍÑ\92¸ì where \92Ê¬\92Îà\92¥³\92¡¼\92¥É = '\92¿ÍZ01\92²¼';
-                \92ÍÑ\92¸ì                | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
---------------------------------------+-----------------+----------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼       | 
+           \92ÍÑ\92¸ì           | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
+--------------------------+------------+------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼    | 
  (1 row)
  
  select * from \92·×\92»»\92µ¡\92ÍÑ\92¸ì where \92Ê¬\92Îà\92¥³\92¡¼\92¥É ~* '\92¿Íz01\92²¼';
-                \92ÍÑ\92¸ì                | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
---------------------------------------+-----------------+----------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼       | 
+           \92ÍÑ\92¸ì           | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
+--------------------------+------------+------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼    | 
  (1 row)
  
  select * from \92·×\92»»\92µ¡\92ÍÑ\92¸ì where \92Ê¬\92Îà\92¥³\92¡¼\92¥É like '_Z01_';
-                \92ÍÑ\92¸ì                | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
---------------------------------------+-----------------+----------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼       | 
+           \92ÍÑ\92¸ì           | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
+--------------------------+------------+------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼    | 
  (1 row)
  
  select * from \92·×\92»»\92µ¡\92ÍÑ\92¸ì where \92Ê¬\92Îà\92¥³\92¡¼\92¥É like '_Z%';
-                \92ÍÑ\92¸ì                | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
---------------------------------------+-----------------+----------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼       | 
+           \92ÍÑ\92¸ì           | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
+--------------------------+------------+------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼ | \92¿ÍZ01\92²¼    | 
  (1 row)
  
  select * from \92·×\92»»\92µ¡\92ÍÑ\92¸ì where \92ÍÑ\92¸ì ~ '\92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿[\92¥Ç\92¥°]';
-                 \92ÍÑ\92¸ì                  | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
------------------------------------------+-----------------+----------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤    | \92µ¡A01\92¾å       | 
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ       | 
+            \92ÍÑ\92¸ì            | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
+----------------------------+------------+------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤   | \92µ¡A01\92¾å    | 
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ    | 
  (2 rows)
  
  select * from \92·×\92»»\92µ¡\92ÍÑ\92¸ì where \92ÍÑ\92¸ì ~* '\92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿[\92¥Ç\92¥°]';
-                 \92ÍÑ\92¸ì                  | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
------------------------------------------+-----------------+----------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤    | \92µ¡A01\92¾å       | 
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ       | 
+            \92ÍÑ\92¸ì            | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è 
+----------------------------+------------+------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤   | \92µ¡A01\92¾å    | 
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ    | 
  (2 rows)
  
  select *,character_length(\92ÍÑ\92¸ì) from \92·×\92»»\92µ¡\92ÍÑ\92¸ì;
-                 \92ÍÑ\92¸ì                  | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è | character_length 
------------------------------------------+-----------------+----------------+------------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤    | \92µ¡A01\92¾å       |                |               12
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ       |                |               13
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼    | \92¿ÍZ01\92²¼       |                |               12
+            \92ÍÑ\92¸ì            | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è | character_length 
+----------------------------+------------+------------+------------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤   | \92µ¡A01\92¾å    |            |               12
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ    |            |               13
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼   | \92¿ÍZ01\92²¼    |            |               12
  (3 rows)
  
  select *,octet_length(\92ÍÑ\92¸ì) from \92·×\92»»\92µ¡\92ÍÑ\92¸ì;
-                 \92ÍÑ\92¸ì                  | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è | octet_length 
------------------------------------------+-----------------+----------------+--------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤    | \92µ¡A01\92¾å       |                |           36
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ       |                |           39
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼    | \92¿ÍZ01\92²¼       |                |           36
+            \92ÍÑ\92¸ì            | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è | octet_length 
+----------------------------+------------+------------+--------------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤   | \92µ¡A01\92¾å    |            |           36
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ    |            |           39
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼   | \92¿ÍZ01\92²¼    |            |           36
  (3 rows)
  
  select *,position('\92¥Ç' in \92ÍÑ\92¸ì) from \92·×\92»»\92µ¡\92ÍÑ\92¸ì;
-                 \92ÍÑ\92¸ì                  | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è | position 
------------------------------------------+-----------------+----------------+----------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤    | \92µ¡A01\92¾å       |                |        7
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ       |                |        0
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼    | \92¿ÍZ01\92²¼       |                |        0
+            \92ÍÑ\92¸ì            | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è | position 
+----------------------------+------------+------------+----------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤   | \92µ¡A01\92¾å    |            |        7
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ    |            |        0
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼   | \92¿ÍZ01\92²¼    |            |        0
  (3 rows)
  
  select *,substring(\92ÍÑ\92¸ì from 10 for 4) from \92·×\92»»\92µ¡\92ÍÑ\92¸ì;
-                 \92ÍÑ\92¸ì                  | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è |  substring   
------------------------------------------+-----------------+----------------+--------------
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤    | \92µ¡A01\92¾å       |                | \92¥×\92¥ì\92¥¤
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ       |                | \92¥£\92¥Ã\92¥¯\92¥¹
- \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼    | \92¿ÍZ01\92²¼       |                | \92¥é\92¥Þ\92¡¼
+            \92ÍÑ\92¸ì            | \92Ê¬\92Îà\92¥³\92¡¼\92¥É | \92È÷\92¹Í1a\92¤À\92¤è | substring 
+----------------------------+------------+------------+-----------
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥Ç\92¥£\92¥¹\92¥×\92¥ì\92¥¤   | \92µ¡A01\92¾å    |            | \92¥×\92¥ì\92¥¤
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥°\92¥é\92¥Õ\92¥£\92¥Ã\92¥¯\92¥¹ | \92Ê¬B10\92Ãæ    |            | \92¥£\92¥Ã\92¥¯\92¥¹
+ \92¥³\92¥ó\92¥Ô\92¥å\92¡¼\92¥¿\92¥×\92¥í\92¥°\92¥é\92¥Þ\92¡¼   | \92¿ÍZ01\92²¼    |            | \92¥é\92¥Þ\92¡¼
  (3 rows)
  
  drop table \91¼Æ\91Ëã\91»ú\91Êõ\91Óï;
@@ -95,81 +95,81 @@ insert into 
  insert into \91¼Æ\91Ëã\91»ú\91Êõ\91Óï values('\91µç\91ÄÔ\91³Ì\91Ðò\91Ô±','\91ÈËZ01\91ÏÂ');
  vacuum \91¼Æ\91Ëã\91»ú\91Êõ\91Óï;
  select * from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï;
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a 
------------------+-----------+----------
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a 
+------------+---------+--------
   \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ | 
- \91µç\91ÄÔ\91Í¼\91ÐÎ    | \91·ÖB01\91ÖÐ | 
+ \91µç\91ÄÔ\91Í¼\91ÐÎ   | \91·ÖB01\91ÖÐ | 
   \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ | 
  (3 rows)
  
  select * from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï where \91·Ö\91Àà\91ºÅ = '\91ÈËZ01\91ÏÂ';
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a 
------------------+-----------+----------
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a 
+------------+---------+--------
   \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ | 
  (1 row)
  
  select * from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï where \91·Ö\91Àà\91ºÅ ~* '\91ÈËz01\91ÏÂ';
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a 
------------------+-----------+----------
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a 
+------------+---------+--------
   \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ | 
  (1 row)
  
  select * from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï where \91·Ö\91Àà\91ºÅ like '_Z01_';
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a 
------------------+-----------+----------
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a 
+------------+---------+--------
   \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ | 
  (1 row)
  
  select * from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï where \91·Ö\91Àà\91ºÅ like '_Z%';
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a 
------------------+-----------+----------
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a 
+------------+---------+--------
   \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ | 
  (1 row)
  
  select * from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï where \91Êõ\91Óï ~ '\91µç\91ÄÔ[\91ÏÔ\91Í¼]';
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a 
------------------+-----------+----------
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a 
+------------+---------+--------
   \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ | 
- \91µç\91ÄÔ\91Í¼\91ÐÎ    | \91·ÖB01\91ÖÐ | 
+ \91µç\91ÄÔ\91Í¼\91ÐÎ   | \91·ÖB01\91ÖÐ | 
  (2 rows)
  
  select * from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï where \91Êõ\91Óï ~* '\91µç\91ÄÔ[\91ÏÔ\91Í¼]';
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a 
------------------+-----------+----------
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a 
+------------+---------+--------
   \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ | 
- \91µç\91ÄÔ\91Í¼\91ÐÎ    | \91·ÖB01\91ÖÐ | 
+ \91µç\91ÄÔ\91Í¼\91ÐÎ   | \91·ÖB01\91ÖÐ | 
  (2 rows)
  
  select *,character_length(\91Êõ\91Óï) from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï;
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a | character_length 
------------------+-----------+----------+------------------
- \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |          |                5
- \91µç\91ÄÔ\91Í¼\91ÐÎ    | \91·ÖB01\91ÖÐ |          |                4
- \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |          |                5
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a | character_length 
+------------+---------+--------+------------------
+ \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |        |                5
+ \91µç\91ÄÔ\91Í¼\91ÐÎ   | \91·ÖB01\91ÖÐ |        |                4
+ \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |        |                5
  (3 rows)
  
  select *,octet_length(\91Êõ\91Óï) from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï;
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a | octet_length 
------------------+-----------+----------+--------------
- \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |          |           15
- \91µç\91ÄÔ\91Í¼\91ÐÎ    | \91·ÖB01\91ÖÐ |          |           12
- \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |          |           15
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a | octet_length 
+------------+---------+--------+--------------
+ \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |        |           15
+ \91µç\91ÄÔ\91Í¼\91ÐÎ   | \91·ÖB01\91ÖÐ |        |           12
+ \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |        |           15
  (3 rows)
  
  select *,position('\91ÏÔ' in \91Êõ\91Óï) from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï;
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a | position 
------------------+-----------+----------+----------
- \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |          |        3
- \91µç\91ÄÔ\91Í¼\91ÐÎ    | \91·ÖB01\91ÖÐ |          |        0
- \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |          |        0
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a | position 
+------------+---------+--------+----------
+ \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |        |        3
+ \91µç\91ÄÔ\91Í¼\91ÐÎ   | \91·ÖB01\91ÖÐ |        |        0
+ \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |        |        0
  (3 rows)
  
  select *,substring(\91Êõ\91Óï from 3 for 4) from \91¼Æ\91Ëã\91»ú\91Êõ\91Óï;
-     \91Êõ\91Óï      | \91·Ö\91Àà\91ºÅ | \91±¸\91×¢1a | substring 
------------------+-----------+----------+-----------
- \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |          | \91ÏÔ\91Ê¾\91ÆÁ
- \91µç\91ÄÔ\91Í¼\91ÐÎ    | \91·ÖB01\91ÖÐ |          | \91Í¼\91ÐÎ
- \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |          | \91³Ì\91Ðò\91Ô±
+    \91Êõ\91Óï    | \91·Ö\91Àà\91ºÅ  | \91±¸\91×¢1a | substring 
+------------+---------+--------+-----------
+ \91µç\91ÄÔ\91ÏÔ\91Ê¾\91ÆÁ | \91»úA01\91ÉÏ |        | \91ÏÔ\91Ê¾\91ÆÁ
+ \91µç\91ÄÔ\91Í¼\91ÐÎ   | \91·ÖB01\91ÖÐ |        | \91Í¼\91ÐÎ
+ \91µç\91ÄÔ\91³Ì\91Ðò\91Ô± | \91ÈËZ01\91ÏÂ |        | \91³Ì\91Ðò\91Ô±
  (3 rows)
  
  drop table \93Íª\93ß©\93Ñ¦\93¿ë\93¾î;
@@ -182,81 +182,81 @@ insert into 
  insert into \93Íª\93ß©\93Ñ¦\93¿ë\93¾î values('\93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó', '\93ìÑZ01\93ù»');
  vacuum \93Íª\93ß©\93Ñ¦\93¿ë\93¾î;
  select * from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î;
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
---------------------------+--------------+----------------
- \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾    | 
- \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º    | \93ÝÂB10\93ñé    | 
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    | 
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
+------------------+----------+------------
+ \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾  | 
+ \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º   | \93ÝÂB10\93ñé  | 
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  | 
  (3 rows)
  
  select * from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î where \93ÝÂ\93×¾\93ÄÚ\93µå = '\93ìÑZ01\93ù»';
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
---------------------------+--------------+----------------
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    | 
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
+------------------+----------+------------
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  | 
  (1 row)
  
  select * from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î where \93ÝÂ\93×¾\93ÄÚ\93µå ~* '\93ìÑz01\93ù»';
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
---------------------------+--------------+----------------
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    | 
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
+------------------+----------+------------
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  | 
  (1 row)
  
  select * from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î where \93ÝÂ\93×¾\93ÄÚ\93µå like '_Z01_';
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
---------------------------+--------------+----------------
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    | 
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
+------------------+----------+------------
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  | 
  (1 row)
  
  select * from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î where \93ÝÂ\93×¾\93ÄÚ\93µå like '_Z%';
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
---------------------------+--------------+----------------
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    | 
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
+------------------+----------+------------
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  | 
  (1 row)
  
  select * from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î where \93¿ë\93¾î ~ '\93ÄÄ\93Ç»\93ÅÍ[\93µð\93±×]';
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
---------------------------+--------------+----------------
- \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾    | 
- \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º    | \93ÝÂB10\93ñé    | 
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
+------------------+----------+------------
+ \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾  | 
+ \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º   | \93ÝÂB10\93ñé  | 
  (2 rows)
  
  select * from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î where \93¿ë\93¾î ~* '\93ÄÄ\93Ç»\93ÅÍ[\93µð\93±×]';
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
---------------------------+--------------+----------------
- \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾    | 
- \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º    | \93ÝÂB10\93ñé    | 
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ 
+------------------+----------+------------
+ \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾  | 
+ \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º   | \93ÝÂB10\93ñé  | 
  (2 rows)
  
  select *,character_length(\93¿ë\93¾î) from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î;
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ | character_length 
---------------------------+--------------+----------------+------------------
- \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾    |                |                8
- \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º    | \93ÝÂB10\93ñé    |                |                7
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    |                |                8
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ | character_length 
+------------------+----------+------------+------------------
+ \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾  |            |                8
+ \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º   | \93ÝÂB10\93ñé  |            |                7
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  |            |                8
  (3 rows)
  
  select *,octet_length(\93¿ë\93¾î) from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î;
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ | octet_length 
---------------------------+--------------+----------------+--------------
- \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾    |                |           24
- \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º    | \93ÝÂB10\93ñé    |                |           21
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    |                |           24
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ | octet_length 
+------------------+----------+------------+--------------
+ \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾  |            |           24
+ \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º   | \93ÝÂB10\93ñé  |            |           21
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  |            |           24
  (3 rows)
  
  select *,position('\93µð' in \93¿ë\93¾î) from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î;
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ | position 
---------------------------+--------------+----------------+----------
- \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾    |                |        4
- \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º    | \93ÝÂB10\93ñé    |                |        0
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    |                |        0
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ | position 
+------------------+----------+------------+----------
+ \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾  |            |        4
+ \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º   | \93ÝÂB10\93ñé  |            |        0
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  |            |        0
  (3 rows)
  
  select *,substring(\93¿ë\93¾î from 3 for 4) from \93Íª\93ß©\93Ñ¦\93¿ë\93¾î;
-          \93¿ë\93¾î          | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ |  substring   
---------------------------+--------------+----------------+--------------
- \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾    |                | \93ÅÍ\93µð\93½º\93ÇÃ
- \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º    | \93ÝÂB10\93ñé    |                | \93ÅÍ\93±×\93·¡\93ÇÈ
- \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»    |                | \93ÅÍ\93ÇÁ\93·Î\93±×
+       \93¿ë\93¾î       | \93ÝÂ\93×¾\93ÄÚ\93µå | \93ºñ\93°í1a\93¶ó\93±¸ | substring 
+------------------+----------+------------+-----------
+ \93ÄÄ\93Ç»\93ÅÍ\93µð\93½º\93ÇÃ\93·¹\93ÀÌ | \93Ñ¦A01\93ß¾  |            | \93ÅÍ\93µð\93½º\93ÇÃ
+ \93ÄÄ\93Ç»\93ÅÍ\93±×\93·¡\93ÇÈ\93½º   | \93ÝÂB10\93ñé  |            | \93ÅÍ\93±×\93·¡\93ÇÈ
+ \93ÄÄ\93Ç»\93ÅÍ\93ÇÁ\93·Î\93±×\93·¡\93¸Ó | \93ìÑZ01\93ù»  |            | \93ÅÍ\93ÇÁ\93·Î\93±×
  (3 rows)
  
  drop table test;
@@ -269,8 +269,8 @@ insert into test values('
  insert into test values('ENGLISH FRAN\81ÇAIS ESPA\81ÑOL \81ÍSLENSKA');
  vacuum test;
  select * from test;
-                  t                   
---------------------------------------
+                 t                 
+-----------------------------------
   ENGLISH
   FRAN\81ÇAIS
   ESPA\81ÑOL
@@ -279,55 +279,55 @@ select * from test;
  (5 rows)
  
  select * from test where t = 'ESPA\81ÑOL';
-    t     
-----------
+    t    
+---------
   ESPA\81ÑOL
  (1 row)
  
  select * from test where t ~* 'espa\81Ñol';
-                  t                   
---------------------------------------
+                 t                 
+-----------------------------------
   ESPA\81ÑOL
   ENGLISH FRAN\81ÇAIS ESPA\81ÑOL \81ÍSLENSKA
  (2 rows)
  
  select *,character_length(t) from test;
-                  t                   | character_length 
---------------------------------------+------------------
- ENGLISH                              |                7
- FRAN\81ÇAIS                            |                8
- ESPA\81ÑOL                             |                7
- \81ÍSLENSKA                            |                8
+                 t                 | character_length 
+-----------------------------------+------------------
+ ENGLISH                           |                7
+ FRAN\81ÇAIS                          |                8
+ ESPA\81ÑOL                           |                7
+ \81ÍSLENSKA                          |                8
   ENGLISH FRAN\81ÇAIS ESPA\81ÑOL \81ÍSLENSKA |               33
  (5 rows)
  
  select *,octet_length(t) from test;
-                  t                   | octet_length 
---------------------------------------+--------------
- ENGLISH                              |            7
- FRAN\81ÇAIS                            |            9
- ESPA\81ÑOL                             |            8
- \81ÍSLENSKA                            |            9
+                 t                 | octet_length 
+-----------------------------------+--------------
+ ENGLISH                           |            7
+ FRAN\81ÇAIS                          |            9
+ ESPA\81ÑOL                           |            8
+ \81ÍSLENSKA                          |            9
   ENGLISH FRAN\81ÇAIS ESPA\81ÑOL \81ÍSLENSKA |           36
  (5 rows)
  
  select *,position('L' in t) from test;
-                  t                   | position 
---------------------------------------+----------
- ENGLISH                              |        4
- FRAN\81ÇAIS                            |        0
- ESPA\81ÑOL                             |        7
- \81ÍSLENSKA                            |        3
+                 t                 | position 
+-----------------------------------+----------
+ ENGLISH                           |        4
+ FRAN\81ÇAIS                          |        0
+ ESPA\81ÑOL                           |        7
+ \81ÍSLENSKA                          |        3
   ENGLISH FRAN\81ÇAIS ESPA\81ÑOL \81ÍSLENSKA |        4
  (5 rows)
  
  select *,substring(t from 3 for 4) from test;
-                  t                   | substring 
---------------------------------------+-----------
- ENGLISH                              | GLIS
- FRAN\81ÇAIS                            | AN\81ÇA
- ESPA\81ÑOL                             | PA\81ÑO
- \81ÍSLENSKA                            | LENS
+                 t                 | substring 
+-----------------------------------+-----------
+ ENGLISH                           | GLIS
+ FRAN\81ÇAIS                          | AN\81ÇA
+ ESPA\81ÑOL                           | PA\81ÑO
+ \81ÍSLENSKA                          | LENS
   ENGLISH FRAN\81ÇAIS ESPA\81ÑOL \81ÍSLENSKA | GLIS
  (5 rows)
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 21 May 2006 20:05:21 +0000 (20:05 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 21 May 2006 20:05:21 +0000 (20:05 +0000)
src/backend/commands/copy.c		patch \| blob \| history
src/backend/utils/adt/name.c		patch \| blob \| history
src/backend/utils/adt/varchar.c		patch \| blob \| history
src/backend/utils/adt/varlena.c		patch \| blob \| history
src/backend/utils/mb/conv.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c		patch \| blob \| history
src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c		patch \| blob \| history
src/backend/utils/mb/mbutils.c		patch \| blob \| history
src/backend/utils/mb/wchar.c		patch \| blob \| history
src/include/mb/pg_wchar.h		patch \| blob \| history
src/test/mb/expected/mule_internal.out		patch \| blob \| history