Fix up handling of C/POSIX collations.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 20 Mar 2011 16:43:39 +0000 (12:43 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 20 Mar 2011 16:44:13 +0000 (12:44 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 20 Mar 2011 16:43:39 +0000 (12:43 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 20 Mar 2011 16:44:13 +0000 (12:44 -0400)
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml

index dd96d00..66f02c6 100644 (file)
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -68,7 +68,7 @@ initdb --locale=sv_SE
     <para>
      This example for Unix systems sets the locale to Swedish
      (<literal>sv</>) as spoken
-    in Sweden (<literal>SE</>).  Other possibilities might be
+    in Sweden (<literal>SE</>).  Other possibilities might include
      <literal>en_US</> (U.S. English) and <literal>fr_CA</> (French
      Canadian).  If more than one character set can be used for a
      locale then the specifications can take the form
@@ -133,7 +133,8 @@ initdb --locale=sv_SE
  
     <para>
      If you want the system to behave as if it had no locale support,
-    use the special locale <literal>C</> or <literal>POSIX</>.
+    use the special locale name <literal>C</>, or equivalently
+    <literal>POSIX</>.
     </para>
  
     <para>
@@ -257,7 +258,9 @@ initdb --locale=sv_SE
      operator classes exist. These allow the creation of an index that
      performs a strict character-by-character comparison, ignoring
      locale comparison rules. Refer to <xref linkend="indexes-opclass">
-    for more information.
+    for more information.  Another approach is to create indexes using
+    the <literal>C</> collation, as discussed in
+    <xref linkend="collation">.
     </para>
    </sect2>
  
@@ -321,13 +324,6 @@ initdb --locale=sv_SE
     of a database cannot be changed after its creation.
    </para>
  
-  <note>
-   <para>
-    Collation support is currently only known to work on
-    Linux (glibc) and Mac OS X platforms.
-   </para>
-  </note>
-
    <sect2>
     <title>Concepts</title>
  
@@ -335,7 +331,8 @@ initdb --locale=sv_SE
      Conceptually, every expression of a collatable data type has a
      collation.  (The built-in collatable data types are
      <type>text</type>, <type>varchar</type>, and <type>char</type>.
-    User-defined base types can also be marked collatable.)  If the
+    User-defined base types can also be marked collatable, and of course
+    a domain over a collatable data type is collatable.)  If the
      expression is a column reference, the collation of the expression is the
      defined collation of the column.  If the expression is a constant, the
      collation is the default collation of the data type of the
@@ -346,8 +343,8 @@ initdb --locale=sv_SE
     <para>
      The collation of an expression can be the <quote>default</quote>
      collation, which means the locale settings defined for the
-    database.  In some cases, an expression can also have no known
-    collation.  In such cases, ordering operations and other
+    database.  It is also possible for an expression's collation to be
+    indeterminate.  In such cases, ordering operations and other
      operations that need to know the collation will fail.
     </para>
  
@@ -379,7 +376,7 @@ initdb --locale=sv_SE
      The <firstterm>collation derivation</firstterm> of an expression can be
      implicit or explicit.  This distinction affects how collations are
      combined when multiple different collations appear in an
-    expression.  An explicit collation derivation arises when a
+    expression.  An explicit collation derivation occurs when a
      <literal>COLLATE</literal> clause is used; all other collation
      derivations are implicit.  When multiple collations need to be
      combined, for example in a function call, the following rules are
@@ -399,34 +396,90 @@ initdb --locale=sv_SE
       <listitem>
        <para>
         Otherwise, all input expressions must have the same implicit
-       collation derivation or the default collation.  If any
-       implicitly derived collation is present, that is the result of
-       the collation combination.  Otherwise, the result is the
-       default collation.
+       collation derivation or the default collation.  If any non-default
+       collation is present, that is the result of the collation combination.
+       Otherwise, the result is the default collation.
+      </para>
+     </listitem>
+
+     <listitem>
+      <para>
+       If there are conflicting non-default implicit collations among the
+       input expressions, then the combination is deemed to have indeterminate
+       collation.  This is not an error condition unless the particular
+       function being invoked requires knowledge of the collation it should
+       apply.  If it does, an error will be raised at run-time.
        </para>
       </listitem>
      </orderedlist>
  
-    For example, take this table definition:
+    For example, consider this table definition:
  <programlisting>
  CREATE TABLE test1 (
-    a text COLLATE "x",
+    a text COLLATE "de_DE",
+    b text COLLATE "es_ES",
      ...
  );
  </programlisting>
  
      Then in
  <programlisting>
-SELECT a || 'foo' FROM test1;
+SELECT a &lt; 'foo' FROM test1;
  </programlisting>
-    the result collation of the <literal>||</literal> operator is
-    <literal>"x"</literal> because it combines an implicitly derived
-    collation with the default collation.  But in
+    the <literal>&lt;</literal> comparison is performed according to
+    <literal>de_DE</literal> rules, because the expression combines an
+    implicitly derived collation with the default collation.  But in
  <programlisting>
-SELECT a || ('foo' COLLATE "y") FROM test1;
+SELECT a &lt; ('foo' COLLATE "fr_FR") FROM test1;
+</programlisting>
+    the comparison is performed using <literal>fr_FR</literal> rules,
+    because the explicit collation derivation overrides the implicit one.
+    Furthermore, given
+<programlisting>
+SELECT a &lt; b FROM test1;
+</programlisting>
+    the parser cannot determine which collation to apply, since the
+    <structfield>a</> and <structfield>b</> columns have conflicting
+    implicit collations.  Since the <literal>&lt;</literal> operator
+    does need to know which collation to use, this will result in an
+    error.  The error can be resolved by attaching an explicit collation
+    specifier to either input expression, thus:
+<programlisting>
+SELECT a &lt; b COLLATE "de_DE" FROM test1;
+</programlisting>
+    or equivalently
+<programlisting>
+SELECT a COLLATE "de_DE" &lt; b FROM test1;
+</programlisting>
+    On the other hand, the structurally similar case
+<programlisting>
+SELECT a || b FROM test1;
+</programlisting>
+    does not result in an error, because the <literal>||</> operator
+    does not care about collations: its result is the same regardless
+    of the collation.
+   </para>
+
+   <para>
+    The collation assigned to a function or operator's combined input
+    expressions is also considered to apply to the function or operator's
+    result, if the function or operator delivers a result of a collatable
+    data type.  So, in
+<programlisting>
+SELECT * FROM test1 ORDER BY a || 'foo';
+</programlisting>
+    the ordering will be done according to <literal>de_DE</literal> rules.
+    But this query:
+<programlisting>
+SELECT * FROM test1 ORDER BY a || b;
+</programlisting>
+    results in an error, because even though the <literal>||</> operator
+    doesn't need to know a collation, the <literal>ORDER BY</> clause does.
+    As before, the conflict can be resolved with an explicit collation
+    specifier:
+<programlisting>
+SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
  </programlisting>
-    the result collation is <literal>"y"</literal> because the explicit
-    collation derivation overrides the implicit one.
     </para>
    </sect2>
  
@@ -449,7 +502,22 @@ SELECT a || ('foo' COLLATE "y") FROM test1;
     </para>
  
     <para>
-    When a database cluster is initialized, <command>initdb</command>
+    On all platforms, the collations named <literal>default</>,
+    <literal>C</>, and <literal>POSIX</> are available.  Additional
+    collations may be available depending on operating system support.
+    The <literal>default</> collation selects the <symbol>LC_COLLATE</symbol>
+    and <symbol>LC_CTYPE</symbol> values specified at database creation time.
+    The <literal>C</> and <literal>POSIX</> collations both specify
+    <quote>traditional C</> behavior, in which only the ASCII letters
+    <quote><literal>A</></quote> through <quote><literal>Z</></quote>
+    are treated as letters, and sorting is done strictly by character
+    code byte values.
+   </para>
+
+   <para>
+    If the operating system provides support for using multiple locales
+    within a single program (<function>newlocale</> and related functions),
+    then when a database cluster is initialized, <command>initdb</command>
      populates the system catalog <literal>pg_collation</literal> with
      collations based on all the locales it finds on the operating
      system at the time.  For example, the operating system might
@@ -484,7 +552,21 @@ SELECT a || ('foo' COLLATE "y") FROM test1;
      within a given database even though it would not be unique globally.
      Use of the stripped collation names is recommendable, since it will
      make one less thing you need to change if you decide to change to
-    another database encoding.
+    another database encoding.  Note however that the <literal>default</>,
+    <literal>C</>, and <literal>POSIX</> collations can be used
+    regardless of the database encoding.
+   </para>
+
+   <para>
+    <productname>PostgreSQL</productname> considers distinct collation
+    objects to be incompatible even when they have identical properties.
+    Thus for example,
+<programlisting>
+SELECT a COLLATE "C" &lt; b COLLATE "POSIX" FROM test1;
+</programlisting>
+    will draw an error even though the <literal>C</> and <literal>POSIX</>
+    collations have identical behaviors.  Mixing stripped and non-stripped
+    collation names is therefore not recommended.
     </para>
    </sect2>
   </sect1>
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c

index aba1145..5478310 100644 (file)
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1462,10 +1462,16 @@ str_numth(char *dest, char *num, int type)
   * in multibyte character sets.  Note that in either case we are effectively
   * assuming that the database character encoding matches the encoding implied
   * by LC_CTYPE.
+ *
+ * If the system provides locale_t and associated functions (which are
+ * standardized by Open Group's XBD), we can support collations that are
+ * neither default nor C.  The code is written to handle both combinations
+ * of have-wide-characters and have-locale_t, though it's rather unlikely
+ * a platform would have the latter without the former.
   */
  
  /*
- * wide-character-aware lower function
+ * collation-aware, wide-character-aware lower function
   *
   * We pass the number of bytes so we can pass varlena and char*
   * to this function.  The result is a palloc'd, null-terminated string.
@@ -1474,21 +1480,31 @@ char *
  str_tolower(const char *buff, size_t nbytes, Oid collid)
  {
         char       *result;
-       pg_locale_t     mylocale = 0;
  
         if (!buff)
                 return NULL;
  
-       if (collid != DEFAULT_COLLATION_OID)
-               mylocale = pg_newlocale_from_collation(collid);
+       /* C/POSIX collations use this path regardless of database encoding */
+       if (lc_ctype_is_c(collid))
+       {
+               char       *p;
+
+               result = pnstrdup(buff, nbytes);
  
+               for (p = result; *p; p++)
+                       *p = pg_ascii_tolower((unsigned char) *p);
+       }
  #ifdef USE_WIDE_UPPER_LOWER
-       if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
+       else if (pg_database_encoding_max_length() > 1)
         {
+               pg_locale_t     mylocale = 0;
                 wchar_t    *workspace;
                 size_t          curr_char;
                 size_t          result_size;
  
+               if (collid != DEFAULT_COLLATION_OID)
+                       mylocale = pg_newlocale_from_collation(collid);
+
                 /* Overflow paranoia */
                 if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
                         ereport(ERROR,
@@ -1501,12 +1517,14 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
                 char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
  
                 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+               {
  #ifdef HAVE_LOCALE_T
                         if (mylocale)
                                 workspace[curr_char] = towlower_l(workspace[curr_char], mylocale);
                         else
  #endif
-                       workspace[curr_char] = towlower(workspace[curr_char]);
+                               workspace[curr_char] = towlower(workspace[curr_char]);
+               }
  
                 /* Make result large enough; case change might change number of bytes */
                 result_size = curr_char * pg_database_encoding_max_length() + 1;
@@ -1515,22 +1533,40 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
                 wchar2char(result, workspace, result_size, collid);
                 pfree(workspace);
         }
-       else
  #endif   /* USE_WIDE_UPPER_LOWER */
+       else
         {
+               pg_locale_t     mylocale = 0;
                 char       *p;
  
+               if (collid != DEFAULT_COLLATION_OID)
+                       mylocale = pg_newlocale_from_collation(collid);
+
                 result = pnstrdup(buff, nbytes);
  
+               /*
+                * Note: we assume that tolower_l() will not be so broken as to need
+                * an isupper_l() guard test.  When using the default collation, we
+                * apply the traditional Postgres behavior that forces ASCII-style
+                * treatment of I/i, but in non-default collations you get exactly
+                * what the collation says.
+                */
                 for (p = result; *p; p++)
-                       *p = pg_tolower((unsigned char) *p);
+               {
+#ifdef HAVE_LOCALE_T
+                       if (mylocale)
+                               *p = tolower_l((unsigned char) *p, mylocale);
+                       else
+#endif
+                               *p = pg_tolower((unsigned char) *p);
+               }
         }
  
         return result;
  }
  
  /*
- * wide-character-aware upper function
+ * collation-aware, wide-character-aware upper function
   *
   * We pass the number of bytes so we can pass varlena and char*
   * to this function.  The result is a palloc'd, null-terminated string.
@@ -1539,21 +1575,31 @@ char *
  str_toupper(const char *buff, size_t nbytes, Oid collid)
  {
         char       *result;
-       pg_locale_t     mylocale = 0;
  
         if (!buff)
                 return NULL;
  
-       if (collid != DEFAULT_COLLATION_OID)
-               mylocale = pg_newlocale_from_collation(collid);
+       /* C/POSIX collations use this path regardless of database encoding */
+       if (lc_ctype_is_c(collid))
+       {
+               char       *p;
  
+               result = pnstrdup(buff, nbytes);
+
+               for (p = result; *p; p++)
+                       *p = pg_ascii_toupper((unsigned char) *p);
+       }
  #ifdef USE_WIDE_UPPER_LOWER
-       if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
+       else if (pg_database_encoding_max_length() > 1)
         {
+               pg_locale_t     mylocale = 0;
                 wchar_t    *workspace;
                 size_t          curr_char;
                 size_t          result_size;
  
+               if (collid != DEFAULT_COLLATION_OID)
+                       mylocale = pg_newlocale_from_collation(collid);
+
                 /* Overflow paranoia */
                 if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
                         ereport(ERROR,
@@ -1566,12 +1612,14 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
                 char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
  
                 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+               {
  #ifdef HAVE_LOCALE_T
                         if (mylocale)
                                 workspace[curr_char] = towupper_l(workspace[curr_char], mylocale);
                         else
  #endif
-                       workspace[curr_char] = towupper(workspace[curr_char]);
+                               workspace[curr_char] = towupper(workspace[curr_char]);
+               }
  
                 /* Make result large enough; case change might change number of bytes */
                 result_size = curr_char * pg_database_encoding_max_length() + 1;
@@ -1580,22 +1628,40 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
                 wchar2char(result, workspace, result_size, collid);
                 pfree(workspace);
         }
-       else
  #endif   /* USE_WIDE_UPPER_LOWER */
+       else
         {
+               pg_locale_t     mylocale = 0;
                 char       *p;
  
+               if (collid != DEFAULT_COLLATION_OID)
+                       mylocale = pg_newlocale_from_collation(collid);
+
                 result = pnstrdup(buff, nbytes);
  
+               /*
+                * Note: we assume that toupper_l() will not be so broken as to need
+                * an islower_l() guard test.  When using the default collation, we
+                * apply the traditional Postgres behavior that forces ASCII-style
+                * treatment of I/i, but in non-default collations you get exactly
+                * what the collation says.
+                */
                 for (p = result; *p; p++)
-                       *p = pg_toupper((unsigned char) *p);
+               {
+#ifdef HAVE_LOCALE_T
+                       if (mylocale)
+                               *p = toupper_l((unsigned char) *p, mylocale);
+                       else
+#endif
+                               *p = pg_toupper((unsigned char) *p);
+               }
         }
  
         return result;
  }
  
  /*
- * wide-character-aware initcap function
+ * collation-aware, wide-character-aware initcap function
   *
   * We pass the number of bytes so we can pass varlena and char*
   * to this function.  The result is a palloc'd, null-terminated string.
@@ -1605,21 +1671,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
  {
         char       *result;
         int                     wasalnum = false;
-       pg_locale_t     mylocale = 0;
  
         if (!buff)
                 return NULL;
  
-       if (collid != DEFAULT_COLLATION_OID)
-               mylocale = pg_newlocale_from_collation(collid);
+       /* C/POSIX collations use this path regardless of database encoding */
+       if (lc_ctype_is_c(collid))
+       {
+               char       *p;
+
+               result = pnstrdup(buff, nbytes);
  
+               for (p = result; *p; p++)
+               {
+                       char    c;
+
+                       if (wasalnum)
+                               *p = c = pg_ascii_tolower((unsigned char) *p);
+                       else
+                               *p = c = pg_ascii_toupper((unsigned char) *p);
+                       /* we don't trust isalnum() here */
+                       wasalnum = ((c >= 'A' && c <= 'Z') ||
+                                               (c >= 'a' && c <= 'z') ||
+                                               (c >= '0' && c <= '9'));
+               }
+       }
  #ifdef USE_WIDE_UPPER_LOWER
-       if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
+       else if (pg_database_encoding_max_length() > 1)
         {
+               pg_locale_t     mylocale = 0;
                 wchar_t    *workspace;
                 size_t          curr_char;
                 size_t          result_size;
  
+               if (collid != DEFAULT_COLLATION_OID)
+                       mylocale = pg_newlocale_from_collation(collid);
+
                 /* Overflow paranoia */
                 if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
                         ereport(ERROR,
@@ -1660,20 +1747,44 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
                 wchar2char(result, workspace, result_size, collid);
                 pfree(workspace);
         }
-       else
  #endif   /* USE_WIDE_UPPER_LOWER */
+       else
         {
+               pg_locale_t     mylocale = 0;
                 char       *p;
  
+               if (collid != DEFAULT_COLLATION_OID)
+                       mylocale = pg_newlocale_from_collation(collid);
+
                 result = pnstrdup(buff, nbytes);
  
+               /*
+                * Note: we assume that toupper_l()/tolower_l() will not be so broken
+                * as to need guard tests.  When using the default collation, we apply
+                * the traditional Postgres behavior that forces ASCII-style treatment
+                * of I/i, but in non-default collations you get exactly what the
+                * collation says.
+                */
                 for (p = result; *p; p++)
                 {
-                       if (wasalnum)
-                               *p = pg_tolower((unsigned char) *p);
+#ifdef HAVE_LOCALE_T
+                       if (mylocale)
+                       {
+                               if (wasalnum)
+                                       *p = tolower_l((unsigned char) *p, mylocale);
+                               else
+                                       *p = toupper_l((unsigned char) *p, mylocale);
+                               wasalnum = isalnum_l((unsigned char) *p, mylocale);
+                       }
                         else
-                               *p = pg_toupper((unsigned char) *p);
-                       wasalnum = isalnum((unsigned char) *p);
+#endif
+                       {
+                               if (wasalnum)
+                                       *p = pg_tolower((unsigned char) *p);
+                               else
+                                       *p = pg_toupper((unsigned char) *p);
+                               wasalnum = isalnum((unsigned char) *p);
+                       }
                 }
         }
  
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c

index 2b9b321..15d347c 100644 (file)
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -99,15 +99,24 @@ static char lc_monetary_envbuf[LC_ENV_BUFSIZE];
  static char lc_numeric_envbuf[LC_ENV_BUFSIZE];
  static char lc_time_envbuf[LC_ENV_BUFSIZE];
  
+/* Cache for collation-related knowledge */
+
+typedef struct
+{
+       Oid                     collid;                 /* hash key: pg_collation OID */
+       bool            collate_is_c;   /* is collation's LC_COLLATE C? */
+       bool            ctype_is_c;             /* is collation's LC_CTYPE C? */
+       bool            flags_valid;    /* true if above flags are valid */
+       pg_locale_t     locale;                 /* locale_t struct, or 0 if not valid */
+} collation_cache_entry;
+
+static HTAB *collation_cache = NULL;
+
+
  #if defined(WIN32) && defined(LC_MESSAGES)
  static char *IsoLocaleName(const char *);              /* MSVC specific */
  #endif
  
-static HTAB *locale_cness_cache = NULL;
-#ifdef HAVE_LOCALE_T
-static HTAB *locale_t_cache = NULL;
-#endif
-
  
  /*
   * pg_perm_setlocale
@@ -313,136 +322,6 @@ locale_messages_assign(const char *value, bool doit, GucSource source)
  
  
  /*
- * We'd like to cache whether LC_COLLATE or LC_CTYPE is C (or POSIX),
- * so we can optimize a few code paths in various places.
- *
- * Note that some code relies on this not reporting false negatives
- * (that is, saying it's not C when it is).  For example, char2wchar()
- * could fail if the locale is C, so str_tolower() shouldn't call it
- * in that case.
- */
-
-struct locale_cness_cache_entry
-{
-       Oid                     collid;
-       bool            collate_is_c;
-       bool            ctype_is_c;
-};
-
-static void
-init_locale_cness_cache(void)
-{
-       HASHCTL         ctl;
-
-       memset(&ctl, 0, sizeof(ctl));
-       ctl.keysize = sizeof(Oid);
-       ctl.entrysize = sizeof(struct locale_cness_cache_entry);
-       ctl.hash = oid_hash;
-       locale_cness_cache = hash_create("locale C-ness cache", 1000, &ctl, HASH_ELEM | HASH_FUNCTION);
-}
-
-/*
- * Handle caching of locale "C-ness" for nondefault collation objects.
- * Relying on the system cache directly isn't fast enough.
- */
-static bool
-lookup_collation_cness(Oid collation, int category)
-{
-       struct locale_cness_cache_entry *cache_entry;
-       bool            found;
-       HeapTuple       tp;
-       char       *localeptr;
-
-       Assert(OidIsValid(collation));
-       Assert(category == LC_COLLATE || category == LC_CTYPE);
-
-       if (!locale_cness_cache)
-               init_locale_cness_cache();
-
-       cache_entry = hash_search(locale_cness_cache, &collation, HASH_ENTER, &found);
-       if (found)
-       {
-               if (category == LC_COLLATE)
-                       return cache_entry->collate_is_c;
-               else
-                       return cache_entry->ctype_is_c;
-       }
-
-       tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
-       if (!HeapTupleIsValid(tp))
-               elog(ERROR, "cache lookup failed for collation %u", collation);
-
-       localeptr = NameStr(((Form_pg_collation) GETSTRUCT(tp))->collcollate);
-       cache_entry->collate_is_c = (strcmp(localeptr, "C") == 0) || (strcmp(localeptr, "POSIX") == 0);
-
-       localeptr = NameStr(((Form_pg_collation) GETSTRUCT(tp))->collctype);
-       cache_entry->ctype_is_c = (strcmp(localeptr, "C") == 0) || (strcmp(localeptr, "POSIX") == 0);
-
-       ReleaseSysCache(tp);
-
-       return category == LC_COLLATE ? cache_entry->collate_is_c : cache_entry->ctype_is_c;
-}
-
-
-bool
-lc_collate_is_c(Oid collation)
-{
-       /* Cache result so we only have to compute it once */
-       static int      result = -1;
-       char       *localeptr;
-
-       if (!OidIsValid(collation))
-               return false;
-
-       if (collation != DEFAULT_COLLATION_OID)
-               return lookup_collation_cness(collation, LC_COLLATE);
-
-       if (result >= 0)
-               return (bool) result;
-       localeptr = setlocale(LC_COLLATE, NULL);
-       if (!localeptr)
-               elog(ERROR, "invalid LC_COLLATE setting");
-
-       if (strcmp(localeptr, "C") == 0)
-               result = true;
-       else if (strcmp(localeptr, "POSIX") == 0)
-               result = true;
-       else
-               result = false;
-       return (bool) result;
-}
-
-
-bool
-lc_ctype_is_c(Oid collation)
-{
-       /* Cache result so we only have to compute it once */
-       static int      result = -1;
-       char       *localeptr;
-
-       if (!OidIsValid(collation))
-               return false;
-
-       if (collation != DEFAULT_COLLATION_OID)
-               return lookup_collation_cness(collation, LC_CTYPE);
-
-       if (result >= 0)
-               return (bool) result;
-       localeptr = setlocale(LC_CTYPE, NULL);
-       if (!localeptr)
-               elog(ERROR, "invalid LC_CTYPE setting");
-
-       if (strcmp(localeptr, "C") == 0)
-               result = true;
-       else if (strcmp(localeptr, "POSIX") == 0)
-               result = true;
-       else
-               result = false;
-       return (bool) result;
-}
-
-
-/*
   * Frees the malloced content of a struct lconv.  (But not the struct
   * itself.)
   */
@@ -844,116 +723,295 @@ IsoLocaleName(const char *winlocname)
  #endif   /* WIN32 && LC_MESSAGES */
  
  
-#ifdef HAVE_LOCALE_T
-struct locale_t_cache_entry
+/*
+ * Cache mechanism for collation information.
+ *
+ * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
+ * (or POSIX), so we can optimize a few code paths in various places.
+ * For the built-in C and POSIX collations, we can know that without even
+ * doing a cache lookup, but we want to support aliases for C/POSIX too.
+ * For the "default" collation, there are separate static cache variables,
+ * since consulting the pg_collation catalog doesn't tell us what we need.
+ *
+ * Also, if a pg_locale_t has been requested for a collation, we cache that
+ * for the life of a backend.
+ *
+ * Note that some code relies on the flags not reporting false negatives
+ * (that is, saying it's not C when it is).  For example, char2wchar()
+ * could fail if the locale is C, so str_tolower() shouldn't call it
+ * in that case.
+ *
+ * Note that we currently lack any way to flush the cache.  Since we don't
+ * support ALTER COLLATION, this is OK.  The worst case is that someone
+ * drops a collation, and a useless cache entry hangs around in existing
+ * backends.
+ */
+
+static collation_cache_entry *
+lookup_collation_cache(Oid collation, bool set_flags)
  {
-       Oid                     collid;
-       locale_t        locale;
-};
+       collation_cache_entry *cache_entry;
+       bool            found;
  
-static void
-init_locale_t_cache(void)
+       Assert(OidIsValid(collation));
+       Assert(collation != DEFAULT_COLLATION_OID);
+
+       if (collation_cache == NULL)
+       {
+               /* First time through, initialize the hash table */
+               HASHCTL         ctl;
+
+               memset(&ctl, 0, sizeof(ctl));
+               ctl.keysize = sizeof(Oid);
+               ctl.entrysize = sizeof(collation_cache_entry);
+               ctl.hash = oid_hash;
+               collation_cache = hash_create("Collation cache", 100, &ctl,
+                                                                         HASH_ELEM | HASH_FUNCTION);
+       }
+
+       cache_entry = hash_search(collation_cache, &collation, HASH_ENTER, &found);
+       if (!found)
+       {
+               /*
+                * Make sure cache entry is marked invalid, in case we fail before
+                * setting things.
+                */
+               cache_entry->flags_valid = false;
+               cache_entry->locale = 0;
+       }
+
+       if (set_flags && !cache_entry->flags_valid)
+       {
+               /* Attempt to set the flags */
+               HeapTuple       tp;
+               Form_pg_collation collform;
+               const char *collcollate;
+               const char *collctype;
+
+               tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
+               if (!HeapTupleIsValid(tp))
+                       elog(ERROR, "cache lookup failed for collation %u", collation);
+               collform = (Form_pg_collation) GETSTRUCT(tp);
+
+               collcollate = NameStr(collform->collcollate);
+               collctype = NameStr(collform->collctype);
+
+               cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
+                                                                        (strcmp(collcollate, "POSIX") == 0));
+               cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
+                                                                  (strcmp(collctype, "POSIX") == 0));
+
+               cache_entry->flags_valid = true;
+
+               ReleaseSysCache(tp);
+       }
+
+       return cache_entry;
+}
+
+
+/*
+ * Detect whether collation's LC_COLLATE property is C
+ */
+bool
+lc_collate_is_c(Oid collation)
  {
-       HASHCTL         ctl;
+       /*
+        * If we're asked about "collation 0", return false, so that the code
+        * will go into the non-C path and report that the collation is bogus.
+        */
+       if (!OidIsValid(collation))
+               return false;
+
+       /*
+        * If we're asked about the default collation, we have to inquire of
+        * the C library.  Cache the result so we only have to compute it once.
+        */
+       if (collation == DEFAULT_COLLATION_OID)
+       {
+               static int      result = -1;
+               char       *localeptr;
+
+               if (result >= 0)
+                       return (bool) result;
+               localeptr = setlocale(LC_COLLATE, NULL);
+               if (!localeptr)
+                       elog(ERROR, "invalid LC_COLLATE setting");
+
+               if (strcmp(localeptr, "C") == 0)
+                       result = true;
+               else if (strcmp(localeptr, "POSIX") == 0)
+                       result = true;
+               else
+                       result = false;
+               return (bool) result;
+       }
+
+       /*
+        * If we're asked about the built-in C/POSIX collations, we know that.
+        */
+       if (collation == C_COLLATION_OID ||
+               collation == POSIX_COLLATION_OID)
+               return true;
+
+       /*
+        * Otherwise, we have to consult pg_collation, but we cache that.
+        */
+       return (lookup_collation_cache(collation, true))->collate_is_c;
+}
+
+/*
+ * Detect whether collation's LC_CTYPE property is C
+ */
+bool
+lc_ctype_is_c(Oid collation)
+{
+       /*
+        * If we're asked about "collation 0", return false, so that the code
+        * will go into the non-C path and report that the collation is bogus.
+        */
+       if (!OidIsValid(collation))
+               return false;
+
+       /*
+        * If we're asked about the default collation, we have to inquire of
+        * the C library.  Cache the result so we only have to compute it once.
+        */
+       if (collation == DEFAULT_COLLATION_OID)
+       {
+               static int      result = -1;
+               char       *localeptr;
+
+               if (result >= 0)
+                       return (bool) result;
+               localeptr = setlocale(LC_CTYPE, NULL);
+               if (!localeptr)
+                       elog(ERROR, "invalid LC_CTYPE setting");
+
+               if (strcmp(localeptr, "C") == 0)
+                       result = true;
+               else if (strcmp(localeptr, "POSIX") == 0)
+                       result = true;
+               else
+                       result = false;
+               return (bool) result;
+       }
+
+       /*
+        * If we're asked about the built-in C/POSIX collations, we know that.
+        */
+       if (collation == C_COLLATION_OID ||
+               collation == POSIX_COLLATION_OID)
+               return true;
  
-       memset(&ctl, 0, sizeof(ctl));
-       ctl.keysize = sizeof(Oid);
-       ctl.entrysize = sizeof(struct locale_t_cache_entry);
-       ctl.hash = oid_hash;
-       locale_t_cache = hash_create("locale_t cache", 1000, &ctl, HASH_ELEM | HASH_FUNCTION);
+       /*
+        * Otherwise, we have to consult pg_collation, but we cache that.
+        */
+       return (lookup_collation_cache(collation, true))->ctype_is_c;
  }
-#endif /* HAVE_LOCALE_T */
+
  
  /*
   * Create a locale_t from a collation OID.  Results are cached for the
- * lifetime of the backend.  Thus, do not free the result with
- * freelocale().
+ * lifetime of the backend.  Thus, do not free the result with freelocale().
   *
- * As a special optimization, the default/database collation returns
- * 0.  Callers should then revert to the non-locale_t-enabled code
- * path.  In fact, they shouldn't call this function at all when they
- * are dealing with the default locale.  That can save quite a bit in
- * hotspots.
+ * As a special optimization, the default/database collation returns 0.
+ * Callers should then revert to the non-locale_t-enabled code path.
+ * In fact, they shouldn't call this function at all when they are dealing
+ * with the default locale.  That can save quite a bit in hotspots.
+ * Also, callers should avoid calling this before going down a C/POSIX
+ * fastpath, because such a fastpath should work even on platforms without
+ * locale_t support in the C library.
   *
   * For simplicity, we always generate COLLATE + CTYPE even though we
- * might only need one of them.  Since this is called only once per
- * session, it shouldn't cost much.
+ * might only need one of them.  Since this is called only once per session,
+ * it shouldn't cost much.
   */
  pg_locale_t
  pg_newlocale_from_collation(Oid collid)
  {
-#ifdef HAVE_LOCALE_T
-       HeapTuple       tp;
-       const char *collcollate;
-       const char *collctype;
-       locale_t        result;
-       struct locale_t_cache_entry *cache_entry;
-       bool            found;
+       collation_cache_entry *cache_entry;
  
+       /* Return 0 for "default" collation, just in case caller forgets */
         if (collid == DEFAULT_COLLATION_OID)
-               return (locale_t) 0;
+               return (pg_locale_t) 0;
  
+       /*
+        * This is where we'll fail if a collation-aware function is invoked
+        * and no collation OID is passed.  This typically means that the
+        * parser could not resolve a conflict of implicit collations, so
+        * report it that way.
+        */
         if (!OidIsValid(collid))
-               elog(ERROR, "locale operation to be invoked, but no collation was derived");
+               ereport(ERROR,
+                               (errcode(ERRCODE_INDETERMINATE_COLLATION),
+                                errmsg("locale operation to be invoked, but no collation was derived")));
  
-       if (!locale_t_cache)
-               init_locale_t_cache();
+       cache_entry = lookup_collation_cache(collid, false);
  
-       cache_entry = hash_search(locale_t_cache, &collid, HASH_ENTER, &found);
-       if (found)
-               return cache_entry->locale;
+       if (cache_entry->locale == 0)
+       {
+               /* We haven't computed this yet in this session, so do it */
+#ifdef HAVE_LOCALE_T
+               HeapTuple       tp;
+               Form_pg_collation collform;
+               const char *collcollate;
+               const char *collctype;
+               locale_t        result;
  
-       tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
-       if (!HeapTupleIsValid(tp))
-               elog(ERROR, "cache lookup failed for collation %u", collid);
+               tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
+               if (!HeapTupleIsValid(tp))
+                       elog(ERROR, "cache lookup failed for collation %u", collid);
+               collform = (Form_pg_collation) GETSTRUCT(tp);
  
-       collcollate = NameStr(((Form_pg_collation) GETSTRUCT(tp))->collcollate);
-       collctype = NameStr(((Form_pg_collation) GETSTRUCT(tp))->collctype);
+               collcollate = NameStr(collform->collcollate);
+               collctype = NameStr(collform->collctype);
  
-       if (strcmp(collcollate, collctype) == 0)
-       {
-               result = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate, NULL);
-               if (!result)
-                       ereport(ERROR,
-                                       (errcode_for_file_access(),
-                                        errmsg("could not create locale \"%s\": %m", collcollate)));
-       }
-       else
-       {
-               locale_t loc1;
-
-               loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
-               if (!loc1)
-                       ereport(ERROR,
-                                       (errcode_for_file_access(),
-                                        errmsg("could not create locale \"%s\": %m", collcollate)));
-               result = newlocale(LC_CTYPE_MASK, collctype, loc1);
-               if (!result)
-                       ereport(ERROR,
-                                       (errcode_for_file_access(),
-                                        errmsg("could not create locale \"%s\": %m", collctype)));
-       }
+               if (strcmp(collcollate, collctype) == 0)
+               {
+                       /* Normal case where they're the same */
+                       result = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate,
+                                                          NULL);
+                       if (!result)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not create locale \"%s\": %m",
+                                                               collcollate)));
+               }
+               else
+               {
+                       /* We need two newlocale() steps */
+                       locale_t loc1;
+
+                       loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
+                       if (!loc1)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not create locale \"%s\": %m",
+                                                               collcollate)));
+                       result = newlocale(LC_CTYPE_MASK, collctype, loc1);
+                       if (!result)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not create locale \"%s\": %m",
+                                                               collctype)));
+               }
  
-       ReleaseSysCache(tp);
+               cache_entry->locale = result;
  
-       cache_entry->locale = result;
+               ReleaseSysCache(tp);
  
-       return result;
  #else /* not HAVE_LOCALE_T */
-       /*
-        * For platforms that don't support locale_t, check that we are
-        * dealing with the default locale.  It's unlikely that we'll get
-        * here, but it's possible if users are creating collations even
-        * though they are not supported, or they are mixing builds in odd
-        * ways.
-        */
-       if (!OidIsValid(collid))
-               elog(ERROR, "locale operation to be invoked, but no collation was derived");
-       else if (collid != DEFAULT_COLLATION_OID)
+
+               /*
+                * For platforms that don't support locale_t, we can't do anything
+                * with non-default collations.
+                */
                 ereport(ERROR,
                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                  errmsg("nondefault collations are not supported on this platform")));
-
-       return 0;
  #endif /* not HAVE_LOCALE_T */
+       }
+
+       return cache_entry->locale;
  }
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c

index d509b13..98e864d 100644 (file)
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -1616,17 +1616,21 @@ setup_collation(void)
                  */
                 skip = false;
                 for (i = 0; i < len; i++)
+               {
                         if (IS_HIGHBIT_SET(localebuf[i]))
                         {
-                               if (debug)
-                                       fprintf(stderr, _("%s: locale name has non-ASCII characters, skipped: %s\n"),
-                                                       progname, localebuf);
-                               skipped++;
                                 skip = true;
                                 break;
                         }
+               }
                 if (skip)
+               {
+                       if (debug)
+                               fprintf(stderr, _("%s: locale name has non-ASCII characters, skipped: %s\n"),
+                                               progname, localebuf);
+                       skipped++;
                         continue;
+               }
  
                 enc = pg_get_encoding_from_locale(localebuf, debug);
                 if (enc < 0)
@@ -1635,7 +1639,7 @@ setup_collation(void)
                         continue;                       /* error message printed by pg_get_encoding_from_locale() */
                 }
                 if (enc == PG_SQL_ASCII)
-                       continue;                       /* SQL_ASCII is handled separately */
+                       continue;                       /* C/POSIX are already in the catalog */
  
                 PG_CMD_PRINTF2("INSERT INTO tmp_pg_collation (locale, encoding) VALUES ('%s', %d);",
                                            escape_quotes(localebuf), enc);
@@ -1651,10 +1655,6 @@ setup_collation(void)
                                                    escape_quotes(alias), escape_quotes(localebuf), enc);
         }
  
-       for (i = PG_SQL_ASCII; i <= PG_ENCODING_BE_LAST; i++)
-               PG_CMD_PRINTF2("INSERT INTO tmp_pg_collation (locale, encoding) VALUES ('C', %d), ('POSIX', %d);",
-                                          i, i);
-
         /* Add an SQL-standard name */
         PG_CMD_PRINTF1("INSERT INTO tmp_pg_collation (collname, locale, encoding) VALUES ('ucs_basic', 'C', %d);", PG_UTF8);
  
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index c10de53..e965909 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
   */
  
  /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     201103191
+#define CATALOG_VERSION_NO     201103201
  
  #endif
diff --git a/src/include/catalog/pg_collation.h b/src/include/catalog/pg_collation.h

index e90aa05..2ab0c50 100644 (file)
--- a/src/include/catalog/pg_collation.h
+++ b/src/include/catalog/pg_collation.h
@@ -58,8 +58,19 @@ typedef FormData_pg_collation *Form_pg_collation;
  #define Anum_pg_collation_collcollate  5
  #define Anum_pg_collation_collctype            6
  
-DATA(insert OID = 100 ( default PGNSP PGUID -1 "" "" ));
+/* ----------------
+ *             initial contents of pg_collation
+ * ----------------
+ */
+
+DATA(insert OID = 100 ( default                PGNSP PGUID -1 "" "" ));
  DESCR("database's default collation");
-#define DEFAULT_COLLATION_OID                  100
+#define DEFAULT_COLLATION_OID  100
+DATA(insert OID = 950 ( C                      PGNSP PGUID -1 "C" "C" ));
+DESCR("standard C collation");
+#define C_COLLATION_OID                        950
+DATA(insert OID = 951 ( POSIX          PGNSP PGUID -1 "POSIX" "POSIX" ));
+DESCR("standard POSIX collation");
+#define POSIX_COLLATION_OID            951
  
  #endif   /* PG_COLLATION_H */
diff --git a/src/include/port.h b/src/include/port.h

index 9d08b39..1116a92 100644 (file)
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -155,6 +155,8 @@ extern int  pg_strcasecmp(const char *s1, const char *s2);
  extern int     pg_strncasecmp(const char *s1, const char *s2, size_t n);
  extern unsigned char pg_toupper(unsigned char ch);
  extern unsigned char pg_tolower(unsigned char ch);
+extern unsigned char pg_ascii_toupper(unsigned char ch);
+extern unsigned char pg_ascii_tolower(unsigned char ch);
  
  #ifdef USE_REPL_SNPRINTF
  
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c

index 1680124..f6e226f 100644 (file)
--- a/src/port/pgstrcasecmp.c
+++ b/src/port/pgstrcasecmp.c
@@ -13,6 +13,10 @@
   *
   * NB: this code should match downcase_truncate_identifier() in scansup.c.
   *
+ * We also provide strict ASCII-only case conversion functions, which can
+ * be used to implement C/POSIX case folding semantics no matter what the
+ * C library thinks the locale is.
+ *
   *
   * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
   *
@@ -123,3 +127,25 @@ pg_tolower(unsigned char ch)
                 ch = tolower(ch);
         return ch;
  }
+
+/*
+ * Fold a character to upper case, following C/POSIX locale rules.
+ */
+unsigned char
+pg_ascii_toupper(unsigned char ch)
+{
+       if (ch >= 'a' && ch <= 'z')
+               ch += 'A' - 'a';
+       return ch;
+}
+
+/*
+ * Fold a character to lower case, following C/POSIX locale rules.
+ */
+unsigned char
+pg_ascii_tolower(unsigned char ch)
+{
+       if (ch >= 'A' && ch <= 'Z')
+               ch += 'a' - 'A';
+       return ch;
+}
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 20 Mar 2011 16:43:39 +0000 (12:43 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 20 Mar 2011 16:44:13 +0000 (12:44 -0400)
doc/src/sgml/charset.sgml		patch \| blob \| history
src/backend/utils/adt/formatting.c		patch \| blob \| history
src/backend/utils/adt/pg_locale.c		patch \| blob \| history
src/bin/initdb/initdb.c		patch \| blob \| history
src/include/catalog/catversion.h		patch \| blob \| history
src/include/catalog/pg_collation.h		patch \| blob \| history
src/include/port.h		patch \| blob \| history
src/port/pgstrcasecmp.c		patch \| blob \| history