From 3c381a55b0fa1c4ca328211e39df8e1a715129cb Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 11 Apr 2011 12:28:28 -0400 Subject: [PATCH] Teach pattern_fixed_prefix() about collations. This is necessary, not optional, now that ILIKE and regexes are collation aware --- else we might derive a wrong comparison constant for index optimized pattern matches. --- src/backend/optimizer/path/indxpath.c | 34 +++++---- src/backend/utils/adt/selfuncs.c | 137 ++++++++++++++++++++++++---------- src/include/utils/selfuncs.h | 1 + 3 files changed, 117 insertions(+), 55 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index ef65cf2224..c7ed1b6ee9 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -2446,6 +2446,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, bool isIndexable = false; Node *rightop; Oid expr_op; + Oid expr_coll; Const *patt; Const *prefix = NULL; Const *rest = NULL; @@ -2462,6 +2463,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, /* we know these will succeed */ rightop = get_rightop(clause); expr_op = ((OpExpr *) clause)->opno; + expr_coll = ((OpExpr *) clause)->inputcollid; /* again, required for all current special ops: */ if (!IsA(rightop, Const) || @@ -2475,13 +2477,13 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, case OID_BPCHAR_LIKE_OP: case OID_NAME_LIKE_OP: /* the right-hand const is type text for all of these */ - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll, &prefix, &rest); isIndexable = (pstatus != Pattern_Prefix_None); break; case OID_BYTEA_LIKE_OP: - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll, &prefix, &rest); isIndexable = (pstatus != Pattern_Prefix_None); break; @@ -2490,7 +2492,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, case OID_BPCHAR_ICLIKE_OP: case OID_NAME_ICLIKE_OP: /* the right-hand const is type text for all of these */ - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, expr_coll, &prefix, &rest); isIndexable = (pstatus != Pattern_Prefix_None); break; @@ -2499,7 +2501,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, case OID_BPCHAR_REGEXEQ_OP: case OID_NAME_REGEXEQ_OP: /* the right-hand const is type text for all of these */ - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, expr_coll, &prefix, &rest); isIndexable = (pstatus != Pattern_Prefix_None); break; @@ -2508,7 +2510,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, case OID_BPCHAR_ICREGEXEQ_OP: case OID_NAME_ICREGEXEQ_OP: /* the right-hand const is type text for all of these */ - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, expr_coll, &prefix, &rest); isIndexable = (pstatus != Pattern_Prefix_None); break; @@ -2544,10 +2546,9 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, * * The non-pattern opclasses will not sort the way we need in most non-C * locales. We can use such an index anyway for an exact match (simple - * equality), but not for prefix-match cases. Note that we are looking at - * the index's collation, not the expression's collation -- this test is - * not dependent on the LIKE/regex operator's collation (which would only - * affect case folding behavior of ILIKE, anyway). + * equality), but not for prefix-match cases. Note that here we are + * looking at the index's collation, not the expression's collation -- + * this test is *not* dependent on the LIKE/regex operator's collation. */ switch (expr_op) { @@ -2558,7 +2559,8 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, isIndexable = (opfamily == TEXT_PATTERN_BTREE_FAM_OID) || (opfamily == TEXT_BTREE_FAM_OID && - (pstatus == Pattern_Prefix_Exact || lc_collate_is_c(idxcollation))); + (pstatus == Pattern_Prefix_Exact || + lc_collate_is_c(idxcollation))); break; case OID_BPCHAR_LIKE_OP: @@ -2568,7 +2570,8 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation, isIndexable = (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID) || (opfamily == BPCHAR_BTREE_FAM_OID && - (pstatus == Pattern_Prefix_Exact || lc_collate_is_c(idxcollation))); + (pstatus == Pattern_Prefix_Exact || + lc_collate_is_c(idxcollation))); break; case OID_NAME_LIKE_OP: @@ -2770,6 +2773,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation) Node *leftop = get_leftop(clause); Node *rightop = get_rightop(clause); Oid expr_op = ((OpExpr *) clause)->opno; + Oid expr_coll = ((OpExpr *) clause)->inputcollid; Const *patt = (Const *) rightop; Const *prefix = NULL; Const *rest = NULL; @@ -2791,7 +2795,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation) case OID_BYTEA_LIKE_OP: if (!op_in_opfamily(expr_op, opfamily)) { - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll, &prefix, &rest); return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus); } @@ -2803,7 +2807,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation) if (!op_in_opfamily(expr_op, opfamily)) { /* the right-hand const is type text for all of these */ - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, expr_coll, &prefix, &rest); return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus); } @@ -2815,7 +2819,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation) if (!op_in_opfamily(expr_op, opfamily)) { /* the right-hand const is type text for all of these */ - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, expr_coll, &prefix, &rest); return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus); } @@ -2827,7 +2831,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation) if (!op_in_opfamily(expr_op, opfamily)) { /* the right-hand const is type text for all of these */ - pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, + pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, expr_coll, &prefix, &rest); return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus); } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 41c5202146..534425a6b5 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -1181,9 +1181,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) return result; } - /* divide pattern into fixed prefix and remainder */ + /* + * Divide pattern into fixed prefix and remainder. XXX we have to assume + * default collation here, because we don't have access to the actual + * input collation for the operator. FIXME ... + */ patt = (Const *) other; - pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest); + pstatus = pattern_fixed_prefix(patt, ptype, DEFAULT_COLLATION_OID, + &prefix, &rest); /* * If necessary, coerce the prefix constant to the right type. (The "rest" @@ -4756,6 +4761,29 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, */ /* + * Check whether char is a letter (and, hence, subject to case-folding) + * + * In multibyte character sets, we can't use isalpha, and it does not seem + * worth trying to convert to wchar_t to use iswalpha. Instead, just assume + * any multibyte char is potentially case-varying. + */ +static int +pattern_char_isalpha(char c, bool is_multibyte, + pg_locale_t locale, bool locale_is_c) +{ + if (locale_is_c) + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); + else if (is_multibyte && IS_HIGHBIT_SET(c)) + return true; +#ifdef HAVE_LOCALE_T + else if (locale) + return isalpha_l((unsigned char) c, locale); +#endif + else + return isalpha((unsigned char) c); +} + +/* * Extract the fixed prefix, if any, for a pattern. * * *prefix is set to a palloc'd prefix string (in the form of a Const node), @@ -4769,7 +4797,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, */ static Pattern_Prefix_Status -like_fixed_prefix(Const *patt_const, bool case_insensitive, +like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Const **rest_const) { char *match; @@ -4780,15 +4808,39 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, int pos, match_pos; bool is_multibyte = (pg_database_encoding_max_length() > 1); + pg_locale_t locale = 0; + bool locale_is_c = false; /* the right-hand const is type text or bytea */ Assert(typeid == BYTEAOID || typeid == TEXTOID); - if (typeid == BYTEAOID && case_insensitive) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + if (case_insensitive) + { + if (typeid == BYTEAOID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("case insensitive matching not supported on type bytea"))); + /* If case-insensitive, we need locale info */ + if (lc_ctype_is_c(collation)) + locale_is_c = true; + else if (collation != DEFAULT_COLLATION_OID) + { + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for ILIKE"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + locale = pg_newlocale_from_collation(collation); + } + } + if (typeid != BYTEAOID) { patt = TextDatumGetCString(patt_const->constvalue); @@ -4822,23 +4874,11 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, break; } - /* - * XXX In multibyte character sets, we can't trust isalpha, so assume - * any multibyte char is potentially case-varying. - */ - if (case_insensitive) - { - if (is_multibyte && (unsigned char) patt[pos] >= 0x80) - break; - if (isalpha((unsigned char) patt[pos])) - break; - } + /* Stop if case-varying character (it's sort of a wildcard) */ + if (case_insensitive && + pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) + break; - /* - * NOTE: this code used to think that %% meant a literal %, but - * textlike() itself does not think that, and the SQL92 spec doesn't - * say any such thing either. - */ match[match_pos++] = patt[pos]; } @@ -4870,7 +4910,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, } static Pattern_Prefix_Status -regex_fixed_prefix(Const *patt_const, bool case_insensitive, +regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Const **rest_const) { char *match; @@ -4883,6 +4923,8 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, char *rest; Oid typeid = patt_const->consttype; bool is_multibyte = (pg_database_encoding_max_length() > 1); + pg_locale_t locale = 0; + bool locale_is_c = false; /* * Should be unnecessary, there are no bytea regex operators defined. As @@ -4894,6 +4936,28 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("regular-expression matching not supported on type bytea"))); + if (case_insensitive) + { + /* If case-insensitive, we need locale info */ + if (lc_ctype_is_c(collation)) + locale_is_c = true; + else if (collation != DEFAULT_COLLATION_OID) + { + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for regular expression"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + locale = pg_newlocale_from_collation(collation); + } + } + /* the right-hand const is type text for all of these */ patt = TextDatumGetCString(patt_const->constvalue); @@ -4969,17 +5033,10 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, patt[pos] == '$') break; - /* - * XXX In multibyte character sets, we can't trust isalpha, so assume - * any multibyte char is potentially case-varying. - */ - if (case_insensitive) - { - if (is_multibyte && (unsigned char) patt[pos] >= 0x80) - break; - if (isalpha((unsigned char) patt[pos])) - break; - } + /* Stop if case-varying character (it's sort of a wildcard) */ + if (case_insensitive && + pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) + break; /* * Check for quantifiers. Except for +, this means the preceding @@ -5004,7 +5061,7 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, * backslash followed by alphanumeric is an escape, not a quoted * character. Must treat it as having multiple possible matches. * Note: since only ASCII alphanumerics are escapes, we don't have to - * be paranoid about multibyte here. + * be paranoid about multibyte or collations here. */ if (patt[pos] == '\\') { @@ -5056,7 +5113,7 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, } Pattern_Prefix_Status -pattern_fixed_prefix(Const *patt, Pattern_Type ptype, +pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, Const **prefix, Const **rest) { Pattern_Prefix_Status result; @@ -5064,16 +5121,16 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, switch (ptype) { case Pattern_Type_Like: - result = like_fixed_prefix(patt, false, prefix, rest); + result = like_fixed_prefix(patt, false, collation, prefix, rest); break; case Pattern_Type_Like_IC: - result = like_fixed_prefix(patt, true, prefix, rest); + result = like_fixed_prefix(patt, true, collation, prefix, rest); break; case Pattern_Type_Regex: - result = regex_fixed_prefix(patt, false, prefix, rest); + result = regex_fixed_prefix(patt, false, collation, prefix, rest); break; case Pattern_Type_Regex_IC: - result = regex_fixed_prefix(patt, true, prefix, rest); + result = regex_fixed_prefix(patt, true, collation, prefix, rest); break; default: elog(ERROR, "unrecognized ptype: %d", (int) ptype); diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index e9913aa049..c1b417ad8f 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -132,6 +132,7 @@ extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, Pattern_Type ptype, + Oid collation, Const **prefix, Const **rest); extern Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc); -- 2.11.0