Fix performance issues in replace_text(), replace_text_regexp(), and

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 8 Nov 2006 19:22:25 +0000 (19:22 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 8 Nov 2006 19:22:25 +0000 (19:22 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 8 Nov 2006 19:22:25 +0000 (19:22 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 8 Nov 2006 19:22:25 +0000 (19:22 +0000)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index 6d8216f..46756a1 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.153 2006/11/08 19:22:25 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -21,6 +21,7 @@
  #include "catalog/pg_type.h"
  #include "libpq/md5.h"
  #include "libpq/pqformat.h"
+#include "miscadmin.h"
  #include "parser/scansup.h"
  #include "regex/regex.h"
  #include "utils/builtins.h"
@@ -478,6 +479,32 @@ textcat(PG_FUNCTION_ARGS)
  }
  
  /*
+ * charlen_to_bytelen()
+ *     Compute the number of bytes occupied by n characters starting at *p
+ *
+ * It is caller's responsibility that there actually are n characters;
+ * the string need not be null-terminated.
+ */
+static int
+charlen_to_bytelen(const char *p, int n)
+{
+       if (pg_database_encoding_max_length() == 1)
+       {
+               /* Optimization for single-byte encodings */
+               return n;
+       }
+       else
+       {
+               const char *s;
+
+               for (s = p; n > 0; n--)
+                       s += pg_mblen(s);
+
+               return s - p;
+       }
+}
+
+/*
   * text_substr()
   * Return a substring starting at the specified position.
   * - thomas 1997-12-31
@@ -534,6 +561,8 @@ text_substr_no_len(PG_FUNCTION_ARGS)
   *     functions.      Note that the argument is passed as a Datum, to indicate that
   *     it may still be in compressed/toasted form.  We can avoid detoasting all
   *     of it in some cases.
+ *
+ *     The result is always a freshly palloc'd datum.
   */
  static text *
  text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
@@ -649,11 +678,23 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                          */
                         slice_size = (S1 + L1) * eml;
                 }
-               slice = DatumGetTextPSlice(str, slice_start, slice_size);
+
+               /*
+                * If we're working with an untoasted source, no need to do an
+                * extra copying step.
+                */
+               if (VARATT_IS_EXTENDED(str))
+                       slice = DatumGetTextPSlice(str, slice_start, slice_size);
+               else
+                       slice = (text *) DatumGetPointer(str);
  
                 /* see if we got back an empty string */
                 if ((VARSIZE(slice) - VARHDRSZ) == 0)
+               {
+                       if (slice != (text *) DatumGetPointer(str))
+                               pfree(slice);
                         return PG_STR_GET_TEXT("");
+               }
  
                 /* Now we can get the actual length of the slice in MB characters */
                 slice_strlen = pg_mbstrlen_with_len(VARDATA(slice), VARSIZE(slice) - VARHDRSZ);
@@ -663,7 +704,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                  * says to return a zero-length string.
                  */
                 if (S1 > slice_strlen)
+               {
+                       if (slice != (text *) DatumGetPointer(str))
+                               pfree(slice);
                         return PG_STR_GET_TEXT("");
+               }
  
                 /*
                  * Adjust L1 and E1 now that we know the slice string length. Again
@@ -695,6 +740,9 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                 VARATT_SIZEP(ret) = VARHDRSZ + (p - s);
                 memcpy(VARDATA(ret), s, (p - s));
  
+               if (slice != (text *) DatumGetPointer(str))
+                       pfree(slice);
+
                 return ret;
         }
         else
@@ -2076,10 +2124,11 @@ replace_text(PG_FUNCTION_ARGS)
         int                     src_text_len = TEXTLEN(src_text);
         int                     from_sub_text_len = TEXTLEN(from_sub_text);
         TextPositionState state;
-       text       *chunk_text;
         text       *ret_text;
         int                     start_posn;
         int                     curr_posn;
+       int                     chunk_len;
+       char       *start_ptr;
         StringInfoData str;
  
         if (src_text_len == 0 || from_sub_text_len == 0)
@@ -2097,31 +2146,31 @@ replace_text(PG_FUNCTION_ARGS)
                 PG_RETURN_TEXT_P(src_text);
         }
  
+       /* start_ptr points to the start_posn'th character of src_text */
+       start_ptr = (char *) VARDATA(src_text);
+
         initStringInfo(&str);
  
         do
         {
-               chunk_text = text_substring(PointerGetDatum(src_text),
-                                                                       start_posn,
-                                                                       curr_posn - start_posn,
-                                                                       false);
-               appendStringInfoText(&str, chunk_text);
-               pfree(chunk_text);
+               /* copy the data skipped over by last text_position_next() */
+               chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
+               appendBinaryStringInfo(&str, start_ptr, chunk_len);
  
                 appendStringInfoText(&str, to_sub_text);
  
-               start_posn = curr_posn + from_sub_text_len;
+               start_posn = curr_posn;
+               start_ptr += chunk_len;
+               start_posn += from_sub_text_len;
+               start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
+
                 curr_posn = text_position_next(start_posn, &state);
         }
         while (curr_posn > 0);
  
-       /* copy trailing chunk */
-       chunk_text = text_substring(PointerGetDatum(src_text),
-                                                               start_posn,
-                                                               -1,
-                                                               true);
-       appendStringInfoText(&str, chunk_text);
-       pfree(chunk_text);
+       /* copy trailing data */
+       chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
+       appendBinaryStringInfo(&str, start_ptr, chunk_len);
  
         text_position_cleanup(&state);
  
@@ -2166,11 +2215,13 @@ check_replace_text_has_escape_char(const text *replace_text)
   * appendStringInfoRegexpSubstr
   *
   * Append replace_text to str, substituting regexp back references for
- * \n escapes.
+ * \n escapes.  start_ptr is the start of the match in the source string,
+ * at logical character position data_pos.
   */
  static void
  appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
-                                                        regmatch_t *pmatch, text *src_text)
+                                                        regmatch_t *pmatch,
+                                                        char *start_ptr, int data_pos)
  {
         const char *p = VARDATA(replace_text);
         const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
@@ -2247,16 +2298,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
                 if (so != -1 && eo != -1)
                 {
                         /*
-                        * Copy the text that is back reference of regexp.      Because so and
-                        * eo are counted in characters not bytes, it's easiest to use
-                        * text_substring to pull out the correct chunk of text.
+                        * Copy the text that is back reference of regexp.      Note so and
+                        * eo are counted in characters not bytes.
                          */
-                       text       *append_text;
-
-                       append_text = text_substring(PointerGetDatum(src_text),
-                                                                                so + 1, (eo - so), false);
-                       appendStringInfoText(str, append_text);
-                       pfree(append_text);
+                       char       *chunk_start;
+                       int                     chunk_len;
+
+                       Assert(so >= data_pos);
+                       chunk_start = start_ptr;
+                       chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
+                       chunk_len = charlen_to_bytelen(chunk_start, eo - so);
+                       appendBinaryStringInfo(str, chunk_start, chunk_len);
                 }
         }
  }
@@ -2284,6 +2336,7 @@ replace_text_regexp(text *src_text, void *regexp,
         size_t          data_len;
         int                     search_start;
         int                     data_pos;
+       char       *start_ptr;
         bool            have_escape;
  
         initStringInfo(&buf);
@@ -2295,10 +2348,17 @@ replace_text_regexp(text *src_text, void *regexp,
         /* Check whether replace_text has escape char. */
         have_escape = check_replace_text_has_escape_char(replace_text);
  
-       for (search_start = data_pos = 0; search_start <= data_len;)
+       /* start_ptr points to the data_pos'th character of src_text */
+       start_ptr = (char *) VARDATA(src_text);
+       data_pos = 0;
+
+       search_start = 0;
+       while (search_start <= data_len)
         {
                 int                     regexec_result;
  
+               CHECK_FOR_INTERRUPTS();
+
                 regexec_result = pg_regexec(re,
                                                                         data,
                                                                         data_len,
@@ -2322,20 +2382,22 @@ replace_text_regexp(text *src_text, void *regexp,
                 }
  
                 /*
-                * Copy the text to the left of the match position.  Because we are
-                * working with character not byte indexes, it's easiest to use
-                * text_substring to pull out the needed data.
+                * Copy the text to the left of the match position.  Note we are
+                * given character not byte indexes.
                  */
                 if (pmatch[0].rm_so - data_pos > 0)
                 {
-                       text       *left_text;
-
-                       left_text = text_substring(PointerGetDatum(src_text),
-                                                                          data_pos + 1,
-                                                                          pmatch[0].rm_so - data_pos,
-                                                                          false);
-                       appendStringInfoText(&buf, left_text);
-                       pfree(left_text);
+                       int                     chunk_len;
+
+                       chunk_len = charlen_to_bytelen(start_ptr,
+                                                                                  pmatch[0].rm_so - data_pos);
+                       appendBinaryStringInfo(&buf, start_ptr, chunk_len);
+                       /*
+                        * Advance start_ptr over that text, to avoid multiple rescans
+                        * of it if the replace_text contains multiple back-references.
+                        */
+                       start_ptr += chunk_len;
+                       data_pos = pmatch[0].rm_so;
                 }
  
                 /*
@@ -2343,11 +2405,15 @@ replace_text_regexp(text *src_text, void *regexp,
                  * replace_text has escape characters.
                  */
                 if (have_escape)
-                       appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, src_text);
+                       appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
+                                                                                start_ptr, data_pos);
                 else
                         appendStringInfoText(&buf, replace_text);
  
-               search_start = data_pos = pmatch[0].rm_eo;
+               /* Advance start_ptr and data_pos over the matched text. */
+               start_ptr += charlen_to_bytelen(start_ptr,
+                                                                               pmatch[0].rm_eo - data_pos);
+               data_pos = pmatch[0].rm_eo;
  
                 /*
                  * When global option is off, replace the first instance only.
@@ -2358,6 +2424,7 @@ replace_text_regexp(text *src_text, void *regexp,
                 /*
                  * Search from next character when the matching text is zero width.
                  */
+               search_start = data_pos;
                 if (pmatch[0].rm_so == pmatch[0].rm_eo)
                         search_start++;
         }
@@ -2367,12 +2434,10 @@ replace_text_regexp(text *src_text, void *regexp,
          */
         if (data_pos < data_len)
         {
-               text       *right_text;
+               int                     chunk_len;
  
-               right_text = text_substring(PointerGetDatum(src_text),
-                                                                       data_pos + 1, -1, true);
-               appendStringInfoText(&buf, right_text);
-               pfree(right_text);
+               chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
+               appendBinaryStringInfo(&buf, start_ptr, chunk_len);
         }
  
         ret_text = PG_STR_GET_TEXT(buf.data);
@@ -2488,6 +2553,8 @@ text_to_array(PG_FUNCTION_ARGS)
         int                     fldnum;
         int                     start_posn;
         int                     end_posn;
+       int                     chunk_len;
+       char       *start_ptr;
         text       *result_text;
         ArrayBuildState *astate = NULL;
  
@@ -2506,6 +2573,9 @@ text_to_array(PG_FUNCTION_ARGS)
         text_position_setup(inputstring, fldsep, &state);
  
         start_posn = 1;
+       /* start_ptr points to the start_posn'th character of inputstring */
+       start_ptr = (char *) VARDATA(inputstring);
+
         for (fldnum = 1;; fldnum++) /* field number is 1 based */
         {
                 end_posn = text_position_next(start_posn, &state);
@@ -2513,20 +2583,19 @@ text_to_array(PG_FUNCTION_ARGS)
                 if (end_posn == 0)
                 {
                         /* fetch last field */
-                       result_text = text_substring(PointerGetDatum(inputstring),
-                                                                                start_posn,
-                                                                                -1,
-                                                                                true);
+                       chunk_len = ((char *) inputstring + VARSIZE(inputstring)) - start_ptr;
                 }
                 else
                 {
                         /* fetch non-last field */
-                       result_text = text_substring(PointerGetDatum(inputstring),
-                                                                                start_posn,
-                                                                                end_posn - start_posn,
-                                                                                false);
+                       chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
                 }
  
+               /* must build a temp text datum to pass to accumArrayResult */
+               result_text = (text *) palloc(VARHDRSZ + chunk_len);
+               VARATT_SIZEP(result_text) = VARHDRSZ + chunk_len;
+               memcpy(VARDATA(result_text), start_ptr, chunk_len);
+
                 /* stash away this field */
                 astate = accumArrayResult(astate,
                                                                   PointerGetDatum(result_text),
@@ -2534,9 +2603,15 @@ text_to_array(PG_FUNCTION_ARGS)
                                                                   TEXTOID,
                                                                   CurrentMemoryContext);
  
+               pfree(result_text);
+
                 if (end_posn == 0)
                         break;
-               start_posn = end_posn + fldsep_len;
+
+               start_posn = end_posn;
+               start_ptr += chunk_len;
+               start_posn += fldsep_len;
+               start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
         }
  
         text_position_cleanup(&state);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 8 Nov 2006 19:22:25 +0000 (19:22 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 8 Nov 2006 19:22:25 +0000 (19:22 +0000)