OSDN Git Service

Tweak tsmatchsel() so that it examines the structure of the tsquery whenever
[pg-rex/syncrep.git] / src / backend / tsearch / ts_selfuncs.c
1 /*-------------------------------------------------------------------------
2  *
3  * ts_selfuncs.c
4  *        Selectivity estimation functions for text search operators.
5  *
6  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.7.6.1 2010/07/31 03:27:48 tgl Exp $
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include "catalog/pg_statistic.h"
17 #include "catalog/pg_type.h"
18 #include "miscadmin.h"
19 #include "nodes/nodes.h"
20 #include "tsearch/ts_type.h"
21 #include "utils/lsyscache.h"
22 #include "utils/selfuncs.h"
23 #include "utils/syscache.h"
24
25
26 /*
27  * The default text search selectivity is chosen to be small enough to
28  * encourage indexscans for typical table densities.  See selfuncs.h and
29  * DEFAULT_EQ_SEL for details.
30  */
31 #define DEFAULT_TS_MATCH_SEL 0.005
32
33 /* lookup table type for binary searching through MCELEMs */
34 typedef struct
35 {
36         text       *element;
37         float4          frequency;
38 } TextFreq;
39
40 /* type of keys for bsearch'ing through an array of TextFreqs */
41 typedef struct
42 {
43         char       *lexeme;
44         int                     length;
45 } LexemeKey;
46
47 static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
48 static Selectivity mcelem_tsquery_selec(TSQuery query,
49                                          Datum *mcelem, int nmcelem,
50                                          float4 *numbers, int nnumbers);
51 static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
52                                   TextFreq *lookup, int length, float4 minfreq);
53 static int      compare_lexeme_textfreq(const void *e1, const void *e2);
54
55 #define tsquery_opr_selec_no_stats(query) \
56         tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
57
58
59 /*
60  *      tsmatchsel -- Selectivity of "@@"
61  *
62  * restriction selectivity function for tsvector @@ tsquery and
63  * tsquery @@ tsvector
64  */
65 Datum
66 tsmatchsel(PG_FUNCTION_ARGS)
67 {
68         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
69
70 #ifdef NOT_USED
71         Oid                     operator = PG_GETARG_OID(1);
72 #endif
73         List       *args = (List *) PG_GETARG_POINTER(2);
74         int                     varRelid = PG_GETARG_INT32(3);
75         VariableStatData vardata;
76         Node       *other;
77         bool            varonleft;
78         Selectivity selec;
79
80         /*
81          * If expression is not variable = something or something = variable, then
82          * punt and return a default estimate.
83          */
84         if (!get_restriction_variable(root, args, varRelid,
85                                                                   &vardata, &other, &varonleft))
86                 PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
87
88         /*
89          * Can't do anything useful if the something is not a constant, either.
90          */
91         if (!IsA(other, Const))
92         {
93                 ReleaseVariableStats(vardata);
94                 PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
95         }
96
97         /*
98          * The "@@" operator is strict, so we can cope with NULL right away
99          */
100         if (((Const *) other)->constisnull)
101         {
102                 ReleaseVariableStats(vardata);
103                 PG_RETURN_FLOAT8(0.0);
104         }
105
106         /*
107          * OK, there's a Var and a Const we're dealing with here.  We need the
108          * Const to be a TSQuery, else we can't do anything useful.  We have to
109          * check this because the Var might be the TSQuery not the TSVector.
110          */
111         if (((Const *) other)->consttype == TSQUERYOID)
112         {
113                 /* tsvector @@ tsquery or the other way around */
114                 Assert(vardata.vartype == TSVECTOROID);
115
116                 selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
117         }
118         else
119         {
120                 /* If we can't see the query structure, must punt */
121                 selec = DEFAULT_TS_MATCH_SEL;
122         }
123
124         ReleaseVariableStats(vardata);
125
126         CLAMP_PROBABILITY(selec);
127
128         PG_RETURN_FLOAT8((float8) selec);
129 }
130
131
132 /*
133  *      tsmatchjoinsel -- join selectivity of "@@"
134  *
135  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
136  */
137 Datum
138 tsmatchjoinsel(PG_FUNCTION_ARGS)
139 {
140         /* for the moment we just punt */
141         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
142 }
143
144
145 /*
146  * @@ selectivity for tsvector var vs tsquery constant
147  */
148 static Selectivity
149 tsquerysel(VariableStatData *vardata, Datum constval)
150 {
151         Selectivity selec;
152         TSQuery         query;
153
154         /* The caller made sure the const is a TSQuery, so get it now */
155         query = DatumGetTSQuery(constval);
156
157         /* Empty query matches nothing */
158         if (query->size == 0)
159                 return (Selectivity) 0.0;
160
161         if (HeapTupleIsValid(vardata->statsTuple))
162         {
163                 Form_pg_statistic stats;
164                 Datum      *values;
165                 int                     nvalues;
166                 float4     *numbers;
167                 int                     nnumbers;
168
169                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
170
171                 /* MCELEM will be an array of TEXT elements for a tsvector column */
172                 if (get_attstatsslot(vardata->statsTuple,
173                                                          TEXTOID, -1,
174                                                          STATISTIC_KIND_MCELEM, InvalidOid,
175                                                          NULL,
176                                                          &values, &nvalues,
177                                                          &numbers, &nnumbers))
178                 {
179                         /*
180                          * There is a most-common-elements slot for the tsvector Var, so
181                          * use that.
182                          */
183                         selec = mcelem_tsquery_selec(query, values, nvalues,
184                                                                                  numbers, nnumbers);
185                         free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
186                 }
187                 else
188                 {
189                         /* No most-common-elements info, so do without */
190                         selec = tsquery_opr_selec_no_stats(query);
191                 }
192         }
193         else
194         {
195                 /* No stats at all, so do without */
196                 selec = tsquery_opr_selec_no_stats(query);
197         }
198
199         return selec;
200 }
201
202 /*
203  * Extract data from the pg_statistic arrays into useful format.
204  */
205 static Selectivity
206 mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
207                                          float4 *numbers, int nnumbers)
208 {
209         float4          minfreq;
210         TextFreq   *lookup;
211         Selectivity selec;
212         int                     i;
213
214         /*
215          * There should be two more Numbers than Values, because the last two
216          * cells are taken for minimal and maximal frequency.  Punt if not.
217          */
218         if (nnumbers != nmcelem + 2)
219                 return tsquery_opr_selec_no_stats(query);
220
221         /*
222          * Transpose the data into a single array so we can use bsearch().
223          */
224         lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
225         for (i = 0; i < nmcelem; i++)
226         {
227                 /*
228                  * The text Datums came from an array, so it cannot be compressed or
229                  * stored out-of-line -- it's safe to use VARSIZE_ANY*.
230                  */
231                 Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
232                 lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
233                 lookup[i].frequency = numbers[i];
234         }
235
236         /*
237          * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
238          * the one before the last cell of the Numbers array. See ts_typanalyze.c
239          */
240         minfreq = numbers[nnumbers - 2];
241
242         selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
243                                                           nmcelem, minfreq);
244
245         pfree(lookup);
246
247         return selec;
248 }
249
250 /*
251  * Traverse the tsquery in preorder, calculating selectivity as:
252  *
253  *       selec(left_oper) * selec(right_oper) in AND nodes,
254  *
255  *       selec(left_oper) + selec(right_oper) -
256  *              selec(left_oper) * selec(right_oper) in OR nodes,
257  *
258  *       1 - select(oper) in NOT nodes
259  *
260  *       freq[val] in VAL nodes, if the value is in MCELEM
261  *       min(freq[MCELEM]) / 2 in VAL nodes, if it is not
262  *
263  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
264  * binary search for determining freq[MCELEM].
265  *
266  * If we don't have stats for the tsvector, we still use this logic,
267  * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes.  This case
268  * is signaled by lookup == NULL.
269  */
270 static Selectivity
271 tsquery_opr_selec(QueryItem *item, char *operand,
272                                   TextFreq *lookup, int length, float4 minfreq)
273 {
274         LexemeKey       key;
275         TextFreq   *searchres;
276         Selectivity selec,
277                                 s1,
278                                 s2;
279
280         /* since this function recurses, it could be driven to stack overflow */
281         check_stack_depth();
282
283         if (item->type == QI_VAL)
284         {
285                 QueryOperand *oper = (QueryOperand *) item;
286
287                 /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
288                 if (lookup == NULL)
289                         return (Selectivity) DEFAULT_TS_MATCH_SEL;
290
291                 /*
292                  * Prepare the key for bsearch().
293                  */
294                 key.lexeme = operand + oper->distance;
295                 key.length = oper->length;
296
297                 searchres = (TextFreq *) bsearch(&key, lookup, length,
298                                                                                  sizeof(TextFreq),
299                                                                                  compare_lexeme_textfreq);
300
301                 if (searchres)
302                 {
303                         /*
304                          * The element is in MCELEM.  Return precise selectivity (or at
305                          * least as precise as ANALYZE could find out).
306                          */
307                         return (Selectivity) searchres->frequency;
308                 }
309                 else
310                 {
311                         /*
312                          * The element is not in MCELEM.  Punt, but assume that the
313                          * selectivity cannot be more than minfreq / 2.
314                          */
315                         return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
316                 }
317         }
318
319         /* Current TSQuery node is an operator */
320         switch (item->qoperator.oper)
321         {
322                 case OP_NOT:
323                         selec = 1.0 - tsquery_opr_selec(item + 1, operand,
324                                                                                         lookup, length, minfreq);
325                         break;
326
327                 case OP_AND:
328                         s1 = tsquery_opr_selec(item + 1, operand,
329                                                                    lookup, length, minfreq);
330                         s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
331                                                                    lookup, length, minfreq);
332                         selec = s1 * s2;
333                         break;
334
335                 case OP_OR:
336                         s1 = tsquery_opr_selec(item + 1, operand,
337                                                                    lookup, length, minfreq);
338                         s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
339                                                                    lookup, length, minfreq);
340                         selec = s1 + s2 - s1 * s2;
341                         break;
342
343                 default:
344                         elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
345                         selec = 0;                      /* keep compiler quiet */
346                         break;
347         }
348
349         /* Clamp intermediate results to stay sane despite roundoff error */
350         CLAMP_PROBABILITY(selec);
351
352         return selec;
353 }
354
355 /*
356  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
357  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
358  * how ANALYZE code sorted data before storing it in a statistic tuple.
359  * See ts_typanalyze.c for details.
360  */
361 static int
362 compare_lexeme_textfreq(const void *e1, const void *e2)
363 {
364         const LexemeKey *key = (const LexemeKey *) e1;
365         const TextFreq *t = (const TextFreq *) e2;
366         int                     len1,
367                                 len2;
368
369         len1 = key->length;
370         len2 = VARSIZE_ANY_EXHDR(t->element);
371
372         /* Compare lengths first, possibly avoiding a strncmp call */
373         if (len1 > len2)
374                 return 1;
375         else if (len1 < len2)
376                 return -1;
377
378         /* Fall back on byte-for-byte comparison */
379         return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
380 }