OSDN Git Service

Fix crash in text similarity search with blank characters keyword.
[pgbigm/pg_bigm.git] / bigm_gin.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2007-2012, PostgreSQL Global Development Group
4  *
5  * Changelog:
6  *   2013/01/09
7  *   Support full text search using bigrams.
8  *   Author: NTT DATA Corporation
9  *
10  *-------------------------------------------------------------------------
11  */
12 #include "postgres.h"
13
14 #include "bigm.h"
15
16 #include "access/gin.h"
17 #include "access/gin_private.h"
18 #include "access/itup.h"
19 #include "access/skey.h"
20 #include "access/tuptoaster.h"
21 #include "catalog/pg_type.h"
22 #include "funcapi.h"
23 #include "mb/pg_wchar.h"
24 #include "storage/bufmgr.h"
25 #include "storage/bufpage.h"
26 #include "tsearch/ts_locale.h"
27 #include "utils/array.h"
28 #include "utils/builtins.h"
29
30
31 PG_FUNCTION_INFO_V1(gin_extract_value_bigm);
32 Datum           gin_extract_value_bigm(PG_FUNCTION_ARGS);
33
34 PG_FUNCTION_INFO_V1(gin_extract_query_bigm);
35 Datum           gin_extract_query_bigm(PG_FUNCTION_ARGS);
36
37 PG_FUNCTION_INFO_V1(gin_bigm_consistent);
38 Datum           gin_bigm_consistent(PG_FUNCTION_ARGS);
39
40 PG_FUNCTION_INFO_V1(gin_bigm_compare_partial);
41 Datum           gin_bigm_compare_partial(PG_FUNCTION_ARGS);
42
43 PG_FUNCTION_INFO_V1(pg_gin_pending_stats);
44 Datum           pg_gin_pending_stats(PG_FUNCTION_ARGS);
45
46 Datum
47 gin_extract_value_bigm(PG_FUNCTION_ARGS)
48 {
49         text       *val = (text *) PG_GETARG_TEXT_P(0);
50         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
51         Datum      *entries = NULL;
52         BIGM       *bgm;
53         int32           bgmlen;
54
55         *nentries = 0;
56
57         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
58         bgmlen = ARRNELEM(bgm);
59
60         if (bgmlen > 0)
61         {
62                 bigm       *ptr;
63                 int32           i;
64
65                 *nentries = bgmlen;
66                 entries = (Datum *) palloc(sizeof(Datum) * bgmlen);
67
68                 ptr = GETARR(bgm);
69                 for (i = 0; i < bgmlen; i++)
70                 {
71                         text            *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
72                         entries[i] = PointerGetDatum(item);
73                         ptr++;
74                 }
75         }
76
77         PG_RETURN_POINTER(entries);
78 }
79
80 Datum
81 gin_extract_query_bigm(PG_FUNCTION_ARGS)
82 {
83         text       *val = (text *) PG_GETARG_TEXT_P(0);
84         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
85         StrategyNumber strategy = PG_GETARG_UINT16(2);
86
87         bool   **pmatch = (bool **) PG_GETARG_POINTER(3);
88         Pointer   **extra_data = (Pointer **) PG_GETARG_POINTER(4);
89         /* bool   **nullFlags = (bool **) PG_GETARG_POINTER(5); */
90         int32      *searchMode = (int32 *) PG_GETARG_POINTER(6);
91         Datum      *entries = NULL;
92         BIGM       *bgm;
93         int32           bgmlen = 0;
94         bigm       *ptr;
95         int32           i;
96         bool            removeDups;
97
98         switch (strategy)
99         {
100                 case LikeStrategyNumber:
101                 {
102                         char    *str = VARDATA(val);
103                         int             slen = VARSIZE(val) - VARHDRSZ;
104                         bool    *recheck;
105
106                         /*
107                          * For wildcard search we extract all the bigrams that every
108                          * potentially-matching string must include.
109                          */
110                         bgm = generate_wildcard_bigm(str, slen, &removeDups);
111                         bgmlen = ARRNELEM(bgm);
112
113                         /*
114                          * Check whether the heap tuple fetched by index search needs to be
115                          * rechecked against the query. If the search word consists of one
116                          * or two characters and doesn't contain any space character, we can
117                          * guarantee that the index test would be exact. That is, the heap
118                          * tuple does match the query, so it doesn't need to be rechecked.
119                          */
120                         *extra_data = (Pointer *) palloc(sizeof(bool));
121                         recheck = (bool *) *extra_data;
122                         if (bgmlen == 1 && !removeDups)
123                         {
124                                 const char      *sp;
125
126                                 *recheck = false;
127                                 for (sp = str; (sp - str) < slen;)
128                                 {
129                                         if (t_isspace(sp))
130                                         {
131                                                 *recheck = true;
132                                                 break;
133                                         }
134
135                                         sp += IS_HIGHBIT_SET(*sp) ? pg_mblen(sp) : 1;
136                                 }
137                         }
138                         else
139                                 *recheck = true;
140                         break;
141                 }
142                 case SimilarityStrategyNumber:
143                 {
144                         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
145                         bgmlen = ARRNELEM(bgm);
146                         break;
147                 }
148                 default:
149                         elog(ERROR, "unrecognized strategy number: %d", strategy);
150                         bgm = NULL;                     /* keep compiler quiet */
151                         break;
152         }
153
154         *nentries = (bigm_gin_key_limit == 0) ?
155                 bgmlen : Min(bigm_gin_key_limit, bgmlen);
156         *pmatch = NULL;
157
158         if (*nentries > 0)
159         {
160                 entries = (Datum *) palloc(sizeof(Datum) * *nentries);
161                 ptr = GETARR(bgm);
162                 for (i = 0; i < *nentries; i++)
163                 {
164                         text            *item;
165
166                         if (ptr->pmatch)
167                         {
168                                 if (*pmatch == NULL)
169                                         *pmatch = (bool *) palloc0(sizeof(bool) * *nentries);
170                                 (*pmatch)[i] = true;
171                         }
172                         item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
173                         entries[i] = PointerGetDatum(item);
174                         ptr++;
175                 }
176         }
177
178         /*
179          * If no bigram was extracted then we have to scan all the index.
180          */
181         if (*nentries == 0)
182                 *searchMode = GIN_SEARCH_MODE_ALL;
183
184         PG_RETURN_POINTER(entries);
185 }
186
187 Datum
188 gin_bigm_consistent(PG_FUNCTION_ARGS)
189 {
190         bool       *check = (bool *) PG_GETARG_POINTER(0);
191         StrategyNumber strategy = PG_GETARG_UINT16(1);
192
193         /* text    *query = PG_GETARG_TEXT_P(2); */
194         int32           nkeys = PG_GETARG_INT32(3);
195
196         Pointer   *extra_data = (Pointer *) PG_GETARG_POINTER(4);
197         bool       *recheck = (bool *) PG_GETARG_POINTER(5);
198         bool            res;
199         int32           i;
200         int32           ntrue;
201
202         switch (strategy)
203         {
204                 case LikeStrategyNumber:
205                         /*
206                          * Don't recheck the heap tuple against the query if either
207                          * pg_bigm.enable_recheck is disabled or the search word is
208                          * the special one so that the index can return the exact
209                          * result.
210                          */
211                         Assert(extra_data != NULL);
212                         *recheck = bigm_enable_recheck &&
213                                 (*((bool *) extra_data) || (nkeys != 1));
214
215                         /* Check if all extracted bigrams are presented. */
216                         res = true;
217                         for (i = 0; i < nkeys; i++)
218                         {
219                                 if (!check[i])
220                                 {
221                                         res = false;
222                                         break;
223                                 }
224                         }
225                         break;
226                 case SimilarityStrategyNumber:
227                         /* Count the matches */
228                         *recheck = bigm_enable_recheck;
229                         ntrue = 0;
230                         for (i = 0; i < nkeys; i++)
231                         {
232                                 if (check[i])
233                                         ntrue++;
234                         }
235 #ifdef DIVUNION
236                         res = (nkeys == ntrue) ? true :
237                                 ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >=
238                                   (float4) bigm_similarity_limit) ? true : false);
239 #else
240                         res = (nkeys == 0) ? false :
241                                 ((((((float4) ntrue) / ((float4) nkeys))) >=
242                                   (float4) bigm_similarity_limit) ? true : false);
243 #endif
244                         break;
245                 default:
246                         elog(ERROR, "unrecognized strategy number: %d", strategy);
247                         res = false;            /* keep compiler quiet */
248                         break;
249         }
250
251         PG_RETURN_BOOL(res);
252 }
253
254 Datum
255 gin_bigm_compare_partial(PG_FUNCTION_ARGS)
256 {
257         text    *arg1 = PG_GETARG_TEXT_PP(0);
258         text    *arg2 = PG_GETARG_TEXT_PP(1);
259         char    *a1p;
260         char    *a2p;
261         int             mblen1;
262         int             mblen2;
263         int             res;
264
265         a1p = VARDATA_ANY(arg1);
266         a2p = VARDATA_ANY(arg2);
267
268         mblen1 = pg_mblen(a1p);
269         mblen2 = pg_mblen(a2p);
270
271         if (mblen1 != mblen2)
272                 PG_RETURN_INT32(1);
273
274         res = memcmp(a1p, a2p, mblen1) ? 1 : 0;
275         PG_RETURN_INT32(res);
276 }
277
278 /*
279  * Report both number of pages and number of heap tuples that
280  * are in the pending list.
281  */
282 Datum
283 pg_gin_pending_stats(PG_FUNCTION_ARGS)
284 {
285         Oid                     indexOid = PG_GETARG_OID(0);
286         Relation        indexRel;
287         Buffer          metabuffer;
288         Page            metapage;
289         GinMetaPageData *metadata;
290         Datum           values[2];
291         bool            isnull[2];
292         HeapTuple       tuple;
293         TupleDesc       tupdesc;
294
295         /*
296          * Obtain statistic information from the meta page
297          */
298         indexRel = index_open(indexOid, AccessShareLock);
299         metabuffer = ReadBuffer(indexRel, GIN_METAPAGE_BLKNO);
300         LockBuffer(metabuffer, GIN_SHARE);
301         metapage = BufferGetPage(metabuffer);
302         metadata = GinPageGetMeta(metapage);
303         index_close(indexRel, AccessShareLock);
304
305         /*
306          * Construct a tuple descriptor for the result row. This must
307          * match this function's pg_bigm--x.x.sql entry.
308          */
309         tupdesc = CreateTemplateTupleDesc(2, false);
310         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
311                                            "pages", INT4OID, -1, 0);
312         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
313                                            "tuples", INT8OID, -1, 0);
314         tupdesc = BlessTupleDesc(tupdesc);
315
316         /* pages */
317         values[0] = Int32GetDatum(metadata->nPendingPages);
318         isnull[0] = false;
319
320         /* tuples */
321         values[1] = Int64GetDatum(metadata->nPendingHeapTuples);
322         isnull[1] = false;
323
324         UnlockReleaseBuffer(metabuffer);
325
326         tuple = heap_form_tuple(tupdesc, values, isnull);
327         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
328 }