OSDN Git Service

Don't include access/tuptoaster.h in bigm_gin.c if PostgreSQL 13 or later.
[pgbigm/pg_bigm.git] / bigm_gin.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2007-2012, PostgreSQL Global Development Group
4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
5  *
6  * Changelog:
7  *       2013/01/09
8  *       Support full text search using bigrams.
9  *       Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include "bigm.h"
16
17 #include "access/gin.h"
18 #include "access/gin_private.h"
19 #include "access/itup.h"
20 #if PG_VERSION_NUM >= 120000
21 #include "access/relation.h"
22 #endif
23 #include "access/skey.h"
24 #if PG_VERSION_NUM < 130000
25 #include "access/tuptoaster.h"
26 #endif
27 #include "access/xlog.h"
28 #if PG_VERSION_NUM > 90500
29 #include "catalog/pg_am.h"
30 #endif
31 #include "catalog/pg_type.h"
32 #include "funcapi.h"
33 #include "mb/pg_wchar.h"
34 #include "storage/bufmgr.h"
35 #include "storage/bufpage.h"
36 #include "tsearch/ts_locale.h"
37 #include "utils/array.h"
38 #include "utils/builtins.h"
39 #include "utils/rel.h"
40
41
42 PG_FUNCTION_INFO_V1(gin_extract_value_bigm);
43 PG_FUNCTION_INFO_V1(gin_extract_query_bigm);
44 PG_FUNCTION_INFO_V1(gin_bigm_consistent);
45 PG_FUNCTION_INFO_V1(gin_bigm_compare_partial);
46 PG_FUNCTION_INFO_V1(pg_gin_pending_stats);
47
48 /* triConsistent function is available only in 9.4 or later */
49 #if PG_VERSION_NUM >= 90400
50 PG_FUNCTION_INFO_V1(gin_bigm_triconsistent);
51 #endif
52
53 /*
54  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
55  * macro since 9.4, and hence the declaration of the function prototypes
56  * here is necessary only for 9.3 or before.
57  */
58 #if PG_VERSION_NUM < 90400
59 Datum           gin_extract_value_bigm(PG_FUNCTION_ARGS);
60 Datum           gin_extract_query_bigm(PG_FUNCTION_ARGS);
61 Datum           gin_bigm_consistent(PG_FUNCTION_ARGS);
62 Datum           gin_bigm_compare_partial(PG_FUNCTION_ARGS);
63 Datum           pg_gin_pending_stats(PG_FUNCTION_ARGS);
64 #endif
65
66 Datum
67 gin_extract_value_bigm(PG_FUNCTION_ARGS)
68 {
69         text       *val = (text *) PG_GETARG_TEXT_P(0);
70         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
71         Datum      *entries = NULL;
72         BIGM       *bgm;
73         int32           bgmlen;
74
75         *nentries = 0;
76
77         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
78         bgmlen = ARRNELEM(bgm);
79
80         if (bgmlen > 0)
81         {
82                 bigm       *ptr;
83                 int32           i;
84
85                 *nentries = bgmlen;
86                 entries = (Datum *) palloc(sizeof(Datum) * bgmlen);
87
88                 ptr = GETARR(bgm);
89                 for (i = 0; i < bgmlen; i++)
90                 {
91                         text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
92
93                         entries[i] = PointerGetDatum(item);
94                         ptr++;
95                 }
96         }
97
98         PG_RETURN_POINTER(entries);
99 }
100
101 Datum
102 gin_extract_query_bigm(PG_FUNCTION_ARGS)
103 {
104         text       *val = (text *) PG_GETARG_TEXT_P(0);
105         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
106         StrategyNumber strategy = PG_GETARG_UINT16(2);
107
108         bool      **pmatch = (bool **) PG_GETARG_POINTER(3);
109         Pointer   **extra_data = (Pointer **) PG_GETARG_POINTER(4);
110
111         /* bool   **nullFlags = (bool **) PG_GETARG_POINTER(5); */
112         int32      *searchMode = (int32 *) PG_GETARG_POINTER(6);
113         Datum      *entries = NULL;
114         BIGM       *bgm;
115         int32           bgmlen = 0;
116         bigm       *ptr;
117         int32           i;
118         bool            removeDups;
119
120         switch (strategy)
121         {
122                 case LikeStrategyNumber:
123                 {
124                         char       *str = VARDATA(val);
125                         int                     slen = VARSIZE(val) - VARHDRSZ;
126                         bool       *recheck;
127
128                         /*
129                          * For wildcard search we extract all the bigrams that every
130                          * potentially-matching string must include.
131                          */
132                         bgm = generate_wildcard_bigm(str, slen, &removeDups);
133                         bgmlen = ARRNELEM(bgm);
134
135                         /*
136                          * Check whether the heap tuple fetched by index search needs to
137                          * be rechecked against the query. If the search word consists of
138                          * one or two characters and doesn't contain any space character,
139                          * we can guarantee that the index test would be exact. That is,
140                          * the heap tuple does match the query, so it doesn't need to be
141                          * rechecked.
142                          */
143                         *extra_data = (Pointer *) palloc(sizeof(bool));
144                         recheck = (bool *) *extra_data;
145                         if (bgmlen == 1 && !removeDups)
146                         {
147                                 const char *sp;
148
149                                 *recheck = false;
150                                 for (sp = str; (sp - str) < slen;)
151                                 {
152                                         if (t_isspace(sp))
153                                         {
154                                                 *recheck = true;
155                                                 break;
156                                         }
157
158                                         sp += IS_HIGHBIT_SET(*sp) ? pg_mblen(sp) : 1;
159                                 }
160                         }
161                         else
162                                 *recheck = true;
163                         break;
164                 }
165                 case SimilarityStrategyNumber:
166                 {
167                         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
168                         bgmlen = ARRNELEM(bgm);
169                         break;
170                 }
171                 default:
172                         elog(ERROR, "unrecognized strategy number: %d", strategy);
173                         bgm = NULL;                     /* keep compiler quiet */
174                         break;
175         }
176
177         *nentries = (bigm_gin_key_limit == 0) ?
178                 bgmlen : Min(bigm_gin_key_limit, bgmlen);
179         *pmatch = NULL;
180
181         if (*nentries > 0)
182         {
183                 entries = (Datum *) palloc(sizeof(Datum) * *nentries);
184                 ptr = GETARR(bgm);
185                 for (i = 0; i < *nentries; i++)
186                 {
187                         text       *item;
188
189                         if (ptr->pmatch)
190                         {
191                                 if (*pmatch == NULL)
192                                         *pmatch = (bool *) palloc0(sizeof(bool) * *nentries);
193                                 (*pmatch)[i] = true;
194                         }
195                         item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
196                         entries[i] = PointerGetDatum(item);
197                         ptr++;
198                 }
199         }
200
201         /*
202          * If no bigram was extracted then we have to scan all the index.
203          */
204         if (*nentries == 0)
205                 *searchMode = GIN_SEARCH_MODE_ALL;
206
207         PG_RETURN_POINTER(entries);
208 }
209
210 Datum
211 gin_bigm_consistent(PG_FUNCTION_ARGS)
212 {
213         bool       *check = (bool *) PG_GETARG_POINTER(0);
214         StrategyNumber strategy = PG_GETARG_UINT16(1);
215
216         /* text    *query = PG_GETARG_TEXT_P(2); */
217         int32           nkeys = PG_GETARG_INT32(3);
218
219         Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4);
220         bool       *recheck = (bool *) PG_GETARG_POINTER(5);
221         bool            res;
222         int32           i;
223         int32           ntrue;
224
225         switch (strategy)
226         {
227                 case LikeStrategyNumber:
228
229                         /*
230                          * Don't recheck the heap tuple against the query if either
231                          * pg_bigm.enable_recheck is disabled or the search word is the
232                          * special one so that the index can return the exact result.
233                          */
234                         Assert(extra_data != NULL);
235                         *recheck = bigm_enable_recheck &&
236                                 (*((bool *) extra_data) || (nkeys != 1));
237
238                         /* Check if all extracted bigrams are presented. */
239                         res = true;
240                         for (i = 0; i < nkeys; i++)
241                         {
242                                 if (!check[i])
243                                 {
244                                         res = false;
245                                         break;
246                                 }
247                         }
248                         break;
249                 case SimilarityStrategyNumber:
250                         /* Count the matches */
251                         *recheck = bigm_enable_recheck;
252                         ntrue = 0;
253                         for (i = 0; i < nkeys; i++)
254                         {
255                                 if (check[i])
256                                         ntrue++;
257                         }
258
259                         /*--------------------
260                          * If DIVUNION is defined then similarity formula is:
261                          * c / (len1 + len2 - c)
262                          * where c is number of common bigrams and it stands as ntrue in
263                          * this code.  Here we don't know value of len2 but we can assume
264                          * that c (ntrue) is a lower bound of len2, so upper bound of
265                          * similarity is:
266                          * c / (len1 + c - c)  => c / len1
267                          * If DIVUNION is not defined then similarity formula is:
268                          * c / max(len1, len2)
269                          * And again, c (ntrue) is a lower bound of len2, but c <= len1
270                          * just by definition and, consequently, upper bound of
271                          * similarity is just c / len1.
272                          * So, independently on DIVUNION the upper bound formula is the same.
273                          */
274                         res = (nkeys == 0) ? false :
275                                 ((((((float4) ntrue) / ((float4) nkeys))) >=
276                                   (float4) bigm_similarity_limit) ? true : false);
277                         break;
278                 default:
279                         elog(ERROR, "unrecognized strategy number: %d", strategy);
280                         res = false;            /* keep compiler quiet */
281                         break;
282         }
283
284         PG_RETURN_BOOL(res);
285 }
286
287 /* triConsistent function is available only in 9.4 or later */
288 #if PG_VERSION_NUM >= 90400
289 Datum
290 gin_bigm_triconsistent(PG_FUNCTION_ARGS)
291 {
292         GinTernaryValue  *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
293         StrategyNumber strategy = PG_GETARG_UINT16(1);
294
295         /* text    *query = PG_GETARG_TEXT_P(2); */
296         int32           nkeys = PG_GETARG_INT32(3);
297         Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4);
298         GinTernaryValue res = GIN_MAYBE;
299         int32           i,
300                                 ntrue;
301
302         switch (strategy)
303         {
304                 case LikeStrategyNumber:
305                         /*
306                          * Don't recheck the heap tuple against the query if either
307                          * pg_bigm.enable_recheck is disabled or the search word is the
308                          * special one so that the index can return the exact result.
309                          */
310                         res = (bigm_enable_recheck &&
311                                    (*((bool *) extra_data) || (nkeys != 1))) ?
312                                 GIN_MAYBE : GIN_TRUE;
313
314                         /* Check if all extracted bigrams are presented. */
315                         for (i = 0; i < nkeys; i++)
316                         {
317                                 if (check[i] == GIN_FALSE)
318                                 {
319                                         res = GIN_FALSE;
320                                         break;
321                                 }
322                         }
323                         break;
324                 case SimilarityStrategyNumber:
325                         /* Count the matches */
326                         ntrue = 0;
327                         for (i = 0; i < nkeys; i++)
328                         {
329                                 if (check[i] != GIN_FALSE)
330                                         ntrue++;
331                         }
332
333                         /*
334                          * See comment in gin_bigm_consistent() about upper bound formula
335                          */
336                         res = (nkeys == 0) ? GIN_FALSE :
337                                 (((((float4) ntrue) / ((float4) nkeys)) >=
338                                   (float4) bigm_similarity_limit) ? GIN_MAYBE : GIN_FALSE);
339
340                         if (res != GIN_FALSE && !bigm_enable_recheck)
341                                 res = GIN_TRUE;
342                         break;
343                 default:
344                         elog(ERROR, "unrecognized strategy number: %d", strategy);
345                         res = GIN_FALSE;                /* keep compiler quiet */
346                         break;
347         }
348
349         PG_RETURN_GIN_TERNARY_VALUE(res);
350 }
351 #endif  /* PG_VERSION_NUM >= 90400 */
352
353 Datum
354 gin_bigm_compare_partial(PG_FUNCTION_ARGS)
355 {
356         text       *arg1 = PG_GETARG_TEXT_PP(0);
357         text       *arg2 = PG_GETARG_TEXT_PP(1);
358         char       *a1p;
359         char       *a2p;
360         int                     mblen1;
361         int                     mblen2;
362         int                     res;
363
364         a1p = VARDATA_ANY(arg1);
365         a2p = VARDATA_ANY(arg2);
366
367         mblen1 = pg_mblen(a1p);
368         mblen2 = pg_mblen(a2p);
369
370         if (mblen1 != mblen2)
371                 PG_RETURN_INT32(1);
372
373         res = memcmp(a1p, a2p, mblen1) ? 1 : 0;
374         PG_RETURN_INT32(res);
375 }
376
377 /*
378  * Report both number of pages and number of heap tuples that
379  * are in the pending list.
380  */
381 Datum
382 pg_gin_pending_stats(PG_FUNCTION_ARGS)
383 {
384         Oid                     indexOid = PG_GETARG_OID(0);
385         Relation        indexRel;
386         Buffer          metabuffer;
387         Page            metapage;
388         GinMetaPageData *metadata;
389         Datum           values[2];
390         bool            isnull[2];
391         HeapTuple       tuple;
392         TupleDesc       tupdesc;
393
394         indexRel = relation_open(indexOid, AccessShareLock);
395
396         if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
397                 indexRel->rd_rel->relam != GIN_AM_OID)
398                 ereport(ERROR,
399                                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
400                                  errmsg("relation \"%s\" is not a GIN index",
401                                                 RelationGetRelationName(indexRel))));
402
403         /*
404          * Reject attempts to read non-local temporary relations; we would be
405          * likely to get wrong data since we have no visibility into the owning
406          * session's local buffers.
407          */
408         if (RELATION_IS_OTHER_TEMP(indexRel))
409                 ereport(ERROR,
410                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
411                                  errmsg("cannot access temporary indexes of other sessions")));
412
413         /*
414          * Obtain statistic information from the meta page
415          */
416         metabuffer = ReadBuffer(indexRel, GIN_METAPAGE_BLKNO);
417         LockBuffer(metabuffer, GIN_SHARE);
418         metapage = BufferGetPage(metabuffer);
419         metadata = GinPageGetMeta(metapage);
420
421         /*
422          * Construct a tuple descriptor for the result row. This must match this
423          * function's pg_bigm--x.x.sql entry.
424          */
425  #if PG_VERSION_NUM >= 120000
426         tupdesc = CreateTemplateTupleDesc(2);
427 #else
428         tupdesc = CreateTemplateTupleDesc(2, false);
429 #endif
430         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
431                                            "pages", INT4OID, -1, 0);
432         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
433                                            "tuples", INT8OID, -1, 0);
434         tupdesc = BlessTupleDesc(tupdesc);
435
436         /* pages */
437         values[0] = Int32GetDatum(metadata->nPendingPages);
438         isnull[0] = false;
439
440         /* tuples */
441         values[1] = Int64GetDatum(metadata->nPendingHeapTuples);
442         isnull[1] = false;
443
444         UnlockReleaseBuffer(metabuffer);
445         relation_close(indexRel, AccessShareLock);
446
447         tuple = heap_form_tuple(tupdesc, values, isnull);
448         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
449 }