OSDN Git Service

Switch CI from Travis CI to GitHub Actions (#12)
[pgbigm/pg_bigm.git] / bigm_gin.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2017-2023, pg_bigm Development Group
4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
5  * Portions Copyright (c) 2007-2012, PostgreSQL Global Development Group
6  *
7  * Changelog:
8  *       2013/01/09
9  *       Support full text search using bigrams.
10  *       Author: NTT DATA Corporation
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include "bigm.h"
17
18 #include "access/gin.h"
19 #include "access/gin_private.h"
20 #include "access/itup.h"
21 #if PG_VERSION_NUM >= 120000
22 #include "access/relation.h"
23 #endif
24 #include "access/skey.h"
25 #if PG_VERSION_NUM < 130000
26 #include "access/tuptoaster.h"
27 #endif
28 #include "access/xlog.h"
29 #if PG_VERSION_NUM > 90500
30 #include "catalog/pg_am.h"
31 #endif
32 #include "catalog/pg_type.h"
33 #include "funcapi.h"
34 #include "mb/pg_wchar.h"
35 #include "storage/bufmgr.h"
36 #include "storage/bufpage.h"
37 #include "tsearch/ts_locale.h"
38 #include "utils/array.h"
39 #include "utils/builtins.h"
40 #include "utils/rel.h"
41
42
43 PG_FUNCTION_INFO_V1(gin_extract_value_bigm);
44 PG_FUNCTION_INFO_V1(gin_extract_query_bigm);
45 PG_FUNCTION_INFO_V1(gin_bigm_consistent);
46 PG_FUNCTION_INFO_V1(gin_bigm_compare_partial);
47 PG_FUNCTION_INFO_V1(pg_gin_pending_stats);
48
49 /* triConsistent function is available only in 9.4 or later */
50 #if PG_VERSION_NUM >= 90400
51 PG_FUNCTION_INFO_V1(gin_bigm_triconsistent);
52 #endif
53
54 /*
55  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
56  * macro since 9.4, and hence the declaration of the function prototypes
57  * here is necessary only for 9.3 or before.
58  */
59 #if PG_VERSION_NUM < 90400
60 Datum           gin_extract_value_bigm(PG_FUNCTION_ARGS);
61 Datum           gin_extract_query_bigm(PG_FUNCTION_ARGS);
62 Datum           gin_bigm_consistent(PG_FUNCTION_ARGS);
63 Datum           gin_bigm_compare_partial(PG_FUNCTION_ARGS);
64 Datum           pg_gin_pending_stats(PG_FUNCTION_ARGS);
65 #endif
66
67 Datum
68 gin_extract_value_bigm(PG_FUNCTION_ARGS)
69 {
70         text       *val = (text *) PG_GETARG_TEXT_P(0);
71         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
72         Datum      *entries = NULL;
73         BIGM       *bgm;
74         int32           bgmlen;
75
76         *nentries = 0;
77
78         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
79         bgmlen = ARRNELEM(bgm);
80
81         if (bgmlen > 0)
82         {
83                 bigm       *ptr;
84                 int32           i;
85
86                 *nentries = bgmlen;
87                 entries = (Datum *) palloc(sizeof(Datum) * bgmlen);
88
89                 ptr = GETARR(bgm);
90                 for (i = 0; i < bgmlen; i++)
91                 {
92                         text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
93
94                         entries[i] = PointerGetDatum(item);
95                         ptr++;
96                 }
97         }
98
99         PG_RETURN_POINTER(entries);
100 }
101
102 Datum
103 gin_extract_query_bigm(PG_FUNCTION_ARGS)
104 {
105         text       *val = (text *) PG_GETARG_TEXT_P(0);
106         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
107         StrategyNumber strategy = PG_GETARG_UINT16(2);
108
109         bool      **pmatch = (bool **) PG_GETARG_POINTER(3);
110         Pointer   **extra_data = (Pointer **) PG_GETARG_POINTER(4);
111
112         /* bool   **nullFlags = (bool **) PG_GETARG_POINTER(5); */
113         int32      *searchMode = (int32 *) PG_GETARG_POINTER(6);
114         Datum      *entries = NULL;
115         BIGM       *bgm;
116         int32           bgmlen = 0;
117         bigm       *ptr;
118         int32           i;
119         bool            removeDups;
120
121         switch (strategy)
122         {
123                 case LikeStrategyNumber:
124                 {
125                         char       *str = VARDATA(val);
126                         int                     slen = VARSIZE(val) - VARHDRSZ;
127                         bool       *recheck;
128
129                         /*
130                          * For wildcard search we extract all the bigrams that every
131                          * potentially-matching string must include.
132                          */
133                         bgm = generate_wildcard_bigm(str, slen, &removeDups);
134                         bgmlen = ARRNELEM(bgm);
135
136                         /*
137                          * Check whether the heap tuple fetched by index search needs to
138                          * be rechecked against the query. If the search word consists of
139                          * one or two characters and doesn't contain any space character,
140                          * we can guarantee that the index test would be exact. That is,
141                          * the heap tuple does match the query, so it doesn't need to be
142                          * rechecked.
143                          */
144                         *extra_data = (Pointer *) palloc(sizeof(bool));
145                         recheck = (bool *) *extra_data;
146                         if (bgmlen == 1 && !removeDups)
147                         {
148                                 const char *sp;
149
150                                 *recheck = false;
151                                 for (sp = str; (sp - str) < slen;)
152                                 {
153                                         if (t_isspace(sp))
154                                         {
155                                                 *recheck = true;
156                                                 break;
157                                         }
158
159                                         sp += IS_HIGHBIT_SET(*sp) ? pg_mblen(sp) : 1;
160                                 }
161                         }
162                         else
163                                 *recheck = true;
164                         break;
165                 }
166                 case SimilarityStrategyNumber:
167                 {
168                         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
169                         bgmlen = ARRNELEM(bgm);
170                         break;
171                 }
172                 default:
173                         elog(ERROR, "unrecognized strategy number: %d", strategy);
174                         bgm = NULL;                     /* keep compiler quiet */
175                         break;
176         }
177
178         *nentries = (bigm_gin_key_limit == 0) ?
179                 bgmlen : Min(bigm_gin_key_limit, bgmlen);
180         *pmatch = NULL;
181
182         if (*nentries > 0)
183         {
184                 entries = (Datum *) palloc(sizeof(Datum) * *nentries);
185                 ptr = GETARR(bgm);
186                 for (i = 0; i < *nentries; i++)
187                 {
188                         text       *item;
189
190                         if (ptr->pmatch)
191                         {
192                                 if (*pmatch == NULL)
193                                         *pmatch = (bool *) palloc0(sizeof(bool) * *nentries);
194                                 (*pmatch)[i] = true;
195                         }
196                         item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
197                         entries[i] = PointerGetDatum(item);
198                         ptr++;
199                 }
200         }
201
202         /*
203          * If no bigram was extracted then we have to scan all the index.
204          */
205         if (*nentries == 0)
206                 *searchMode = GIN_SEARCH_MODE_ALL;
207
208         PG_RETURN_POINTER(entries);
209 }
210
211 Datum
212 gin_bigm_consistent(PG_FUNCTION_ARGS)
213 {
214         bool       *check = (bool *) PG_GETARG_POINTER(0);
215         StrategyNumber strategy = PG_GETARG_UINT16(1);
216
217         /* text    *query = PG_GETARG_TEXT_P(2); */
218         int32           nkeys = PG_GETARG_INT32(3);
219
220         Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4);
221         bool       *recheck = (bool *) PG_GETARG_POINTER(5);
222         bool            res;
223         int32           i;
224         int32           ntrue;
225
226         switch (strategy)
227         {
228                 case LikeStrategyNumber:
229
230                         /*
231                          * Don't recheck the heap tuple against the query if either
232                          * pg_bigm.enable_recheck is disabled or the search word is the
233                          * special one so that the index can return the exact result.
234                          */
235                         Assert(extra_data != NULL);
236                         *recheck = bigm_enable_recheck &&
237                                 (*((bool *) extra_data) || (nkeys != 1));
238
239                         /* Check if all extracted bigrams are presented. */
240                         res = true;
241                         for (i = 0; i < nkeys; i++)
242                         {
243                                 if (!check[i])
244                                 {
245                                         res = false;
246                                         break;
247                                 }
248                         }
249                         break;
250                 case SimilarityStrategyNumber:
251                         /* Count the matches */
252                         *recheck = bigm_enable_recheck;
253                         ntrue = 0;
254                         for (i = 0; i < nkeys; i++)
255                         {
256                                 if (check[i])
257                                         ntrue++;
258                         }
259
260                         /*--------------------
261                          * If DIVUNION is defined then similarity formula is:
262                          * c / (len1 + len2 - c)
263                          * where c is number of common bigrams and it stands as ntrue in
264                          * this code.  Here we don't know value of len2 but we can assume
265                          * that c (ntrue) is a lower bound of len2, so upper bound of
266                          * similarity is:
267                          * c / (len1 + c - c)  => c / len1
268                          * If DIVUNION is not defined then similarity formula is:
269                          * c / max(len1, len2)
270                          * And again, c (ntrue) is a lower bound of len2, but c <= len1
271                          * just by definition and, consequently, upper bound of
272                          * similarity is just c / len1.
273                          * So, independently on DIVUNION the upper bound formula is the same.
274                          */
275                         res = (nkeys == 0) ? false :
276                                 ((((float4) ntrue) / ((float4) nkeys)) >=
277                                   (float4) bigm_similarity_limit);
278                         break;
279                 default:
280                         elog(ERROR, "unrecognized strategy number: %d", strategy);
281                         res = false;            /* keep compiler quiet */
282                         break;
283         }
284
285         PG_RETURN_BOOL(res);
286 }
287
288 /* triConsistent function is available only in 9.4 or later */
289 #if PG_VERSION_NUM >= 90400
290 Datum
291 gin_bigm_triconsistent(PG_FUNCTION_ARGS)
292 {
293         GinTernaryValue  *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
294         StrategyNumber strategy = PG_GETARG_UINT16(1);
295
296         /* text    *query = PG_GETARG_TEXT_P(2); */
297         int32           nkeys = PG_GETARG_INT32(3);
298         Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4);
299         GinTernaryValue res = GIN_MAYBE;
300         int32           i,
301                                 ntrue;
302
303         switch (strategy)
304         {
305                 case LikeStrategyNumber:
306                         /*
307                          * Don't recheck the heap tuple against the query if either
308                          * pg_bigm.enable_recheck is disabled or the search word is the
309                          * special one so that the index can return the exact result.
310                          */
311                         res = (bigm_enable_recheck &&
312                                    (*((bool *) extra_data) || (nkeys != 1))) ?
313                                 GIN_MAYBE : GIN_TRUE;
314
315                         /* Check if all extracted bigrams are presented. */
316                         for (i = 0; i < nkeys; i++)
317                         {
318                                 if (check[i] == GIN_FALSE)
319                                 {
320                                         res = GIN_FALSE;
321                                         break;
322                                 }
323                         }
324                         break;
325                 case SimilarityStrategyNumber:
326                         /* Count the matches */
327                         ntrue = 0;
328                         for (i = 0; i < nkeys; i++)
329                         {
330                                 if (check[i] != GIN_FALSE)
331                                         ntrue++;
332                         }
333
334                         /*
335                          * See comment in gin_bigm_consistent() about upper bound formula
336                          */
337                         res = (nkeys == 0) ? GIN_FALSE :
338                                 (((((float4) ntrue) / ((float4) nkeys)) >=
339                                   (float4) bigm_similarity_limit) ? GIN_MAYBE : GIN_FALSE);
340
341                         if (res != GIN_FALSE && !bigm_enable_recheck)
342                                 res = GIN_TRUE;
343                         break;
344                 default:
345                         elog(ERROR, "unrecognized strategy number: %d", strategy);
346                         res = GIN_FALSE;                /* keep compiler quiet */
347                         break;
348         }
349
350         PG_RETURN_GIN_TERNARY_VALUE(res);
351 }
352 #endif  /* PG_VERSION_NUM >= 90400 */
353
354 Datum
355 gin_bigm_compare_partial(PG_FUNCTION_ARGS)
356 {
357         text       *arg1 = PG_GETARG_TEXT_PP(0);
358         text       *arg2 = PG_GETARG_TEXT_PP(1);
359         char       *a1p;
360         char       *a2p;
361         int                     mblen1;
362         int                     mblen2;
363         int                     res;
364
365         a1p = VARDATA_ANY(arg1);
366         a2p = VARDATA_ANY(arg2);
367
368         mblen1 = pg_mblen(a1p);
369         mblen2 = pg_mblen(a2p);
370
371         if (mblen1 != mblen2)
372                 PG_RETURN_INT32(1);
373
374         res = memcmp(a1p, a2p, mblen1) ? 1 : 0;
375         PG_RETURN_INT32(res);
376 }
377
378 /*
379  * Report both number of pages and number of heap tuples that
380  * are in the pending list.
381  */
382 Datum
383 pg_gin_pending_stats(PG_FUNCTION_ARGS)
384 {
385         Oid                     indexOid = PG_GETARG_OID(0);
386         Relation        indexRel;
387         Buffer          metabuffer;
388         Page            metapage;
389         GinMetaPageData *metadata;
390         Datum           values[2];
391         bool            isnull[2];
392         HeapTuple       tuple;
393         TupleDesc       tupdesc;
394
395         indexRel = relation_open(indexOid, AccessShareLock);
396
397         if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
398                 indexRel->rd_rel->relam != GIN_AM_OID)
399                 ereport(ERROR,
400                                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
401                                  errmsg("relation \"%s\" is not a GIN index",
402                                                 RelationGetRelationName(indexRel))));
403
404         /*
405          * Reject attempts to read non-local temporary relations; we would be
406          * likely to get wrong data since we have no visibility into the owning
407          * session's local buffers.
408          */
409         if (RELATION_IS_OTHER_TEMP(indexRel))
410                 ereport(ERROR,
411                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
412                                  errmsg("cannot access temporary indexes of other sessions")));
413
414         /*
415          * Obtain statistic information from the meta page
416          */
417         metabuffer = ReadBuffer(indexRel, GIN_METAPAGE_BLKNO);
418         LockBuffer(metabuffer, GIN_SHARE);
419         metapage = BufferGetPage(metabuffer);
420         metadata = GinPageGetMeta(metapage);
421
422         /*
423          * Construct a tuple descriptor for the result row. This must match this
424          * function's pg_bigm--x.x.sql entry.
425          */
426  #if PG_VERSION_NUM >= 120000
427         tupdesc = CreateTemplateTupleDesc(2);
428 #else
429         tupdesc = CreateTemplateTupleDesc(2, false);
430 #endif
431         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
432                                            "pages", INT4OID, -1, 0);
433         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
434                                            "tuples", INT8OID, -1, 0);
435         tupdesc = BlessTupleDesc(tupdesc);
436
437         /* pages */
438         values[0] = Int32GetDatum(metadata->nPendingPages);
439         isnull[0] = false;
440
441         /* tuples */
442         values[1] = Int64GetDatum(metadata->nPendingHeapTuples);
443         isnull[1] = false;
444
445         UnlockReleaseBuffer(metabuffer);
446         relation_close(indexRel, AccessShareLock);
447
448         tuple = heap_form_tuple(tupdesc, values, isnull);
449         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
450 }