1 /*-------------------------------------------------------------------------
3 * Portions Copyright (c) 2017-2023, pg_bigm Development Group
4 * Portions Copyright (c) 2013-2016, NTT DATA Corporation
5 * Portions Copyright (c) 2007-2012, PostgreSQL Global Development Group
9 * Support full text search using bigrams.
10 * Author: NTT DATA Corporation
12 *-------------------------------------------------------------------------
18 #include "access/gin.h"
19 #include "access/gin_private.h"
20 #include "access/itup.h"
21 #if PG_VERSION_NUM >= 120000
22 #include "access/relation.h"
24 #include "access/skey.h"
25 #if PG_VERSION_NUM < 130000
26 #include "access/tuptoaster.h"
28 #include "access/xlog.h"
29 #if PG_VERSION_NUM > 90500
30 #include "catalog/pg_am.h"
32 #include "catalog/pg_type.h"
34 #include "mb/pg_wchar.h"
35 #include "storage/bufmgr.h"
36 #include "storage/bufpage.h"
37 #include "tsearch/ts_locale.h"
38 #include "utils/array.h"
39 #include "utils/builtins.h"
40 #include "utils/rel.h"
43 PG_FUNCTION_INFO_V1(gin_extract_value_bigm);
44 PG_FUNCTION_INFO_V1(gin_extract_query_bigm);
45 PG_FUNCTION_INFO_V1(gin_bigm_consistent);
46 PG_FUNCTION_INFO_V1(gin_bigm_compare_partial);
47 PG_FUNCTION_INFO_V1(pg_gin_pending_stats);
49 /* triConsistent function is available only in 9.4 or later */
50 #if PG_VERSION_NUM >= 90400
51 PG_FUNCTION_INFO_V1(gin_bigm_triconsistent);
55 * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
56 * macro since 9.4, and hence the declaration of the function prototypes
57 * here is necessary only for 9.3 or before.
59 #if PG_VERSION_NUM < 90400
60 Datum gin_extract_value_bigm(PG_FUNCTION_ARGS);
61 Datum gin_extract_query_bigm(PG_FUNCTION_ARGS);
62 Datum gin_bigm_consistent(PG_FUNCTION_ARGS);
63 Datum gin_bigm_compare_partial(PG_FUNCTION_ARGS);
64 Datum pg_gin_pending_stats(PG_FUNCTION_ARGS);
68 gin_extract_value_bigm(PG_FUNCTION_ARGS)
70 text *val = (text *) PG_GETARG_TEXT_P(0);
71 int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
72 Datum *entries = NULL;
78 bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
79 bgmlen = ARRNELEM(bgm);
87 entries = (Datum *) palloc(sizeof(Datum) * bgmlen);
90 for (i = 0; i < bgmlen; i++)
92 text *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
94 entries[i] = PointerGetDatum(item);
99 PG_RETURN_POINTER(entries);
103 gin_extract_query_bigm(PG_FUNCTION_ARGS)
105 text *val = (text *) PG_GETARG_TEXT_P(0);
106 int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
107 StrategyNumber strategy = PG_GETARG_UINT16(2);
109 bool **pmatch = (bool **) PG_GETARG_POINTER(3);
110 Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4);
112 /* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
113 int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
114 Datum *entries = NULL;
123 case LikeStrategyNumber:
125 char *str = VARDATA(val);
126 int slen = VARSIZE(val) - VARHDRSZ;
130 * For wildcard search we extract all the bigrams that every
131 * potentially-matching string must include.
133 bgm = generate_wildcard_bigm(str, slen, &removeDups);
134 bgmlen = ARRNELEM(bgm);
137 * Check whether the heap tuple fetched by index search needs to
138 * be rechecked against the query. If the search word consists of
139 * one or two characters and doesn't contain any space character,
140 * we can guarantee that the index test would be exact. That is,
141 * the heap tuple does match the query, so it doesn't need to be
144 *extra_data = (Pointer *) palloc(sizeof(bool));
145 recheck = (bool *) *extra_data;
146 if (bgmlen == 1 && !removeDups)
151 for (sp = str; (sp - str) < slen;)
159 sp += IS_HIGHBIT_SET(*sp) ? pg_mblen(sp) : 1;
166 case SimilarityStrategyNumber:
168 bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
169 bgmlen = ARRNELEM(bgm);
173 elog(ERROR, "unrecognized strategy number: %d", strategy);
174 bgm = NULL; /* keep compiler quiet */
178 *nentries = (bigm_gin_key_limit == 0) ?
179 bgmlen : Min(bigm_gin_key_limit, bgmlen);
184 entries = (Datum *) palloc(sizeof(Datum) * *nentries);
186 for (i = 0; i < *nentries; i++)
193 *pmatch = (bool *) palloc0(sizeof(bool) * *nentries);
196 item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
197 entries[i] = PointerGetDatum(item);
203 * If no bigram was extracted then we have to scan all the index.
206 *searchMode = GIN_SEARCH_MODE_ALL;
208 PG_RETURN_POINTER(entries);
212 gin_bigm_consistent(PG_FUNCTION_ARGS)
214 bool *check = (bool *) PG_GETARG_POINTER(0);
215 StrategyNumber strategy = PG_GETARG_UINT16(1);
217 /* text *query = PG_GETARG_TEXT_P(2); */
218 int32 nkeys = PG_GETARG_INT32(3);
220 Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
221 bool *recheck = (bool *) PG_GETARG_POINTER(5);
228 case LikeStrategyNumber:
231 * Don't recheck the heap tuple against the query if either
232 * pg_bigm.enable_recheck is disabled or the search word is the
233 * special one so that the index can return the exact result.
235 Assert(extra_data != NULL);
236 *recheck = bigm_enable_recheck &&
237 (*((bool *) extra_data) || (nkeys != 1));
239 /* Check if all extracted bigrams are presented. */
241 for (i = 0; i < nkeys; i++)
250 case SimilarityStrategyNumber:
251 /* Count the matches */
252 *recheck = bigm_enable_recheck;
254 for (i = 0; i < nkeys; i++)
260 /*--------------------
261 * If DIVUNION is defined then similarity formula is:
262 * c / (len1 + len2 - c)
263 * where c is number of common bigrams and it stands as ntrue in
264 * this code. Here we don't know value of len2 but we can assume
265 * that c (ntrue) is a lower bound of len2, so upper bound of
267 * c / (len1 + c - c) => c / len1
268 * If DIVUNION is not defined then similarity formula is:
269 * c / max(len1, len2)
270 * And again, c (ntrue) is a lower bound of len2, but c <= len1
271 * just by definition and, consequently, upper bound of
272 * similarity is just c / len1.
273 * So, independently on DIVUNION the upper bound formula is the same.
275 res = (nkeys == 0) ? false :
276 ((((float4) ntrue) / ((float4) nkeys)) >=
277 (float4) bigm_similarity_limit);
280 elog(ERROR, "unrecognized strategy number: %d", strategy);
281 res = false; /* keep compiler quiet */
288 /* triConsistent function is available only in 9.4 or later */
289 #if PG_VERSION_NUM >= 90400
291 gin_bigm_triconsistent(PG_FUNCTION_ARGS)
293 GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
294 StrategyNumber strategy = PG_GETARG_UINT16(1);
296 /* text *query = PG_GETARG_TEXT_P(2); */
297 int32 nkeys = PG_GETARG_INT32(3);
298 Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
299 GinTernaryValue res = GIN_MAYBE;
305 case LikeStrategyNumber:
307 * Don't recheck the heap tuple against the query if either
308 * pg_bigm.enable_recheck is disabled or the search word is the
309 * special one so that the index can return the exact result.
311 res = (bigm_enable_recheck &&
312 (*((bool *) extra_data) || (nkeys != 1))) ?
313 GIN_MAYBE : GIN_TRUE;
315 /* Check if all extracted bigrams are presented. */
316 for (i = 0; i < nkeys; i++)
318 if (check[i] == GIN_FALSE)
325 case SimilarityStrategyNumber:
326 /* Count the matches */
328 for (i = 0; i < nkeys; i++)
330 if (check[i] != GIN_FALSE)
335 * See comment in gin_bigm_consistent() about upper bound formula
337 res = (nkeys == 0) ? GIN_FALSE :
338 (((((float4) ntrue) / ((float4) nkeys)) >=
339 (float4) bigm_similarity_limit) ? GIN_MAYBE : GIN_FALSE);
341 if (res != GIN_FALSE && !bigm_enable_recheck)
345 elog(ERROR, "unrecognized strategy number: %d", strategy);
346 res = GIN_FALSE; /* keep compiler quiet */
350 PG_RETURN_GIN_TERNARY_VALUE(res);
352 #endif /* PG_VERSION_NUM >= 90400 */
355 gin_bigm_compare_partial(PG_FUNCTION_ARGS)
357 text *arg1 = PG_GETARG_TEXT_PP(0);
358 text *arg2 = PG_GETARG_TEXT_PP(1);
365 a1p = VARDATA_ANY(arg1);
366 a2p = VARDATA_ANY(arg2);
368 mblen1 = pg_mblen(a1p);
369 mblen2 = pg_mblen(a2p);
371 if (mblen1 != mblen2)
374 res = memcmp(a1p, a2p, mblen1) ? 1 : 0;
375 PG_RETURN_INT32(res);
379 * Report both number of pages and number of heap tuples that
380 * are in the pending list.
383 pg_gin_pending_stats(PG_FUNCTION_ARGS)
385 Oid indexOid = PG_GETARG_OID(0);
389 GinMetaPageData *metadata;
395 indexRel = relation_open(indexOid, AccessShareLock);
397 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
398 indexRel->rd_rel->relam != GIN_AM_OID)
400 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
401 errmsg("relation \"%s\" is not a GIN index",
402 RelationGetRelationName(indexRel))));
405 * Reject attempts to read non-local temporary relations; we would be
406 * likely to get wrong data since we have no visibility into the owning
407 * session's local buffers.
409 if (RELATION_IS_OTHER_TEMP(indexRel))
411 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
412 errmsg("cannot access temporary indexes of other sessions")));
415 * Obtain statistic information from the meta page
417 metabuffer = ReadBuffer(indexRel, GIN_METAPAGE_BLKNO);
418 LockBuffer(metabuffer, GIN_SHARE);
419 metapage = BufferGetPage(metabuffer);
420 metadata = GinPageGetMeta(metapage);
423 * Construct a tuple descriptor for the result row. This must match this
424 * function's pg_bigm--x.x.sql entry.
426 #if PG_VERSION_NUM >= 120000
427 tupdesc = CreateTemplateTupleDesc(2);
429 tupdesc = CreateTemplateTupleDesc(2, false);
431 TupleDescInitEntry(tupdesc, (AttrNumber) 1,
432 "pages", INT4OID, -1, 0);
433 TupleDescInitEntry(tupdesc, (AttrNumber) 2,
434 "tuples", INT8OID, -1, 0);
435 tupdesc = BlessTupleDesc(tupdesc);
438 values[0] = Int32GetDatum(metadata->nPendingPages);
442 values[1] = Int64GetDatum(metadata->nPendingHeapTuples);
445 UnlockReleaseBuffer(metabuffer);
446 relation_close(indexRel, AccessShareLock);
448 tuple = heap_form_tuple(tupdesc, values, isnull);
449 PG_RETURN_DATUM(HeapTupleGetDatum(tuple));