1 /*-------------------------------------------------------------------------
3 * Portions Copyright (c) 2017-2023, pg_bigm Development Group
4 * Portions Copyright (c) 2013-2016, NTT DATA Corporation
5 * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
9 * Support full text search using bigrams.
10 * Author: NTT DATA Corporation
12 *-------------------------------------------------------------------------
20 #include "catalog/pg_type.h"
21 #include "tsearch/ts_locale.h"
22 #include "utils/array.h"
23 #if PG_VERSION_NUM >= 160000
24 #include "utils/guc.h"
25 #endif /* PG_VERSION_NUM */
26 #include "utils/memutils.h"
30 /* Last update date of pg_bigm */
31 #define BIGM_LAST_UPDATE "2020.02.28"
34 bool bigm_enable_recheck = false;
35 int bigm_gin_key_limit = 0;
36 double bigm_similarity_limit = 0.3;
37 char *bigm_last_update = NULL;
39 PG_FUNCTION_INFO_V1(show_bigm);
40 PG_FUNCTION_INFO_V1(bigmtextcmp);
41 PG_FUNCTION_INFO_V1(likequery);
42 PG_FUNCTION_INFO_V1(bigm_similarity);
43 PG_FUNCTION_INFO_V1(bigm_similarity_op);
46 * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
47 * macro since 9.4, and hence the declaration of the function prototypes
48 * here is necessary only for 9.3 or before.
50 #if PG_VERSION_NUM < 90400
51 Datum show_bigm(PG_FUNCTION_ARGS);
52 Datum bigmtextcmp(PG_FUNCTION_ARGS);
53 Datum likequery(PG_FUNCTION_ARGS);
54 Datum bigm_similarity(PG_FUNCTION_ARGS);
55 Datum bigm_similarity_op(PG_FUNCTION_ARGS);
64 /* Define custom GUC variables */
65 DefineCustomBoolVariable("pg_bigm.enable_recheck",
66 "Recheck that heap tuples fetched from index "
77 DefineCustomIntVariable("pg_bigm.gin_key_limit",
78 "Sets the maximum number of bi-gram keys allowed to "
79 "use for GIN index search.",
80 "Zero means no limit.",
90 DefineCustomRealVariable("pg_bigm.similarity_limit",
91 "Sets the similarity threshold used by the "
94 &bigm_similarity_limit,
103 /* Can't be set in postgresql.conf */
104 DefineCustomStringVariable("pg_bigm.last_update",
105 "Shows the last update date of pg_bigm.",
110 GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
115 EmitWarningsOnPlaceholders("pg_bigm");
124 comp_bigm(const void *a, const void *b, void *arg)
127 bool *haveDups = (bool *) arg;
138 unique_array(bigm *a, int len)
144 while (tmp - a < len)
145 if (CMPBIGM(tmp, curend))
149 memcpy(curend, tmp, BIGMSIZE);
155 return curend + 1 - a;
158 #define iswordchr(c) (!t_isspace(c))
161 * Finds first word in string, returns pointer to the word,
162 * endword points to the character after word
165 find_word(char *str, int lenstr, char **endword, int *charlen)
167 char *beginword = str;
169 while (beginword - str < lenstr && !iswordchr(beginword))
170 beginword += pg_mblen(beginword);
172 if (beginword - str >= lenstr)
175 *endword = beginword;
177 while (*endword - str < lenstr && iswordchr(*endword))
179 *endword += pg_mblen(*endword);
187 * The function is named compact_bigram to maintain consistency with pg_trgm,
188 * though it does not reduce multibyte characters to hash values like in
192 compact_bigram(bigm *bptr, char *str, int bytelen)
194 CPBIGM(bptr, str, bytelen);
198 * Adds bigrams from words (already padded).
201 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
207 compact_bigram(bptr, ptr, pg_mblen(str));
213 if (bytelen > charlen)
215 /* Find multibyte character boundaries and call compact_bigram */
216 int lenfirst = pg_mblen(str),
217 lenlast = pg_mblen(str + lenfirst);
219 while ((ptr - str) + lenfirst + lenlast <= bytelen)
221 compact_bigram(bptr, ptr, lenfirst + lenlast);
227 lenlast = pg_mblen(ptr + lenfirst);
232 /* Fast path when there are no multibyte characters */
233 Assert(bytelen == charlen);
235 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
237 CPBIGM(bptr, ptr, 2);
247 generate_bigm(char *str, int slen)
259 * Guard against possible overflow in the palloc requests below.
260 * We need to prevent integer overflow in the multiplications here.
262 if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
263 (Size) slen > MaxAllocSize - 4)
265 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
266 errmsg("out of memory")));
268 bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
269 SET_VARSIZE(bgm, VARHDRSZ);
271 if (slen + LPADDING + RPADDING < 2 || slen == 0)
276 buf = palloc(sizeof(char) * (slen + 4));
286 while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
288 bytelen = eword - bword;
289 memcpy(buf + LPADDING, bword, bytelen);
291 buf[LPADDING + bytelen] = ' ';
292 buf[LPADDING + bytelen + 1] = ' ';
297 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
298 charlen + LPADDING + RPADDING);
303 if ((len = bptr - GETARR(bgm)) == 0)
307 * Make bigrams unique.
311 bool haveDups = false;
313 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
315 len = unique_array(GETARR(bgm), len);
318 SET_VARSIZE(bgm, CALCGTSIZE(len));
324 * Extract the next non-wildcard part of a search string, i.e. a word bounded
325 * by '_' or '%' meta-characters, non-word characters or string end.
327 * str: source string, of length lenstr bytes (need not be null-terminated)
328 * buf: where to return the substring (must be long enough)
329 * *bytelen: receives byte length of the found substring
330 * *charlen: receives character length of the found substring
332 * Returns pointer to end+1 of the found substring in the source string.
333 * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
335 * If the found word is bounded by non-word characters or string boundaries
336 * then this function will include corresponding padding spaces into buf.
339 get_wildcard_part(const char *str, int lenstr,
340 char *buf, int *bytelen, int *charlen)
342 const char *beginword = str;
345 bool in_leading_wildcard_meta = false;
346 bool in_trailing_wildcard_meta = false;
347 bool in_escape = false;
351 * Find the first word character, remembering whether preceding character
352 * was wildcard meta-character. Note that the in_escape state persists
353 * from this loop to the next one, since we may exit at a word character
356 while (beginword - str < lenstr)
360 if (iswordchr(beginword))
363 in_leading_wildcard_meta = false;
367 if (ISESCAPECHAR(beginword))
369 else if (ISWILDCARDCHAR(beginword))
370 in_leading_wildcard_meta = true;
371 else if (iswordchr(beginword))
374 in_leading_wildcard_meta = false;
376 beginword += pg_mblen(beginword);
382 if (beginword - str >= lenstr)
386 * Add left padding spaces if preceding character wasn't wildcard
390 if (!in_leading_wildcard_meta)
405 * Copy data into buf until wildcard meta-character, non-word character or
406 * string boundary. Strip escapes during copy.
409 while (endword - str < lenstr)
411 clen = pg_mblen(endword);
414 if (iswordchr(endword))
416 memcpy(s, endword, clen);
423 * Back up endword to the escape character when stopping at an
424 * escaped char, so that subsequent get_wildcard_part will
425 * restart from the escape character. We assume here that
426 * escape chars are single-byte.
435 if (ISESCAPECHAR(endword))
437 else if (ISWILDCARDCHAR(endword))
439 in_trailing_wildcard_meta = true;
442 else if (iswordchr(endword))
444 memcpy(s, endword, clen);
455 * Add right padding spaces if next character isn't wildcard
458 if (!in_trailing_wildcard_meta)
477 * Generates bigrams for wildcard search string.
479 * Returns array of bigrams that must occur in any string that matches the
480 * wildcard string. For example, given pattern "a%bcd%" the bigrams
481 * " a", "bcd" would be extracted.
483 * Set 'removeDups' to true if duplicate bigrams are removed.
486 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
499 * Guard against possible overflow in the palloc requests below.
500 * We need to prevent integer overflow in the multiplications here.
502 if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
503 (Size) slen > MaxAllocSize - 4)
505 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
506 errmsg("out of memory")));
508 bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
509 SET_VARSIZE(bgm, VARHDRSZ);
511 if (slen + LPADDING + RPADDING < 2 || slen == 0)
516 buf = palloc(sizeof(char) * (slen + 4));
519 * Extract bigrams from each substring extracted by get_wildcard_part.
522 while ((eword = get_wildcard_part(eword, slen - (eword - str),
523 buf, &bytelen, &charlen)) != NULL)
528 bptr = make_bigrams(bptr, buf, bytelen, charlen);
533 if ((len = bptr - GETARR(bgm)) == 0)
537 * Make bigrams unique.
541 bool haveDups = false;
543 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
547 len = unique_array(GETARR(bgm), len);
551 SET_VARSIZE(bgm, CALCGTSIZE(len));
557 show_bigm(PG_FUNCTION_ARGS)
559 text *in = PG_GETARG_TEXT_P(0);
566 bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
567 d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
569 for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
571 text *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
573 d[i] = PointerGetDatum(item);
585 for (i = 0; i < ARRNELEM(bgm); i++)
586 pfree(DatumGetPointer(d[i]));
590 PG_FREE_IF_COPY(in, 0);
592 PG_RETURN_POINTER(a);
596 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
607 len1 = ARRNELEM(bgm1);
608 len2 = ARRNELEM(bgm2);
610 /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
611 if (len1 <= 0 || len2 <= 0)
614 while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
616 int res = CMPBIGM(ptr1, ptr2);
631 return ((float4) count) / ((float4) (len1 + len2 - count));
633 return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
638 bigm_similarity(PG_FUNCTION_ARGS)
640 text *in1 = PG_GETARG_TEXT_P(0);
641 text *in2 = PG_GETARG_TEXT_P(1);
646 bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
647 bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
649 res = cnt_sml_bigm(bgm1, bgm2);
653 PG_FREE_IF_COPY(in1, 0);
654 PG_FREE_IF_COPY(in2, 1);
656 PG_RETURN_FLOAT4(res);
660 bigm_similarity_op(PG_FUNCTION_ARGS)
662 float4 res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
664 PG_GETARG_DATUM(1)));
666 PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
670 likequery(PG_FUNCTION_ARGS)
672 text *query = PG_GETARG_TEXT_PP(0);
680 str = VARDATA_ANY(query);
681 len = VARSIZE_ANY_EXHDR(query);
686 result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
687 rp = VARDATA(result);
690 for (sp = str; (sp - str) < len;)
692 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
697 else if (IS_HIGHBIT_SET(*sp))
699 mblen = pg_mblen(sp);
700 memcpy(rp, sp, mblen);
709 SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
711 PG_RETURN_TEXT_P(result);
715 bigmtextcmp(PG_FUNCTION_ARGS)
717 text *arg1 = PG_GETARG_TEXT_PP(0);
718 text *arg2 = PG_GETARG_TEXT_PP(1);
719 char *a1p = VARDATA_ANY(arg1);
720 char *a2p = VARDATA_ANY(arg2);
721 int len1 = VARSIZE_ANY_EXHDR(arg1);
722 int len2 = VARSIZE_ANY_EXHDR(arg2);
724 PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));