bigm_op.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * Portions Copyright (c) 2017-2023, pg_bigm Development Group
   4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
   5  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
   6  *
   7  * Changelog:
   8  *       2013/01/09
   9  *       Support full text search using bigrams.
  10  *       Author: NTT DATA Corporation
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #include "postgres.h"
  15
  16 #include <ctype.h>
  17
  18 #include "bigm.h"
  19
  20 #include "catalog/pg_type.h"
  21 #include "tsearch/ts_locale.h"
  22 #include "utils/array.h"
  23 #if PG_VERSION_NUM >= 160000
  24 #include "utils/guc.h"
  25 #endif /* PG_VERSION_NUM */
  26 #include "utils/memutils.h"
  27
  28 PG_MODULE_MAGIC;
  29
  30 /* Last update date of pg_bigm */
  31 #define BIGM_LAST_UPDATE        "2020.02.28"
  32
  33 /* GUC variable */
  34 bool            bigm_enable_recheck = false;
  35 int                     bigm_gin_key_limit = 0;
  36 double          bigm_similarity_limit = 0.3;
  37 char       *bigm_last_update = NULL;
  38
  39 PG_FUNCTION_INFO_V1(show_bigm);
  40 PG_FUNCTION_INFO_V1(bigmtextcmp);
  41 PG_FUNCTION_INFO_V1(likequery);
  42 PG_FUNCTION_INFO_V1(bigm_similarity);
  43 PG_FUNCTION_INFO_V1(bigm_similarity_op);
  44
  45 /*
  46  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
  47  * macro since 9.4, and hence the declaration of the function prototypes
  48  * here is necessary only for 9.3 or before.
  49  */
  50 #if PG_VERSION_NUM < 90400
  51 Datum           show_bigm(PG_FUNCTION_ARGS);
  52 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
  53 Datum           likequery(PG_FUNCTION_ARGS);
  54 Datum           bigm_similarity(PG_FUNCTION_ARGS);
  55 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
  56 #endif
  57
  58 void            _PG_init(void);
  59 void            _PG_fini(void);
  60
  61 void
  62 _PG_init(void)
  63 {
  64         /* Define custom GUC variables */
  65         DefineCustomBoolVariable("pg_bigm.enable_recheck",
  66                                                          "Recheck that heap tuples fetched from index "
  67                                                          "match the query.",
  68                                                          NULL,
  69                                                          &bigm_enable_recheck,
  70                                                          true,
  71                                                          PGC_USERSET,
  72                                                          0,
  73                                                          NULL,
  74                                                          NULL,
  75                                                          NULL);
  76
  77         DefineCustomIntVariable("pg_bigm.gin_key_limit",
  78                                                         "Sets the maximum number of bi-gram keys allowed to "
  79                                                         "use for GIN index search.",
  80                                                         "Zero means no limit.",
  81                                                         &bigm_gin_key_limit,
  82                                                         0,
  83                                                         0, INT_MAX,
  84                                                         PGC_USERSET,
  85                                                         0,
  86                                                         NULL,
  87                                                         NULL,
  88                                                         NULL);
  89
  90         DefineCustomRealVariable("pg_bigm.similarity_limit",
  91                                                          "Sets the similarity threshold used by the "
  92                                                          "=% operator.",
  93                                                          NULL,
  94                                                          &bigm_similarity_limit,
  95                                                          0.3,
  96                                                          0.0, 1.0,
  97                                                          PGC_USERSET,
  98                                                          0,
  99                                                          NULL,
 100                                                          NULL,
 101                                                          NULL);
 102
 103         /* Can't be set in postgresql.conf */
 104         DefineCustomStringVariable("pg_bigm.last_update",
 105                                                            "Shows the last update date of pg_bigm.",
 106                                                            NULL,
 107                                                            &bigm_last_update,
 108                                                            BIGM_LAST_UPDATE,
 109                                                            PGC_INTERNAL,
 110                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
 111                                                            NULL,
 112                                                            NULL,
 113                                                            NULL);
 114
 115         EmitWarningsOnPlaceholders("pg_bigm");
 116 }
 117
 118 void
 119 _PG_fini(void)
 120 {
 121 }
 122
 123 static int
 124 comp_bigm(const void *a, const void *b, void *arg)
 125 {
 126         int                     res;
 127         bool       *haveDups = (bool *) arg;
 128
 129         res = CMPBIGM(a, b);
 130
 131         if (res == 0)
 132                 *haveDups = true;
 133
 134         return res;
 135 }
 136
 137 static int
 138 unique_array(bigm *a, int len)
 139 {
 140         bigm       *curend,
 141                            *tmp;
 142
 143         curend = tmp = a;
 144         while (tmp - a < len)
 145                 if (CMPBIGM(tmp, curend))
 146                 {
 147                         curend++;
 148                         if (curend != tmp)
 149                                 memcpy(curend, tmp, BIGMSIZE);
 150                         tmp++;
 151                 }
 152                 else
 153                         tmp++;
 154
 155         return curend + 1 - a;
 156 }
 157
 158 #define iswordchr(c)    (!t_isspace(c))
 159
 160 /*
 161  * Finds first word in string, returns pointer to the word,
 162  * endword points to the character after word
 163  */
 164 static char *
 165 find_word(char *str, int lenstr, char **endword, int *charlen)
 166 {
 167         char       *beginword = str;
 168
 169         while (beginword - str < lenstr && !iswordchr(beginword))
 170                 beginword += pg_mblen(beginword);
 171
 172         if (beginword - str >= lenstr)
 173                 return NULL;
 174
 175         *endword = beginword;
 176         *charlen = 0;
 177         while (*endword - str < lenstr && iswordchr(*endword))
 178         {
 179                 *endword += pg_mblen(*endword);
 180                 (*charlen)++;
 181         }
 182
 183         return beginword;
 184 }
 185
 186 /*
 187  * The function is named compact_bigram to maintain consistency with pg_trgm,
 188  * though it does not reduce multibyte characters to hash values like in
 189  * compact_trigram.
 190  */
 191 static void
 192 compact_bigram(bigm *bptr, char *str, int bytelen)
 193 {
 194         CPBIGM(bptr, str, bytelen);
 195 }
 196
 197 /*
 198  * Adds bigrams from words (already padded).
 199  */
 200 static bigm *
 201 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
 202 {
 203         char       *ptr = str;
 204
 205         if (charlen < 2)
 206         {
 207                 compact_bigram(bptr, ptr, pg_mblen(str));
 208                 bptr->pmatch = true;
 209                 bptr++;
 210                 return bptr;
 211         }
 212
 213         if (bytelen > charlen)
 214         {
 215                 /* Find multibyte character boundaries and call compact_bigram */
 216                 int                     lenfirst = pg_mblen(str),
 217                                         lenlast = pg_mblen(str + lenfirst);
 218
 219                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
 220                 {
 221                         compact_bigram(bptr, ptr, lenfirst + lenlast);
 222
 223                         ptr += lenfirst;
 224                         bptr++;
 225
 226                         lenfirst = lenlast;
 227                         lenlast = pg_mblen(ptr + lenfirst);
 228                 }
 229         }
 230         else
 231         {
 232                 /* Fast path when there are no multibyte characters */
 233                 Assert(bytelen == charlen);
 234
 235                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
 236                 {
 237                         CPBIGM(bptr, ptr, 2);
 238                         ptr++;
 239                         bptr++;
 240                 }
 241         }
 242
 243         return bptr;
 244 }
 245
 246 BIGM *
 247 generate_bigm(char *str, int slen)
 248 {
 249         BIGM       *bgm;
 250         char       *buf;
 251         bigm       *bptr;
 252         int                     len,
 253                                 charlen,
 254                                 bytelen;
 255         char       *bword,
 256                            *eword;
 257
 258         /*
 259          * Guard against possible overflow in the palloc requests below.
 260          * We need to prevent integer overflow in the multiplications here.
 261          */
 262         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
 263                 (Size) slen > MaxAllocSize - 4)
 264                 ereport(ERROR,
 265                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 266                                  errmsg("out of memory")));
 267
 268         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
 269         SET_VARSIZE(bgm, VARHDRSZ);
 270
 271         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 272                 return bgm;
 273
 274         bptr = GETARR(bgm);
 275
 276         buf = palloc(sizeof(char) * (slen + 4));
 277
 278         if (LPADDING > 0)
 279         {
 280                 *buf = ' ';
 281                 if (LPADDING > 1)
 282                         *(buf + 1) = ' ';
 283         }
 284
 285         eword = str;
 286         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
 287         {
 288                 bytelen = eword - bword;
 289                 memcpy(buf + LPADDING, bword, bytelen);
 290
 291                 buf[LPADDING + bytelen] = ' ';
 292                 buf[LPADDING + bytelen + 1] = ' ';
 293
 294                 /*
 295                  * count bigrams
 296                  */
 297                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
 298                                                         charlen + LPADDING + RPADDING);
 299         }
 300
 301         pfree(buf);
 302
 303         if ((len = bptr - GETARR(bgm)) == 0)
 304                 return bgm;
 305
 306         /*
 307          * Make bigrams unique.
 308          */
 309         if (len > 1)
 310         {
 311                 bool            haveDups = false;
 312
 313                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 314                 if (haveDups)
 315                         len = unique_array(GETARR(bgm), len);
 316         }
 317
 318         SET_VARSIZE(bgm, CALCGTSIZE(len));
 319
 320         return bgm;
 321 }
 322
 323 /*
 324  * Extract the next non-wildcard part of a search string, i.e. a word bounded
 325  * by '_' or '%' meta-characters, non-word characters or string end.
 326  *
 327  * str: source string, of length lenstr bytes (need not be null-terminated)
 328  * buf: where to return the substring (must be long enough)
 329  * *bytelen: receives byte length of the found substring
 330  * *charlen: receives character length of the found substring
 331  *
 332  * Returns pointer to end+1 of the found substring in the source string.
 333  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
 334  *
 335  * If the found word is bounded by non-word characters or string boundaries
 336  * then this function will include corresponding padding spaces into buf.
 337  */
 338 static const char *
 339 get_wildcard_part(const char *str, int lenstr,
 340                                   char *buf, int *bytelen, int *charlen)
 341 {
 342         const char *beginword = str;
 343         const char *endword;
 344         char       *s = buf;
 345         bool            in_leading_wildcard_meta = false;
 346         bool            in_trailing_wildcard_meta = false;
 347         bool            in_escape = false;
 348         int                     clen;
 349
 350         /*
 351          * Find the first word character, remembering whether preceding character
 352          * was wildcard meta-character.  Note that the in_escape state persists
 353          * from this loop to the next one, since we may exit at a word character
 354          * that is in_escape.
 355          */
 356         while (beginword - str < lenstr)
 357         {
 358                 if (in_escape)
 359                 {
 360                         if (iswordchr(beginword))
 361                                 break;
 362                         in_escape = false;
 363                         in_leading_wildcard_meta = false;
 364                 }
 365                 else
 366                 {
 367                         if (ISESCAPECHAR(beginword))
 368                                 in_escape = true;
 369                         else if (ISWILDCARDCHAR(beginword))
 370                                 in_leading_wildcard_meta = true;
 371                         else if (iswordchr(beginword))
 372                                 break;
 373                         else
 374                                 in_leading_wildcard_meta = false;
 375                 }
 376                 beginword += pg_mblen(beginword);
 377         }
 378
 379         /*
 380          * Handle string end.
 381          */
 382         if (beginword - str >= lenstr)
 383                 return NULL;
 384
 385         /*
 386          * Add left padding spaces if preceding character wasn't wildcard
 387          * meta-character.
 388          */
 389         *charlen = 0;
 390         if (!in_leading_wildcard_meta)
 391         {
 392                 if (LPADDING > 0)
 393                 {
 394                         *s++ = ' ';
 395                         (*charlen)++;
 396                         if (LPADDING > 1)
 397                         {
 398                                 *s++ = ' ';
 399                                 (*charlen)++;
 400                         }
 401                 }
 402         }
 403
 404         /*
 405          * Copy data into buf until wildcard meta-character, non-word character or
 406          * string boundary.  Strip escapes during copy.
 407          */
 408         endword = beginword;
 409         while (endword - str < lenstr)
 410         {
 411                 clen = pg_mblen(endword);
 412                 if (in_escape)
 413                 {
 414                         if (iswordchr(endword))
 415                         {
 416                                 memcpy(s, endword, clen);
 417                                 (*charlen)++;
 418                                 s += clen;
 419                         }
 420                         else
 421                         {
 422                                 /*
 423                                  * Back up endword to the escape character when stopping at an
 424                                  * escaped char, so that subsequent get_wildcard_part will
 425                                  * restart from the escape character.  We assume here that
 426                                  * escape chars are single-byte.
 427                                  */
 428                                 endword--;
 429                                 break;
 430                         }
 431                         in_escape = false;
 432                 }
 433                 else
 434                 {
 435                         if (ISESCAPECHAR(endword))
 436                                 in_escape = true;
 437                         else if (ISWILDCARDCHAR(endword))
 438                         {
 439                                 in_trailing_wildcard_meta = true;
 440                                 break;
 441                         }
 442                         else if (iswordchr(endword))
 443                         {
 444                                 memcpy(s, endword, clen);
 445                                 (*charlen)++;
 446                                 s += clen;
 447                         }
 448                         else
 449                                 break;
 450                 }
 451                 endword += clen;
 452         }
 453
 454         /*
 455          * Add right padding spaces if next character isn't wildcard
 456          * meta-character.
 457          */
 458         if (!in_trailing_wildcard_meta)
 459         {
 460                 if (RPADDING > 0)
 461                 {
 462                         *s++ = ' ';
 463                         (*charlen)++;
 464                         if (RPADDING > 1)
 465                         {
 466                                 *s++ = ' ';
 467                                 (*charlen)++;
 468                         }
 469                 }
 470         }
 471
 472         *bytelen = s - buf;
 473         return endword;
 474 }
 475
 476 /*
 477  * Generates bigrams for wildcard search string.
 478  *
 479  * Returns array of bigrams that must occur in any string that matches the
 480  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
 481  * " a", "bcd" would be extracted.
 482  *
 483  * Set 'removeDups' to true if duplicate bigrams are removed.
 484  */
 485 BIGM *
 486 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
 487 {
 488         BIGM       *bgm;
 489         char       *buf;
 490         bigm       *bptr;
 491         int                     len,
 492                                 charlen,
 493                                 bytelen;
 494         const char *eword;
 495
 496         *removeDups = false;
 497
 498         /*
 499          * Guard against possible overflow in the palloc requests below.
 500          * We need to prevent integer overflow in the multiplications here.
 501          */
 502         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
 503                 (Size) slen > MaxAllocSize - 4)
 504                 ereport(ERROR,
 505                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 506                                  errmsg("out of memory")));
 507
 508         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
 509         SET_VARSIZE(bgm, VARHDRSZ);
 510
 511         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 512                 return bgm;
 513
 514         bptr = GETARR(bgm);
 515
 516         buf = palloc(sizeof(char) * (slen + 4));
 517
 518         /*
 519          * Extract bigrams from each substring extracted by get_wildcard_part.
 520          */
 521         eword = str;
 522         while ((eword = get_wildcard_part(eword, slen - (eword - str),
 523                                                                           buf, &bytelen, &charlen)) != NULL)
 524         {
 525                 /*
 526                  * count bigrams
 527                  */
 528                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
 529         }
 530
 531         pfree(buf);
 532
 533         if ((len = bptr - GETARR(bgm)) == 0)
 534                 return bgm;
 535
 536         /*
 537          * Make bigrams unique.
 538          */
 539         if (len > 1)
 540         {
 541                 bool            haveDups = false;
 542
 543                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 544                 if (haveDups)
 545                 {
 546                         *removeDups = true;
 547                         len = unique_array(GETARR(bgm), len);
 548                 }
 549         }
 550
 551         SET_VARSIZE(bgm, CALCGTSIZE(len));
 552
 553         return bgm;
 554 }
 555
 556 Datum
 557 show_bigm(PG_FUNCTION_ARGS)
 558 {
 559         text       *in = PG_GETARG_TEXT_P(0);
 560         BIGM       *bgm;
 561         Datum      *d;
 562         ArrayType  *a;
 563         bigm       *ptr;
 564         int                     i;
 565
 566         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
 567         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
 568
 569         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
 570         {
 571                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
 572
 573                 d[i] = PointerGetDatum(item);
 574         }
 575
 576         a = construct_array(
 577                                                 d,
 578                                                 ARRNELEM(bgm),
 579                                                 TEXTOID,
 580                                                 -1,
 581                                                 false,
 582                                                 'i'
 583                 );
 584
 585         for (i = 0; i < ARRNELEM(bgm); i++)
 586                 pfree(DatumGetPointer(d[i]));
 587
 588         pfree(d);
 589         pfree(bgm);
 590         PG_FREE_IF_COPY(in, 0);
 591
 592         PG_RETURN_POINTER(a);
 593 }
 594
 595 static float4
 596 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
 597 {
 598         bigm       *ptr1,
 599                            *ptr2;
 600         int                     count = 0;
 601         int                     len1,
 602                                 len2;
 603
 604         ptr1 = GETARR(bgm1);
 605         ptr2 = GETARR(bgm2);
 606
 607         len1 = ARRNELEM(bgm1);
 608         len2 = ARRNELEM(bgm2);
 609
 610         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
 611         if (len1 <= 0 || len2 <= 0)
 612                 return (float4) 0.0;
 613
 614         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
 615         {
 616                 int                     res = CMPBIGM(ptr1, ptr2);
 617
 618                 if (res < 0)
 619                         ptr1++;
 620                 else if (res > 0)
 621                         ptr2++;
 622                 else
 623                 {
 624                         ptr1++;
 625                         ptr2++;
 626                         count++;
 627                 }
 628         }
 629
 630 #ifdef DIVUNION
 631         return ((float4) count) / ((float4) (len1 + len2 - count));
 632 #else
 633         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
 634 #endif
 635 }
 636
 637 Datum
 638 bigm_similarity(PG_FUNCTION_ARGS)
 639 {
 640         text       *in1 = PG_GETARG_TEXT_P(0);
 641         text       *in2 = PG_GETARG_TEXT_P(1);
 642         BIGM       *bgm1,
 643                            *bgm2;
 644         float4          res;
 645
 646         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
 647         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
 648
 649         res = cnt_sml_bigm(bgm1, bgm2);
 650
 651         pfree(bgm1);
 652         pfree(bgm2);
 653         PG_FREE_IF_COPY(in1, 0);
 654         PG_FREE_IF_COPY(in2, 1);
 655
 656         PG_RETURN_FLOAT4(res);
 657 }
 658
 659 Datum
 660 bigm_similarity_op(PG_FUNCTION_ARGS)
 661 {
 662         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
 663                                                                                                                  PG_GETARG_DATUM(0),
 664                                                                                                                  PG_GETARG_DATUM(1)));
 665
 666         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
 667 }
 668
 669 Datum
 670 likequery(PG_FUNCTION_ARGS)
 671 {
 672         text       *query = PG_GETARG_TEXT_PP(0);
 673         const char *str;
 674         int                     len;
 675         const char *sp;
 676         text       *result;
 677         char       *rp;
 678         int                     mblen;
 679
 680         str = VARDATA_ANY(query);
 681         len = VARSIZE_ANY_EXHDR(query);
 682
 683         if (len == 0)
 684                 PG_RETURN_NULL();
 685
 686         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
 687         rp = VARDATA(result);
 688         *rp++ = '%';
 689
 690         for (sp = str; (sp - str) < len;)
 691         {
 692                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
 693                 {
 694                         *rp++ = '\\';
 695                         *rp++ = *sp++;
 696                 }
 697                 else if (IS_HIGHBIT_SET(*sp))
 698                 {
 699                         mblen = pg_mblen(sp);
 700                         memcpy(rp, sp, mblen);
 701                         rp += mblen;
 702                         sp += mblen;
 703                 }
 704                 else
 705                         *rp++ = *sp++;
 706         }
 707
 708         *rp++ = '%';
 709         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
 710
 711         PG_RETURN_TEXT_P(result);
 712 }
 713
 714 Datum
 715 bigmtextcmp(PG_FUNCTION_ARGS)
 716 {
 717         text       *arg1 = PG_GETARG_TEXT_PP(0);
 718         text       *arg2 = PG_GETARG_TEXT_PP(1);
 719         char       *a1p = VARDATA_ANY(arg1);
 720         char       *a2p = VARDATA_ANY(arg2);
 721         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
 722         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
 723
 724         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
 725 }