bigm_op.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * Portions Copyright (c) 2017-2020, pg_bigm Development Group
   4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
   5  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
   6  *
   7  * Changelog:
   8  *       2013/01/09
   9  *       Support full text search using bigrams.
  10  *       Author: NTT DATA Corporation
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #include "postgres.h"
  15
  16 #include <ctype.h>
  17
  18 #include "bigm.h"
  19
  20 #include "catalog/pg_type.h"
  21 #include "tsearch/ts_locale.h"
  22 #include "utils/array.h"
  23 #include "utils/memutils.h"
  24
  25 PG_MODULE_MAGIC;
  26
  27 /* Last update date of pg_bigm */
  28 #define BIGM_LAST_UPDATE        "2019.10.03"
  29
  30 /* GUC variable */
  31 bool            bigm_enable_recheck = false;
  32 int                     bigm_gin_key_limit = 0;
  33 double          bigm_similarity_limit = 0.3;
  34 char       *bigm_last_update = NULL;
  35
  36 PG_FUNCTION_INFO_V1(show_bigm);
  37 PG_FUNCTION_INFO_V1(bigmtextcmp);
  38 PG_FUNCTION_INFO_V1(likequery);
  39 PG_FUNCTION_INFO_V1(bigm_similarity);
  40 PG_FUNCTION_INFO_V1(bigm_similarity_op);
  41
  42 /*
  43  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
  44  * macro since 9.4, and hence the declaration of the function prototypes
  45  * here is necessary only for 9.3 or before.
  46  */
  47 #if PG_VERSION_NUM < 90400
  48 Datum           show_bigm(PG_FUNCTION_ARGS);
  49 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
  50 Datum           likequery(PG_FUNCTION_ARGS);
  51 Datum           bigm_similarity(PG_FUNCTION_ARGS);
  52 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
  53 #endif
  54
  55 void            _PG_init(void);
  56 void            _PG_fini(void);
  57
  58 void
  59 _PG_init(void)
  60 {
  61         /* Define custom GUC variables */
  62         DefineCustomBoolVariable("pg_bigm.enable_recheck",
  63                                                          "Recheck that heap tuples fetched from index "
  64                                                          "match the query.",
  65                                                          NULL,
  66                                                          &bigm_enable_recheck,
  67                                                          true,
  68                                                          PGC_USERSET,
  69                                                          0,
  70                                                          NULL,
  71                                                          NULL,
  72                                                          NULL);
  73
  74         DefineCustomIntVariable("pg_bigm.gin_key_limit",
  75                                                         "Sets the maximum number of bi-gram keys allowed to "
  76                                                         "use for GIN index search.",
  77                                                         "Zero means no limit.",
  78                                                         &bigm_gin_key_limit,
  79                                                         0,
  80                                                         0, INT_MAX,
  81                                                         PGC_USERSET,
  82                                                         0,
  83                                                         NULL,
  84                                                         NULL,
  85                                                         NULL);
  86
  87         DefineCustomRealVariable("pg_bigm.similarity_limit",
  88                                                          "Sets the similarity threshold used by the "
  89                                                          "=% operator.",
  90                                                          NULL,
  91                                                          &bigm_similarity_limit,
  92                                                          0.3,
  93                                                          0.0, 1.0,
  94                                                          PGC_USERSET,
  95                                                          0,
  96                                                          NULL,
  97                                                          NULL,
  98                                                          NULL);
  99
 100         /* Can't be set in postgresql.conf */
 101         DefineCustomStringVariable("pg_bigm.last_update",
 102                                                            "Shows the last update date of pg_bigm.",
 103                                                            NULL,
 104                                                            &bigm_last_update,
 105                                                            BIGM_LAST_UPDATE,
 106                                                            PGC_INTERNAL,
 107                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
 108                                                            NULL,
 109                                                            NULL,
 110                                                            NULL);
 111
 112         EmitWarningsOnPlaceholders("pg_bigm");
 113 }
 114
 115 void
 116 _PG_fini(void)
 117 {
 118 }
 119
 120 static int
 121 comp_bigm(const void *a, const void *b, void *arg)
 122 {
 123         int                     res;
 124         bool       *haveDups = (bool *) arg;
 125
 126         res = CMPBIGM(a, b);
 127
 128         if (res == 0)
 129                 *haveDups = true;
 130
 131         return res;
 132 }
 133
 134 static int
 135 unique_array(bigm *a, int len)
 136 {
 137         bigm       *curend,
 138                            *tmp;
 139
 140         curend = tmp = a;
 141         while (tmp - a < len)
 142                 if (CMPBIGM(tmp, curend))
 143                 {
 144                         curend++;
 145                         if (curend != tmp)
 146                                 memcpy(curend, tmp, BIGMSIZE);
 147                         tmp++;
 148                 }
 149                 else
 150                         tmp++;
 151
 152         return curend + 1 - a;
 153 }
 154
 155 #define iswordchr(c)    (!t_isspace(c))
 156
 157 /*
 158  * Finds first word in string, returns pointer to the word,
 159  * endword points to the character after word
 160  */
 161 static char *
 162 find_word(char *str, int lenstr, char **endword, int *charlen)
 163 {
 164         char       *beginword = str;
 165
 166         while (beginword - str < lenstr && !iswordchr(beginword))
 167                 beginword += pg_mblen(beginword);
 168
 169         if (beginword - str >= lenstr)
 170                 return NULL;
 171
 172         *endword = beginword;
 173         *charlen = 0;
 174         while (*endword - str < lenstr && iswordchr(*endword))
 175         {
 176                 *endword += pg_mblen(*endword);
 177                 (*charlen)++;
 178         }
 179
 180         return beginword;
 181 }
 182
 183 /*
 184  * The function is named compact_bigram to maintain consistency with pg_trgm,
 185  * though it does not reduce multibyte characters to hash values like in
 186  * compact_trigram.
 187  */
 188 static void
 189 compact_bigram(bigm *bptr, char *str, int bytelen)
 190 {
 191         CPBIGM(bptr, str, bytelen);
 192 }
 193
 194 /*
 195  * Adds bigrams from words (already padded).
 196  */
 197 static bigm *
 198 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
 199 {
 200         char       *ptr = str;
 201
 202         if (charlen < 2)
 203         {
 204                 compact_bigram(bptr, ptr, pg_mblen(str));
 205                 bptr->pmatch = true;
 206                 bptr++;
 207                 return bptr;
 208         }
 209
 210         if (bytelen > charlen)
 211         {
 212                 /* Find multibyte character boundaries and call compact_bigram */
 213                 int                     lenfirst = pg_mblen(str),
 214                                         lenlast = pg_mblen(str + lenfirst);
 215
 216                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
 217                 {
 218                         compact_bigram(bptr, ptr, lenfirst + lenlast);
 219
 220                         ptr += lenfirst;
 221                         bptr++;
 222
 223                         lenfirst = lenlast;
 224                         lenlast = pg_mblen(ptr + lenfirst);
 225                 }
 226         }
 227         else
 228         {
 229                 /* Fast path when there are no multibyte characters */
 230                 Assert(bytelen == charlen);
 231
 232                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
 233                 {
 234                         CPBIGM(bptr, ptr, 2);
 235                         ptr++;
 236                         bptr++;
 237                 }
 238         }
 239
 240         return bptr;
 241 }
 242
 243 BIGM *
 244 generate_bigm(char *str, int slen)
 245 {
 246         BIGM       *bgm;
 247         char       *buf;
 248         bigm       *bptr;
 249         int                     len,
 250                                 charlen,
 251                                 bytelen;
 252         char       *bword,
 253                            *eword;
 254
 255         /*
 256          * Guard against possible overflow in the palloc requests below.
 257          * We need to prevent integer overflow in the multiplications here.
 258          */
 259         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
 260                 (Size) slen > MaxAllocSize - 4)
 261                 ereport(ERROR,
 262                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 263                                  errmsg("out of memory")));
 264
 265         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
 266         SET_VARSIZE(bgm, VARHDRSZ);
 267
 268         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 269                 return bgm;
 270
 271         bptr = GETARR(bgm);
 272
 273         buf = palloc(sizeof(char) * (slen + 4));
 274
 275         if (LPADDING > 0)
 276         {
 277                 *buf = ' ';
 278                 if (LPADDING > 1)
 279                         *(buf + 1) = ' ';
 280         }
 281
 282         eword = str;
 283         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
 284         {
 285                 bytelen = eword - bword;
 286                 memcpy(buf + LPADDING, bword, bytelen);
 287
 288                 buf[LPADDING + bytelen] = ' ';
 289                 buf[LPADDING + bytelen + 1] = ' ';
 290
 291                 /*
 292                  * count bigrams
 293                  */
 294                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
 295                                                         charlen + LPADDING + RPADDING);
 296         }
 297
 298         pfree(buf);
 299
 300         if ((len = bptr - GETARR(bgm)) == 0)
 301                 return bgm;
 302
 303         /*
 304          * Make bigrams unique.
 305          */
 306         if (len > 1)
 307         {
 308                 bool            haveDups = false;
 309
 310                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 311                 if (haveDups)
 312                         len = unique_array(GETARR(bgm), len);
 313         }
 314
 315         SET_VARSIZE(bgm, CALCGTSIZE(len));
 316
 317         return bgm;
 318 }
 319
 320 /*
 321  * Extract the next non-wildcard part of a search string, i.e. a word bounded
 322  * by '_' or '%' meta-characters, non-word characters or string end.
 323  *
 324  * str: source string, of length lenstr bytes (need not be null-terminated)
 325  * buf: where to return the substring (must be long enough)
 326  * *bytelen: receives byte length of the found substring
 327  * *charlen: receives character length of the found substring
 328  *
 329  * Returns pointer to end+1 of the found substring in the source string.
 330  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
 331  *
 332  * If the found word is bounded by non-word characters or string boundaries
 333  * then this function will include corresponding padding spaces into buf.
 334  */
 335 static const char *
 336 get_wildcard_part(const char *str, int lenstr,
 337                                   char *buf, int *bytelen, int *charlen)
 338 {
 339         const char *beginword = str;
 340         const char *endword;
 341         char       *s = buf;
 342         bool            in_leading_wildcard_meta = false;
 343         bool            in_trailing_wildcard_meta = false;
 344         bool            in_escape = false;
 345         int                     clen;
 346
 347         /*
 348          * Find the first word character, remembering whether preceding character
 349          * was wildcard meta-character.  Note that the in_escape state persists
 350          * from this loop to the next one, since we may exit at a word character
 351          * that is in_escape.
 352          */
 353         while (beginword - str < lenstr)
 354         {
 355                 if (in_escape)
 356                 {
 357                         if (iswordchr(beginword))
 358                                 break;
 359                         in_escape = false;
 360                         in_leading_wildcard_meta = false;
 361                 }
 362                 else
 363                 {
 364                         if (ISESCAPECHAR(beginword))
 365                                 in_escape = true;
 366                         else if (ISWILDCARDCHAR(beginword))
 367                                 in_leading_wildcard_meta = true;
 368                         else if (iswordchr(beginword))
 369                                 break;
 370                         else
 371                                 in_leading_wildcard_meta = false;
 372                 }
 373                 beginword += pg_mblen(beginword);
 374         }
 375
 376         /*
 377          * Handle string end.
 378          */
 379         if (beginword - str >= lenstr)
 380                 return NULL;
 381
 382         /*
 383          * Add left padding spaces if preceding character wasn't wildcard
 384          * meta-character.
 385          */
 386         *charlen = 0;
 387         if (!in_leading_wildcard_meta)
 388         {
 389                 if (LPADDING > 0)
 390                 {
 391                         *s++ = ' ';
 392                         (*charlen)++;
 393                         if (LPADDING > 1)
 394                         {
 395                                 *s++ = ' ';
 396                                 (*charlen)++;
 397                         }
 398                 }
 399         }
 400
 401         /*
 402          * Copy data into buf until wildcard meta-character, non-word character or
 403          * string boundary.  Strip escapes during copy.
 404          */
 405         endword = beginword;
 406         while (endword - str < lenstr)
 407         {
 408                 clen = pg_mblen(endword);
 409                 if (in_escape)
 410                 {
 411                         if (iswordchr(endword))
 412                         {
 413                                 memcpy(s, endword, clen);
 414                                 (*charlen)++;
 415                                 s += clen;
 416                         }
 417                         else
 418                         {
 419                                 /*
 420                                  * Back up endword to the escape character when stopping at an
 421                                  * escaped char, so that subsequent get_wildcard_part will
 422                                  * restart from the escape character.  We assume here that
 423                                  * escape chars are single-byte.
 424                                  */
 425                                 endword--;
 426                                 break;
 427                         }
 428                         in_escape = false;
 429                 }
 430                 else
 431                 {
 432                         if (ISESCAPECHAR(endword))
 433                                 in_escape = true;
 434                         else if (ISWILDCARDCHAR(endword))
 435                         {
 436                                 in_trailing_wildcard_meta = true;
 437                                 break;
 438                         }
 439                         else if (iswordchr(endword))
 440                         {
 441                                 memcpy(s, endword, clen);
 442                                 (*charlen)++;
 443                                 s += clen;
 444                         }
 445                         else
 446                                 break;
 447                 }
 448                 endword += clen;
 449         }
 450
 451         /*
 452          * Add right padding spaces if next character isn't wildcard
 453          * meta-character.
 454          */
 455         if (!in_trailing_wildcard_meta)
 456         {
 457                 if (RPADDING > 0)
 458                 {
 459                         *s++ = ' ';
 460                         (*charlen)++;
 461                         if (RPADDING > 1)
 462                         {
 463                                 *s++ = ' ';
 464                                 (*charlen)++;
 465                         }
 466                 }
 467         }
 468
 469         *bytelen = s - buf;
 470         return endword;
 471 }
 472
 473 /*
 474  * Generates bigrams for wildcard search string.
 475  *
 476  * Returns array of bigrams that must occur in any string that matches the
 477  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
 478  * " a", "bcd" would be extracted.
 479  *
 480  * Set 'removeDups' to true if duplicate bigrams are removed.
 481  */
 482 BIGM *
 483 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
 484 {
 485         BIGM       *bgm;
 486         char       *buf;
 487         bigm       *bptr;
 488         int                     len,
 489                                 charlen,
 490                                 bytelen;
 491         const char *eword;
 492
 493         *removeDups = false;
 494
 495         /*
 496          * Guard against possible overflow in the palloc requests below.
 497          * We need to prevent integer overflow in the multiplications here.
 498          */
 499         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
 500                 (Size) slen > MaxAllocSize - 4)
 501                 ereport(ERROR,
 502                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 503                                  errmsg("out of memory")));
 504
 505         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
 506         SET_VARSIZE(bgm, VARHDRSZ);
 507
 508         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 509                 return bgm;
 510
 511         bptr = GETARR(bgm);
 512
 513         buf = palloc(sizeof(char) * (slen + 4));
 514
 515         /*
 516          * Extract bigrams from each substring extracted by get_wildcard_part.
 517          */
 518         eword = str;
 519         while ((eword = get_wildcard_part(eword, slen - (eword - str),
 520                                                                           buf, &bytelen, &charlen)) != NULL)
 521         {
 522                 /*
 523                  * count bigrams
 524                  */
 525                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
 526         }
 527
 528         pfree(buf);
 529
 530         if ((len = bptr - GETARR(bgm)) == 0)
 531                 return bgm;
 532
 533         /*
 534          * Make bigrams unique.
 535          */
 536         if (len > 1)
 537         {
 538                 bool            haveDups = false;
 539
 540                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 541                 if (haveDups)
 542                 {
 543                         *removeDups = true;
 544                         len = unique_array(GETARR(bgm), len);
 545                 }
 546         }
 547
 548         SET_VARSIZE(bgm, CALCGTSIZE(len));
 549
 550         return bgm;
 551 }
 552
 553 Datum
 554 show_bigm(PG_FUNCTION_ARGS)
 555 {
 556         text       *in = PG_GETARG_TEXT_P(0);
 557         BIGM       *bgm;
 558         Datum      *d;
 559         ArrayType  *a;
 560         bigm       *ptr;
 561         int                     i;
 562
 563         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
 564         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
 565
 566         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
 567         {
 568                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
 569
 570                 d[i] = PointerGetDatum(item);
 571         }
 572
 573         a = construct_array(
 574                                                 d,
 575                                                 ARRNELEM(bgm),
 576                                                 TEXTOID,
 577                                                 -1,
 578                                                 false,
 579                                                 'i'
 580                 );
 581
 582         for (i = 0; i < ARRNELEM(bgm); i++)
 583                 pfree(DatumGetPointer(d[i]));
 584
 585         pfree(d);
 586         pfree(bgm);
 587         PG_FREE_IF_COPY(in, 0);
 588
 589         PG_RETURN_POINTER(a);
 590 }
 591
 592 static float4
 593 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
 594 {
 595         bigm       *ptr1,
 596                            *ptr2;
 597         int                     count = 0;
 598         int                     len1,
 599                                 len2;
 600
 601         ptr1 = GETARR(bgm1);
 602         ptr2 = GETARR(bgm2);
 603
 604         len1 = ARRNELEM(bgm1);
 605         len2 = ARRNELEM(bgm2);
 606
 607         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
 608         if (len1 <= 0 || len2 <= 0)
 609                 return (float4) 0.0;
 610
 611         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
 612         {
 613                 int                     res = CMPBIGM(ptr1, ptr2);
 614
 615                 if (res < 0)
 616                         ptr1++;
 617                 else if (res > 0)
 618                         ptr2++;
 619                 else
 620                 {
 621                         ptr1++;
 622                         ptr2++;
 623                         count++;
 624                 }
 625         }
 626
 627 #ifdef DIVUNION
 628         return ((float4) count) / ((float4) (len1 + len2 - count));
 629 #else
 630         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
 631 #endif
 632 }
 633
 634 Datum
 635 bigm_similarity(PG_FUNCTION_ARGS)
 636 {
 637         text       *in1 = PG_GETARG_TEXT_P(0);
 638         text       *in2 = PG_GETARG_TEXT_P(1);
 639         BIGM       *bgm1,
 640                            *bgm2;
 641         float4          res;
 642
 643         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
 644         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
 645
 646         res = cnt_sml_bigm(bgm1, bgm2);
 647
 648         pfree(bgm1);
 649         pfree(bgm2);
 650         PG_FREE_IF_COPY(in1, 0);
 651         PG_FREE_IF_COPY(in2, 1);
 652
 653         PG_RETURN_FLOAT4(res);
 654 }
 655
 656 Datum
 657 bigm_similarity_op(PG_FUNCTION_ARGS)
 658 {
 659         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
 660                                                                                                                  PG_GETARG_DATUM(0),
 661                                                                                                                  PG_GETARG_DATUM(1)));
 662
 663         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
 664 }
 665
 666 Datum
 667 likequery(PG_FUNCTION_ARGS)
 668 {
 669         text       *query = PG_GETARG_TEXT_PP(0);
 670         const char *str;
 671         int                     len;
 672         const char *sp;
 673         text       *result;
 674         char       *rp;
 675         int                     mblen;
 676
 677         str = VARDATA_ANY(query);
 678         len = VARSIZE_ANY_EXHDR(query);
 679
 680         if (len == 0)
 681                 PG_RETURN_NULL();
 682
 683         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
 684         rp = VARDATA(result);
 685         *rp++ = '%';
 686
 687         for (sp = str; (sp - str) < len;)
 688         {
 689                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
 690                 {
 691                         *rp++ = '\\';
 692                         *rp++ = *sp++;
 693                 }
 694                 else if (IS_HIGHBIT_SET(*sp))
 695                 {
 696                         mblen = pg_mblen(sp);
 697                         memcpy(rp, sp, mblen);
 698                         rp += mblen;
 699                         sp += mblen;
 700                 }
 701                 else
 702                         *rp++ = *sp++;
 703         }
 704
 705         *rp++ = '%';
 706         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
 707
 708         PG_RETURN_TEXT_P(result);
 709 }
 710
 711 Datum
 712 bigmtextcmp(PG_FUNCTION_ARGS)
 713 {
 714         text       *arg1 = PG_GETARG_TEXT_PP(0);
 715         text       *arg2 = PG_GETARG_TEXT_PP(1);
 716         char       *a1p = VARDATA_ANY(arg1);
 717         char       *a2p = VARDATA_ANY(arg2);
 718         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
 719         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
 720
 721         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
 722 }