bigm_op.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
   4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
   5  *
   6  * Changelog:
   7  *       2013/01/09
   8  *       Support full text search using bigrams.
   9  *       Author: NTT DATA Corporation
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "postgres.h"
  14
  15 #include <ctype.h>
  16
  17 #include "bigm.h"
  18
  19 #include "catalog/pg_type.h"
  20 #include "tsearch/ts_locale.h"
  21 #include "utils/array.h"
  22 #include "utils/memutils.h"
  23
  24 PG_MODULE_MAGIC;
  25
  26 /* Last update date of pg_bigm */
  27 #define BIGM_LAST_UPDATE        "2015.09.10"
  28
  29 /* GUC variable */
  30 bool            bigm_enable_recheck = false;
  31 int                     bigm_gin_key_limit = 0;
  32 double          bigm_similarity_limit = 0.3;
  33 char       *bigm_last_update = NULL;
  34
  35 PG_FUNCTION_INFO_V1(show_bigm);
  36 PG_FUNCTION_INFO_V1(bigmtextcmp);
  37 PG_FUNCTION_INFO_V1(likequery);
  38 PG_FUNCTION_INFO_V1(bigm_similarity);
  39 PG_FUNCTION_INFO_V1(bigm_similarity_op);
  40
  41 /*
  42  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
  43  * macro since 9.4, and hence the declaration of the function prototypes
  44  * here is necessary only for 9.3 or before.
  45  */
  46 #if PG_VERSION_NUM < 90400
  47 Datum           show_bigm(PG_FUNCTION_ARGS);
  48 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
  49 Datum           likequery(PG_FUNCTION_ARGS);
  50 Datum           bigm_similarity(PG_FUNCTION_ARGS);
  51 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
  52 #endif
  53
  54 void            _PG_init(void);
  55 void            _PG_fini(void);
  56
  57 void
  58 _PG_init(void)
  59 {
  60         /* Define custom GUC variables */
  61         DefineCustomBoolVariable("pg_bigm.enable_recheck",
  62                                                          "Recheck that heap tuples fetched from index "
  63                                                          "match the query.",
  64                                                          NULL,
  65                                                          &bigm_enable_recheck,
  66                                                          true,
  67                                                          PGC_USERSET,
  68                                                          0,
  69                                                          NULL,
  70                                                          NULL,
  71                                                          NULL);
  72
  73         DefineCustomIntVariable("pg_bigm.gin_key_limit",
  74                                                         "Sets the maximum number of bi-gram keys allowed to "
  75                                                         "use for GIN index search.",
  76                                                         "Zero means no limit.",
  77                                                         &bigm_gin_key_limit,
  78                                                         0,
  79                                                         0, INT_MAX,
  80                                                         PGC_USERSET,
  81                                                         0,
  82                                                         NULL,
  83                                                         NULL,
  84                                                         NULL);
  85
  86         DefineCustomRealVariable("pg_bigm.similarity_limit",
  87                                                          "Sets the similarity threshold used by the "
  88                                                          "=% operator.",
  89                                                          NULL,
  90                                                          &bigm_similarity_limit,
  91                                                          0.3,
  92                                                          0.0, 1.0,
  93                                                          PGC_USERSET,
  94                                                          0,
  95                                                          NULL,
  96                                                          NULL,
  97                                                          NULL);
  98
  99         /* Can't be set in postgresql.conf */
 100         DefineCustomStringVariable("pg_bigm.last_update",
 101                                                            "Shows the last update date of pg_bigm.",
 102                                                            NULL,
 103                                                            &bigm_last_update,
 104                                                            BIGM_LAST_UPDATE,
 105                                                            PGC_INTERNAL,
 106                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
 107                                                            NULL,
 108                                                            NULL,
 109                                                            NULL);
 110
 111         EmitWarningsOnPlaceholders("pg_bigm");
 112 }
 113
 114 void
 115 _PG_fini(void)
 116 {
 117 }
 118
 119 static int
 120 comp_bigm(const void *a, const void *b, void *arg)
 121 {
 122         int                     res;
 123         bool       *haveDups = (bool *) arg;
 124
 125         res = CMPBIGM(a, b);
 126
 127         if (res == 0)
 128                 *haveDups = true;
 129
 130         return res;
 131 }
 132
 133 static int
 134 unique_array(bigm *a, int len)
 135 {
 136         bigm       *curend,
 137                            *tmp;
 138
 139         curend = tmp = a;
 140         while (tmp - a < len)
 141                 if (CMPBIGM(tmp, curend))
 142                 {
 143                         curend++;
 144                         if (curend != tmp)
 145                                 memcpy(curend, tmp, BIGMSIZE);
 146                         tmp++;
 147                 }
 148                 else
 149                         tmp++;
 150
 151         return curend + 1 - a;
 152 }
 153
 154 #define iswordchr(c)    (!t_isspace(c))
 155
 156 /*
 157  * Finds first word in string, returns pointer to the word,
 158  * endword points to the character after word
 159  */
 160 static char *
 161 find_word(char *str, int lenstr, char **endword, int *charlen)
 162 {
 163         char       *beginword = str;
 164
 165         while (beginword - str < lenstr && !iswordchr(beginword))
 166                 beginword += pg_mblen(beginword);
 167
 168         if (beginword - str >= lenstr)
 169                 return NULL;
 170
 171         *endword = beginword;
 172         *charlen = 0;
 173         while (*endword - str < lenstr && iswordchr(*endword))
 174         {
 175                 *endword += pg_mblen(*endword);
 176                 (*charlen)++;
 177         }
 178
 179         return beginword;
 180 }
 181
 182 /*
 183  * The function is named compact_bigram to maintain consistency with pg_trgm,
 184  * though it does not reduce multibyte characters to hash values like in
 185  * compact_trigram.
 186  */
 187 static void
 188 compact_bigram(bigm *bptr, char *str, int bytelen)
 189 {
 190         CPBIGM(bptr, str, bytelen);
 191 }
 192
 193 /*
 194  * Adds bigrams from words (already padded).
 195  */
 196 static bigm *
 197 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
 198 {
 199         char       *ptr = str;
 200
 201         if (charlen < 2)
 202         {
 203                 compact_bigram(bptr, ptr, pg_mblen(str));
 204                 bptr->pmatch = true;
 205                 bptr++;
 206                 return bptr;
 207         }
 208
 209         if (bytelen > charlen)
 210         {
 211                 /* Find multibyte character boundaries and call compact_bigram */
 212                 int                     lenfirst = pg_mblen(str),
 213                                         lenlast = pg_mblen(str + lenfirst);
 214
 215                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
 216                 {
 217                         compact_bigram(bptr, ptr, lenfirst + lenlast);
 218
 219                         ptr += lenfirst;
 220                         bptr++;
 221
 222                         lenfirst = lenlast;
 223                         lenlast = pg_mblen(ptr + lenfirst);
 224                 }
 225         }
 226         else
 227         {
 228                 /* Fast path when there are no multibyte characters */
 229                 Assert(bytelen == charlen);
 230
 231                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
 232                 {
 233                         CPBIGM(bptr, ptr, 2);
 234                         ptr++;
 235                         bptr++;
 236                 }
 237         }
 238
 239         return bptr;
 240 }
 241
 242 BIGM *
 243 generate_bigm(char *str, int slen)
 244 {
 245         BIGM       *bgm;
 246         char       *buf;
 247         bigm       *bptr;
 248         int                     len,
 249                                 charlen,
 250                                 bytelen;
 251         char       *bword,
 252                            *eword;
 253
 254         /*
 255          * Guard against possible overflow in the palloc requests below.
 256          * We need to prevent integer overflow in the multiplications here.
 257          */
 258         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
 259                 (Size) slen > MaxAllocSize - 4)
 260                 ereport(ERROR,
 261                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 262                                  errmsg("out of memory")));
 263
 264         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
 265         SET_VARSIZE(bgm, VARHDRSZ);
 266
 267         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 268                 return bgm;
 269
 270         bptr = GETARR(bgm);
 271
 272         buf = palloc(sizeof(char) * (slen + 4));
 273
 274         if (LPADDING > 0)
 275         {
 276                 *buf = ' ';
 277                 if (LPADDING > 1)
 278                         *(buf + 1) = ' ';
 279         }
 280
 281         eword = str;
 282         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
 283         {
 284                 bytelen = eword - bword;
 285                 memcpy(buf + LPADDING, bword, bytelen);
 286
 287                 buf[LPADDING + bytelen] = ' ';
 288                 buf[LPADDING + bytelen + 1] = ' ';
 289
 290                 /*
 291                  * count bigrams
 292                  */
 293                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
 294                                                         charlen + LPADDING + RPADDING);
 295         }
 296
 297         pfree(buf);
 298
 299         if ((len = bptr - GETARR(bgm)) == 0)
 300                 return bgm;
 301
 302         /*
 303          * Make bigrams unique.
 304          */
 305         if (len > 1)
 306         {
 307                 bool            haveDups = false;
 308
 309                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 310                 if (haveDups)
 311                         len = unique_array(GETARR(bgm), len);
 312         }
 313
 314         SET_VARSIZE(bgm, CALCGTSIZE(len));
 315
 316         return bgm;
 317 }
 318
 319 /*
 320  * Extract the next non-wildcard part of a search string, ie, a word bounded
 321  * by '_' or '%' meta-characters, non-word characters or string end.
 322  *
 323  * str: source string, of length lenstr bytes (need not be null-terminated)
 324  * buf: where to return the substring (must be long enough)
 325  * *bytelen: receives byte length of the found substring
 326  * *charlen: receives character length of the found substring
 327  *
 328  * Returns pointer to end+1 of the found substring in the source string.
 329  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
 330  *
 331  * If the found word is bounded by non-word characters or string boundaries
 332  * then this function will include corresponding padding spaces into buf.
 333  */
 334 static const char *
 335 get_wildcard_part(const char *str, int lenstr,
 336                                   char *buf, int *bytelen, int *charlen)
 337 {
 338         const char *beginword = str;
 339         const char *endword;
 340         char       *s = buf;
 341         bool            in_leading_wildcard_meta = false;
 342         bool            in_trailing_wildcard_meta = false;
 343         bool            in_escape = false;
 344         int                     clen;
 345
 346         /*
 347          * Find the first word character, remembering whether preceding character
 348          * was wildcard meta-character.  Note that the in_escape state persists
 349          * from this loop to the next one, since we may exit at a word character
 350          * that is in_escape.
 351          */
 352         while (beginword - str < lenstr)
 353         {
 354                 if (in_escape)
 355                 {
 356                         if (iswordchr(beginword))
 357                                 break;
 358                         in_escape = false;
 359                         in_leading_wildcard_meta = false;
 360                 }
 361                 else
 362                 {
 363                         if (ISESCAPECHAR(beginword))
 364                                 in_escape = true;
 365                         else if (ISWILDCARDCHAR(beginword))
 366                                 in_leading_wildcard_meta = true;
 367                         else if (iswordchr(beginword))
 368                                 break;
 369                         else
 370                                 in_leading_wildcard_meta = false;
 371                 }
 372                 beginword += pg_mblen(beginword);
 373         }
 374
 375         /*
 376          * Handle string end.
 377          */
 378         if (beginword - str >= lenstr)
 379                 return NULL;
 380
 381         /*
 382          * Add left padding spaces if preceding character wasn't wildcard
 383          * meta-character.
 384          */
 385         *charlen = 0;
 386         if (!in_leading_wildcard_meta)
 387         {
 388                 if (LPADDING > 0)
 389                 {
 390                         *s++ = ' ';
 391                         (*charlen)++;
 392                         if (LPADDING > 1)
 393                         {
 394                                 *s++ = ' ';
 395                                 (*charlen)++;
 396                         }
 397                 }
 398         }
 399
 400         /*
 401          * Copy data into buf until wildcard meta-character, non-word character or
 402          * string boundary.  Strip escapes during copy.
 403          */
 404         endword = beginword;
 405         while (endword - str < lenstr)
 406         {
 407                 clen = pg_mblen(endword);
 408                 if (in_escape)
 409                 {
 410                         if (iswordchr(endword))
 411                         {
 412                                 memcpy(s, endword, clen);
 413                                 (*charlen)++;
 414                                 s += clen;
 415                         }
 416                         else
 417                         {
 418                                 /*
 419                                  * Back up endword to the escape character when stopping at an
 420                                  * escaped char, so that subsequent get_wildcard_part will
 421                                  * restart from the escape character.  We assume here that
 422                                  * escape chars are single-byte.
 423                                  */
 424                                 endword--;
 425                                 break;
 426                         }
 427                         in_escape = false;
 428                 }
 429                 else
 430                 {
 431                         if (ISESCAPECHAR(endword))
 432                                 in_escape = true;
 433                         else if (ISWILDCARDCHAR(endword))
 434                         {
 435                                 in_trailing_wildcard_meta = true;
 436                                 break;
 437                         }
 438                         else if (iswordchr(endword))
 439                         {
 440                                 memcpy(s, endword, clen);
 441                                 (*charlen)++;
 442                                 s += clen;
 443                         }
 444                         else
 445                                 break;
 446                 }
 447                 endword += clen;
 448         }
 449
 450         /*
 451          * Add right padding spaces if next character isn't wildcard
 452          * meta-character.
 453          */
 454         if (!in_trailing_wildcard_meta)
 455         {
 456                 if (RPADDING > 0)
 457                 {
 458                         *s++ = ' ';
 459                         (*charlen)++;
 460                         if (RPADDING > 1)
 461                         {
 462                                 *s++ = ' ';
 463                                 (*charlen)++;
 464                         }
 465                 }
 466         }
 467
 468         *bytelen = s - buf;
 469         return endword;
 470 }
 471
 472 /*
 473  * Generates bigrams for wildcard search string.
 474  *
 475  * Returns array of bigrams that must occur in any string that matches the
 476  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
 477  * " a", "bcd" would be extracted.
 478  *
 479  * Set 'removeDups' to true if duplicate bigrams are removed.
 480  */
 481 BIGM *
 482 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
 483 {
 484         BIGM       *bgm;
 485         char       *buf;
 486         bigm       *bptr;
 487         int                     len,
 488                                 charlen,
 489                                 bytelen;
 490         const char *eword;
 491
 492         *removeDups = false;
 493
 494         /*
 495          * Guard against possible overflow in the palloc requests below.
 496          * We need to prevent integer overflow in the multiplications here.
 497          */
 498         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
 499                 (Size) slen > MaxAllocSize - 4)
 500                 ereport(ERROR,
 501                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 502                                  errmsg("out of memory")));
 503
 504         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
 505         SET_VARSIZE(bgm, VARHDRSZ);
 506
 507         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 508                 return bgm;
 509
 510         bptr = GETARR(bgm);
 511
 512         buf = palloc(sizeof(char) * (slen + 4));
 513
 514         /*
 515          * Extract bigrams from each substring extracted by get_wildcard_part.
 516          */
 517         eword = str;
 518         while ((eword = get_wildcard_part(eword, slen - (eword - str),
 519                                                                           buf, &bytelen, &charlen)) != NULL)
 520         {
 521                 /*
 522                  * count bigrams
 523                  */
 524                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
 525         }
 526
 527         pfree(buf);
 528
 529         if ((len = bptr - GETARR(bgm)) == 0)
 530                 return bgm;
 531
 532         /*
 533          * Make bigrams unique.
 534          */
 535         if (len > 1)
 536         {
 537                 bool            haveDups = false;
 538
 539                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 540                 if (haveDups)
 541                 {
 542                         *removeDups = true;
 543                         len = unique_array(GETARR(bgm), len);
 544                 }
 545         }
 546
 547         SET_VARSIZE(bgm, CALCGTSIZE(len));
 548
 549         return bgm;
 550 }
 551
 552 Datum
 553 show_bigm(PG_FUNCTION_ARGS)
 554 {
 555         text       *in = PG_GETARG_TEXT_P(0);
 556         BIGM       *bgm;
 557         Datum      *d;
 558         ArrayType  *a;
 559         bigm       *ptr;
 560         int                     i;
 561
 562         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
 563         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
 564
 565         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
 566         {
 567                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
 568
 569                 d[i] = PointerGetDatum(item);
 570         }
 571
 572         a = construct_array(
 573                                                 d,
 574                                                 ARRNELEM(bgm),
 575                                                 TEXTOID,
 576                                                 -1,
 577                                                 false,
 578                                                 'i'
 579                 );
 580
 581         for (i = 0; i < ARRNELEM(bgm); i++)
 582                 pfree(DatumGetPointer(d[i]));
 583
 584         pfree(d);
 585         pfree(bgm);
 586         PG_FREE_IF_COPY(in, 0);
 587
 588         PG_RETURN_POINTER(a);
 589 }
 590
 591 static float4
 592 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
 593 {
 594         bigm       *ptr1,
 595                            *ptr2;
 596         int                     count = 0;
 597         int                     len1,
 598                                 len2;
 599
 600         ptr1 = GETARR(bgm1);
 601         ptr2 = GETARR(bgm2);
 602
 603         len1 = ARRNELEM(bgm1);
 604         len2 = ARRNELEM(bgm2);
 605
 606         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
 607         if (len1 <= 0 || len2 <= 0)
 608                 return (float4) 0.0;
 609
 610         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
 611         {
 612                 int                     res = CMPBIGM(ptr1, ptr2);
 613
 614                 if (res < 0)
 615                         ptr1++;
 616                 else if (res > 0)
 617                         ptr2++;
 618                 else
 619                 {
 620                         ptr1++;
 621                         ptr2++;
 622                         count++;
 623                 }
 624         }
 625
 626 #ifdef DIVUNION
 627         return ((float4) count) / ((float4) (len1 + len2 - count));
 628 #else
 629         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
 630 #endif
 631 }
 632
 633 Datum
 634 bigm_similarity(PG_FUNCTION_ARGS)
 635 {
 636         text       *in1 = PG_GETARG_TEXT_P(0);
 637         text       *in2 = PG_GETARG_TEXT_P(1);
 638         BIGM       *bgm1,
 639                            *bgm2;
 640         float4          res;
 641
 642         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
 643         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
 644
 645         res = cnt_sml_bigm(bgm1, bgm2);
 646
 647         pfree(bgm1);
 648         pfree(bgm2);
 649         PG_FREE_IF_COPY(in1, 0);
 650         PG_FREE_IF_COPY(in2, 1);
 651
 652         PG_RETURN_FLOAT4(res);
 653 }
 654
 655 Datum
 656 bigm_similarity_op(PG_FUNCTION_ARGS)
 657 {
 658         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
 659                                                                                                                  PG_GETARG_DATUM(0),
 660                                                                                                                  PG_GETARG_DATUM(1)));
 661
 662         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
 663 }
 664
 665 Datum
 666 likequery(PG_FUNCTION_ARGS)
 667 {
 668         text       *query = PG_GETARG_TEXT_PP(0);
 669         const char *str;
 670         int                     len;
 671         const char *sp;
 672         text       *result;
 673         char       *rp;
 674         int                     mblen;
 675
 676         str = VARDATA_ANY(query);
 677         len = VARSIZE_ANY_EXHDR(query);
 678
 679         if (len == 0)
 680                 PG_RETURN_NULL();
 681
 682         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
 683         rp = VARDATA(result);
 684         *rp++ = '%';
 685
 686         for (sp = str; (sp - str) < len;)
 687         {
 688                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
 689                 {
 690                         *rp++ = '\\';
 691                         *rp++ = *sp++;
 692                 }
 693                 else if (IS_HIGHBIT_SET(*sp))
 694                 {
 695                         mblen = pg_mblen(sp);
 696                         memcpy(rp, sp, mblen);
 697                         rp += mblen;
 698                         sp += mblen;
 699                 }
 700                 else
 701                         *rp++ = *sp++;
 702         }
 703
 704         *rp++ = '%';
 705         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
 706
 707         PG_RETURN_TEXT_P(result);
 708 }
 709
 710 inline int
 711 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
 712 {
 713         int                     i;
 714         int                     len = Min(len1, len2);
 715
 716         for (i = 0; i < len; i++, arg1++, arg2++)
 717         {
 718                 if (*arg1 == *arg2)
 719                         continue;
 720                 if (*arg1 < *arg2)
 721                         return -1;
 722                 else
 723                         return 1;
 724         }
 725
 726         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
 727 }
 728
 729 Datum
 730 bigmtextcmp(PG_FUNCTION_ARGS)
 731 {
 732         text       *arg1 = PG_GETARG_TEXT_PP(0);
 733         text       *arg2 = PG_GETARG_TEXT_PP(1);
 734         char       *a1p = VARDATA_ANY(arg1);
 735         char       *a2p = VARDATA_ANY(arg2);
 736         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
 737         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
 738
 739         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
 740 }