bigm_op.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
   4  *
   5  * Changelog:
   6  *   2013/01/09
   7  *   Support full text search using bigrams.
   8  *   Author: NTT DATA Corporation
   9  *
  10  *-------------------------------------------------------------------------
  11  */
  12 #include "postgres.h"
  13
  14 #include <ctype.h>
  15
  16 #include "bigm.h"
  17
  18 #include "catalog/pg_type.h"
  19 #include "tsearch/ts_locale.h"
  20 #include "utils/array.h"
  21
  22
  23 PG_MODULE_MAGIC;
  24
  25 /* Last update date of pg_bigm */
  26 #define BIGM_LAST_UPDATE        "2013.11.22"
  27
  28 /* GUC variable */
  29 bool    bigm_enable_recheck = false;
  30 int             bigm_gin_key_limit = 0;
  31 double  bigm_similarity_limit = 0.3;
  32 char    *bigm_last_update = NULL;
  33
  34 PG_FUNCTION_INFO_V1(show_bigm);
  35 Datum           show_bigm(PG_FUNCTION_ARGS);
  36
  37 PG_FUNCTION_INFO_V1(bigmtextcmp);
  38 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
  39
  40 PG_FUNCTION_INFO_V1(likequery);
  41 Datum           likequery(PG_FUNCTION_ARGS);
  42
  43 PG_FUNCTION_INFO_V1(bigm_similarity);
  44 Datum           bigm_similarity(PG_FUNCTION_ARGS);
  45
  46 PG_FUNCTION_INFO_V1(bigm_similarity_op);
  47 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
  48
  49 void            _PG_init(void);
  50 void            _PG_fini(void);
  51
  52 void
  53 _PG_init(void)
  54 {
  55         /* Define custom GUC variables */
  56         DefineCustomBoolVariable("pg_bigm.enable_recheck",
  57                                                          "Recheck that heap tuples fetched from index "
  58                                                          "match the query.",
  59                                                          NULL,
  60                                                          &bigm_enable_recheck,
  61                                                          true,
  62                                                          PGC_USERSET,
  63                                                          0,
  64                                                          NULL,
  65                                                          NULL,
  66                                                          NULL);
  67
  68         DefineCustomIntVariable("pg_bigm.gin_key_limit",
  69                                                         "Sets the maximum number of bi-gram keys allowed to "
  70                                                         "use for GIN index search.",
  71                                                         "Zero means no limit.",
  72                                                         &bigm_gin_key_limit,
  73                                                         0,
  74                                                         0, INT_MAX,
  75                                                         PGC_USERSET,
  76                                                         0,
  77                                                         NULL,
  78                                                         NULL,
  79                                                         NULL);
  80
  81         DefineCustomRealVariable("pg_bigm.similarity_limit",
  82                                                          "Sets the similarity threshold used by the "
  83                                                          "=% operator.",
  84                                                          NULL,
  85                                                          &bigm_similarity_limit,
  86                                                          0.3,
  87                                                          0.0, 1.0,
  88                                                          PGC_USERSET,
  89                                                          0,
  90                                                          NULL,
  91                                                          NULL,
  92                                                          NULL);
  93
  94         /* Can't be set in postgresql.conf */
  95         DefineCustomStringVariable("pg_bigm.last_update",
  96                                                            "Shows the last update date of pg_bigm.",
  97                                                            NULL,
  98                                                            &bigm_last_update,
  99                                                            BIGM_LAST_UPDATE,
 100                                                            PGC_INTERNAL,
 101                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
 102                                                            NULL,
 103                                                            NULL,
 104                                                            NULL);
 105
 106         EmitWarningsOnPlaceholders("pg_bigm");
 107 }
 108
 109 void
 110 _PG_fini(void)
 111 {
 112 }
 113
 114 static int
 115 comp_bigm(const void *a, const void *b, void *arg)
 116 {
 117         int             res;
 118         bool    *haveDups = (bool *) arg;
 119
 120         res = CMPBIGM(a, b);
 121
 122         if (res == 0)
 123                 *haveDups = true;
 124
 125         return res;
 126 }
 127
 128 static int
 129 unique_array(bigm *a, int len)
 130 {
 131         bigm       *curend,
 132                            *tmp;
 133
 134         curend = tmp = a;
 135         while (tmp - a < len)
 136                 if (CMPBIGM(tmp, curend))
 137                 {
 138                         curend++;
 139                         if (curend != tmp)
 140                                 memcpy(curend, tmp, BIGMSIZE);
 141                         tmp++;
 142                 }
 143                 else
 144                         tmp++;
 145
 146         return curend + 1 - a;
 147 }
 148
 149 #define iswordchr(c)    (!t_isspace(c))
 150
 151 /*
 152  * Finds first word in string, returns pointer to the word,
 153  * endword points to the character after word
 154  */
 155 static char *
 156 find_word(char *str, int lenstr, char **endword, int *charlen)
 157 {
 158         char       *beginword = str;
 159
 160         while (beginword - str < lenstr && !iswordchr(beginword))
 161                 beginword += pg_mblen(beginword);
 162
 163         if (beginword - str >= lenstr)
 164                 return NULL;
 165
 166         *endword = beginword;
 167         *charlen = 0;
 168         while (*endword - str < lenstr && iswordchr(*endword))
 169         {
 170                 *endword += pg_mblen(*endword);
 171                 (*charlen)++;
 172         }
 173
 174         return beginword;
 175 }
 176
 177 /*
 178  * The function is named compact_bigram to maintain consistency with pg_trgm,
 179  * though it does not reduce multibyte characters to hash values like in
 180  * compact_trigram.
 181  */
 182 static void
 183 compact_bigram(bigm *bptr, char *str, int bytelen)
 184 {
 185         CPBIGM(bptr, str, bytelen);
 186 }
 187
 188 /*
 189  * Adds bigrams from words (already padded).
 190  */
 191 static bigm *
 192 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
 193 {
 194         char       *ptr = str;
 195
 196         if (charlen < 2)
 197         {
 198                 compact_bigram(bptr, ptr, pg_mblen(str));
 199                 bptr->pmatch = true;
 200                 bptr++;
 201                 return bptr;
 202         }
 203
 204         if (bytelen > charlen)
 205         {
 206                 /* Find multibyte character boundaries and call compact_bigram */
 207                 int                     lenfirst = pg_mblen(str),
 208                                         lenlast = pg_mblen(str + lenfirst);
 209
 210                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
 211                 {
 212                         compact_bigram(bptr, ptr, lenfirst + lenlast);
 213
 214                         ptr += lenfirst;
 215                         bptr++;
 216
 217                         lenfirst = lenlast;
 218                         lenlast = pg_mblen(ptr + lenfirst);
 219                 }
 220         }
 221         else
 222         {
 223                 /* Fast path when there are no multibyte characters */
 224                 Assert(bytelen == charlen);
 225
 226                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
 227                 {
 228                         CPBIGM(bptr, ptr, 2);
 229                         ptr++;
 230                         bptr++;
 231                 }
 232         }
 233
 234         return bptr;
 235 }
 236
 237 BIGM *
 238 generate_bigm(char *str, int slen)
 239 {
 240         BIGM       *bgm;
 241         char       *buf;
 242         bigm       *bptr;
 243         int                     len,
 244                                 charlen,
 245                                 bytelen;
 246         char       *bword,
 247                            *eword;
 248
 249         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
 250         SET_VARSIZE(bgm, VARHDRSZ);
 251
 252         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 253                 return bgm;
 254
 255         bptr = GETARR(bgm);
 256
 257         buf = palloc(sizeof(char) * (slen + 4));
 258
 259         if (LPADDING > 0)
 260         {
 261                 *buf = ' ';
 262                 if (LPADDING > 1)
 263                         *(buf + 1) = ' ';
 264         }
 265
 266         eword = str;
 267         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
 268         {
 269                 bytelen = eword - bword;
 270                 memcpy(buf + LPADDING, bword, bytelen);
 271
 272                 buf[LPADDING + bytelen] = ' ';
 273                 buf[LPADDING + bytelen + 1] = ' ';
 274
 275                 /*
 276                  * count bigrams
 277                  */
 278                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
 279                                                          charlen + LPADDING + RPADDING);
 280         }
 281
 282         pfree(buf);
 283
 284         if ((len = bptr - GETARR(bgm)) == 0)
 285                 return bgm;
 286
 287         if (len > 0)
 288         {
 289                 bool    haveDups = false;
 290
 291                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 292                 if (haveDups)
 293                         len = unique_array(GETARR(bgm), len);
 294         }
 295
 296         SET_VARSIZE(bgm, CALCGTSIZE(len));
 297
 298         return bgm;
 299 }
 300
 301 /*
 302  * Extract the next non-wildcard part of a search string, ie, a word bounded
 303  * by '_' or '%' meta-characters, non-word characters or string end.
 304  *
 305  * str: source string, of length lenstr bytes (need not be null-terminated)
 306  * buf: where to return the substring (must be long enough)
 307  * *bytelen: receives byte length of the found substring
 308  * *charlen: receives character length of the found substring
 309  *
 310  * Returns pointer to end+1 of the found substring in the source string.
 311  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
 312  *
 313  * If the found word is bounded by non-word characters or string boundaries
 314  * then this function will include corresponding padding spaces into buf.
 315  */
 316 static const char *
 317 get_wildcard_part(const char *str, int lenstr,
 318                                   char *buf, int *bytelen, int *charlen)
 319 {
 320         const char *beginword = str;
 321         const char *endword;
 322         char       *s = buf;
 323         bool        in_leading_wildcard_meta = false;
 324         bool        in_trailing_wildcard_meta = false;
 325         bool            in_escape = false;
 326         int                     clen;
 327
 328         /*
 329          * Find the first word character, remembering whether preceding character
 330          * was wildcard meta-character.  Note that the in_escape state persists
 331          * from this loop to the next one, since we may exit at a word character
 332          * that is in_escape.
 333          */
 334         while (beginword - str < lenstr)
 335         {
 336                 if (in_escape)
 337                 {
 338                         if (iswordchr(beginword))
 339                                 break;
 340                         in_escape = false;
 341                         in_leading_wildcard_meta = false;
 342                 }
 343                 else
 344                 {
 345                         if (ISESCAPECHAR(beginword))
 346                                 in_escape = true;
 347                         else if (ISWILDCARDCHAR(beginword))
 348                                 in_leading_wildcard_meta = true;
 349                         else if (iswordchr(beginword))
 350                                 break;
 351                         else
 352                                 in_leading_wildcard_meta = false;
 353                 }
 354                 beginword += pg_mblen(beginword);
 355         }
 356
 357         /*
 358          * Handle string end.
 359          */
 360         if (beginword - str >= lenstr)
 361                 return NULL;
 362
 363         /*
 364          * Add left padding spaces if preceding character wasn't wildcard
 365          * meta-character.
 366          */
 367         *charlen = 0;
 368         if (!in_leading_wildcard_meta)
 369         {
 370                 if (LPADDING > 0)
 371                 {
 372                         *s++ = ' ';
 373                         (*charlen)++;
 374                         if (LPADDING > 1)
 375                         {
 376                                 *s++ = ' ';
 377                                 (*charlen)++;
 378                         }
 379                 }
 380         }
 381
 382         /*
 383          * Copy data into buf until wildcard meta-character, non-word character or
 384          * string boundary.  Strip escapes during copy.
 385          */
 386         endword = beginword;
 387         while (endword - str < lenstr)
 388         {
 389                 clen = pg_mblen(endword);
 390                 if (in_escape)
 391                 {
 392                         if (iswordchr(endword))
 393                         {
 394                                 memcpy(s, endword, clen);
 395                                 (*charlen)++;
 396                                 s += clen;
 397                         }
 398                         else
 399                         {
 400                                 /*
 401                                  * Back up endword to the escape character when stopping at
 402                                  * an escaped char, so that subsequent get_wildcard_part will
 403                                  * restart from the escape character.  We assume here that
 404                                  * escape chars are single-byte.
 405                                  */
 406                                 endword--;
 407                                 break;
 408                         }
 409                         in_escape = false;
 410                 }
 411                 else
 412                 {
 413                         if (ISESCAPECHAR(endword))
 414                                 in_escape = true;
 415                         else if (ISWILDCARDCHAR(endword))
 416                         {
 417                                 in_trailing_wildcard_meta = true;
 418                                 break;
 419                         }
 420                         else if (iswordchr(endword))
 421                         {
 422                                 memcpy(s, endword, clen);
 423                                 (*charlen)++;
 424                                 s += clen;
 425                         }
 426                         else
 427                                 break;
 428                 }
 429                 endword += clen;
 430         }
 431
 432         /*
 433          * Add right padding spaces if next character isn't wildcard
 434          * meta-character.
 435          */
 436         if (!in_trailing_wildcard_meta)
 437         {
 438                 if (RPADDING > 0)
 439                 {
 440                         *s++ = ' ';
 441                         (*charlen)++;
 442                         if (RPADDING > 1)
 443                         {
 444                                 *s++ = ' ';
 445                                 (*charlen)++;
 446                         }
 447                 }
 448         }
 449
 450         *bytelen = s - buf;
 451         return endword;
 452 }
 453
 454 /*
 455  * Generates bigrams for wildcard search string.
 456  *
 457  * Returns array of bigrams that must occur in any string that matches the
 458  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
 459  * " a", "bcd" would be extracted.
 460  *
 461  * Set 'removeDups' to true if duplicate bigrams are removed.
 462  */
 463 BIGM *
 464 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
 465 {
 466         BIGM       *bgm;
 467         char       *buf;
 468         bigm       *bptr;
 469         int                     len,
 470                                 charlen,
 471                                 bytelen;
 472         const char *eword;
 473
 474         *removeDups = false;
 475
 476         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
 477         SET_VARSIZE(bgm, VARHDRSZ);
 478
 479         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 480                 return bgm;
 481
 482         bptr = GETARR(bgm);
 483
 484         buf = palloc(sizeof(char) * (slen + 4));
 485
 486         /*
 487          * Extract bigrams from each substring extracted by get_wildcard_part.
 488          */
 489         eword = str;
 490         while ((eword = get_wildcard_part(eword, slen - (eword - str),
 491                                                                           buf, &bytelen, &charlen)) != NULL)
 492         {
 493                 /*
 494                  * count bigrams
 495                  */
 496                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
 497         }
 498
 499         pfree(buf);
 500
 501         if ((len = bptr - GETARR(bgm)) == 0)
 502                 return bgm;
 503
 504         /*
 505          * Make bigrams unique.
 506          */
 507         if (len > 0)
 508         {
 509                 bool    haveDups = false;
 510
 511                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 512                 if (haveDups)
 513                 {
 514                         *removeDups = true;
 515                         len = unique_array(GETARR(bgm), len);
 516                 }
 517         }
 518
 519         SET_VARSIZE(bgm, CALCGTSIZE(len));
 520
 521         return bgm;
 522 }
 523
 524 Datum
 525 show_bigm(PG_FUNCTION_ARGS)
 526 {
 527         text       *in = PG_GETARG_TEXT_P(0);
 528         BIGM       *bgm;
 529         Datum      *d;
 530         ArrayType  *a;
 531         bigm       *ptr;
 532         int                     i;
 533
 534         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
 535         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
 536
 537         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
 538         {
 539                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
 540                 d[i] = PointerGetDatum(item);
 541         }
 542
 543         a = construct_array(
 544                                                 d,
 545                                                 ARRNELEM(bgm),
 546                                                 TEXTOID,
 547                                                 -1,
 548                                                 false,
 549                                                 'i'
 550                 );
 551
 552         for (i = 0; i < ARRNELEM(bgm); i++)
 553                 pfree(DatumGetPointer(d[i]));
 554
 555         pfree(d);
 556         pfree(bgm);
 557         PG_FREE_IF_COPY(in, 0);
 558
 559         PG_RETURN_POINTER(a);
 560 }
 561
 562 static float4
 563 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
 564 {
 565         bigm       *ptr1,
 566                            *ptr2;
 567         int                     count = 0;
 568         int                     len1,
 569                                 len2;
 570
 571         ptr1 = GETARR(bgm1);
 572         ptr2 = GETARR(bgm2);
 573
 574         len1 = ARRNELEM(bgm1);
 575         len2 = ARRNELEM(bgm2);
 576
 577         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
 578         if (len1 <= 0 || len2 <= 0)
 579                 return (float4) 0.0;
 580
 581         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
 582         {
 583                 int             res = CMPBIGM(ptr1, ptr2);
 584
 585                 if (res < 0)
 586                         ptr1++;
 587                 else if (res > 0)
 588                         ptr2++;
 589                 else
 590                 {
 591                         ptr1++;
 592                         ptr2++;
 593                         count++;
 594                 }
 595         }
 596
 597 #ifdef DIVUNION
 598         return ((float4) count) / ((float4) (len1 + len2 - count));
 599 #else
 600         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
 601 #endif
 602 }
 603
 604 Datum
 605 bigm_similarity(PG_FUNCTION_ARGS)
 606 {
 607         text       *in1 = PG_GETARG_TEXT_P(0);
 608         text       *in2 = PG_GETARG_TEXT_P(1);
 609         BIGM       *bgm1,
 610                            *bgm2;
 611         float4          res;
 612
 613         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
 614         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
 615
 616         res = cnt_sml_bigm(bgm1, bgm2);
 617
 618         pfree(bgm1);
 619         pfree(bgm2);
 620         PG_FREE_IF_COPY(in1, 0);
 621         PG_FREE_IF_COPY(in2, 1);
 622
 623         PG_RETURN_FLOAT4(res);
 624 }
 625
 626 Datum
 627 bigm_similarity_op(PG_FUNCTION_ARGS)
 628 {
 629         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
 630                                                                                                                  PG_GETARG_DATUM(0),
 631                                                                                                                  PG_GETARG_DATUM(1)));
 632
 633         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
 634 }
 635
 636 Datum
 637 likequery(PG_FUNCTION_ARGS)
 638 {
 639         text       *query = PG_GETARG_TEXT_PP(0);
 640         const char *str;
 641         int                     len;
 642         const char *sp;
 643         text       *result;
 644         char       *rp;
 645         int                     mblen;
 646
 647         str = VARDATA_ANY(query);
 648         len = VARSIZE_ANY_EXHDR(query);
 649
 650         if (len == 0)
 651                 PG_RETURN_NULL();
 652
 653         result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
 654         rp = VARDATA(result);
 655         *rp++ = '%';
 656
 657         for (sp = str; (sp - str) < len;)
 658         {
 659                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
 660                 {
 661                         *rp++ = '\\';
 662                         *rp++ = *sp++;
 663                 }
 664                 else if (IS_HIGHBIT_SET(*sp))
 665                 {
 666                         mblen = pg_mblen(sp);
 667                         memcpy(rp, sp, mblen);
 668                         rp += mblen;
 669                         sp += mblen;
 670                 }
 671                 else
 672                         *rp++ = *sp++;
 673         }
 674
 675         *rp++ = '%';
 676         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
 677
 678         PG_RETURN_TEXT_P(result);
 679 }
 680
 681 inline int
 682 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
 683 {
 684         int                     i;
 685         int                     len = Min(len1, len2);
 686
 687         for (i = 0; i < len; i++, arg1++, arg2++)
 688         {
 689                 if (*arg1 == *arg2)
 690                         continue;
 691                 if (*arg1 < *arg2)
 692                         return -1;
 693                 else
 694                         return 1;
 695         }
 696
 697         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
 698 }
 699
 700 Datum
 701 bigmtextcmp(PG_FUNCTION_ARGS)
 702 {
 703         text    *arg1 = PG_GETARG_TEXT_PP(0);
 704         text    *arg2 = PG_GETARG_TEXT_PP(1);
 705         char    *a1p = VARDATA_ANY(arg1);
 706         char    *a2p = VARDATA_ANY(arg2);
 707         int             len1 = VARSIZE_ANY_EXHDR(arg1);
 708         int             len2 = VARSIZE_ANY_EXHDR(arg2);
 709
 710         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
 711 }