bigm_op.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
   4  *
   5  * Changelog:
   6  *   2013/01/09
   7  *   Support full text search using bigrams.
   8  *   Author: NTT DATA Corporation
   9  *
  10  *-------------------------------------------------------------------------
  11  */
  12 #include "postgres.h"
  13
  14 #include <ctype.h>
  15
  16 #include "bigm.h"
  17
  18 #include "catalog/pg_type.h"
  19 #include "tsearch/ts_locale.h"
  20 #include "utils/array.h"
  21
  22
  23 PG_MODULE_MAGIC;
  24
  25 /* Last update date of pg_bigm */
  26 #define BIGM_LAST_UPDATE        "2013.11.22"
  27
  28 /* GUC variable */
  29 bool    bigm_enable_recheck = false;
  30 int             bigm_gin_key_limit = 0;
  31 double  bigm_similarity_limit = 0.3;
  32 char    *bigm_last_update = NULL;
  33
  34 PG_FUNCTION_INFO_V1(show_bigm);
  35 Datum           show_bigm(PG_FUNCTION_ARGS);
  36
  37 PG_FUNCTION_INFO_V1(bigmtextcmp);
  38 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
  39
  40 PG_FUNCTION_INFO_V1(likequery);
  41 Datum           likequery(PG_FUNCTION_ARGS);
  42
  43 PG_FUNCTION_INFO_V1(bigm_similarity);
  44 Datum           bigm_similarity(PG_FUNCTION_ARGS);
  45
  46 PG_FUNCTION_INFO_V1(bigm_similarity_op);
  47 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
  48
  49 void            _PG_init(void);
  50 void            _PG_fini(void);
  51
  52 void
  53 _PG_init(void)
  54 {
  55         /* Define custom GUC variables */
  56         DefineCustomBoolVariable("pg_bigm.enable_recheck",
  57                                                          "Recheck that heap tuples fetched from index "
  58                                                          "match the query.",
  59                                                          NULL,
  60                                                          &bigm_enable_recheck,
  61                                                          true,
  62                                                          PGC_USERSET,
  63                                                          0,
  64                                                          NULL,
  65                                                          NULL,
  66                                                          NULL);
  67
  68         DefineCustomIntVariable("pg_bigm.gin_key_limit",
  69                                                         "Sets the maximum number of bi-gram keys allowed to "
  70                                                         "use for GIN index search.",
  71                                                         "Zero means no limit.",
  72                                                         &bigm_gin_key_limit,
  73                                                         0,
  74                                                         0, INT_MAX,
  75                                                         PGC_USERSET,
  76                                                         0,
  77                                                         NULL,
  78                                                         NULL,
  79                                                         NULL);
  80
  81         DefineCustomRealVariable("pg_bigm.similarity_limit",
  82                                                          "Sets the similarity threshold used by the "
  83                                                          "=% operator.",
  84                                                          NULL,
  85                                                          &bigm_similarity_limit,
  86                                                          0.3,
  87                                                          0.0, 1.0,
  88                                                          PGC_USERSET,
  89                                                          0,
  90                                                          NULL,
  91                                                          NULL,
  92                                                          NULL);
  93
  94         /* Can't be set in postgresql.conf */
  95         DefineCustomStringVariable("pg_bigm.last_update",
  96                                                            "Shows the last update date of pg_bigm.",
  97                                                            NULL,
  98                                                            &bigm_last_update,
  99                                                            BIGM_LAST_UPDATE,
 100                                                            PGC_INTERNAL,
 101                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
 102                                                            NULL,
 103                                                            NULL,
 104                                                            NULL);
 105
 106         EmitWarningsOnPlaceholders("pg_bigm");
 107 }
 108
 109 void
 110 _PG_fini(void)
 111 {
 112 }
 113
 114 static int
 115 comp_bigm(const void *a, const void *b, void *arg)
 116 {
 117         int             res;
 118         bool    *haveDups = (bool *) arg;
 119
 120         res = CMPBIGM(a, b);
 121
 122         if (res == 0)
 123                 *haveDups = true;
 124
 125         return res;
 126 }
 127
 128 static int
 129 unique_array(bigm *a, int len)
 130 {
 131         bigm       *curend,
 132                            *tmp;
 133
 134         curend = tmp = a;
 135         while (tmp - a < len)
 136                 if (CMPBIGM(tmp, curend))
 137                 {
 138                         curend++;
 139                         if (curend != tmp)
 140                                 memcpy(curend, tmp, BIGMSIZE);
 141                         tmp++;
 142                 }
 143                 else
 144                         tmp++;
 145
 146         return curend + 1 - a;
 147 }
 148
 149 #define iswordchr(c)    (!t_isspace(c))
 150
 151 /*
 152  * Finds first word in string, returns pointer to the word,
 153  * endword points to the character after word
 154  */
 155 static char *
 156 find_word(char *str, int lenstr, char **endword, int *charlen)
 157 {
 158         char       *beginword = str;
 159
 160         while (beginword - str < lenstr && !iswordchr(beginword))
 161                 beginword += pg_mblen(beginword);
 162
 163         if (beginword - str >= lenstr)
 164                 return NULL;
 165
 166         *endword = beginword;
 167         *charlen = 0;
 168         while (*endword - str < lenstr && iswordchr(*endword))
 169         {
 170                 *endword += pg_mblen(*endword);
 171                 (*charlen)++;
 172         }
 173
 174         return beginword;
 175 }
 176
 177 /*
 178  * The function is named compact_bigram to maintain consistency with pg_trgm,
 179  * though it does not reduce multibyte characters to hash values like in
 180  * compact_trigram.
 181  */
 182 static void
 183 compact_bigram(bigm *bptr, char *str, int bytelen)
 184 {
 185         CPBIGM(bptr, str, bytelen);
 186 }
 187
 188 /*
 189  * Adds bigrams from words (already padded).
 190  */
 191 static bigm *
 192 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
 193 {
 194         char       *ptr = str;
 195
 196         if (charlen < 2)
 197         {
 198                 compact_bigram(bptr, ptr, pg_mblen(str));
 199                 bptr->pmatch = true;
 200                 bptr++;
 201                 return bptr;
 202         }
 203
 204         if (bytelen > charlen)
 205         {
 206                 /* Find multibyte character boundaries and call compact_bigram */
 207                 int                     lenfirst = pg_mblen(str),
 208                                         lenlast = pg_mblen(str + lenfirst);
 209
 210                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
 211                 {
 212                         compact_bigram(bptr, ptr, lenfirst + lenlast);
 213
 214                         ptr += lenfirst;
 215                         bptr++;
 216
 217                         lenfirst = lenlast;
 218                         lenlast = pg_mblen(ptr + lenfirst);
 219                 }
 220         }
 221         else
 222         {
 223                 /* Fast path when there are no multibyte characters */
 224                 Assert(bytelen == charlen);
 225
 226                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
 227                 {
 228                         CPBIGM(bptr, ptr, 2);
 229                         ptr++;
 230                         bptr++;
 231                 }
 232         }
 233
 234         return bptr;
 235 }
 236
 237 BIGM *
 238 generate_bigm(char *str, int slen)
 239 {
 240         BIGM       *bgm;
 241         char       *buf;
 242         bigm       *bptr;
 243         int                     len,
 244                                 charlen,
 245                                 bytelen;
 246         char       *bword,
 247                            *eword;
 248
 249         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
 250         SET_VARSIZE(bgm, VARHDRSZ);
 251
 252         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 253                 return bgm;
 254
 255         bptr = GETARR(bgm);
 256
 257         buf = palloc(sizeof(char) * (slen + 4));
 258
 259         if (LPADDING > 0)
 260         {
 261                 *buf = ' ';
 262                 if (LPADDING > 1)
 263                         *(buf + 1) = ' ';
 264         }
 265
 266         eword = str;
 267         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
 268         {
 269                 bytelen = eword - bword;
 270                 memcpy(buf + LPADDING, bword, bytelen);
 271
 272                 buf[LPADDING + bytelen] = ' ';
 273                 buf[LPADDING + bytelen + 1] = ' ';
 274
 275                 /*
 276                  * count bigrams
 277                  */
 278                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
 279                                                          charlen + LPADDING + RPADDING);
 280         }
 281
 282         pfree(buf);
 283
 284         if ((len = bptr - GETARR(bgm)) == 0)
 285                 return bgm;
 286
 287         /*
 288          * Make bigrams unique.
 289          */
 290         if (len > 1)
 291         {
 292                 bool    haveDups = false;
 293
 294                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 295                 if (haveDups)
 296                         len = unique_array(GETARR(bgm), len);
 297         }
 298
 299         SET_VARSIZE(bgm, CALCGTSIZE(len));
 300
 301         return bgm;
 302 }
 303
 304 /*
 305  * Extract the next non-wildcard part of a search string, ie, a word bounded
 306  * by '_' or '%' meta-characters, non-word characters or string end.
 307  *
 308  * str: source string, of length lenstr bytes (need not be null-terminated)
 309  * buf: where to return the substring (must be long enough)
 310  * *bytelen: receives byte length of the found substring
 311  * *charlen: receives character length of the found substring
 312  *
 313  * Returns pointer to end+1 of the found substring in the source string.
 314  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
 315  *
 316  * If the found word is bounded by non-word characters or string boundaries
 317  * then this function will include corresponding padding spaces into buf.
 318  */
 319 static const char *
 320 get_wildcard_part(const char *str, int lenstr,
 321                                   char *buf, int *bytelen, int *charlen)
 322 {
 323         const char *beginword = str;
 324         const char *endword;
 325         char       *s = buf;
 326         bool        in_leading_wildcard_meta = false;
 327         bool        in_trailing_wildcard_meta = false;
 328         bool            in_escape = false;
 329         int                     clen;
 330
 331         /*
 332          * Find the first word character, remembering whether preceding character
 333          * was wildcard meta-character.  Note that the in_escape state persists
 334          * from this loop to the next one, since we may exit at a word character
 335          * that is in_escape.
 336          */
 337         while (beginword - str < lenstr)
 338         {
 339                 if (in_escape)
 340                 {
 341                         if (iswordchr(beginword))
 342                                 break;
 343                         in_escape = false;
 344                         in_leading_wildcard_meta = false;
 345                 }
 346                 else
 347                 {
 348                         if (ISESCAPECHAR(beginword))
 349                                 in_escape = true;
 350                         else if (ISWILDCARDCHAR(beginword))
 351                                 in_leading_wildcard_meta = true;
 352                         else if (iswordchr(beginword))
 353                                 break;
 354                         else
 355                                 in_leading_wildcard_meta = false;
 356                 }
 357                 beginword += pg_mblen(beginword);
 358         }
 359
 360         /*
 361          * Handle string end.
 362          */
 363         if (beginword - str >= lenstr)
 364                 return NULL;
 365
 366         /*
 367          * Add left padding spaces if preceding character wasn't wildcard
 368          * meta-character.
 369          */
 370         *charlen = 0;
 371         if (!in_leading_wildcard_meta)
 372         {
 373                 if (LPADDING > 0)
 374                 {
 375                         *s++ = ' ';
 376                         (*charlen)++;
 377                         if (LPADDING > 1)
 378                         {
 379                                 *s++ = ' ';
 380                                 (*charlen)++;
 381                         }
 382                 }
 383         }
 384
 385         /*
 386          * Copy data into buf until wildcard meta-character, non-word character or
 387          * string boundary.  Strip escapes during copy.
 388          */
 389         endword = beginword;
 390         while (endword - str < lenstr)
 391         {
 392                 clen = pg_mblen(endword);
 393                 if (in_escape)
 394                 {
 395                         if (iswordchr(endword))
 396                         {
 397                                 memcpy(s, endword, clen);
 398                                 (*charlen)++;
 399                                 s += clen;
 400                         }
 401                         else
 402                         {
 403                                 /*
 404                                  * Back up endword to the escape character when stopping at
 405                                  * an escaped char, so that subsequent get_wildcard_part will
 406                                  * restart from the escape character.  We assume here that
 407                                  * escape chars are single-byte.
 408                                  */
 409                                 endword--;
 410                                 break;
 411                         }
 412                         in_escape = false;
 413                 }
 414                 else
 415                 {
 416                         if (ISESCAPECHAR(endword))
 417                                 in_escape = true;
 418                         else if (ISWILDCARDCHAR(endword))
 419                         {
 420                                 in_trailing_wildcard_meta = true;
 421                                 break;
 422                         }
 423                         else if (iswordchr(endword))
 424                         {
 425                                 memcpy(s, endword, clen);
 426                                 (*charlen)++;
 427                                 s += clen;
 428                         }
 429                         else
 430                                 break;
 431                 }
 432                 endword += clen;
 433         }
 434
 435         /*
 436          * Add right padding spaces if next character isn't wildcard
 437          * meta-character.
 438          */
 439         if (!in_trailing_wildcard_meta)
 440         {
 441                 if (RPADDING > 0)
 442                 {
 443                         *s++ = ' ';
 444                         (*charlen)++;
 445                         if (RPADDING > 1)
 446                         {
 447                                 *s++ = ' ';
 448                                 (*charlen)++;
 449                         }
 450                 }
 451         }
 452
 453         *bytelen = s - buf;
 454         return endword;
 455 }
 456
 457 /*
 458  * Generates bigrams for wildcard search string.
 459  *
 460  * Returns array of bigrams that must occur in any string that matches the
 461  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
 462  * " a", "bcd" would be extracted.
 463  *
 464  * Set 'removeDups' to true if duplicate bigrams are removed.
 465  */
 466 BIGM *
 467 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
 468 {
 469         BIGM       *bgm;
 470         char       *buf;
 471         bigm       *bptr;
 472         int                     len,
 473                                 charlen,
 474                                 bytelen;
 475         const char *eword;
 476
 477         *removeDups = false;
 478
 479         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
 480         SET_VARSIZE(bgm, VARHDRSZ);
 481
 482         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 483                 return bgm;
 484
 485         bptr = GETARR(bgm);
 486
 487         buf = palloc(sizeof(char) * (slen + 4));
 488
 489         /*
 490          * Extract bigrams from each substring extracted by get_wildcard_part.
 491          */
 492         eword = str;
 493         while ((eword = get_wildcard_part(eword, slen - (eword - str),
 494                                                                           buf, &bytelen, &charlen)) != NULL)
 495         {
 496                 /*
 497                  * count bigrams
 498                  */
 499                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
 500         }
 501
 502         pfree(buf);
 503
 504         if ((len = bptr - GETARR(bgm)) == 0)
 505                 return bgm;
 506
 507         /*
 508          * Make bigrams unique.
 509          */
 510         if (len > 1)
 511         {
 512                 bool    haveDups = false;
 513
 514                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 515                 if (haveDups)
 516                 {
 517                         *removeDups = true;
 518                         len = unique_array(GETARR(bgm), len);
 519                 }
 520         }
 521
 522         SET_VARSIZE(bgm, CALCGTSIZE(len));
 523
 524         return bgm;
 525 }
 526
 527 Datum
 528 show_bigm(PG_FUNCTION_ARGS)
 529 {
 530         text       *in = PG_GETARG_TEXT_P(0);
 531         BIGM       *bgm;
 532         Datum      *d;
 533         ArrayType  *a;
 534         bigm       *ptr;
 535         int                     i;
 536
 537         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
 538         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
 539
 540         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
 541         {
 542                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
 543                 d[i] = PointerGetDatum(item);
 544         }
 545
 546         a = construct_array(
 547                                                 d,
 548                                                 ARRNELEM(bgm),
 549                                                 TEXTOID,
 550                                                 -1,
 551                                                 false,
 552                                                 'i'
 553                 );
 554
 555         for (i = 0; i < ARRNELEM(bgm); i++)
 556                 pfree(DatumGetPointer(d[i]));
 557
 558         pfree(d);
 559         pfree(bgm);
 560         PG_FREE_IF_COPY(in, 0);
 561
 562         PG_RETURN_POINTER(a);
 563 }
 564
 565 static float4
 566 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
 567 {
 568         bigm       *ptr1,
 569                            *ptr2;
 570         int                     count = 0;
 571         int                     len1,
 572                                 len2;
 573
 574         ptr1 = GETARR(bgm1);
 575         ptr2 = GETARR(bgm2);
 576
 577         len1 = ARRNELEM(bgm1);
 578         len2 = ARRNELEM(bgm2);
 579
 580         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
 581         if (len1 <= 0 || len2 <= 0)
 582                 return (float4) 0.0;
 583
 584         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
 585         {
 586                 int             res = CMPBIGM(ptr1, ptr2);
 587
 588                 if (res < 0)
 589                         ptr1++;
 590                 else if (res > 0)
 591                         ptr2++;
 592                 else
 593                 {
 594                         ptr1++;
 595                         ptr2++;
 596                         count++;
 597                 }
 598         }
 599
 600 #ifdef DIVUNION
 601         return ((float4) count) / ((float4) (len1 + len2 - count));
 602 #else
 603         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
 604 #endif
 605 }
 606
 607 Datum
 608 bigm_similarity(PG_FUNCTION_ARGS)
 609 {
 610         text       *in1 = PG_GETARG_TEXT_P(0);
 611         text       *in2 = PG_GETARG_TEXT_P(1);
 612         BIGM       *bgm1,
 613                            *bgm2;
 614         float4          res;
 615
 616         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
 617         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
 618
 619         res = cnt_sml_bigm(bgm1, bgm2);
 620
 621         pfree(bgm1);
 622         pfree(bgm2);
 623         PG_FREE_IF_COPY(in1, 0);
 624         PG_FREE_IF_COPY(in2, 1);
 625
 626         PG_RETURN_FLOAT4(res);
 627 }
 628
 629 Datum
 630 bigm_similarity_op(PG_FUNCTION_ARGS)
 631 {
 632         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
 633                                                                                                                  PG_GETARG_DATUM(0),
 634                                                                                                                  PG_GETARG_DATUM(1)));
 635
 636         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
 637 }
 638
 639 Datum
 640 likequery(PG_FUNCTION_ARGS)
 641 {
 642         text       *query = PG_GETARG_TEXT_PP(0);
 643         const char *str;
 644         int                     len;
 645         const char *sp;
 646         text       *result;
 647         char       *rp;
 648         int                     mblen;
 649
 650         str = VARDATA_ANY(query);
 651         len = VARSIZE_ANY_EXHDR(query);
 652
 653         if (len == 0)
 654                 PG_RETURN_NULL();
 655
 656         result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
 657         rp = VARDATA(result);
 658         *rp++ = '%';
 659
 660         for (sp = str; (sp - str) < len;)
 661         {
 662                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
 663                 {
 664                         *rp++ = '\\';
 665                         *rp++ = *sp++;
 666                 }
 667                 else if (IS_HIGHBIT_SET(*sp))
 668                 {
 669                         mblen = pg_mblen(sp);
 670                         memcpy(rp, sp, mblen);
 671                         rp += mblen;
 672                         sp += mblen;
 673                 }
 674                 else
 675                         *rp++ = *sp++;
 676         }
 677
 678         *rp++ = '%';
 679         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
 680
 681         PG_RETURN_TEXT_P(result);
 682 }
 683
 684 inline int
 685 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
 686 {
 687         int                     i;
 688         int                     len = Min(len1, len2);
 689
 690         for (i = 0; i < len; i++, arg1++, arg2++)
 691         {
 692                 if (*arg1 == *arg2)
 693                         continue;
 694                 if (*arg1 < *arg2)
 695                         return -1;
 696                 else
 697                         return 1;
 698         }
 699
 700         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
 701 }
 702
 703 Datum
 704 bigmtextcmp(PG_FUNCTION_ARGS)
 705 {
 706         text    *arg1 = PG_GETARG_TEXT_PP(0);
 707         text    *arg2 = PG_GETARG_TEXT_PP(1);
 708         char    *a1p = VARDATA_ANY(arg1);
 709         char    *a2p = VARDATA_ANY(arg2);
 710         int             len1 = VARSIZE_ANY_EXHDR(arg1);
 711         int             len2 = VARSIZE_ANY_EXHDR(arg2);
 712
 713         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
 714 }