OSDN Git Service

Fix crash in text similarity search with blank characters keyword.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  *
5  * Changelog:
6  *   2013/01/09
7  *   Support full text search using bigrams.
8  *   Author: NTT DATA Corporation
9  *
10  *-------------------------------------------------------------------------
11  */
12 #include "postgres.h"
13
14 #include <ctype.h>
15
16 #include "bigm.h"
17
18 #include "catalog/pg_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "utils/array.h"
21
22
23 PG_MODULE_MAGIC;
24
25 /* Last update date of pg_bigm */
26 #define BIGM_LAST_UPDATE        "2013.04.05"
27
28 /* GUC variable */
29 bool    bigm_enable_recheck = false;
30 int             bigm_gin_key_limit = 0;
31 double  bigm_similarity_limit = 0.3;
32 char    *bigm_last_update = NULL;
33
34 PG_FUNCTION_INFO_V1(show_bigm);
35 Datum           show_bigm(PG_FUNCTION_ARGS);
36
37 PG_FUNCTION_INFO_V1(bigmtextcmp);
38 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
39
40 PG_FUNCTION_INFO_V1(likequery);
41 Datum           likequery(PG_FUNCTION_ARGS);
42
43 PG_FUNCTION_INFO_V1(bigm_similarity);
44 Datum           bigm_similarity(PG_FUNCTION_ARGS);
45
46 PG_FUNCTION_INFO_V1(bigm_similarity_op);
47 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
48
49 void            _PG_init(void);
50 void            _PG_fini(void);
51
52 void
53 _PG_init(void)
54 {
55         /* Define custom GUC variables */
56         DefineCustomBoolVariable("pg_bigm.enable_recheck",
57                                                          "Recheck that heap tuples fetched from index "
58                                                          "match the query.",
59                                                          NULL,
60                                                          &bigm_enable_recheck,
61                                                          true,
62                                                          PGC_USERSET,
63                                                          0,
64                                                          NULL,
65                                                          NULL,
66                                                          NULL);
67
68         DefineCustomIntVariable("pg_bigm.gin_key_limit",
69                                                         "Sets the maximum number of bi-gram keys allowed to "
70                                                         "use for GIN index search.",
71                                                         "Zero means no limit.",
72                                                         &bigm_gin_key_limit,
73                                                         0,
74                                                         0, INT_MAX,
75                                                         PGC_USERSET,
76                                                         0,
77                                                         NULL,
78                                                         NULL,
79                                                         NULL);
80
81         DefineCustomRealVariable("pg_bigm.similarity_limit",
82                                                          "Sets the similarity threshold used by the "
83                                                          "=% operator.",
84                                                          NULL,
85                                                          &bigm_similarity_limit,
86                                                          0.3,
87                                                          0.0, 1.0,
88                                                          PGC_USERSET,
89                                                          0,
90                                                          NULL,
91                                                          NULL,
92                                                          NULL);
93
94         /* Can't be set in postgresql.conf */
95         DefineCustomStringVariable("pg_bigm.last_update",
96                                                            "Shows the last update date of pg_bigm.",
97                                                            NULL,
98                                                            &bigm_last_update,
99                                                            BIGM_LAST_UPDATE,
100                                                            PGC_INTERNAL,
101                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
102                                                            NULL,
103                                                            NULL,
104                                                            NULL);
105
106         EmitWarningsOnPlaceholders("pg_bigm");
107 }
108
109 void
110 _PG_fini(void)
111 {
112 }
113
114 static int
115 comp_bigm(const void *a, const void *b, void *arg)
116 {
117         int             res;
118         bool    *haveDups = (bool *) arg;
119
120         res = CMPBIGM(a, b);
121
122         if (res == 0)
123                 *haveDups = true;
124
125         return res;
126 }
127
128 static int
129 unique_array(bigm *a, int len)
130 {
131         bigm       *curend,
132                            *tmp;
133
134         curend = tmp = a;
135         while (tmp - a < len)
136                 if (CMPBIGM(tmp, curend))
137                 {
138                         curend++;
139                         if (curend != tmp)
140                                 memcpy(curend, tmp, BIGMSIZE);
141                         tmp++;
142                 }
143                 else
144                         tmp++;
145
146         return curend + 1 - a;
147 }
148
149 #define iswordchr(c)    (!t_isspace(c))
150
151 /*
152  * Finds first word in string, returns pointer to the word,
153  * endword points to the character after word
154  */
155 static char *
156 find_word(char *str, int lenstr, char **endword, int *charlen)
157 {
158         char       *beginword = str;
159
160         while (beginword - str < lenstr && !iswordchr(beginword))
161                 beginword += pg_mblen(beginword);
162
163         if (beginword - str >= lenstr)
164                 return NULL;
165
166         *endword = beginword;
167         *charlen = 0;
168         while (*endword - str < lenstr && iswordchr(*endword))
169         {
170                 *endword += pg_mblen(*endword);
171                 (*charlen)++;
172         }
173
174         return beginword;
175 }
176
177 /* 
178  * The function is named compact_bigram to maintain consistency with pg_trgm,
179  * though it does not reduce multibyte characters to hash values like in
180  * compact_trigram.
181  */
182 static void
183 compact_bigram(bigm *bptr, char *str, int bytelen)
184 {
185         CPBIGM(bptr, str, bytelen);
186 }
187
188 /*
189  * Adds bigrams from words (already padded).
190  */
191 static bigm *
192 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
193 {
194         char       *ptr = str;
195
196         if (charlen < 2)
197         {
198                 compact_bigram(bptr, ptr, pg_mblen(str));
199                 bptr->pmatch = true;
200                 bptr++;
201                 return bptr;
202         }
203
204         if (bytelen > charlen)
205         {
206                 /* Find multibyte character boundaries and call compact_bigram */
207                 int                     lenfirst = pg_mblen(str),
208                                         lenlast = pg_mblen(str + lenfirst);
209
210                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
211                 {
212                         compact_bigram(bptr, ptr, lenfirst + lenlast);
213
214                         ptr += lenfirst;
215                         bptr++;
216
217                         lenfirst = lenlast;
218                         lenlast = pg_mblen(ptr + lenfirst);
219                 }
220         }
221         else
222         {
223                 /* Fast path when there are no multibyte characters */
224                 Assert(bytelen == charlen);
225
226                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
227                 {
228                         CPBIGM(bptr, ptr, 2);
229                         ptr++;
230                         bptr++;
231                 }
232         }
233
234         return bptr;
235 }
236
237 BIGM *
238 generate_bigm(char *str, int slen)
239 {
240         BIGM       *bgm;
241         char       *buf;
242         bigm       *bptr;
243         int                     len,
244                                 charlen,
245                                 bytelen;
246         char       *bword,
247                            *eword;
248
249         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
250         SET_VARSIZE(bgm, VARHDRSZ);
251
252         if (slen + LPADDING + RPADDING < 2 || slen == 0)
253                 return bgm;
254
255         bptr = GETARR(bgm);
256
257         buf = palloc(sizeof(char) * (slen + 4));
258
259         if (LPADDING > 0)
260         {
261                 *buf = ' ';
262                 if (LPADDING > 1)
263                         *(buf + 1) = ' ';
264         }
265
266         eword = str;
267         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
268         {
269                 bytelen = eword - bword;
270                 memcpy(buf + LPADDING, bword, bytelen);
271
272                 buf[LPADDING + bytelen] = ' ';
273                 buf[LPADDING + bytelen + 1] = ' ';
274
275                 /*
276                  * count bigrams
277                  */
278                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
279                                                          charlen + LPADDING + RPADDING);
280         }
281
282         pfree(buf);
283
284         if ((len = bptr - GETARR(bgm)) == 0)
285                 return bgm;
286
287         if (len > 0)
288         {
289                 bool    haveDups = false;
290
291                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
292                 if (haveDups)
293                         len = unique_array(GETARR(bgm), len);
294         }
295
296         SET_VARSIZE(bgm, CALCGTSIZE(len));
297
298         return bgm;
299 }
300
301 /*
302  * Extract the next non-wildcard part of a search string, ie, a word bounded
303  * by '_' or '%' meta-characters, non-word characters or string end.
304  *
305  * str: source string, of length lenstr bytes (need not be null-terminated)
306  * buf: where to return the substring (must be long enough)
307  * *bytelen: receives byte length of the found substring
308  * *charlen: receives character length of the found substring
309  *
310  * Returns pointer to end+1 of the found substring in the source string.
311  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
312  *
313  * If the found word is bounded by non-word characters or string boundaries
314  * then this function will include corresponding padding spaces into buf.
315  */
316 static const char *
317 get_wildcard_part(const char *str, int lenstr,
318                                   char *buf, int *bytelen, int *charlen)
319 {
320         const char *beginword = str;
321         const char *endword;
322         char       *s = buf;
323         bool        in_leading_wildcard_meta = false;
324         bool        in_trailing_wildcard_meta = false;
325         bool            in_escape = false;
326         int                     clen;
327
328         /*
329          * Find the first word character, remembering whether preceding character
330          * was wildcard meta-character.  Note that the in_escape state persists
331          * from this loop to the next one, since we may exit at a word character
332          * that is in_escape.
333          */
334         while (beginword - str < lenstr)
335         {
336                 if (in_escape)
337                 {
338                         if (iswordchr(beginword))
339                                 break;
340                         in_escape = false;
341                         in_leading_wildcard_meta = false;
342                 }
343                 else
344                 {
345                         if (ISESCAPECHAR(beginword))
346                                 in_escape = true;
347                         else if (ISWILDCARDCHAR(beginword))
348                                 in_leading_wildcard_meta = true;
349                         else if (iswordchr(beginword))
350                                 break;
351                         else
352                                 in_leading_wildcard_meta = false;
353                 }
354                 beginword += pg_mblen(beginword);
355         }
356
357         /*
358          * Handle string end.
359          */
360         if (beginword - str >= lenstr)
361                 return NULL;
362
363         /*
364          * Add left padding spaces if preceding character wasn't wildcard
365          * meta-character.
366          */
367         *charlen = 0;
368         if (!in_leading_wildcard_meta)
369         {
370                 if (LPADDING > 0)
371                 {
372                         *s++ = ' ';
373                         (*charlen)++;
374                         if (LPADDING > 1)
375                         {
376                                 *s++ = ' ';
377                                 (*charlen)++;
378                         }
379                 }
380         }
381
382         /*
383          * Copy data into buf until wildcard meta-character, non-word character or
384          * string boundary.  Strip escapes during copy.
385          */
386         endword = beginword;
387         while (endword - str < lenstr)
388         {
389                 clen = pg_mblen(endword);
390                 if (in_escape)
391                 {
392                         if (iswordchr(endword))
393                         {
394                                 memcpy(s, endword, clen);
395                                 (*charlen)++;
396                                 s += clen;
397                         }
398                         else
399                         {
400                                 /*
401                                  * Back up endword to the escape character when stopping at
402                                  * an escaped char, so that subsequent get_wildcard_part will
403                                  * restart from the escape character.  We assume here that
404                                  * escape chars are single-byte.
405                                  */
406                                 endword--;
407                                 break;
408                         }
409                         in_escape = false;
410                 }
411                 else
412                 {
413                         if (ISESCAPECHAR(endword))
414                                 in_escape = true;
415                         else if (ISWILDCARDCHAR(endword))
416                         {
417                                 in_trailing_wildcard_meta = true;
418                                 break;
419                         }
420                         else if (iswordchr(endword))
421                         {
422                                 memcpy(s, endword, clen);
423                                 (*charlen)++;
424                                 s += clen;
425                         }
426                         else
427                                 break;
428                 }
429                 endword += clen;
430         }
431
432         /*
433          * Add right padding spaces if next character isn't wildcard
434          * meta-character.
435          */
436         if (!in_trailing_wildcard_meta)
437         {
438                 if (RPADDING > 0)
439                 {
440                         *s++ = ' ';
441                         (*charlen)++;
442                         if (RPADDING > 1)
443                         {
444                                 *s++ = ' ';
445                                 (*charlen)++;
446                         }
447                 }
448         }
449
450         *bytelen = s - buf;
451         return endword;
452 }
453
454 /*
455  * Generates bigrams for wildcard search string.
456  *
457  * Returns array of bigrams that must occur in any string that matches the
458  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
459  * " a", "bcd" would be extracted.
460  *
461  * Set 'removeDups' to true if duplicate bigrams are removed.
462  */
463 BIGM *
464 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
465 {
466         BIGM       *bgm;
467         char       *buf;
468         bigm       *bptr;
469         int                     len,
470                                 charlen,
471                                 bytelen;
472         const char *eword;
473
474         *removeDups = false;
475
476         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
477         SET_VARSIZE(bgm, VARHDRSZ);
478
479         if (slen + LPADDING + RPADDING < 2 || slen == 0)
480                 return bgm;
481
482         bptr = GETARR(bgm);
483
484         buf = palloc(sizeof(char) * (slen + 4));
485
486         /*
487          * Extract bigrams from each substring extracted by get_wildcard_part.
488          */
489         eword = str;
490         while ((eword = get_wildcard_part(eword, slen - (eword - str),
491                                                                           buf, &bytelen, &charlen)) != NULL)
492         {
493                 /*
494                  * count bigrams
495                  */
496                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
497         }
498
499         pfree(buf);
500
501         if ((len = bptr - GETARR(bgm)) == 0)
502                 return bgm;
503
504         /*
505          * Make bigrams unique.
506          */
507         if (len > 0)
508         {
509                 bool    haveDups = false;
510
511                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
512                 if (haveDups)
513                 {
514                         *removeDups = true;
515                         len = unique_array(GETARR(bgm), len);
516                 }
517         }
518
519         SET_VARSIZE(bgm, CALCGTSIZE(len));
520
521         return bgm;
522 }
523
524 Datum
525 show_bigm(PG_FUNCTION_ARGS)
526 {
527         text       *in = PG_GETARG_TEXT_P(0);
528         BIGM       *bgm;
529         Datum      *d;
530         ArrayType  *a;
531         bigm       *ptr;
532         int                     i;
533
534         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
535         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
536
537         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
538         {
539                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
540                 d[i] = PointerGetDatum(item);
541         }
542
543         a = construct_array(
544                                                 d,
545                                                 ARRNELEM(bgm),
546                                                 TEXTOID,
547                                                 -1,
548                                                 false,
549                                                 'i'
550                 );
551
552         for (i = 0; i < ARRNELEM(bgm); i++)
553                 pfree(DatumGetPointer(d[i]));
554
555         pfree(d);
556         pfree(bgm);
557         PG_FREE_IF_COPY(in, 0);
558
559         PG_RETURN_POINTER(a);
560 }
561
562 static float4
563 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
564 {
565         bigm       *ptr1,
566                            *ptr2;
567         int                     count = 0;
568         int                     len1,
569                                 len2;
570
571         ptr1 = GETARR(bgm1);
572         ptr2 = GETARR(bgm2);
573
574         len1 = ARRNELEM(bgm1);
575         len2 = ARRNELEM(bgm2);
576
577         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
578         if (len1 <= 0 || len2 <= 0)
579                 return (float4) 0.0;
580
581         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
582         {
583                 int             res = CMPBIGM(ptr1, ptr2);
584
585                 if (res < 0)
586                         ptr1++;
587                 else if (res > 0)
588                         ptr2++;
589                 else
590                 {
591                         ptr1++;
592                         ptr2++;
593                         count++;
594                 }
595         }
596
597 #ifdef DIVUNION
598         return ((float4) count) / ((float4) (len1 + len2 - count));
599 #else
600         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
601 #endif
602 }
603
604 Datum
605 bigm_similarity(PG_FUNCTION_ARGS)
606 {
607         text       *in1 = PG_GETARG_TEXT_P(0);
608         text       *in2 = PG_GETARG_TEXT_P(1);
609         BIGM       *bgm1,
610                            *bgm2;
611         float4          res;
612
613         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
614         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
615
616         res = cnt_sml_bigm(bgm1, bgm2);
617
618         pfree(bgm1);
619         pfree(bgm2);
620         PG_FREE_IF_COPY(in1, 0);
621         PG_FREE_IF_COPY(in2, 1);
622
623         PG_RETURN_FLOAT4(res);
624 }
625
626 Datum
627 bigm_similarity_op(PG_FUNCTION_ARGS)
628 {
629         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
630                                                                                                                  PG_GETARG_DATUM(0),
631                                                                                                                  PG_GETARG_DATUM(1)));
632
633         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
634 }
635
636 Datum
637 likequery(PG_FUNCTION_ARGS)
638 {
639         text       *query = PG_GETARG_TEXT_PP(0);
640         const char *str;
641         int                     len;
642         const char *sp;
643         text       *result;
644         char       *rp;
645         int                     mblen;
646
647         str = VARDATA_ANY(query);
648         len = VARSIZE_ANY_EXHDR(query);
649
650         if (len == 0)
651                 PG_RETURN_NULL();
652
653         result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
654         rp = VARDATA(result);
655         *rp++ = '%';
656
657         for (sp = str; (sp - str) < len;)
658         {
659                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
660                 {
661                         *rp++ = '\\';
662                         *rp++ = *sp++;
663                 }
664                 else if (IS_HIGHBIT_SET(*sp))
665                 {
666                         mblen = pg_mblen(sp);
667                         memcpy(rp, sp, mblen);
668                         rp += mblen;
669                         sp += mblen;
670                 }
671                 else
672                         *rp++ = *sp++;
673         }
674
675         *rp++ = '%';
676         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
677
678         PG_RETURN_TEXT_P(result);
679 }
680
681 inline int
682 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
683 {
684         int                     i;
685         int                     len = Min(len1, len2);
686
687         for (i = 0; i < len; i++, arg1++, arg2++)
688         {
689                 if (*arg1 == *arg2)
690                         continue;
691                 if (*arg1 < *arg2)
692                         return -1;
693                 else
694                         return 1;
695         }
696
697         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
698 }
699
700 Datum
701 bigmtextcmp(PG_FUNCTION_ARGS)
702 {
703         text    *arg1 = PG_GETARG_TEXT_PP(0);
704         text    *arg2 = PG_GETARG_TEXT_PP(1);
705         char    *a1p = VARDATA_ANY(arg1);
706         char    *a2p = VARDATA_ANY(arg2);
707         int             len1 = VARSIZE_ANY_EXHDR(arg1);
708         int             len2 = VARSIZE_ANY_EXHDR(arg2);
709
710         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
711 }