OSDN Git Service

Support triConsistent function for better performance.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  * Portions Copyright (c) 2013-2015, NTT DATA Corporation
5  *
6  * Changelog:
7  *       2013/01/09
8  *       Support full text search using bigrams.
9  *       Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <ctype.h>
16
17 #include "bigm.h"
18
19 #include "catalog/pg_type.h"
20 #include "tsearch/ts_locale.h"
21 #include "utils/array.h"
22 #include "utils/memutils.h"
23
24 PG_MODULE_MAGIC;
25
26 /* Last update date of pg_bigm */
27 #define BIGM_LAST_UPDATE        "2013.11.22"
28
29 /* GUC variable */
30 bool            bigm_enable_recheck = false;
31 int                     bigm_gin_key_limit = 0;
32 double          bigm_similarity_limit = 0.3;
33 char       *bigm_last_update = NULL;
34
35 PG_FUNCTION_INFO_V1(show_bigm);
36 PG_FUNCTION_INFO_V1(bigmtextcmp);
37 PG_FUNCTION_INFO_V1(likequery);
38 PG_FUNCTION_INFO_V1(bigm_similarity);
39 PG_FUNCTION_INFO_V1(bigm_similarity_op);
40
41 void            _PG_init(void);
42 void            _PG_fini(void);
43
44 void
45 _PG_init(void)
46 {
47         /* Define custom GUC variables */
48         DefineCustomBoolVariable("pg_bigm.enable_recheck",
49                                                          "Recheck that heap tuples fetched from index "
50                                                          "match the query.",
51                                                          NULL,
52                                                          &bigm_enable_recheck,
53                                                          true,
54                                                          PGC_USERSET,
55                                                          0,
56                                                          NULL,
57                                                          NULL,
58                                                          NULL);
59
60         DefineCustomIntVariable("pg_bigm.gin_key_limit",
61                                                         "Sets the maximum number of bi-gram keys allowed to "
62                                                         "use for GIN index search.",
63                                                         "Zero means no limit.",
64                                                         &bigm_gin_key_limit,
65                                                         0,
66                                                         0, INT_MAX,
67                                                         PGC_USERSET,
68                                                         0,
69                                                         NULL,
70                                                         NULL,
71                                                         NULL);
72
73         DefineCustomRealVariable("pg_bigm.similarity_limit",
74                                                          "Sets the similarity threshold used by the "
75                                                          "=% operator.",
76                                                          NULL,
77                                                          &bigm_similarity_limit,
78                                                          0.3,
79                                                          0.0, 1.0,
80                                                          PGC_USERSET,
81                                                          0,
82                                                          NULL,
83                                                          NULL,
84                                                          NULL);
85
86         /* Can't be set in postgresql.conf */
87         DefineCustomStringVariable("pg_bigm.last_update",
88                                                            "Shows the last update date of pg_bigm.",
89                                                            NULL,
90                                                            &bigm_last_update,
91                                                            BIGM_LAST_UPDATE,
92                                                            PGC_INTERNAL,
93                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
94                                                            NULL,
95                                                            NULL,
96                                                            NULL);
97
98         EmitWarningsOnPlaceholders("pg_bigm");
99 }
100
101 void
102 _PG_fini(void)
103 {
104 }
105
106 static int
107 comp_bigm(const void *a, const void *b, void *arg)
108 {
109         int                     res;
110         bool       *haveDups = (bool *) arg;
111
112         res = CMPBIGM(a, b);
113
114         if (res == 0)
115                 *haveDups = true;
116
117         return res;
118 }
119
120 static int
121 unique_array(bigm *a, int len)
122 {
123         bigm       *curend,
124                            *tmp;
125
126         curend = tmp = a;
127         while (tmp - a < len)
128                 if (CMPBIGM(tmp, curend))
129                 {
130                         curend++;
131                         if (curend != tmp)
132                                 memcpy(curend, tmp, BIGMSIZE);
133                         tmp++;
134                 }
135                 else
136                         tmp++;
137
138         return curend + 1 - a;
139 }
140
141 #define iswordchr(c)    (!t_isspace(c))
142
143 /*
144  * Finds first word in string, returns pointer to the word,
145  * endword points to the character after word
146  */
147 static char *
148 find_word(char *str, int lenstr, char **endword, int *charlen)
149 {
150         char       *beginword = str;
151
152         while (beginword - str < lenstr && !iswordchr(beginword))
153                 beginword += pg_mblen(beginword);
154
155         if (beginword - str >= lenstr)
156                 return NULL;
157
158         *endword = beginword;
159         *charlen = 0;
160         while (*endword - str < lenstr && iswordchr(*endword))
161         {
162                 *endword += pg_mblen(*endword);
163                 (*charlen)++;
164         }
165
166         return beginword;
167 }
168
169 /*
170  * The function is named compact_bigram to maintain consistency with pg_trgm,
171  * though it does not reduce multibyte characters to hash values like in
172  * compact_trigram.
173  */
174 static void
175 compact_bigram(bigm *bptr, char *str, int bytelen)
176 {
177         CPBIGM(bptr, str, bytelen);
178 }
179
180 /*
181  * Adds bigrams from words (already padded).
182  */
183 static bigm *
184 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
185 {
186         char       *ptr = str;
187
188         if (charlen < 2)
189         {
190                 compact_bigram(bptr, ptr, pg_mblen(str));
191                 bptr->pmatch = true;
192                 bptr++;
193                 return bptr;
194         }
195
196         if (bytelen > charlen)
197         {
198                 /* Find multibyte character boundaries and call compact_bigram */
199                 int                     lenfirst = pg_mblen(str),
200                                         lenlast = pg_mblen(str + lenfirst);
201
202                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
203                 {
204                         compact_bigram(bptr, ptr, lenfirst + lenlast);
205
206                         ptr += lenfirst;
207                         bptr++;
208
209                         lenfirst = lenlast;
210                         lenlast = pg_mblen(ptr + lenfirst);
211                 }
212         }
213         else
214         {
215                 /* Fast path when there are no multibyte characters */
216                 Assert(bytelen == charlen);
217
218                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
219                 {
220                         CPBIGM(bptr, ptr, 2);
221                         ptr++;
222                         bptr++;
223                 }
224         }
225
226         return bptr;
227 }
228
229 BIGM *
230 generate_bigm(char *str, int slen)
231 {
232         BIGM       *bgm;
233         char       *buf;
234         bigm       *bptr;
235         int                     len,
236                                 charlen,
237                                 bytelen;
238         char       *bword,
239                            *eword;
240
241         /*
242          * Guard against possible overflow in the palloc requests below.
243          * We need to prevent integer overflow in the multiplications here.
244          */
245         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
246                 (Size) slen > MaxAllocSize - 4)
247                 ereport(ERROR,
248                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
249                                  errmsg("out of memory")));
250
251         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
252         SET_VARSIZE(bgm, VARHDRSZ);
253
254         if (slen + LPADDING + RPADDING < 2 || slen == 0)
255                 return bgm;
256
257         bptr = GETARR(bgm);
258
259         buf = palloc(sizeof(char) * (slen + 4));
260
261         if (LPADDING > 0)
262         {
263                 *buf = ' ';
264                 if (LPADDING > 1)
265                         *(buf + 1) = ' ';
266         }
267
268         eword = str;
269         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
270         {
271                 bytelen = eword - bword;
272                 memcpy(buf + LPADDING, bword, bytelen);
273
274                 buf[LPADDING + bytelen] = ' ';
275                 buf[LPADDING + bytelen + 1] = ' ';
276
277                 /*
278                  * count bigrams
279                  */
280                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
281                                                         charlen + LPADDING + RPADDING);
282         }
283
284         pfree(buf);
285
286         if ((len = bptr - GETARR(bgm)) == 0)
287                 return bgm;
288
289         /*
290          * Make bigrams unique.
291          */
292         if (len > 1)
293         {
294                 bool            haveDups = false;
295
296                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
297                 if (haveDups)
298                         len = unique_array(GETARR(bgm), len);
299         }
300
301         SET_VARSIZE(bgm, CALCGTSIZE(len));
302
303         return bgm;
304 }
305
306 /*
307  * Extract the next non-wildcard part of a search string, ie, a word bounded
308  * by '_' or '%' meta-characters, non-word characters or string end.
309  *
310  * str: source string, of length lenstr bytes (need not be null-terminated)
311  * buf: where to return the substring (must be long enough)
312  * *bytelen: receives byte length of the found substring
313  * *charlen: receives character length of the found substring
314  *
315  * Returns pointer to end+1 of the found substring in the source string.
316  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
317  *
318  * If the found word is bounded by non-word characters or string boundaries
319  * then this function will include corresponding padding spaces into buf.
320  */
321 static const char *
322 get_wildcard_part(const char *str, int lenstr,
323                                   char *buf, int *bytelen, int *charlen)
324 {
325         const char *beginword = str;
326         const char *endword;
327         char       *s = buf;
328         bool            in_leading_wildcard_meta = false;
329         bool            in_trailing_wildcard_meta = false;
330         bool            in_escape = false;
331         int                     clen;
332
333         /*
334          * Find the first word character, remembering whether preceding character
335          * was wildcard meta-character.  Note that the in_escape state persists
336          * from this loop to the next one, since we may exit at a word character
337          * that is in_escape.
338          */
339         while (beginword - str < lenstr)
340         {
341                 if (in_escape)
342                 {
343                         if (iswordchr(beginword))
344                                 break;
345                         in_escape = false;
346                         in_leading_wildcard_meta = false;
347                 }
348                 else
349                 {
350                         if (ISESCAPECHAR(beginword))
351                                 in_escape = true;
352                         else if (ISWILDCARDCHAR(beginword))
353                                 in_leading_wildcard_meta = true;
354                         else if (iswordchr(beginword))
355                                 break;
356                         else
357                                 in_leading_wildcard_meta = false;
358                 }
359                 beginword += pg_mblen(beginword);
360         }
361
362         /*
363          * Handle string end.
364          */
365         if (beginword - str >= lenstr)
366                 return NULL;
367
368         /*
369          * Add left padding spaces if preceding character wasn't wildcard
370          * meta-character.
371          */
372         *charlen = 0;
373         if (!in_leading_wildcard_meta)
374         {
375                 if (LPADDING > 0)
376                 {
377                         *s++ = ' ';
378                         (*charlen)++;
379                         if (LPADDING > 1)
380                         {
381                                 *s++ = ' ';
382                                 (*charlen)++;
383                         }
384                 }
385         }
386
387         /*
388          * Copy data into buf until wildcard meta-character, non-word character or
389          * string boundary.  Strip escapes during copy.
390          */
391         endword = beginword;
392         while (endword - str < lenstr)
393         {
394                 clen = pg_mblen(endword);
395                 if (in_escape)
396                 {
397                         if (iswordchr(endword))
398                         {
399                                 memcpy(s, endword, clen);
400                                 (*charlen)++;
401                                 s += clen;
402                         }
403                         else
404                         {
405                                 /*
406                                  * Back up endword to the escape character when stopping at an
407                                  * escaped char, so that subsequent get_wildcard_part will
408                                  * restart from the escape character.  We assume here that
409                                  * escape chars are single-byte.
410                                  */
411                                 endword--;
412                                 break;
413                         }
414                         in_escape = false;
415                 }
416                 else
417                 {
418                         if (ISESCAPECHAR(endword))
419                                 in_escape = true;
420                         else if (ISWILDCARDCHAR(endword))
421                         {
422                                 in_trailing_wildcard_meta = true;
423                                 break;
424                         }
425                         else if (iswordchr(endword))
426                         {
427                                 memcpy(s, endword, clen);
428                                 (*charlen)++;
429                                 s += clen;
430                         }
431                         else
432                                 break;
433                 }
434                 endword += clen;
435         }
436
437         /*
438          * Add right padding spaces if next character isn't wildcard
439          * meta-character.
440          */
441         if (!in_trailing_wildcard_meta)
442         {
443                 if (RPADDING > 0)
444                 {
445                         *s++ = ' ';
446                         (*charlen)++;
447                         if (RPADDING > 1)
448                         {
449                                 *s++ = ' ';
450                                 (*charlen)++;
451                         }
452                 }
453         }
454
455         *bytelen = s - buf;
456         return endword;
457 }
458
459 /*
460  * Generates bigrams for wildcard search string.
461  *
462  * Returns array of bigrams that must occur in any string that matches the
463  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
464  * " a", "bcd" would be extracted.
465  *
466  * Set 'removeDups' to true if duplicate bigrams are removed.
467  */
468 BIGM *
469 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
470 {
471         BIGM       *bgm;
472         char       *buf;
473         bigm       *bptr;
474         int                     len,
475                                 charlen,
476                                 bytelen;
477         const char *eword;
478
479         *removeDups = false;
480
481         /*
482          * Guard against possible overflow in the palloc requests below.
483          * We need to prevent integer overflow in the multiplications here.
484          */
485         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
486                 (Size) slen > MaxAllocSize - 4)
487                 ereport(ERROR,
488                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
489                                  errmsg("out of memory")));
490
491         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
492         SET_VARSIZE(bgm, VARHDRSZ);
493
494         if (slen + LPADDING + RPADDING < 2 || slen == 0)
495                 return bgm;
496
497         bptr = GETARR(bgm);
498
499         buf = palloc(sizeof(char) * (slen + 4));
500
501         /*
502          * Extract bigrams from each substring extracted by get_wildcard_part.
503          */
504         eword = str;
505         while ((eword = get_wildcard_part(eword, slen - (eword - str),
506                                                                           buf, &bytelen, &charlen)) != NULL)
507         {
508                 /*
509                  * count bigrams
510                  */
511                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
512         }
513
514         pfree(buf);
515
516         if ((len = bptr - GETARR(bgm)) == 0)
517                 return bgm;
518
519         /*
520          * Make bigrams unique.
521          */
522         if (len > 1)
523         {
524                 bool            haveDups = false;
525
526                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
527                 if (haveDups)
528                 {
529                         *removeDups = true;
530                         len = unique_array(GETARR(bgm), len);
531                 }
532         }
533
534         SET_VARSIZE(bgm, CALCGTSIZE(len));
535
536         return bgm;
537 }
538
539 Datum
540 show_bigm(PG_FUNCTION_ARGS)
541 {
542         text       *in = PG_GETARG_TEXT_P(0);
543         BIGM       *bgm;
544         Datum      *d;
545         ArrayType  *a;
546         bigm       *ptr;
547         int                     i;
548
549         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
550         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
551
552         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
553         {
554                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
555
556                 d[i] = PointerGetDatum(item);
557         }
558
559         a = construct_array(
560                                                 d,
561                                                 ARRNELEM(bgm),
562                                                 TEXTOID,
563                                                 -1,
564                                                 false,
565                                                 'i'
566                 );
567
568         for (i = 0; i < ARRNELEM(bgm); i++)
569                 pfree(DatumGetPointer(d[i]));
570
571         pfree(d);
572         pfree(bgm);
573         PG_FREE_IF_COPY(in, 0);
574
575         PG_RETURN_POINTER(a);
576 }
577
578 static float4
579 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
580 {
581         bigm       *ptr1,
582                            *ptr2;
583         int                     count = 0;
584         int                     len1,
585                                 len2;
586
587         ptr1 = GETARR(bgm1);
588         ptr2 = GETARR(bgm2);
589
590         len1 = ARRNELEM(bgm1);
591         len2 = ARRNELEM(bgm2);
592
593         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
594         if (len1 <= 0 || len2 <= 0)
595                 return (float4) 0.0;
596
597         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
598         {
599                 int                     res = CMPBIGM(ptr1, ptr2);
600
601                 if (res < 0)
602                         ptr1++;
603                 else if (res > 0)
604                         ptr2++;
605                 else
606                 {
607                         ptr1++;
608                         ptr2++;
609                         count++;
610                 }
611         }
612
613 #ifdef DIVUNION
614         return ((float4) count) / ((float4) (len1 + len2 - count));
615 #else
616         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
617 #endif
618 }
619
620 Datum
621 bigm_similarity(PG_FUNCTION_ARGS)
622 {
623         text       *in1 = PG_GETARG_TEXT_P(0);
624         text       *in2 = PG_GETARG_TEXT_P(1);
625         BIGM       *bgm1,
626                            *bgm2;
627         float4          res;
628
629         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
630         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
631
632         res = cnt_sml_bigm(bgm1, bgm2);
633
634         pfree(bgm1);
635         pfree(bgm2);
636         PG_FREE_IF_COPY(in1, 0);
637         PG_FREE_IF_COPY(in2, 1);
638
639         PG_RETURN_FLOAT4(res);
640 }
641
642 Datum
643 bigm_similarity_op(PG_FUNCTION_ARGS)
644 {
645         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
646                                                                                                                  PG_GETARG_DATUM(0),
647                                                                                                                  PG_GETARG_DATUM(1)));
648
649         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
650 }
651
652 Datum
653 likequery(PG_FUNCTION_ARGS)
654 {
655         text       *query = PG_GETARG_TEXT_PP(0);
656         const char *str;
657         int                     len;
658         const char *sp;
659         text       *result;
660         char       *rp;
661         int                     mblen;
662
663         str = VARDATA_ANY(query);
664         len = VARSIZE_ANY_EXHDR(query);
665
666         if (len == 0)
667                 PG_RETURN_NULL();
668
669         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
670         rp = VARDATA(result);
671         *rp++ = '%';
672
673         for (sp = str; (sp - str) < len;)
674         {
675                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
676                 {
677                         *rp++ = '\\';
678                         *rp++ = *sp++;
679                 }
680                 else if (IS_HIGHBIT_SET(*sp))
681                 {
682                         mblen = pg_mblen(sp);
683                         memcpy(rp, sp, mblen);
684                         rp += mblen;
685                         sp += mblen;
686                 }
687                 else
688                         *rp++ = *sp++;
689         }
690
691         *rp++ = '%';
692         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
693
694         PG_RETURN_TEXT_P(result);
695 }
696
697 inline int
698 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
699 {
700         int                     i;
701         int                     len = Min(len1, len2);
702
703         for (i = 0; i < len; i++, arg1++, arg2++)
704         {
705                 if (*arg1 == *arg2)
706                         continue;
707                 if (*arg1 < *arg2)
708                         return -1;
709                 else
710                         return 1;
711         }
712
713         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
714 }
715
716 Datum
717 bigmtextcmp(PG_FUNCTION_ARGS)
718 {
719         text       *arg1 = PG_GETARG_TEXT_PP(0);
720         text       *arg2 = PG_GETARG_TEXT_PP(1);
721         char       *a1p = VARDATA_ANY(arg1);
722         char       *a2p = VARDATA_ANY(arg2);
723         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
724         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
725
726         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
727 }