OSDN Git Service

Switch CI from Travis CI to GitHub Actions (#12)
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2017-2023, pg_bigm Development Group
4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
5  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
6  *
7  * Changelog:
8  *       2013/01/09
9  *       Support full text search using bigrams.
10  *       Author: NTT DATA Corporation
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include <ctype.h>
17
18 #include "bigm.h"
19
20 #include "catalog/pg_type.h"
21 #include "tsearch/ts_locale.h"
22 #include "utils/array.h"
23 #if PG_VERSION_NUM >= 160000
24 #include "utils/guc.h"
25 #endif /* PG_VERSION_NUM */
26 #include "utils/memutils.h"
27
28 PG_MODULE_MAGIC;
29
30 /* Last update date of pg_bigm */
31 #define BIGM_LAST_UPDATE        "2020.02.28"
32
33 /* GUC variable */
34 bool            bigm_enable_recheck = false;
35 int                     bigm_gin_key_limit = 0;
36 double          bigm_similarity_limit = 0.3;
37 char       *bigm_last_update = NULL;
38
39 PG_FUNCTION_INFO_V1(show_bigm);
40 PG_FUNCTION_INFO_V1(bigmtextcmp);
41 PG_FUNCTION_INFO_V1(likequery);
42 PG_FUNCTION_INFO_V1(bigm_similarity);
43 PG_FUNCTION_INFO_V1(bigm_similarity_op);
44
45 /*
46  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
47  * macro since 9.4, and hence the declaration of the function prototypes
48  * here is necessary only for 9.3 or before.
49  */
50 #if PG_VERSION_NUM < 90400
51 Datum           show_bigm(PG_FUNCTION_ARGS);
52 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
53 Datum           likequery(PG_FUNCTION_ARGS);
54 Datum           bigm_similarity(PG_FUNCTION_ARGS);
55 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
56 #endif
57
58 void            _PG_init(void);
59 void            _PG_fini(void);
60
61 void
62 _PG_init(void)
63 {
64         /* Define custom GUC variables */
65         DefineCustomBoolVariable("pg_bigm.enable_recheck",
66                                                          "Recheck that heap tuples fetched from index "
67                                                          "match the query.",
68                                                          NULL,
69                                                          &bigm_enable_recheck,
70                                                          true,
71                                                          PGC_USERSET,
72                                                          0,
73                                                          NULL,
74                                                          NULL,
75                                                          NULL);
76
77         DefineCustomIntVariable("pg_bigm.gin_key_limit",
78                                                         "Sets the maximum number of bi-gram keys allowed to "
79                                                         "use for GIN index search.",
80                                                         "Zero means no limit.",
81                                                         &bigm_gin_key_limit,
82                                                         0,
83                                                         0, INT_MAX,
84                                                         PGC_USERSET,
85                                                         0,
86                                                         NULL,
87                                                         NULL,
88                                                         NULL);
89
90         DefineCustomRealVariable("pg_bigm.similarity_limit",
91                                                          "Sets the similarity threshold used by the "
92                                                          "=% operator.",
93                                                          NULL,
94                                                          &bigm_similarity_limit,
95                                                          0.3,
96                                                          0.0, 1.0,
97                                                          PGC_USERSET,
98                                                          0,
99                                                          NULL,
100                                                          NULL,
101                                                          NULL);
102
103         /* Can't be set in postgresql.conf */
104         DefineCustomStringVariable("pg_bigm.last_update",
105                                                            "Shows the last update date of pg_bigm.",
106                                                            NULL,
107                                                            &bigm_last_update,
108                                                            BIGM_LAST_UPDATE,
109                                                            PGC_INTERNAL,
110                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
111                                                            NULL,
112                                                            NULL,
113                                                            NULL);
114
115         EmitWarningsOnPlaceholders("pg_bigm");
116 }
117
118 void
119 _PG_fini(void)
120 {
121 }
122
123 static int
124 comp_bigm(const void *a, const void *b, void *arg)
125 {
126         int                     res;
127         bool       *haveDups = (bool *) arg;
128
129         res = CMPBIGM(a, b);
130
131         if (res == 0)
132                 *haveDups = true;
133
134         return res;
135 }
136
137 static int
138 unique_array(bigm *a, int len)
139 {
140         bigm       *curend,
141                            *tmp;
142
143         curend = tmp = a;
144         while (tmp - a < len)
145                 if (CMPBIGM(tmp, curend))
146                 {
147                         curend++;
148                         if (curend != tmp)
149                                 memcpy(curend, tmp, BIGMSIZE);
150                         tmp++;
151                 }
152                 else
153                         tmp++;
154
155         return curend + 1 - a;
156 }
157
158 #define iswordchr(c)    (!t_isspace(c))
159
160 /*
161  * Finds first word in string, returns pointer to the word,
162  * endword points to the character after word
163  */
164 static char *
165 find_word(char *str, int lenstr, char **endword, int *charlen)
166 {
167         char       *beginword = str;
168
169         while (beginword - str < lenstr && !iswordchr(beginword))
170                 beginword += pg_mblen(beginword);
171
172         if (beginword - str >= lenstr)
173                 return NULL;
174
175         *endword = beginword;
176         *charlen = 0;
177         while (*endword - str < lenstr && iswordchr(*endword))
178         {
179                 *endword += pg_mblen(*endword);
180                 (*charlen)++;
181         }
182
183         return beginword;
184 }
185
186 /*
187  * The function is named compact_bigram to maintain consistency with pg_trgm,
188  * though it does not reduce multibyte characters to hash values like in
189  * compact_trigram.
190  */
191 static void
192 compact_bigram(bigm *bptr, char *str, int bytelen)
193 {
194         CPBIGM(bptr, str, bytelen);
195 }
196
197 /*
198  * Adds bigrams from words (already padded).
199  */
200 static bigm *
201 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
202 {
203         char       *ptr = str;
204
205         if (charlen < 2)
206         {
207                 compact_bigram(bptr, ptr, pg_mblen(str));
208                 bptr->pmatch = true;
209                 bptr++;
210                 return bptr;
211         }
212
213         if (bytelen > charlen)
214         {
215                 /* Find multibyte character boundaries and call compact_bigram */
216                 int                     lenfirst = pg_mblen(str),
217                                         lenlast = pg_mblen(str + lenfirst);
218
219                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
220                 {
221                         compact_bigram(bptr, ptr, lenfirst + lenlast);
222
223                         ptr += lenfirst;
224                         bptr++;
225
226                         lenfirst = lenlast;
227                         lenlast = pg_mblen(ptr + lenfirst);
228                 }
229         }
230         else
231         {
232                 /* Fast path when there are no multibyte characters */
233                 Assert(bytelen == charlen);
234
235                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
236                 {
237                         CPBIGM(bptr, ptr, 2);
238                         ptr++;
239                         bptr++;
240                 }
241         }
242
243         return bptr;
244 }
245
246 BIGM *
247 generate_bigm(char *str, int slen)
248 {
249         BIGM       *bgm;
250         char       *buf;
251         bigm       *bptr;
252         int                     len,
253                                 charlen,
254                                 bytelen;
255         char       *bword,
256                            *eword;
257
258         /*
259          * Guard against possible overflow in the palloc requests below.
260          * We need to prevent integer overflow in the multiplications here.
261          */
262         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
263                 (Size) slen > MaxAllocSize - 4)
264                 ereport(ERROR,
265                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
266                                  errmsg("out of memory")));
267
268         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
269         SET_VARSIZE(bgm, VARHDRSZ);
270
271         if (slen + LPADDING + RPADDING < 2 || slen == 0)
272                 return bgm;
273
274         bptr = GETARR(bgm);
275
276         buf = palloc(sizeof(char) * (slen + 4));
277
278         if (LPADDING > 0)
279         {
280                 *buf = ' ';
281                 if (LPADDING > 1)
282                         *(buf + 1) = ' ';
283         }
284
285         eword = str;
286         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
287         {
288                 bytelen = eword - bword;
289                 memcpy(buf + LPADDING, bword, bytelen);
290
291                 buf[LPADDING + bytelen] = ' ';
292                 buf[LPADDING + bytelen + 1] = ' ';
293
294                 /*
295                  * count bigrams
296                  */
297                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
298                                                         charlen + LPADDING + RPADDING);
299         }
300
301         pfree(buf);
302
303         if ((len = bptr - GETARR(bgm)) == 0)
304                 return bgm;
305
306         /*
307          * Make bigrams unique.
308          */
309         if (len > 1)
310         {
311                 bool            haveDups = false;
312
313                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
314                 if (haveDups)
315                         len = unique_array(GETARR(bgm), len);
316         }
317
318         SET_VARSIZE(bgm, CALCGTSIZE(len));
319
320         return bgm;
321 }
322
323 /*
324  * Extract the next non-wildcard part of a search string, i.e. a word bounded
325  * by '_' or '%' meta-characters, non-word characters or string end.
326  *
327  * str: source string, of length lenstr bytes (need not be null-terminated)
328  * buf: where to return the substring (must be long enough)
329  * *bytelen: receives byte length of the found substring
330  * *charlen: receives character length of the found substring
331  *
332  * Returns pointer to end+1 of the found substring in the source string.
333  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
334  *
335  * If the found word is bounded by non-word characters or string boundaries
336  * then this function will include corresponding padding spaces into buf.
337  */
338 static const char *
339 get_wildcard_part(const char *str, int lenstr,
340                                   char *buf, int *bytelen, int *charlen)
341 {
342         const char *beginword = str;
343         const char *endword;
344         char       *s = buf;
345         bool            in_leading_wildcard_meta = false;
346         bool            in_trailing_wildcard_meta = false;
347         bool            in_escape = false;
348         int                     clen;
349
350         /*
351          * Find the first word character, remembering whether preceding character
352          * was wildcard meta-character.  Note that the in_escape state persists
353          * from this loop to the next one, since we may exit at a word character
354          * that is in_escape.
355          */
356         while (beginword - str < lenstr)
357         {
358                 if (in_escape)
359                 {
360                         if (iswordchr(beginword))
361                                 break;
362                         in_escape = false;
363                         in_leading_wildcard_meta = false;
364                 }
365                 else
366                 {
367                         if (ISESCAPECHAR(beginword))
368                                 in_escape = true;
369                         else if (ISWILDCARDCHAR(beginword))
370                                 in_leading_wildcard_meta = true;
371                         else if (iswordchr(beginword))
372                                 break;
373                         else
374                                 in_leading_wildcard_meta = false;
375                 }
376                 beginword += pg_mblen(beginword);
377         }
378
379         /*
380          * Handle string end.
381          */
382         if (beginword - str >= lenstr)
383                 return NULL;
384
385         /*
386          * Add left padding spaces if preceding character wasn't wildcard
387          * meta-character.
388          */
389         *charlen = 0;
390         if (!in_leading_wildcard_meta)
391         {
392                 if (LPADDING > 0)
393                 {
394                         *s++ = ' ';
395                         (*charlen)++;
396                         if (LPADDING > 1)
397                         {
398                                 *s++ = ' ';
399                                 (*charlen)++;
400                         }
401                 }
402         }
403
404         /*
405          * Copy data into buf until wildcard meta-character, non-word character or
406          * string boundary.  Strip escapes during copy.
407          */
408         endword = beginword;
409         while (endword - str < lenstr)
410         {
411                 clen = pg_mblen(endword);
412                 if (in_escape)
413                 {
414                         if (iswordchr(endword))
415                         {
416                                 memcpy(s, endword, clen);
417                                 (*charlen)++;
418                                 s += clen;
419                         }
420                         else
421                         {
422                                 /*
423                                  * Back up endword to the escape character when stopping at an
424                                  * escaped char, so that subsequent get_wildcard_part will
425                                  * restart from the escape character.  We assume here that
426                                  * escape chars are single-byte.
427                                  */
428                                 endword--;
429                                 break;
430                         }
431                         in_escape = false;
432                 }
433                 else
434                 {
435                         if (ISESCAPECHAR(endword))
436                                 in_escape = true;
437                         else if (ISWILDCARDCHAR(endword))
438                         {
439                                 in_trailing_wildcard_meta = true;
440                                 break;
441                         }
442                         else if (iswordchr(endword))
443                         {
444                                 memcpy(s, endword, clen);
445                                 (*charlen)++;
446                                 s += clen;
447                         }
448                         else
449                                 break;
450                 }
451                 endword += clen;
452         }
453
454         /*
455          * Add right padding spaces if next character isn't wildcard
456          * meta-character.
457          */
458         if (!in_trailing_wildcard_meta)
459         {
460                 if (RPADDING > 0)
461                 {
462                         *s++ = ' ';
463                         (*charlen)++;
464                         if (RPADDING > 1)
465                         {
466                                 *s++ = ' ';
467                                 (*charlen)++;
468                         }
469                 }
470         }
471
472         *bytelen = s - buf;
473         return endword;
474 }
475
476 /*
477  * Generates bigrams for wildcard search string.
478  *
479  * Returns array of bigrams that must occur in any string that matches the
480  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
481  * " a", "bcd" would be extracted.
482  *
483  * Set 'removeDups' to true if duplicate bigrams are removed.
484  */
485 BIGM *
486 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
487 {
488         BIGM       *bgm;
489         char       *buf;
490         bigm       *bptr;
491         int                     len,
492                                 charlen,
493                                 bytelen;
494         const char *eword;
495
496         *removeDups = false;
497
498         /*
499          * Guard against possible overflow in the palloc requests below.
500          * We need to prevent integer overflow in the multiplications here.
501          */
502         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
503                 (Size) slen > MaxAllocSize - 4)
504                 ereport(ERROR,
505                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
506                                  errmsg("out of memory")));
507
508         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
509         SET_VARSIZE(bgm, VARHDRSZ);
510
511         if (slen + LPADDING + RPADDING < 2 || slen == 0)
512                 return bgm;
513
514         bptr = GETARR(bgm);
515
516         buf = palloc(sizeof(char) * (slen + 4));
517
518         /*
519          * Extract bigrams from each substring extracted by get_wildcard_part.
520          */
521         eword = str;
522         while ((eword = get_wildcard_part(eword, slen - (eword - str),
523                                                                           buf, &bytelen, &charlen)) != NULL)
524         {
525                 /*
526                  * count bigrams
527                  */
528                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
529         }
530
531         pfree(buf);
532
533         if ((len = bptr - GETARR(bgm)) == 0)
534                 return bgm;
535
536         /*
537          * Make bigrams unique.
538          */
539         if (len > 1)
540         {
541                 bool            haveDups = false;
542
543                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
544                 if (haveDups)
545                 {
546                         *removeDups = true;
547                         len = unique_array(GETARR(bgm), len);
548                 }
549         }
550
551         SET_VARSIZE(bgm, CALCGTSIZE(len));
552
553         return bgm;
554 }
555
556 Datum
557 show_bigm(PG_FUNCTION_ARGS)
558 {
559         text       *in = PG_GETARG_TEXT_P(0);
560         BIGM       *bgm;
561         Datum      *d;
562         ArrayType  *a;
563         bigm       *ptr;
564         int                     i;
565
566         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
567         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
568
569         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
570         {
571                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
572
573                 d[i] = PointerGetDatum(item);
574         }
575
576         a = construct_array(
577                                                 d,
578                                                 ARRNELEM(bgm),
579                                                 TEXTOID,
580                                                 -1,
581                                                 false,
582                                                 'i'
583                 );
584
585         for (i = 0; i < ARRNELEM(bgm); i++)
586                 pfree(DatumGetPointer(d[i]));
587
588         pfree(d);
589         pfree(bgm);
590         PG_FREE_IF_COPY(in, 0);
591
592         PG_RETURN_POINTER(a);
593 }
594
595 static float4
596 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
597 {
598         bigm       *ptr1,
599                            *ptr2;
600         int                     count = 0;
601         int                     len1,
602                                 len2;
603
604         ptr1 = GETARR(bgm1);
605         ptr2 = GETARR(bgm2);
606
607         len1 = ARRNELEM(bgm1);
608         len2 = ARRNELEM(bgm2);
609
610         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
611         if (len1 <= 0 || len2 <= 0)
612                 return (float4) 0.0;
613
614         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
615         {
616                 int                     res = CMPBIGM(ptr1, ptr2);
617
618                 if (res < 0)
619                         ptr1++;
620                 else if (res > 0)
621                         ptr2++;
622                 else
623                 {
624                         ptr1++;
625                         ptr2++;
626                         count++;
627                 }
628         }
629
630 #ifdef DIVUNION
631         return ((float4) count) / ((float4) (len1 + len2 - count));
632 #else
633         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
634 #endif
635 }
636
637 Datum
638 bigm_similarity(PG_FUNCTION_ARGS)
639 {
640         text       *in1 = PG_GETARG_TEXT_P(0);
641         text       *in2 = PG_GETARG_TEXT_P(1);
642         BIGM       *bgm1,
643                            *bgm2;
644         float4          res;
645
646         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
647         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
648
649         res = cnt_sml_bigm(bgm1, bgm2);
650
651         pfree(bgm1);
652         pfree(bgm2);
653         PG_FREE_IF_COPY(in1, 0);
654         PG_FREE_IF_COPY(in2, 1);
655
656         PG_RETURN_FLOAT4(res);
657 }
658
659 Datum
660 bigm_similarity_op(PG_FUNCTION_ARGS)
661 {
662         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
663                                                                                                                  PG_GETARG_DATUM(0),
664                                                                                                                  PG_GETARG_DATUM(1)));
665
666         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
667 }
668
669 Datum
670 likequery(PG_FUNCTION_ARGS)
671 {
672         text       *query = PG_GETARG_TEXT_PP(0);
673         const char *str;
674         int                     len;
675         const char *sp;
676         text       *result;
677         char       *rp;
678         int                     mblen;
679
680         str = VARDATA_ANY(query);
681         len = VARSIZE_ANY_EXHDR(query);
682
683         if (len == 0)
684                 PG_RETURN_NULL();
685
686         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
687         rp = VARDATA(result);
688         *rp++ = '%';
689
690         for (sp = str; (sp - str) < len;)
691         {
692                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
693                 {
694                         *rp++ = '\\';
695                         *rp++ = *sp++;
696                 }
697                 else if (IS_HIGHBIT_SET(*sp))
698                 {
699                         mblen = pg_mblen(sp);
700                         memcpy(rp, sp, mblen);
701                         rp += mblen;
702                         sp += mblen;
703                 }
704                 else
705                         *rp++ = *sp++;
706         }
707
708         *rp++ = '%';
709         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
710
711         PG_RETURN_TEXT_P(result);
712 }
713
714 Datum
715 bigmtextcmp(PG_FUNCTION_ARGS)
716 {
717         text       *arg1 = PG_GETARG_TEXT_PP(0);
718         text       *arg2 = PG_GETARG_TEXT_PP(1);
719         char       *a1p = VARDATA_ANY(arg1);
720         char       *a2p = VARDATA_ANY(arg2);
721         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
722         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
723
724         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
725 }