OSDN Git Service

Update copyright for 2016.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
5  *
6  * Changelog:
7  *       2013/01/09
8  *       Support full text search using bigrams.
9  *       Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <ctype.h>
16
17 #include "bigm.h"
18
19 #include "catalog/pg_type.h"
20 #include "tsearch/ts_locale.h"
21 #include "utils/array.h"
22 #include "utils/memutils.h"
23
24 PG_MODULE_MAGIC;
25
26 /* Last update date of pg_bigm */
27 #define BIGM_LAST_UPDATE        "2015.09.10"
28
29 /* GUC variable */
30 bool            bigm_enable_recheck = false;
31 int                     bigm_gin_key_limit = 0;
32 double          bigm_similarity_limit = 0.3;
33 char       *bigm_last_update = NULL;
34
35 PG_FUNCTION_INFO_V1(show_bigm);
36 PG_FUNCTION_INFO_V1(bigmtextcmp);
37 PG_FUNCTION_INFO_V1(likequery);
38 PG_FUNCTION_INFO_V1(bigm_similarity);
39 PG_FUNCTION_INFO_V1(bigm_similarity_op);
40
41 /*
42  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
43  * macro since 9.4, and hence the declaration of the function prototypes
44  * here is necessary only for 9.3 or before.
45  */
46 #if PG_VERSION_NUM < 90400
47 Datum           show_bigm(PG_FUNCTION_ARGS);
48 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
49 Datum           likequery(PG_FUNCTION_ARGS);
50 Datum           bigm_similarity(PG_FUNCTION_ARGS);
51 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
52 #endif
53
54 void            _PG_init(void);
55 void            _PG_fini(void);
56
57 void
58 _PG_init(void)
59 {
60         /* Define custom GUC variables */
61         DefineCustomBoolVariable("pg_bigm.enable_recheck",
62                                                          "Recheck that heap tuples fetched from index "
63                                                          "match the query.",
64                                                          NULL,
65                                                          &bigm_enable_recheck,
66                                                          true,
67                                                          PGC_USERSET,
68                                                          0,
69                                                          NULL,
70                                                          NULL,
71                                                          NULL);
72
73         DefineCustomIntVariable("pg_bigm.gin_key_limit",
74                                                         "Sets the maximum number of bi-gram keys allowed to "
75                                                         "use for GIN index search.",
76                                                         "Zero means no limit.",
77                                                         &bigm_gin_key_limit,
78                                                         0,
79                                                         0, INT_MAX,
80                                                         PGC_USERSET,
81                                                         0,
82                                                         NULL,
83                                                         NULL,
84                                                         NULL);
85
86         DefineCustomRealVariable("pg_bigm.similarity_limit",
87                                                          "Sets the similarity threshold used by the "
88                                                          "=% operator.",
89                                                          NULL,
90                                                          &bigm_similarity_limit,
91                                                          0.3,
92                                                          0.0, 1.0,
93                                                          PGC_USERSET,
94                                                          0,
95                                                          NULL,
96                                                          NULL,
97                                                          NULL);
98
99         /* Can't be set in postgresql.conf */
100         DefineCustomStringVariable("pg_bigm.last_update",
101                                                            "Shows the last update date of pg_bigm.",
102                                                            NULL,
103                                                            &bigm_last_update,
104                                                            BIGM_LAST_UPDATE,
105                                                            PGC_INTERNAL,
106                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
107                                                            NULL,
108                                                            NULL,
109                                                            NULL);
110
111         EmitWarningsOnPlaceholders("pg_bigm");
112 }
113
114 void
115 _PG_fini(void)
116 {
117 }
118
119 static int
120 comp_bigm(const void *a, const void *b, void *arg)
121 {
122         int                     res;
123         bool       *haveDups = (bool *) arg;
124
125         res = CMPBIGM(a, b);
126
127         if (res == 0)
128                 *haveDups = true;
129
130         return res;
131 }
132
133 static int
134 unique_array(bigm *a, int len)
135 {
136         bigm       *curend,
137                            *tmp;
138
139         curend = tmp = a;
140         while (tmp - a < len)
141                 if (CMPBIGM(tmp, curend))
142                 {
143                         curend++;
144                         if (curend != tmp)
145                                 memcpy(curend, tmp, BIGMSIZE);
146                         tmp++;
147                 }
148                 else
149                         tmp++;
150
151         return curend + 1 - a;
152 }
153
154 #define iswordchr(c)    (!t_isspace(c))
155
156 /*
157  * Finds first word in string, returns pointer to the word,
158  * endword points to the character after word
159  */
160 static char *
161 find_word(char *str, int lenstr, char **endword, int *charlen)
162 {
163         char       *beginword = str;
164
165         while (beginword - str < lenstr && !iswordchr(beginword))
166                 beginword += pg_mblen(beginword);
167
168         if (beginword - str >= lenstr)
169                 return NULL;
170
171         *endword = beginword;
172         *charlen = 0;
173         while (*endword - str < lenstr && iswordchr(*endword))
174         {
175                 *endword += pg_mblen(*endword);
176                 (*charlen)++;
177         }
178
179         return beginword;
180 }
181
182 /*
183  * The function is named compact_bigram to maintain consistency with pg_trgm,
184  * though it does not reduce multibyte characters to hash values like in
185  * compact_trigram.
186  */
187 static void
188 compact_bigram(bigm *bptr, char *str, int bytelen)
189 {
190         CPBIGM(bptr, str, bytelen);
191 }
192
193 /*
194  * Adds bigrams from words (already padded).
195  */
196 static bigm *
197 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
198 {
199         char       *ptr = str;
200
201         if (charlen < 2)
202         {
203                 compact_bigram(bptr, ptr, pg_mblen(str));
204                 bptr->pmatch = true;
205                 bptr++;
206                 return bptr;
207         }
208
209         if (bytelen > charlen)
210         {
211                 /* Find multibyte character boundaries and call compact_bigram */
212                 int                     lenfirst = pg_mblen(str),
213                                         lenlast = pg_mblen(str + lenfirst);
214
215                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
216                 {
217                         compact_bigram(bptr, ptr, lenfirst + lenlast);
218
219                         ptr += lenfirst;
220                         bptr++;
221
222                         lenfirst = lenlast;
223                         lenlast = pg_mblen(ptr + lenfirst);
224                 }
225         }
226         else
227         {
228                 /* Fast path when there are no multibyte characters */
229                 Assert(bytelen == charlen);
230
231                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
232                 {
233                         CPBIGM(bptr, ptr, 2);
234                         ptr++;
235                         bptr++;
236                 }
237         }
238
239         return bptr;
240 }
241
242 BIGM *
243 generate_bigm(char *str, int slen)
244 {
245         BIGM       *bgm;
246         char       *buf;
247         bigm       *bptr;
248         int                     len,
249                                 charlen,
250                                 bytelen;
251         char       *bword,
252                            *eword;
253
254         /*
255          * Guard against possible overflow in the palloc requests below.
256          * We need to prevent integer overflow in the multiplications here.
257          */
258         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
259                 (Size) slen > MaxAllocSize - 4)
260                 ereport(ERROR,
261                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
262                                  errmsg("out of memory")));
263
264         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
265         SET_VARSIZE(bgm, VARHDRSZ);
266
267         if (slen + LPADDING + RPADDING < 2 || slen == 0)
268                 return bgm;
269
270         bptr = GETARR(bgm);
271
272         buf = palloc(sizeof(char) * (slen + 4));
273
274         if (LPADDING > 0)
275         {
276                 *buf = ' ';
277                 if (LPADDING > 1)
278                         *(buf + 1) = ' ';
279         }
280
281         eword = str;
282         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
283         {
284                 bytelen = eword - bword;
285                 memcpy(buf + LPADDING, bword, bytelen);
286
287                 buf[LPADDING + bytelen] = ' ';
288                 buf[LPADDING + bytelen + 1] = ' ';
289
290                 /*
291                  * count bigrams
292                  */
293                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
294                                                         charlen + LPADDING + RPADDING);
295         }
296
297         pfree(buf);
298
299         if ((len = bptr - GETARR(bgm)) == 0)
300                 return bgm;
301
302         /*
303          * Make bigrams unique.
304          */
305         if (len > 1)
306         {
307                 bool            haveDups = false;
308
309                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
310                 if (haveDups)
311                         len = unique_array(GETARR(bgm), len);
312         }
313
314         SET_VARSIZE(bgm, CALCGTSIZE(len));
315
316         return bgm;
317 }
318
319 /*
320  * Extract the next non-wildcard part of a search string, ie, a word bounded
321  * by '_' or '%' meta-characters, non-word characters or string end.
322  *
323  * str: source string, of length lenstr bytes (need not be null-terminated)
324  * buf: where to return the substring (must be long enough)
325  * *bytelen: receives byte length of the found substring
326  * *charlen: receives character length of the found substring
327  *
328  * Returns pointer to end+1 of the found substring in the source string.
329  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
330  *
331  * If the found word is bounded by non-word characters or string boundaries
332  * then this function will include corresponding padding spaces into buf.
333  */
334 static const char *
335 get_wildcard_part(const char *str, int lenstr,
336                                   char *buf, int *bytelen, int *charlen)
337 {
338         const char *beginword = str;
339         const char *endword;
340         char       *s = buf;
341         bool            in_leading_wildcard_meta = false;
342         bool            in_trailing_wildcard_meta = false;
343         bool            in_escape = false;
344         int                     clen;
345
346         /*
347          * Find the first word character, remembering whether preceding character
348          * was wildcard meta-character.  Note that the in_escape state persists
349          * from this loop to the next one, since we may exit at a word character
350          * that is in_escape.
351          */
352         while (beginword - str < lenstr)
353         {
354                 if (in_escape)
355                 {
356                         if (iswordchr(beginword))
357                                 break;
358                         in_escape = false;
359                         in_leading_wildcard_meta = false;
360                 }
361                 else
362                 {
363                         if (ISESCAPECHAR(beginword))
364                                 in_escape = true;
365                         else if (ISWILDCARDCHAR(beginword))
366                                 in_leading_wildcard_meta = true;
367                         else if (iswordchr(beginword))
368                                 break;
369                         else
370                                 in_leading_wildcard_meta = false;
371                 }
372                 beginword += pg_mblen(beginword);
373         }
374
375         /*
376          * Handle string end.
377          */
378         if (beginword - str >= lenstr)
379                 return NULL;
380
381         /*
382          * Add left padding spaces if preceding character wasn't wildcard
383          * meta-character.
384          */
385         *charlen = 0;
386         if (!in_leading_wildcard_meta)
387         {
388                 if (LPADDING > 0)
389                 {
390                         *s++ = ' ';
391                         (*charlen)++;
392                         if (LPADDING > 1)
393                         {
394                                 *s++ = ' ';
395                                 (*charlen)++;
396                         }
397                 }
398         }
399
400         /*
401          * Copy data into buf until wildcard meta-character, non-word character or
402          * string boundary.  Strip escapes during copy.
403          */
404         endword = beginword;
405         while (endword - str < lenstr)
406         {
407                 clen = pg_mblen(endword);
408                 if (in_escape)
409                 {
410                         if (iswordchr(endword))
411                         {
412                                 memcpy(s, endword, clen);
413                                 (*charlen)++;
414                                 s += clen;
415                         }
416                         else
417                         {
418                                 /*
419                                  * Back up endword to the escape character when stopping at an
420                                  * escaped char, so that subsequent get_wildcard_part will
421                                  * restart from the escape character.  We assume here that
422                                  * escape chars are single-byte.
423                                  */
424                                 endword--;
425                                 break;
426                         }
427                         in_escape = false;
428                 }
429                 else
430                 {
431                         if (ISESCAPECHAR(endword))
432                                 in_escape = true;
433                         else if (ISWILDCARDCHAR(endword))
434                         {
435                                 in_trailing_wildcard_meta = true;
436                                 break;
437                         }
438                         else if (iswordchr(endword))
439                         {
440                                 memcpy(s, endword, clen);
441                                 (*charlen)++;
442                                 s += clen;
443                         }
444                         else
445                                 break;
446                 }
447                 endword += clen;
448         }
449
450         /*
451          * Add right padding spaces if next character isn't wildcard
452          * meta-character.
453          */
454         if (!in_trailing_wildcard_meta)
455         {
456                 if (RPADDING > 0)
457                 {
458                         *s++ = ' ';
459                         (*charlen)++;
460                         if (RPADDING > 1)
461                         {
462                                 *s++ = ' ';
463                                 (*charlen)++;
464                         }
465                 }
466         }
467
468         *bytelen = s - buf;
469         return endword;
470 }
471
472 /*
473  * Generates bigrams for wildcard search string.
474  *
475  * Returns array of bigrams that must occur in any string that matches the
476  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
477  * " a", "bcd" would be extracted.
478  *
479  * Set 'removeDups' to true if duplicate bigrams are removed.
480  */
481 BIGM *
482 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
483 {
484         BIGM       *bgm;
485         char       *buf;
486         bigm       *bptr;
487         int                     len,
488                                 charlen,
489                                 bytelen;
490         const char *eword;
491
492         *removeDups = false;
493
494         /*
495          * Guard against possible overflow in the palloc requests below.
496          * We need to prevent integer overflow in the multiplications here.
497          */
498         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
499                 (Size) slen > MaxAllocSize - 4)
500                 ereport(ERROR,
501                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
502                                  errmsg("out of memory")));
503
504         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
505         SET_VARSIZE(bgm, VARHDRSZ);
506
507         if (slen + LPADDING + RPADDING < 2 || slen == 0)
508                 return bgm;
509
510         bptr = GETARR(bgm);
511
512         buf = palloc(sizeof(char) * (slen + 4));
513
514         /*
515          * Extract bigrams from each substring extracted by get_wildcard_part.
516          */
517         eword = str;
518         while ((eword = get_wildcard_part(eword, slen - (eword - str),
519                                                                           buf, &bytelen, &charlen)) != NULL)
520         {
521                 /*
522                  * count bigrams
523                  */
524                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
525         }
526
527         pfree(buf);
528
529         if ((len = bptr - GETARR(bgm)) == 0)
530                 return bgm;
531
532         /*
533          * Make bigrams unique.
534          */
535         if (len > 1)
536         {
537                 bool            haveDups = false;
538
539                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
540                 if (haveDups)
541                 {
542                         *removeDups = true;
543                         len = unique_array(GETARR(bgm), len);
544                 }
545         }
546
547         SET_VARSIZE(bgm, CALCGTSIZE(len));
548
549         return bgm;
550 }
551
552 Datum
553 show_bigm(PG_FUNCTION_ARGS)
554 {
555         text       *in = PG_GETARG_TEXT_P(0);
556         BIGM       *bgm;
557         Datum      *d;
558         ArrayType  *a;
559         bigm       *ptr;
560         int                     i;
561
562         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
563         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
564
565         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
566         {
567                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
568
569                 d[i] = PointerGetDatum(item);
570         }
571
572         a = construct_array(
573                                                 d,
574                                                 ARRNELEM(bgm),
575                                                 TEXTOID,
576                                                 -1,
577                                                 false,
578                                                 'i'
579                 );
580
581         for (i = 0; i < ARRNELEM(bgm); i++)
582                 pfree(DatumGetPointer(d[i]));
583
584         pfree(d);
585         pfree(bgm);
586         PG_FREE_IF_COPY(in, 0);
587
588         PG_RETURN_POINTER(a);
589 }
590
591 static float4
592 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
593 {
594         bigm       *ptr1,
595                            *ptr2;
596         int                     count = 0;
597         int                     len1,
598                                 len2;
599
600         ptr1 = GETARR(bgm1);
601         ptr2 = GETARR(bgm2);
602
603         len1 = ARRNELEM(bgm1);
604         len2 = ARRNELEM(bgm2);
605
606         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
607         if (len1 <= 0 || len2 <= 0)
608                 return (float4) 0.0;
609
610         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
611         {
612                 int                     res = CMPBIGM(ptr1, ptr2);
613
614                 if (res < 0)
615                         ptr1++;
616                 else if (res > 0)
617                         ptr2++;
618                 else
619                 {
620                         ptr1++;
621                         ptr2++;
622                         count++;
623                 }
624         }
625
626 #ifdef DIVUNION
627         return ((float4) count) / ((float4) (len1 + len2 - count));
628 #else
629         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
630 #endif
631 }
632
633 Datum
634 bigm_similarity(PG_FUNCTION_ARGS)
635 {
636         text       *in1 = PG_GETARG_TEXT_P(0);
637         text       *in2 = PG_GETARG_TEXT_P(1);
638         BIGM       *bgm1,
639                            *bgm2;
640         float4          res;
641
642         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
643         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
644
645         res = cnt_sml_bigm(bgm1, bgm2);
646
647         pfree(bgm1);
648         pfree(bgm2);
649         PG_FREE_IF_COPY(in1, 0);
650         PG_FREE_IF_COPY(in2, 1);
651
652         PG_RETURN_FLOAT4(res);
653 }
654
655 Datum
656 bigm_similarity_op(PG_FUNCTION_ARGS)
657 {
658         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
659                                                                                                                  PG_GETARG_DATUM(0),
660                                                                                                                  PG_GETARG_DATUM(1)));
661
662         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
663 }
664
665 Datum
666 likequery(PG_FUNCTION_ARGS)
667 {
668         text       *query = PG_GETARG_TEXT_PP(0);
669         const char *str;
670         int                     len;
671         const char *sp;
672         text       *result;
673         char       *rp;
674         int                     mblen;
675
676         str = VARDATA_ANY(query);
677         len = VARSIZE_ANY_EXHDR(query);
678
679         if (len == 0)
680                 PG_RETURN_NULL();
681
682         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
683         rp = VARDATA(result);
684         *rp++ = '%';
685
686         for (sp = str; (sp - str) < len;)
687         {
688                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
689                 {
690                         *rp++ = '\\';
691                         *rp++ = *sp++;
692                 }
693                 else if (IS_HIGHBIT_SET(*sp))
694                 {
695                         mblen = pg_mblen(sp);
696                         memcpy(rp, sp, mblen);
697                         rp += mblen;
698                         sp += mblen;
699                 }
700                 else
701                         *rp++ = *sp++;
702         }
703
704         *rp++ = '%';
705         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
706
707         PG_RETURN_TEXT_P(result);
708 }
709
710 inline int
711 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
712 {
713         int                     i;
714         int                     len = Min(len1, len2);
715
716         for (i = 0; i < len; i++, arg1++, arg2++)
717         {
718                 if (*arg1 == *arg2)
719                         continue;
720                 if (*arg1 < *arg2)
721                         return -1;
722                 else
723                         return 1;
724         }
725
726         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
727 }
728
729 Datum
730 bigmtextcmp(PG_FUNCTION_ARGS)
731 {
732         text       *arg1 = PG_GETARG_TEXT_PP(0);
733         text       *arg2 = PG_GETARG_TEXT_PP(1);
734         char       *a1p = VARDATA_ANY(arg1);
735         char       *a2p = VARDATA_ANY(arg2);
736         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
737         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
738
739         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
740 }