OSDN Git Service

Adjust blank lines around PG_MODULE_MAGIC defines, for consistency.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  * Portions Copyright (c) 2013-2015, NTT DATA Corporation
5  *
6  * Changelog:
7  *       2013/01/09
8  *       Support full text search using bigrams.
9  *       Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <ctype.h>
16
17 #include "bigm.h"
18
19 #include "catalog/pg_type.h"
20 #include "tsearch/ts_locale.h"
21 #include "utils/array.h"
22
23 PG_MODULE_MAGIC;
24
25 /* Last update date of pg_bigm */
26 #define BIGM_LAST_UPDATE        "2013.11.22"
27
28 /* GUC variable */
29 bool            bigm_enable_recheck = false;
30 int                     bigm_gin_key_limit = 0;
31 double          bigm_similarity_limit = 0.3;
32 char       *bigm_last_update = NULL;
33
34 PG_FUNCTION_INFO_V1(show_bigm);
35 PG_FUNCTION_INFO_V1(bigmtextcmp);
36 PG_FUNCTION_INFO_V1(likequery);
37 PG_FUNCTION_INFO_V1(bigm_similarity);
38 PG_FUNCTION_INFO_V1(bigm_similarity_op);
39
40 /*
41  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
42  * macro since 9.4, and hence the declaration of the function prototypes
43  * here is necessary only for 9.3 or before.
44  */
45 #if PG_VERSION_NUM < 90400
46 Datum           show_bigm(PG_FUNCTION_ARGS);
47 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
48 Datum           likequery(PG_FUNCTION_ARGS);
49 Datum           bigm_similarity(PG_FUNCTION_ARGS);
50 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
51 #endif
52
53 void            _PG_init(void);
54 void            _PG_fini(void);
55
56 void
57 _PG_init(void)
58 {
59         /* Define custom GUC variables */
60         DefineCustomBoolVariable("pg_bigm.enable_recheck",
61                                                          "Recheck that heap tuples fetched from index "
62                                                          "match the query.",
63                                                          NULL,
64                                                          &bigm_enable_recheck,
65                                                          true,
66                                                          PGC_USERSET,
67                                                          0,
68                                                          NULL,
69                                                          NULL,
70                                                          NULL);
71
72         DefineCustomIntVariable("pg_bigm.gin_key_limit",
73                                                         "Sets the maximum number of bi-gram keys allowed to "
74                                                         "use for GIN index search.",
75                                                         "Zero means no limit.",
76                                                         &bigm_gin_key_limit,
77                                                         0,
78                                                         0, INT_MAX,
79                                                         PGC_USERSET,
80                                                         0,
81                                                         NULL,
82                                                         NULL,
83                                                         NULL);
84
85         DefineCustomRealVariable("pg_bigm.similarity_limit",
86                                                          "Sets the similarity threshold used by the "
87                                                          "=% operator.",
88                                                          NULL,
89                                                          &bigm_similarity_limit,
90                                                          0.3,
91                                                          0.0, 1.0,
92                                                          PGC_USERSET,
93                                                          0,
94                                                          NULL,
95                                                          NULL,
96                                                          NULL);
97
98         /* Can't be set in postgresql.conf */
99         DefineCustomStringVariable("pg_bigm.last_update",
100                                                            "Shows the last update date of pg_bigm.",
101                                                            NULL,
102                                                            &bigm_last_update,
103                                                            BIGM_LAST_UPDATE,
104                                                            PGC_INTERNAL,
105                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
106                                                            NULL,
107                                                            NULL,
108                                                            NULL);
109
110         EmitWarningsOnPlaceholders("pg_bigm");
111 }
112
113 void
114 _PG_fini(void)
115 {
116 }
117
118 static int
119 comp_bigm(const void *a, const void *b, void *arg)
120 {
121         int                     res;
122         bool       *haveDups = (bool *) arg;
123
124         res = CMPBIGM(a, b);
125
126         if (res == 0)
127                 *haveDups = true;
128
129         return res;
130 }
131
132 static int
133 unique_array(bigm *a, int len)
134 {
135         bigm       *curend,
136                            *tmp;
137
138         curend = tmp = a;
139         while (tmp - a < len)
140                 if (CMPBIGM(tmp, curend))
141                 {
142                         curend++;
143                         if (curend != tmp)
144                                 memcpy(curend, tmp, BIGMSIZE);
145                         tmp++;
146                 }
147                 else
148                         tmp++;
149
150         return curend + 1 - a;
151 }
152
153 #define iswordchr(c)    (!t_isspace(c))
154
155 /*
156  * Finds first word in string, returns pointer to the word,
157  * endword points to the character after word
158  */
159 static char *
160 find_word(char *str, int lenstr, char **endword, int *charlen)
161 {
162         char       *beginword = str;
163
164         while (beginword - str < lenstr && !iswordchr(beginword))
165                 beginword += pg_mblen(beginword);
166
167         if (beginword - str >= lenstr)
168                 return NULL;
169
170         *endword = beginword;
171         *charlen = 0;
172         while (*endword - str < lenstr && iswordchr(*endword))
173         {
174                 *endword += pg_mblen(*endword);
175                 (*charlen)++;
176         }
177
178         return beginword;
179 }
180
181 /*
182  * The function is named compact_bigram to maintain consistency with pg_trgm,
183  * though it does not reduce multibyte characters to hash values like in
184  * compact_trigram.
185  */
186 static void
187 compact_bigram(bigm *bptr, char *str, int bytelen)
188 {
189         CPBIGM(bptr, str, bytelen);
190 }
191
192 /*
193  * Adds bigrams from words (already padded).
194  */
195 static bigm *
196 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
197 {
198         char       *ptr = str;
199
200         if (charlen < 2)
201         {
202                 compact_bigram(bptr, ptr, pg_mblen(str));
203                 bptr->pmatch = true;
204                 bptr++;
205                 return bptr;
206         }
207
208         if (bytelen > charlen)
209         {
210                 /* Find multibyte character boundaries and call compact_bigram */
211                 int                     lenfirst = pg_mblen(str),
212                                         lenlast = pg_mblen(str + lenfirst);
213
214                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
215                 {
216                         compact_bigram(bptr, ptr, lenfirst + lenlast);
217
218                         ptr += lenfirst;
219                         bptr++;
220
221                         lenfirst = lenlast;
222                         lenlast = pg_mblen(ptr + lenfirst);
223                 }
224         }
225         else
226         {
227                 /* Fast path when there are no multibyte characters */
228                 Assert(bytelen == charlen);
229
230                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
231                 {
232                         CPBIGM(bptr, ptr, 2);
233                         ptr++;
234                         bptr++;
235                 }
236         }
237
238         return bptr;
239 }
240
241 BIGM *
242 generate_bigm(char *str, int slen)
243 {
244         BIGM       *bgm;
245         char       *buf;
246         bigm       *bptr;
247         int                     len,
248                                 charlen,
249                                 bytelen;
250         char       *bword,
251                            *eword;
252
253         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (Size) (slen / 2 + 1) * 3);
254         SET_VARSIZE(bgm, VARHDRSZ);
255
256         if (slen + LPADDING + RPADDING < 2 || slen == 0)
257                 return bgm;
258
259         bptr = GETARR(bgm);
260
261         buf = palloc(sizeof(char) * (slen + 4));
262
263         if (LPADDING > 0)
264         {
265                 *buf = ' ';
266                 if (LPADDING > 1)
267                         *(buf + 1) = ' ';
268         }
269
270         eword = str;
271         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
272         {
273                 bytelen = eword - bword;
274                 memcpy(buf + LPADDING, bword, bytelen);
275
276                 buf[LPADDING + bytelen] = ' ';
277                 buf[LPADDING + bytelen + 1] = ' ';
278
279                 /*
280                  * count bigrams
281                  */
282                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
283                                                         charlen + LPADDING + RPADDING);
284         }
285
286         pfree(buf);
287
288         if ((len = bptr - GETARR(bgm)) == 0)
289                 return bgm;
290
291         /*
292          * Make bigrams unique.
293          */
294         if (len > 1)
295         {
296                 bool            haveDups = false;
297
298                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
299                 if (haveDups)
300                         len = unique_array(GETARR(bgm), len);
301         }
302
303         SET_VARSIZE(bgm, CALCGTSIZE(len));
304
305         return bgm;
306 }
307
308 /*
309  * Extract the next non-wildcard part of a search string, ie, a word bounded
310  * by '_' or '%' meta-characters, non-word characters or string end.
311  *
312  * str: source string, of length lenstr bytes (need not be null-terminated)
313  * buf: where to return the substring (must be long enough)
314  * *bytelen: receives byte length of the found substring
315  * *charlen: receives character length of the found substring
316  *
317  * Returns pointer to end+1 of the found substring in the source string.
318  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
319  *
320  * If the found word is bounded by non-word characters or string boundaries
321  * then this function will include corresponding padding spaces into buf.
322  */
323 static const char *
324 get_wildcard_part(const char *str, int lenstr,
325                                   char *buf, int *bytelen, int *charlen)
326 {
327         const char *beginword = str;
328         const char *endword;
329         char       *s = buf;
330         bool            in_leading_wildcard_meta = false;
331         bool            in_trailing_wildcard_meta = false;
332         bool            in_escape = false;
333         int                     clen;
334
335         /*
336          * Find the first word character, remembering whether preceding character
337          * was wildcard meta-character.  Note that the in_escape state persists
338          * from this loop to the next one, since we may exit at a word character
339          * that is in_escape.
340          */
341         while (beginword - str < lenstr)
342         {
343                 if (in_escape)
344                 {
345                         if (iswordchr(beginword))
346                                 break;
347                         in_escape = false;
348                         in_leading_wildcard_meta = false;
349                 }
350                 else
351                 {
352                         if (ISESCAPECHAR(beginword))
353                                 in_escape = true;
354                         else if (ISWILDCARDCHAR(beginword))
355                                 in_leading_wildcard_meta = true;
356                         else if (iswordchr(beginword))
357                                 break;
358                         else
359                                 in_leading_wildcard_meta = false;
360                 }
361                 beginword += pg_mblen(beginword);
362         }
363
364         /*
365          * Handle string end.
366          */
367         if (beginword - str >= lenstr)
368                 return NULL;
369
370         /*
371          * Add left padding spaces if preceding character wasn't wildcard
372          * meta-character.
373          */
374         *charlen = 0;
375         if (!in_leading_wildcard_meta)
376         {
377                 if (LPADDING > 0)
378                 {
379                         *s++ = ' ';
380                         (*charlen)++;
381                         if (LPADDING > 1)
382                         {
383                                 *s++ = ' ';
384                                 (*charlen)++;
385                         }
386                 }
387         }
388
389         /*
390          * Copy data into buf until wildcard meta-character, non-word character or
391          * string boundary.  Strip escapes during copy.
392          */
393         endword = beginword;
394         while (endword - str < lenstr)
395         {
396                 clen = pg_mblen(endword);
397                 if (in_escape)
398                 {
399                         if (iswordchr(endword))
400                         {
401                                 memcpy(s, endword, clen);
402                                 (*charlen)++;
403                                 s += clen;
404                         }
405                         else
406                         {
407                                 /*
408                                  * Back up endword to the escape character when stopping at an
409                                  * escaped char, so that subsequent get_wildcard_part will
410                                  * restart from the escape character.  We assume here that
411                                  * escape chars are single-byte.
412                                  */
413                                 endword--;
414                                 break;
415                         }
416                         in_escape = false;
417                 }
418                 else
419                 {
420                         if (ISESCAPECHAR(endword))
421                                 in_escape = true;
422                         else if (ISWILDCARDCHAR(endword))
423                         {
424                                 in_trailing_wildcard_meta = true;
425                                 break;
426                         }
427                         else if (iswordchr(endword))
428                         {
429                                 memcpy(s, endword, clen);
430                                 (*charlen)++;
431                                 s += clen;
432                         }
433                         else
434                                 break;
435                 }
436                 endword += clen;
437         }
438
439         /*
440          * Add right padding spaces if next character isn't wildcard
441          * meta-character.
442          */
443         if (!in_trailing_wildcard_meta)
444         {
445                 if (RPADDING > 0)
446                 {
447                         *s++ = ' ';
448                         (*charlen)++;
449                         if (RPADDING > 1)
450                         {
451                                 *s++ = ' ';
452                                 (*charlen)++;
453                         }
454                 }
455         }
456
457         *bytelen = s - buf;
458         return endword;
459 }
460
461 /*
462  * Generates bigrams for wildcard search string.
463  *
464  * Returns array of bigrams that must occur in any string that matches the
465  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
466  * " a", "bcd" would be extracted.
467  *
468  * Set 'removeDups' to true if duplicate bigrams are removed.
469  */
470 BIGM *
471 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
472 {
473         BIGM       *bgm;
474         char       *buf;
475         bigm       *bptr;
476         int                     len,
477                                 charlen,
478                                 bytelen;
479         const char *eword;
480
481         *removeDups = false;
482
483         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (Size) (slen / 2 + 1) * 3);
484         SET_VARSIZE(bgm, VARHDRSZ);
485
486         if (slen + LPADDING + RPADDING < 2 || slen == 0)
487                 return bgm;
488
489         bptr = GETARR(bgm);
490
491         buf = palloc(sizeof(char) * (slen + 4));
492
493         /*
494          * Extract bigrams from each substring extracted by get_wildcard_part.
495          */
496         eword = str;
497         while ((eword = get_wildcard_part(eword, slen - (eword - str),
498                                                                           buf, &bytelen, &charlen)) != NULL)
499         {
500                 /*
501                  * count bigrams
502                  */
503                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
504         }
505
506         pfree(buf);
507
508         if ((len = bptr - GETARR(bgm)) == 0)
509                 return bgm;
510
511         /*
512          * Make bigrams unique.
513          */
514         if (len > 1)
515         {
516                 bool            haveDups = false;
517
518                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
519                 if (haveDups)
520                 {
521                         *removeDups = true;
522                         len = unique_array(GETARR(bgm), len);
523                 }
524         }
525
526         SET_VARSIZE(bgm, CALCGTSIZE(len));
527
528         return bgm;
529 }
530
531 Datum
532 show_bigm(PG_FUNCTION_ARGS)
533 {
534         text       *in = PG_GETARG_TEXT_P(0);
535         BIGM       *bgm;
536         Datum      *d;
537         ArrayType  *a;
538         bigm       *ptr;
539         int                     i;
540
541         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
542         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
543
544         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
545         {
546                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
547
548                 d[i] = PointerGetDatum(item);
549         }
550
551         a = construct_array(
552                                                 d,
553                                                 ARRNELEM(bgm),
554                                                 TEXTOID,
555                                                 -1,
556                                                 false,
557                                                 'i'
558                 );
559
560         for (i = 0; i < ARRNELEM(bgm); i++)
561                 pfree(DatumGetPointer(d[i]));
562
563         pfree(d);
564         pfree(bgm);
565         PG_FREE_IF_COPY(in, 0);
566
567         PG_RETURN_POINTER(a);
568 }
569
570 static float4
571 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
572 {
573         bigm       *ptr1,
574                            *ptr2;
575         int                     count = 0;
576         int                     len1,
577                                 len2;
578
579         ptr1 = GETARR(bgm1);
580         ptr2 = GETARR(bgm2);
581
582         len1 = ARRNELEM(bgm1);
583         len2 = ARRNELEM(bgm2);
584
585         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
586         if (len1 <= 0 || len2 <= 0)
587                 return (float4) 0.0;
588
589         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
590         {
591                 int                     res = CMPBIGM(ptr1, ptr2);
592
593                 if (res < 0)
594                         ptr1++;
595                 else if (res > 0)
596                         ptr2++;
597                 else
598                 {
599                         ptr1++;
600                         ptr2++;
601                         count++;
602                 }
603         }
604
605 #ifdef DIVUNION
606         return ((float4) count) / ((float4) (len1 + len2 - count));
607 #else
608         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
609 #endif
610 }
611
612 Datum
613 bigm_similarity(PG_FUNCTION_ARGS)
614 {
615         text       *in1 = PG_GETARG_TEXT_P(0);
616         text       *in2 = PG_GETARG_TEXT_P(1);
617         BIGM       *bgm1,
618                            *bgm2;
619         float4          res;
620
621         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
622         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
623
624         res = cnt_sml_bigm(bgm1, bgm2);
625
626         pfree(bgm1);
627         pfree(bgm2);
628         PG_FREE_IF_COPY(in1, 0);
629         PG_FREE_IF_COPY(in2, 1);
630
631         PG_RETURN_FLOAT4(res);
632 }
633
634 Datum
635 bigm_similarity_op(PG_FUNCTION_ARGS)
636 {
637         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
638                                                                                                                  PG_GETARG_DATUM(0),
639                                                                                                                  PG_GETARG_DATUM(1)));
640
641         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
642 }
643
644 Datum
645 likequery(PG_FUNCTION_ARGS)
646 {
647         text       *query = PG_GETARG_TEXT_PP(0);
648         const char *str;
649         int                     len;
650         const char *sp;
651         text       *result;
652         char       *rp;
653         int                     mblen;
654
655         str = VARDATA_ANY(query);
656         len = VARSIZE_ANY_EXHDR(query);
657
658         if (len == 0)
659                 PG_RETURN_NULL();
660
661         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
662         rp = VARDATA(result);
663         *rp++ = '%';
664
665         for (sp = str; (sp - str) < len;)
666         {
667                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
668                 {
669                         *rp++ = '\\';
670                         *rp++ = *sp++;
671                 }
672                 else if (IS_HIGHBIT_SET(*sp))
673                 {
674                         mblen = pg_mblen(sp);
675                         memcpy(rp, sp, mblen);
676                         rp += mblen;
677                         sp += mblen;
678                 }
679                 else
680                         *rp++ = *sp++;
681         }
682
683         *rp++ = '%';
684         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
685
686         PG_RETURN_TEXT_P(result);
687 }
688
689 inline int
690 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
691 {
692         int                     i;
693         int                     len = Min(len1, len2);
694
695         for (i = 0; i < len; i++, arg1++, arg2++)
696         {
697                 if (*arg1 == *arg2)
698                         continue;
699                 if (*arg1 < *arg2)
700                         return -1;
701                 else
702                         return 1;
703         }
704
705         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
706 }
707
708 Datum
709 bigmtextcmp(PG_FUNCTION_ARGS)
710 {
711         text       *arg1 = PG_GETARG_TEXT_PP(0);
712         text       *arg2 = PG_GETARG_TEXT_PP(1);
713         char       *a1p = VARDATA_ANY(arg1);
714         char       *a2p = VARDATA_ANY(arg2);
715         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
716         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
717
718         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
719 }