OSDN Git Service

pgindent run.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  * Portions Copyright (c) 2013-2015, NTT DATA Corporation
5  *
6  * Changelog:
7  *       2013/01/09
8  *       Support full text search using bigrams.
9  *       Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <ctype.h>
16
17 #include "bigm.h"
18
19 #include "catalog/pg_type.h"
20 #include "tsearch/ts_locale.h"
21 #include "utils/array.h"
22
23
24 PG_MODULE_MAGIC;
25
26 /* Last update date of pg_bigm */
27 #define BIGM_LAST_UPDATE        "2013.11.22"
28
29 /* GUC variable */
30 bool            bigm_enable_recheck = false;
31 int                     bigm_gin_key_limit = 0;
32 double          bigm_similarity_limit = 0.3;
33 char       *bigm_last_update = NULL;
34
35 PG_FUNCTION_INFO_V1(show_bigm);
36 PG_FUNCTION_INFO_V1(bigmtextcmp);
37 PG_FUNCTION_INFO_V1(likequery);
38 PG_FUNCTION_INFO_V1(bigm_similarity);
39 PG_FUNCTION_INFO_V1(bigm_similarity_op);
40
41 /*
42  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
43  * macro since 9.4, and hence the declaration of the function prototypes
44  * here is necessary only for 9.3 or before.
45  */
46 #if PG_VERSION_NUM < 90400
47 Datum           show_bigm(PG_FUNCTION_ARGS);
48 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
49 Datum           likequery(PG_FUNCTION_ARGS);
50 Datum           bigm_similarity(PG_FUNCTION_ARGS);
51 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
52 #endif
53
54 void            _PG_init(void);
55 void            _PG_fini(void);
56
57 void
58 _PG_init(void)
59 {
60         /* Define custom GUC variables */
61         DefineCustomBoolVariable("pg_bigm.enable_recheck",
62                                                          "Recheck that heap tuples fetched from index "
63                                                          "match the query.",
64                                                          NULL,
65                                                          &bigm_enable_recheck,
66                                                          true,
67                                                          PGC_USERSET,
68                                                          0,
69                                                          NULL,
70                                                          NULL,
71                                                          NULL);
72
73         DefineCustomIntVariable("pg_bigm.gin_key_limit",
74                                                         "Sets the maximum number of bi-gram keys allowed to "
75                                                         "use for GIN index search.",
76                                                         "Zero means no limit.",
77                                                         &bigm_gin_key_limit,
78                                                         0,
79                                                         0, INT_MAX,
80                                                         PGC_USERSET,
81                                                         0,
82                                                         NULL,
83                                                         NULL,
84                                                         NULL);
85
86         DefineCustomRealVariable("pg_bigm.similarity_limit",
87                                                          "Sets the similarity threshold used by the "
88                                                          "=% operator.",
89                                                          NULL,
90                                                          &bigm_similarity_limit,
91                                                          0.3,
92                                                          0.0, 1.0,
93                                                          PGC_USERSET,
94                                                          0,
95                                                          NULL,
96                                                          NULL,
97                                                          NULL);
98
99         /* Can't be set in postgresql.conf */
100         DefineCustomStringVariable("pg_bigm.last_update",
101                                                            "Shows the last update date of pg_bigm.",
102                                                            NULL,
103                                                            &bigm_last_update,
104                                                            BIGM_LAST_UPDATE,
105                                                            PGC_INTERNAL,
106                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
107                                                            NULL,
108                                                            NULL,
109                                                            NULL);
110
111         EmitWarningsOnPlaceholders("pg_bigm");
112 }
113
114 void
115 _PG_fini(void)
116 {
117 }
118
119 static int
120 comp_bigm(const void *a, const void *b, void *arg)
121 {
122         int                     res;
123         bool       *haveDups = (bool *) arg;
124
125         res = CMPBIGM(a, b);
126
127         if (res == 0)
128                 *haveDups = true;
129
130         return res;
131 }
132
133 static int
134 unique_array(bigm *a, int len)
135 {
136         bigm       *curend,
137                            *tmp;
138
139         curend = tmp = a;
140         while (tmp - a < len)
141                 if (CMPBIGM(tmp, curend))
142                 {
143                         curend++;
144                         if (curend != tmp)
145                                 memcpy(curend, tmp, BIGMSIZE);
146                         tmp++;
147                 }
148                 else
149                         tmp++;
150
151         return curend + 1 - a;
152 }
153
154 #define iswordchr(c)    (!t_isspace(c))
155
156 /*
157  * Finds first word in string, returns pointer to the word,
158  * endword points to the character after word
159  */
160 static char *
161 find_word(char *str, int lenstr, char **endword, int *charlen)
162 {
163         char       *beginword = str;
164
165         while (beginword - str < lenstr && !iswordchr(beginword))
166                 beginword += pg_mblen(beginword);
167
168         if (beginword - str >= lenstr)
169                 return NULL;
170
171         *endword = beginword;
172         *charlen = 0;
173         while (*endword - str < lenstr && iswordchr(*endword))
174         {
175                 *endword += pg_mblen(*endword);
176                 (*charlen)++;
177         }
178
179         return beginword;
180 }
181
182 /*
183  * The function is named compact_bigram to maintain consistency with pg_trgm,
184  * though it does not reduce multibyte characters to hash values like in
185  * compact_trigram.
186  */
187 static void
188 compact_bigram(bigm *bptr, char *str, int bytelen)
189 {
190         CPBIGM(bptr, str, bytelen);
191 }
192
193 /*
194  * Adds bigrams from words (already padded).
195  */
196 static bigm *
197 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
198 {
199         char       *ptr = str;
200
201         if (charlen < 2)
202         {
203                 compact_bigram(bptr, ptr, pg_mblen(str));
204                 bptr->pmatch = true;
205                 bptr++;
206                 return bptr;
207         }
208
209         if (bytelen > charlen)
210         {
211                 /* Find multibyte character boundaries and call compact_bigram */
212                 int                     lenfirst = pg_mblen(str),
213                                         lenlast = pg_mblen(str + lenfirst);
214
215                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
216                 {
217                         compact_bigram(bptr, ptr, lenfirst + lenlast);
218
219                         ptr += lenfirst;
220                         bptr++;
221
222                         lenfirst = lenlast;
223                         lenlast = pg_mblen(ptr + lenfirst);
224                 }
225         }
226         else
227         {
228                 /* Fast path when there are no multibyte characters */
229                 Assert(bytelen == charlen);
230
231                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
232                 {
233                         CPBIGM(bptr, ptr, 2);
234                         ptr++;
235                         bptr++;
236                 }
237         }
238
239         return bptr;
240 }
241
242 BIGM *
243 generate_bigm(char *str, int slen)
244 {
245         BIGM       *bgm;
246         char       *buf;
247         bigm       *bptr;
248         int                     len,
249                                 charlen,
250                                 bytelen;
251         char       *bword,
252                            *eword;
253
254         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (Size) (slen / 2 + 1) * 3);
255         SET_VARSIZE(bgm, VARHDRSZ);
256
257         if (slen + LPADDING + RPADDING < 2 || slen == 0)
258                 return bgm;
259
260         bptr = GETARR(bgm);
261
262         buf = palloc(sizeof(char) * (slen + 4));
263
264         if (LPADDING > 0)
265         {
266                 *buf = ' ';
267                 if (LPADDING > 1)
268                         *(buf + 1) = ' ';
269         }
270
271         eword = str;
272         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
273         {
274                 bytelen = eword - bword;
275                 memcpy(buf + LPADDING, bword, bytelen);
276
277                 buf[LPADDING + bytelen] = ' ';
278                 buf[LPADDING + bytelen + 1] = ' ';
279
280                 /*
281                  * count bigrams
282                  */
283                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
284                                                         charlen + LPADDING + RPADDING);
285         }
286
287         pfree(buf);
288
289         if ((len = bptr - GETARR(bgm)) == 0)
290                 return bgm;
291
292         /*
293          * Make bigrams unique.
294          */
295         if (len > 1)
296         {
297                 bool            haveDups = false;
298
299                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
300                 if (haveDups)
301                         len = unique_array(GETARR(bgm), len);
302         }
303
304         SET_VARSIZE(bgm, CALCGTSIZE(len));
305
306         return bgm;
307 }
308
309 /*
310  * Extract the next non-wildcard part of a search string, ie, a word bounded
311  * by '_' or '%' meta-characters, non-word characters or string end.
312  *
313  * str: source string, of length lenstr bytes (need not be null-terminated)
314  * buf: where to return the substring (must be long enough)
315  * *bytelen: receives byte length of the found substring
316  * *charlen: receives character length of the found substring
317  *
318  * Returns pointer to end+1 of the found substring in the source string.
319  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
320  *
321  * If the found word is bounded by non-word characters or string boundaries
322  * then this function will include corresponding padding spaces into buf.
323  */
324 static const char *
325 get_wildcard_part(const char *str, int lenstr,
326                                   char *buf, int *bytelen, int *charlen)
327 {
328         const char *beginword = str;
329         const char *endword;
330         char       *s = buf;
331         bool            in_leading_wildcard_meta = false;
332         bool            in_trailing_wildcard_meta = false;
333         bool            in_escape = false;
334         int                     clen;
335
336         /*
337          * Find the first word character, remembering whether preceding character
338          * was wildcard meta-character.  Note that the in_escape state persists
339          * from this loop to the next one, since we may exit at a word character
340          * that is in_escape.
341          */
342         while (beginword - str < lenstr)
343         {
344                 if (in_escape)
345                 {
346                         if (iswordchr(beginword))
347                                 break;
348                         in_escape = false;
349                         in_leading_wildcard_meta = false;
350                 }
351                 else
352                 {
353                         if (ISESCAPECHAR(beginword))
354                                 in_escape = true;
355                         else if (ISWILDCARDCHAR(beginword))
356                                 in_leading_wildcard_meta = true;
357                         else if (iswordchr(beginword))
358                                 break;
359                         else
360                                 in_leading_wildcard_meta = false;
361                 }
362                 beginword += pg_mblen(beginword);
363         }
364
365         /*
366          * Handle string end.
367          */
368         if (beginword - str >= lenstr)
369                 return NULL;
370
371         /*
372          * Add left padding spaces if preceding character wasn't wildcard
373          * meta-character.
374          */
375         *charlen = 0;
376         if (!in_leading_wildcard_meta)
377         {
378                 if (LPADDING > 0)
379                 {
380                         *s++ = ' ';
381                         (*charlen)++;
382                         if (LPADDING > 1)
383                         {
384                                 *s++ = ' ';
385                                 (*charlen)++;
386                         }
387                 }
388         }
389
390         /*
391          * Copy data into buf until wildcard meta-character, non-word character or
392          * string boundary.  Strip escapes during copy.
393          */
394         endword = beginword;
395         while (endword - str < lenstr)
396         {
397                 clen = pg_mblen(endword);
398                 if (in_escape)
399                 {
400                         if (iswordchr(endword))
401                         {
402                                 memcpy(s, endword, clen);
403                                 (*charlen)++;
404                                 s += clen;
405                         }
406                         else
407                         {
408                                 /*
409                                  * Back up endword to the escape character when stopping at an
410                                  * escaped char, so that subsequent get_wildcard_part will
411                                  * restart from the escape character.  We assume here that
412                                  * escape chars are single-byte.
413                                  */
414                                 endword--;
415                                 break;
416                         }
417                         in_escape = false;
418                 }
419                 else
420                 {
421                         if (ISESCAPECHAR(endword))
422                                 in_escape = true;
423                         else if (ISWILDCARDCHAR(endword))
424                         {
425                                 in_trailing_wildcard_meta = true;
426                                 break;
427                         }
428                         else if (iswordchr(endword))
429                         {
430                                 memcpy(s, endword, clen);
431                                 (*charlen)++;
432                                 s += clen;
433                         }
434                         else
435                                 break;
436                 }
437                 endword += clen;
438         }
439
440         /*
441          * Add right padding spaces if next character isn't wildcard
442          * meta-character.
443          */
444         if (!in_trailing_wildcard_meta)
445         {
446                 if (RPADDING > 0)
447                 {
448                         *s++ = ' ';
449                         (*charlen)++;
450                         if (RPADDING > 1)
451                         {
452                                 *s++ = ' ';
453                                 (*charlen)++;
454                         }
455                 }
456         }
457
458         *bytelen = s - buf;
459         return endword;
460 }
461
462 /*
463  * Generates bigrams for wildcard search string.
464  *
465  * Returns array of bigrams that must occur in any string that matches the
466  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
467  * " a", "bcd" would be extracted.
468  *
469  * Set 'removeDups' to true if duplicate bigrams are removed.
470  */
471 BIGM *
472 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
473 {
474         BIGM       *bgm;
475         char       *buf;
476         bigm       *bptr;
477         int                     len,
478                                 charlen,
479                                 bytelen;
480         const char *eword;
481
482         *removeDups = false;
483
484         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (Size) (slen / 2 + 1) * 3);
485         SET_VARSIZE(bgm, VARHDRSZ);
486
487         if (slen + LPADDING + RPADDING < 2 || slen == 0)
488                 return bgm;
489
490         bptr = GETARR(bgm);
491
492         buf = palloc(sizeof(char) * (slen + 4));
493
494         /*
495          * Extract bigrams from each substring extracted by get_wildcard_part.
496          */
497         eword = str;
498         while ((eword = get_wildcard_part(eword, slen - (eword - str),
499                                                                           buf, &bytelen, &charlen)) != NULL)
500         {
501                 /*
502                  * count bigrams
503                  */
504                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
505         }
506
507         pfree(buf);
508
509         if ((len = bptr - GETARR(bgm)) == 0)
510                 return bgm;
511
512         /*
513          * Make bigrams unique.
514          */
515         if (len > 1)
516         {
517                 bool            haveDups = false;
518
519                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
520                 if (haveDups)
521                 {
522                         *removeDups = true;
523                         len = unique_array(GETARR(bgm), len);
524                 }
525         }
526
527         SET_VARSIZE(bgm, CALCGTSIZE(len));
528
529         return bgm;
530 }
531
532 Datum
533 show_bigm(PG_FUNCTION_ARGS)
534 {
535         text       *in = PG_GETARG_TEXT_P(0);
536         BIGM       *bgm;
537         Datum      *d;
538         ArrayType  *a;
539         bigm       *ptr;
540         int                     i;
541
542         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
543         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
544
545         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
546         {
547                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
548
549                 d[i] = PointerGetDatum(item);
550         }
551
552         a = construct_array(
553                                                 d,
554                                                 ARRNELEM(bgm),
555                                                 TEXTOID,
556                                                 -1,
557                                                 false,
558                                                 'i'
559                 );
560
561         for (i = 0; i < ARRNELEM(bgm); i++)
562                 pfree(DatumGetPointer(d[i]));
563
564         pfree(d);
565         pfree(bgm);
566         PG_FREE_IF_COPY(in, 0);
567
568         PG_RETURN_POINTER(a);
569 }
570
571 static float4
572 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
573 {
574         bigm       *ptr1,
575                            *ptr2;
576         int                     count = 0;
577         int                     len1,
578                                 len2;
579
580         ptr1 = GETARR(bgm1);
581         ptr2 = GETARR(bgm2);
582
583         len1 = ARRNELEM(bgm1);
584         len2 = ARRNELEM(bgm2);
585
586         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
587         if (len1 <= 0 || len2 <= 0)
588                 return (float4) 0.0;
589
590         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
591         {
592                 int                     res = CMPBIGM(ptr1, ptr2);
593
594                 if (res < 0)
595                         ptr1++;
596                 else if (res > 0)
597                         ptr2++;
598                 else
599                 {
600                         ptr1++;
601                         ptr2++;
602                         count++;
603                 }
604         }
605
606 #ifdef DIVUNION
607         return ((float4) count) / ((float4) (len1 + len2 - count));
608 #else
609         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
610 #endif
611 }
612
613 Datum
614 bigm_similarity(PG_FUNCTION_ARGS)
615 {
616         text       *in1 = PG_GETARG_TEXT_P(0);
617         text       *in2 = PG_GETARG_TEXT_P(1);
618         BIGM       *bgm1,
619                            *bgm2;
620         float4          res;
621
622         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
623         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
624
625         res = cnt_sml_bigm(bgm1, bgm2);
626
627         pfree(bgm1);
628         pfree(bgm2);
629         PG_FREE_IF_COPY(in1, 0);
630         PG_FREE_IF_COPY(in2, 1);
631
632         PG_RETURN_FLOAT4(res);
633 }
634
635 Datum
636 bigm_similarity_op(PG_FUNCTION_ARGS)
637 {
638         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
639                                                                                                                  PG_GETARG_DATUM(0),
640                                                                                                                  PG_GETARG_DATUM(1)));
641
642         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
643 }
644
645 Datum
646 likequery(PG_FUNCTION_ARGS)
647 {
648         text       *query = PG_GETARG_TEXT_PP(0);
649         const char *str;
650         int                     len;
651         const char *sp;
652         text       *result;
653         char       *rp;
654         int                     mblen;
655
656         str = VARDATA_ANY(query);
657         len = VARSIZE_ANY_EXHDR(query);
658
659         if (len == 0)
660                 PG_RETURN_NULL();
661
662         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
663         rp = VARDATA(result);
664         *rp++ = '%';
665
666         for (sp = str; (sp - str) < len;)
667         {
668                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
669                 {
670                         *rp++ = '\\';
671                         *rp++ = *sp++;
672                 }
673                 else if (IS_HIGHBIT_SET(*sp))
674                 {
675                         mblen = pg_mblen(sp);
676                         memcpy(rp, sp, mblen);
677                         rp += mblen;
678                         sp += mblen;
679                 }
680                 else
681                         *rp++ = *sp++;
682         }
683
684         *rp++ = '%';
685         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
686
687         PG_RETURN_TEXT_P(result);
688 }
689
690 inline int
691 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
692 {
693         int                     i;
694         int                     len = Min(len1, len2);
695
696         for (i = 0; i < len; i++, arg1++, arg2++)
697         {
698                 if (*arg1 == *arg2)
699                         continue;
700                 if (*arg1 < *arg2)
701                         return -1;
702                 else
703                         return 1;
704         }
705
706         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
707 }
708
709 Datum
710 bigmtextcmp(PG_FUNCTION_ARGS)
711 {
712         text       *arg1 = PG_GETARG_TEXT_PP(0);
713         text       *arg2 = PG_GETARG_TEXT_PP(1);
714         char       *a1p = VARDATA_ANY(arg1);
715         char       *a2p = VARDATA_ANY(arg2);
716         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
717         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
718
719         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
720 }