OSDN Git Service

84515b59f1f7842b5516cd142d3d8388df210cc9
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  *
5  * Changelog:
6  *   2013/01/09
7  *   Support full text search using bigrams.
8  *   Author: NTT DATA Corporation
9  *
10  *-------------------------------------------------------------------------
11  */
12 #include "postgres.h"
13
14 #include <ctype.h>
15
16 #include "bigm.h"
17
18 #include "catalog/pg_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "utils/array.h"
21
22
23 PG_MODULE_MAGIC;
24
25 /* Last update date of pg_bigm */
26 #define BIGM_LAST_UPDATE        "2013.11.22"
27
28 /* GUC variable */
29 bool    bigm_enable_recheck = false;
30 int             bigm_gin_key_limit = 0;
31 double  bigm_similarity_limit = 0.3;
32 char    *bigm_last_update = NULL;
33
34 PG_FUNCTION_INFO_V1(show_bigm);
35 Datum           show_bigm(PG_FUNCTION_ARGS);
36
37 PG_FUNCTION_INFO_V1(bigmtextcmp);
38 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
39
40 PG_FUNCTION_INFO_V1(likequery);
41 Datum           likequery(PG_FUNCTION_ARGS);
42
43 PG_FUNCTION_INFO_V1(bigm_similarity);
44 Datum           bigm_similarity(PG_FUNCTION_ARGS);
45
46 PG_FUNCTION_INFO_V1(bigm_similarity_op);
47 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
48
49 void            _PG_init(void);
50 void            _PG_fini(void);
51
52 void
53 _PG_init(void)
54 {
55         /* Define custom GUC variables */
56         DefineCustomBoolVariable("pg_bigm.enable_recheck",
57                                                          "Recheck that heap tuples fetched from index "
58                                                          "match the query.",
59                                                          NULL,
60                                                          &bigm_enable_recheck,
61                                                          true,
62                                                          PGC_USERSET,
63                                                          0,
64                                                          NULL,
65                                                          NULL,
66                                                          NULL);
67
68         DefineCustomIntVariable("pg_bigm.gin_key_limit",
69                                                         "Sets the maximum number of bi-gram keys allowed to "
70                                                         "use for GIN index search.",
71                                                         "Zero means no limit.",
72                                                         &bigm_gin_key_limit,
73                                                         0,
74                                                         0, INT_MAX,
75                                                         PGC_USERSET,
76                                                         0,
77                                                         NULL,
78                                                         NULL,
79                                                         NULL);
80
81         DefineCustomRealVariable("pg_bigm.similarity_limit",
82                                                          "Sets the similarity threshold used by the "
83                                                          "=% operator.",
84                                                          NULL,
85                                                          &bigm_similarity_limit,
86                                                          0.3,
87                                                          0.0, 1.0,
88                                                          PGC_USERSET,
89                                                          0,
90                                                          NULL,
91                                                          NULL,
92                                                          NULL);
93
94         /* Can't be set in postgresql.conf */
95         DefineCustomStringVariable("pg_bigm.last_update",
96                                                            "Shows the last update date of pg_bigm.",
97                                                            NULL,
98                                                            &bigm_last_update,
99                                                            BIGM_LAST_UPDATE,
100                                                            PGC_INTERNAL,
101                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
102                                                            NULL,
103                                                            NULL,
104                                                            NULL);
105
106         EmitWarningsOnPlaceholders("pg_bigm");
107 }
108
109 void
110 _PG_fini(void)
111 {
112 }
113
114 static int
115 comp_bigm(const void *a, const void *b, void *arg)
116 {
117         int             res;
118         bool    *haveDups = (bool *) arg;
119
120         res = CMPBIGM(a, b);
121
122         if (res == 0)
123                 *haveDups = true;
124
125         return res;
126 }
127
128 static int
129 unique_array(bigm *a, int len)
130 {
131         bigm       *curend,
132                            *tmp;
133
134         curend = tmp = a;
135         while (tmp - a < len)
136                 if (CMPBIGM(tmp, curend))
137                 {
138                         curend++;
139                         if (curend != tmp)
140                                 memcpy(curend, tmp, BIGMSIZE);
141                         tmp++;
142                 }
143                 else
144                         tmp++;
145
146         return curend + 1 - a;
147 }
148
149 #define iswordchr(c)    (!t_isspace(c))
150
151 /*
152  * Finds first word in string, returns pointer to the word,
153  * endword points to the character after word
154  */
155 static char *
156 find_word(char *str, int lenstr, char **endword, int *charlen)
157 {
158         char       *beginword = str;
159
160         while (beginword - str < lenstr && !iswordchr(beginword))
161                 beginword += pg_mblen(beginword);
162
163         if (beginword - str >= lenstr)
164                 return NULL;
165
166         *endword = beginword;
167         *charlen = 0;
168         while (*endword - str < lenstr && iswordchr(*endword))
169         {
170                 *endword += pg_mblen(*endword);
171                 (*charlen)++;
172         }
173
174         return beginword;
175 }
176
177 /* 
178  * The function is named compact_bigram to maintain consistency with pg_trgm,
179  * though it does not reduce multibyte characters to hash values like in
180  * compact_trigram.
181  */
182 static void
183 compact_bigram(bigm *bptr, char *str, int bytelen)
184 {
185         CPBIGM(bptr, str, bytelen);
186 }
187
188 /*
189  * Adds bigrams from words (already padded).
190  */
191 static bigm *
192 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
193 {
194         char       *ptr = str;
195
196         if (charlen < 2)
197         {
198                 compact_bigram(bptr, ptr, pg_mblen(str));
199                 bptr->pmatch = true;
200                 bptr++;
201                 return bptr;
202         }
203
204         if (bytelen > charlen)
205         {
206                 /* Find multibyte character boundaries and call compact_bigram */
207                 int                     lenfirst = pg_mblen(str),
208                                         lenlast = pg_mblen(str + lenfirst);
209
210                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
211                 {
212                         compact_bigram(bptr, ptr, lenfirst + lenlast);
213
214                         ptr += lenfirst;
215                         bptr++;
216
217                         lenfirst = lenlast;
218                         lenlast = pg_mblen(ptr + lenfirst);
219                 }
220         }
221         else
222         {
223                 /* Fast path when there are no multibyte characters */
224                 Assert(bytelen == charlen);
225
226                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
227                 {
228                         CPBIGM(bptr, ptr, 2);
229                         ptr++;
230                         bptr++;
231                 }
232         }
233
234         return bptr;
235 }
236
237 BIGM *
238 generate_bigm(char *str, int slen)
239 {
240         BIGM       *bgm;
241         char       *buf;
242         bigm       *bptr;
243         int                     len,
244                                 charlen,
245                                 bytelen;
246         char       *bword,
247                            *eword;
248
249         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
250         SET_VARSIZE(bgm, VARHDRSZ);
251
252         if (slen + LPADDING + RPADDING < 2 || slen == 0)
253                 return bgm;
254
255         bptr = GETARR(bgm);
256
257         buf = palloc(sizeof(char) * (slen + 4));
258
259         if (LPADDING > 0)
260         {
261                 *buf = ' ';
262                 if (LPADDING > 1)
263                         *(buf + 1) = ' ';
264         }
265
266         eword = str;
267         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
268         {
269                 bytelen = eword - bword;
270                 memcpy(buf + LPADDING, bword, bytelen);
271
272                 buf[LPADDING + bytelen] = ' ';
273                 buf[LPADDING + bytelen + 1] = ' ';
274
275                 /*
276                  * count bigrams
277                  */
278                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
279                                                          charlen + LPADDING + RPADDING);
280         }
281
282         pfree(buf);
283
284         if ((len = bptr - GETARR(bgm)) == 0)
285                 return bgm;
286
287         /*
288          * Make bigrams unique.
289          */
290         if (len > 1)
291         {
292                 bool    haveDups = false;
293
294                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
295                 if (haveDups)
296                         len = unique_array(GETARR(bgm), len);
297         }
298
299         SET_VARSIZE(bgm, CALCGTSIZE(len));
300
301         return bgm;
302 }
303
304 /*
305  * Extract the next non-wildcard part of a search string, ie, a word bounded
306  * by '_' or '%' meta-characters, non-word characters or string end.
307  *
308  * str: source string, of length lenstr bytes (need not be null-terminated)
309  * buf: where to return the substring (must be long enough)
310  * *bytelen: receives byte length of the found substring
311  * *charlen: receives character length of the found substring
312  *
313  * Returns pointer to end+1 of the found substring in the source string.
314  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
315  *
316  * If the found word is bounded by non-word characters or string boundaries
317  * then this function will include corresponding padding spaces into buf.
318  */
319 static const char *
320 get_wildcard_part(const char *str, int lenstr,
321                                   char *buf, int *bytelen, int *charlen)
322 {
323         const char *beginword = str;
324         const char *endword;
325         char       *s = buf;
326         bool        in_leading_wildcard_meta = false;
327         bool        in_trailing_wildcard_meta = false;
328         bool            in_escape = false;
329         int                     clen;
330
331         /*
332          * Find the first word character, remembering whether preceding character
333          * was wildcard meta-character.  Note that the in_escape state persists
334          * from this loop to the next one, since we may exit at a word character
335          * that is in_escape.
336          */
337         while (beginword - str < lenstr)
338         {
339                 if (in_escape)
340                 {
341                         if (iswordchr(beginword))
342                                 break;
343                         in_escape = false;
344                         in_leading_wildcard_meta = false;
345                 }
346                 else
347                 {
348                         if (ISESCAPECHAR(beginword))
349                                 in_escape = true;
350                         else if (ISWILDCARDCHAR(beginword))
351                                 in_leading_wildcard_meta = true;
352                         else if (iswordchr(beginword))
353                                 break;
354                         else
355                                 in_leading_wildcard_meta = false;
356                 }
357                 beginword += pg_mblen(beginword);
358         }
359
360         /*
361          * Handle string end.
362          */
363         if (beginword - str >= lenstr)
364                 return NULL;
365
366         /*
367          * Add left padding spaces if preceding character wasn't wildcard
368          * meta-character.
369          */
370         *charlen = 0;
371         if (!in_leading_wildcard_meta)
372         {
373                 if (LPADDING > 0)
374                 {
375                         *s++ = ' ';
376                         (*charlen)++;
377                         if (LPADDING > 1)
378                         {
379                                 *s++ = ' ';
380                                 (*charlen)++;
381                         }
382                 }
383         }
384
385         /*
386          * Copy data into buf until wildcard meta-character, non-word character or
387          * string boundary.  Strip escapes during copy.
388          */
389         endword = beginword;
390         while (endword - str < lenstr)
391         {
392                 clen = pg_mblen(endword);
393                 if (in_escape)
394                 {
395                         if (iswordchr(endword))
396                         {
397                                 memcpy(s, endword, clen);
398                                 (*charlen)++;
399                                 s += clen;
400                         }
401                         else
402                         {
403                                 /*
404                                  * Back up endword to the escape character when stopping at
405                                  * an escaped char, so that subsequent get_wildcard_part will
406                                  * restart from the escape character.  We assume here that
407                                  * escape chars are single-byte.
408                                  */
409                                 endword--;
410                                 break;
411                         }
412                         in_escape = false;
413                 }
414                 else
415                 {
416                         if (ISESCAPECHAR(endword))
417                                 in_escape = true;
418                         else if (ISWILDCARDCHAR(endword))
419                         {
420                                 in_trailing_wildcard_meta = true;
421                                 break;
422                         }
423                         else if (iswordchr(endword))
424                         {
425                                 memcpy(s, endword, clen);
426                                 (*charlen)++;
427                                 s += clen;
428                         }
429                         else
430                                 break;
431                 }
432                 endword += clen;
433         }
434
435         /*
436          * Add right padding spaces if next character isn't wildcard
437          * meta-character.
438          */
439         if (!in_trailing_wildcard_meta)
440         {
441                 if (RPADDING > 0)
442                 {
443                         *s++ = ' ';
444                         (*charlen)++;
445                         if (RPADDING > 1)
446                         {
447                                 *s++ = ' ';
448                                 (*charlen)++;
449                         }
450                 }
451         }
452
453         *bytelen = s - buf;
454         return endword;
455 }
456
457 /*
458  * Generates bigrams for wildcard search string.
459  *
460  * Returns array of bigrams that must occur in any string that matches the
461  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
462  * " a", "bcd" would be extracted.
463  *
464  * Set 'removeDups' to true if duplicate bigrams are removed.
465  */
466 BIGM *
467 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
468 {
469         BIGM       *bgm;
470         char       *buf;
471         bigm       *bptr;
472         int                     len,
473                                 charlen,
474                                 bytelen;
475         const char *eword;
476
477         *removeDups = false;
478
479         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
480         SET_VARSIZE(bgm, VARHDRSZ);
481
482         if (slen + LPADDING + RPADDING < 2 || slen == 0)
483                 return bgm;
484
485         bptr = GETARR(bgm);
486
487         buf = palloc(sizeof(char) * (slen + 4));
488
489         /*
490          * Extract bigrams from each substring extracted by get_wildcard_part.
491          */
492         eword = str;
493         while ((eword = get_wildcard_part(eword, slen - (eword - str),
494                                                                           buf, &bytelen, &charlen)) != NULL)
495         {
496                 /*
497                  * count bigrams
498                  */
499                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
500         }
501
502         pfree(buf);
503
504         if ((len = bptr - GETARR(bgm)) == 0)
505                 return bgm;
506
507         /*
508          * Make bigrams unique.
509          */
510         if (len > 1)
511         {
512                 bool    haveDups = false;
513
514                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
515                 if (haveDups)
516                 {
517                         *removeDups = true;
518                         len = unique_array(GETARR(bgm), len);
519                 }
520         }
521
522         SET_VARSIZE(bgm, CALCGTSIZE(len));
523
524         return bgm;
525 }
526
527 Datum
528 show_bigm(PG_FUNCTION_ARGS)
529 {
530         text       *in = PG_GETARG_TEXT_P(0);
531         BIGM       *bgm;
532         Datum      *d;
533         ArrayType  *a;
534         bigm       *ptr;
535         int                     i;
536
537         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
538         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
539
540         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
541         {
542                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
543                 d[i] = PointerGetDatum(item);
544         }
545
546         a = construct_array(
547                                                 d,
548                                                 ARRNELEM(bgm),
549                                                 TEXTOID,
550                                                 -1,
551                                                 false,
552                                                 'i'
553                 );
554
555         for (i = 0; i < ARRNELEM(bgm); i++)
556                 pfree(DatumGetPointer(d[i]));
557
558         pfree(d);
559         pfree(bgm);
560         PG_FREE_IF_COPY(in, 0);
561
562         PG_RETURN_POINTER(a);
563 }
564
565 static float4
566 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
567 {
568         bigm       *ptr1,
569                            *ptr2;
570         int                     count = 0;
571         int                     len1,
572                                 len2;
573
574         ptr1 = GETARR(bgm1);
575         ptr2 = GETARR(bgm2);
576
577         len1 = ARRNELEM(bgm1);
578         len2 = ARRNELEM(bgm2);
579
580         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
581         if (len1 <= 0 || len2 <= 0)
582                 return (float4) 0.0;
583
584         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
585         {
586                 int             res = CMPBIGM(ptr1, ptr2);
587
588                 if (res < 0)
589                         ptr1++;
590                 else if (res > 0)
591                         ptr2++;
592                 else
593                 {
594                         ptr1++;
595                         ptr2++;
596                         count++;
597                 }
598         }
599
600 #ifdef DIVUNION
601         return ((float4) count) / ((float4) (len1 + len2 - count));
602 #else
603         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
604 #endif
605 }
606
607 Datum
608 bigm_similarity(PG_FUNCTION_ARGS)
609 {
610         text       *in1 = PG_GETARG_TEXT_P(0);
611         text       *in2 = PG_GETARG_TEXT_P(1);
612         BIGM       *bgm1,
613                            *bgm2;
614         float4          res;
615
616         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
617         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
618
619         res = cnt_sml_bigm(bgm1, bgm2);
620
621         pfree(bgm1);
622         pfree(bgm2);
623         PG_FREE_IF_COPY(in1, 0);
624         PG_FREE_IF_COPY(in2, 1);
625
626         PG_RETURN_FLOAT4(res);
627 }
628
629 Datum
630 bigm_similarity_op(PG_FUNCTION_ARGS)
631 {
632         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
633                                                                                                                  PG_GETARG_DATUM(0),
634                                                                                                                  PG_GETARG_DATUM(1)));
635
636         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
637 }
638
639 Datum
640 likequery(PG_FUNCTION_ARGS)
641 {
642         text       *query = PG_GETARG_TEXT_PP(0);
643         const char *str;
644         int                     len;
645         const char *sp;
646         text       *result;
647         char       *rp;
648         int                     mblen;
649
650         str = VARDATA_ANY(query);
651         len = VARSIZE_ANY_EXHDR(query);
652
653         if (len == 0)
654                 PG_RETURN_NULL();
655
656         result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
657         rp = VARDATA(result);
658         *rp++ = '%';
659
660         for (sp = str; (sp - str) < len;)
661         {
662                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
663                 {
664                         *rp++ = '\\';
665                         *rp++ = *sp++;
666                 }
667                 else if (IS_HIGHBIT_SET(*sp))
668                 {
669                         mblen = pg_mblen(sp);
670                         memcpy(rp, sp, mblen);
671                         rp += mblen;
672                         sp += mblen;
673                 }
674                 else
675                         *rp++ = *sp++;
676         }
677
678         *rp++ = '%';
679         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
680
681         PG_RETURN_TEXT_P(result);
682 }
683
684 inline int
685 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
686 {
687         int                     i;
688         int                     len = Min(len1, len2);
689
690         for (i = 0; i < len; i++, arg1++, arg2++)
691         {
692                 if (*arg1 == *arg2)
693                         continue;
694                 if (*arg1 < *arg2)
695                         return -1;
696                 else
697                         return 1;
698         }
699
700         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
701 }
702
703 Datum
704 bigmtextcmp(PG_FUNCTION_ARGS)
705 {
706         text    *arg1 = PG_GETARG_TEXT_PP(0);
707         text    *arg2 = PG_GETARG_TEXT_PP(1);
708         char    *a1p = VARDATA_ANY(arg1);
709         char    *a2p = VARDATA_ANY(arg2);
710         int             len1 = VARSIZE_ANY_EXHDR(arg1);
711         int             len2 = VARSIZE_ANY_EXHDR(arg2);
712
713         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
714 }