OSDN Git Service

Update the last release date of pg_bigm.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  *
5  * Changelog:
6  *   2013/01/09
7  *   Support full text search using bigrams.
8  *   Author: NTT DATA Corporation
9  *
10  *-------------------------------------------------------------------------
11  */
12 #include "postgres.h"
13
14 #include <ctype.h>
15
16 #include "bigm.h"
17
18 #include "catalog/pg_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "utils/array.h"
21
22
23 PG_MODULE_MAGIC;
24
25 /* Last update date of pg_bigm */
26 #define BIGM_LAST_UPDATE        "2013.04.05"
27
28 /* GUC variable */
29 bool    bigm_enable_recheck = false;
30 int             bigm_gin_key_limit = 0;
31 char    *bigm_last_update = NULL;
32
33 PG_FUNCTION_INFO_V1(show_bigm);
34 Datum           show_bigm(PG_FUNCTION_ARGS);
35
36 PG_FUNCTION_INFO_V1(bigmtextcmp);
37 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
38
39 PG_FUNCTION_INFO_V1(likequery);
40 Datum           likequery(PG_FUNCTION_ARGS);
41
42 void            _PG_init(void);
43 void            _PG_fini(void);
44
45 void
46 _PG_init(void)
47 {
48         /* Define custom GUC variables */
49         DefineCustomBoolVariable("pg_bigm.enable_recheck",
50                                                          "Recheck that heap tuples fetched from index "
51                                                          "match the query.",
52                                                          NULL,
53                                                          &bigm_enable_recheck,
54                                                          true,
55                                                          PGC_USERSET,
56                                                          0,
57                                                          NULL,
58                                                          NULL,
59                                                          NULL);
60
61         DefineCustomIntVariable("pg_bigm.gin_key_limit",
62                                                         "Sets the maximum number of bi-gram keys allowed to "
63                                                         "use for GIN index search.",
64                                                         "Zero means no limit.",
65                                                         &bigm_gin_key_limit,
66                                                         0,
67                                                         0, INT_MAX,
68                                                         PGC_USERSET,
69                                                         0,
70                                                         NULL,
71                                                         NULL,
72                                                         NULL);
73
74         /* Can't be set in postgresql.conf */
75         DefineCustomStringVariable("pg_bigm.last_update",
76                                                            "Shows the last update date of pg_bigm.",
77                                                            NULL,
78                                                            &bigm_last_update,
79                                                            BIGM_LAST_UPDATE,
80                                                            PGC_INTERNAL,
81                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
82                                                            NULL,
83                                                            NULL,
84                                                            NULL);
85
86         EmitWarningsOnPlaceholders("pg_bigm");
87 }
88
89 void
90 _PG_fini(void)
91 {
92 }
93
94 static int
95 comp_bigm(const void *a, const void *b, void *arg)
96 {
97         int             res;
98         bool    *haveDups = (bool *) arg;
99
100         res = CMPBIGM(a, b);
101
102         if (res == 0)
103                 *haveDups = true;
104
105         return res;
106 }
107
108 static int
109 unique_array(bigm *a, int len)
110 {
111         bigm       *curend,
112                            *tmp;
113
114         curend = tmp = a;
115         while (tmp - a < len)
116                 if (CMPBIGM(tmp, curend))
117                 {
118                         curend++;
119                         if (curend != tmp)
120                                 memcpy(curend, tmp, BIGMSIZE);
121                         tmp++;
122                 }
123                 else
124                         tmp++;
125
126         return curend + 1 - a;
127 }
128
129 #define iswordchr(c)    (!t_isspace(c))
130
131 /*
132  * Finds first word in string, returns pointer to the word,
133  * endword points to the character after word
134  */
135 static char *
136 find_word(char *str, int lenstr, char **endword, int *charlen)
137 {
138         char       *beginword = str;
139
140         while (beginword - str < lenstr && !iswordchr(beginword))
141                 beginword += pg_mblen(beginword);
142
143         if (beginword - str >= lenstr)
144                 return NULL;
145
146         *endword = beginword;
147         *charlen = 0;
148         while (*endword - str < lenstr && iswordchr(*endword))
149         {
150                 *endword += pg_mblen(*endword);
151                 (*charlen)++;
152         }
153
154         return beginword;
155 }
156
157 #ifdef USE_WIDE_UPPER_LOWER
158 static void
159 cnt_bigram(bigm *bptr, char *str, int bytelen)
160 {
161         CPBIGM(bptr, str, bytelen);
162 }
163 #endif
164
165 /*
166  * Adds bigrams from words (already padded).
167  */
168 static bigm *
169 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
170 {
171         char       *ptr = str;
172
173         if (charlen < 2)
174         {
175 #ifdef USE_WIDE_UPPER_LOWER
176                 cnt_bigram(bptr, ptr, pg_mblen(str));
177 #else
178                 CPBIGM(bptr, ptr, 1);
179 #endif
180                 bptr->pmatch = true;
181                 bptr++;
182                 return bptr;
183         }
184
185 #ifdef USE_WIDE_UPPER_LOWER
186         if (pg_database_encoding_max_length() > 1)
187         {
188                 int                     lenfirst = pg_mblen(str),
189                                         lenlast = pg_mblen(str + lenfirst);
190
191                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
192                 {
193                         cnt_bigram(bptr, ptr, lenfirst + lenlast);
194
195                         ptr += lenfirst;
196                         bptr++;
197
198                         lenfirst = lenlast;
199                         lenlast = pg_mblen(ptr + lenfirst);
200                 }
201         }
202         else
203 #endif
204         {
205                 Assert(bytelen == charlen);
206
207                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
208                 {
209                         CPBIGM(bptr, ptr, 2);
210                         ptr++;
211                         bptr++;
212                 }
213         }
214
215         return bptr;
216 }
217
218 BIGM *
219 generate_bigm(char *str, int slen)
220 {
221         BIGM       *bgm;
222         char       *buf;
223         bigm       *bptr;
224         int                     len,
225                                 charlen,
226                                 bytelen;
227         char       *bword,
228                            *eword;
229
230         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
231         SET_VARSIZE(bgm, VARHDRSZ);
232
233         if (slen + LPADDING + RPADDING < 2 || slen == 0)
234                 return bgm;
235
236         bptr = GETARR(bgm);
237
238         buf = palloc(sizeof(char) * (slen + 4));
239
240         if (LPADDING > 0)
241         {
242                 *buf = ' ';
243                 if (LPADDING > 1)
244                         *(buf + 1) = ' ';
245         }
246
247         eword = str;
248         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
249         {
250                 bytelen = eword - bword;
251                 memcpy(buf + LPADDING, bword, bytelen);
252
253                 buf[LPADDING + bytelen] = ' ';
254                 buf[LPADDING + bytelen + 1] = ' ';
255
256                 /*
257                  * count bigrams
258                  */
259                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
260                                                          charlen + LPADDING + RPADDING);
261         }
262
263         pfree(buf);
264
265         if ((len = bptr - GETARR(bgm)) == 0)
266                 return bgm;
267
268         if (len > 0)
269         {
270                 bool    haveDups = false;
271
272                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
273                 if (haveDups)
274                         len = unique_array(GETARR(bgm), len);
275         }
276
277         SET_VARSIZE(bgm, CALCGTSIZE(len));
278
279         return bgm;
280 }
281
282 /*
283  * Extract the next non-wildcard part of a search string, ie, a word bounded
284  * by '_' or '%' meta-characters, non-word characters or string end.
285  *
286  * str: source string, of length lenstr bytes (need not be null-terminated)
287  * buf: where to return the substring (must be long enough)
288  * *bytelen: receives byte length of the found substring
289  * *charlen: receives character length of the found substring
290  *
291  * Returns pointer to end+1 of the found substring in the source string.
292  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
293  *
294  * If the found word is bounded by non-word characters or string boundaries
295  * then this function will include corresponding padding spaces into buf.
296  */
297 static const char *
298 get_wildcard_part(const char *str, int lenstr,
299                                   char *buf, int *bytelen, int *charlen)
300 {
301         const char *beginword = str;
302         const char *endword;
303         char       *s = buf;
304         bool        in_leading_wildcard_meta = false;
305         bool        in_trailing_wildcard_meta = false;
306         bool            in_escape = false;
307         int                     clen;
308
309         /*
310          * Find the first word character, remembering whether preceding character
311          * was wildcard meta-character.  Note that the in_escape state persists
312          * from this loop to the next one, since we may exit at a word character
313          * that is in_escape.
314          */
315         while (beginword - str < lenstr)
316         {
317                 if (in_escape)
318                 {
319                         if (iswordchr(beginword))
320                                 break;
321                         in_escape = false;
322                         in_leading_wildcard_meta = false;
323                 }
324                 else
325                 {
326                         if (ISESCAPECHAR(beginword))
327                                 in_escape = true;
328                         else if (ISWILDCARDCHAR(beginword))
329                                 in_leading_wildcard_meta = true;
330                         else if (iswordchr(beginword))
331                                 break;
332                         else
333                                 in_leading_wildcard_meta = false;
334                 }
335                 beginword += pg_mblen(beginword);
336         }
337
338         /*
339          * Handle string end.
340          */
341         if (beginword - str >= lenstr)
342                 return NULL;
343
344         /*
345          * Add left padding spaces if preceding character wasn't wildcard
346          * meta-character.
347          */
348         *charlen = 0;
349         if (!in_leading_wildcard_meta)
350         {
351                 if (LPADDING > 0)
352                 {
353                         *s++ = ' ';
354                         (*charlen)++;
355                         if (LPADDING > 1)
356                         {
357                                 *s++ = ' ';
358                                 (*charlen)++;
359                         }
360                 }
361         }
362
363         /*
364          * Copy data into buf until wildcard meta-character, non-word character or
365          * string boundary.  Strip escapes during copy.
366          */
367         endword = beginword;
368         while (endword - str < lenstr)
369         {
370                 clen = pg_mblen(endword);
371                 if (in_escape)
372                 {
373                         if (iswordchr(endword))
374                         {
375                                 memcpy(s, endword, clen);
376                                 (*charlen)++;
377                                 s += clen;
378                         }
379                         else
380                         {
381                                 /*
382                                  * Back up endword to the escape character when stopping at
383                                  * an escaped char, so that subsequent get_wildcard_part will
384                                  * restart from the escape character.  We assume here that
385                                  * escape chars are single-byte.
386                                  */
387                                 endword--;
388                                 break;
389                         }
390                         in_escape = false;
391                 }
392                 else
393                 {
394                         if (ISESCAPECHAR(endword))
395                                 in_escape = true;
396                         else if (ISWILDCARDCHAR(endword))
397                         {
398                                 in_trailing_wildcard_meta = true;
399                                 break;
400                         }
401                         else if (iswordchr(endword))
402                         {
403                                 memcpy(s, endword, clen);
404                                 (*charlen)++;
405                                 s += clen;
406                         }
407                         else
408                                 break;
409                 }
410                 endword += clen;
411         }
412
413         /*
414          * Add right padding spaces if next character isn't wildcard
415          * meta-character.
416          */
417         if (!in_trailing_wildcard_meta)
418         {
419                 if (RPADDING > 0)
420                 {
421                         *s++ = ' ';
422                         (*charlen)++;
423                         if (RPADDING > 1)
424                         {
425                                 *s++ = ' ';
426                                 (*charlen)++;
427                         }
428                 }
429         }
430
431         *bytelen = s - buf;
432         return endword;
433 }
434
435 /*
436  * Generates bigrams for wildcard search string.
437  *
438  * Returns array of bigrams that must occur in any string that matches the
439  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
440  * " a", "bcd" would be extracted.
441  *
442  * Set 'removeDups' to true if duplicate bigrams are removed.
443  */
444 BIGM *
445 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
446 {
447         BIGM       *bgm;
448         char       *buf;
449         bigm       *bptr;
450         int                     len,
451                                 charlen,
452                                 bytelen;
453         const char *eword;
454
455         *removeDups = false;
456
457         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
458         SET_VARSIZE(bgm, VARHDRSZ);
459
460         if (slen + LPADDING + RPADDING < 2 || slen == 0)
461                 return bgm;
462
463         bptr = GETARR(bgm);
464
465         buf = palloc(sizeof(char) * (slen + 4));
466
467         /*
468          * Extract bigrams from each substring extracted by get_wildcard_part.
469          */
470         eword = str;
471         while ((eword = get_wildcard_part(eword, slen - (eword - str),
472                                                                           buf, &bytelen, &charlen)) != NULL)
473         {
474                 /*
475                  * count bigrams
476                  */
477                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
478         }
479
480         pfree(buf);
481
482         if ((len = bptr - GETARR(bgm)) == 0)
483                 return bgm;
484
485         /*
486          * Make bigrams unique.
487          */
488         if (len > 0)
489         {
490                 bool    haveDups = false;
491
492                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
493                 if (haveDups)
494                 {
495                         *removeDups = true;
496                         len = unique_array(GETARR(bgm), len);
497                 }
498         }
499
500         SET_VARSIZE(bgm, CALCGTSIZE(len));
501
502         return bgm;
503 }
504
505 Datum
506 show_bigm(PG_FUNCTION_ARGS)
507 {
508         text       *in = PG_GETARG_TEXT_P(0);
509         BIGM       *bgm;
510         Datum      *d;
511         ArrayType  *a;
512         bigm       *ptr;
513         int                     i;
514
515         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
516         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
517
518         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
519         {
520                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
521                 d[i] = PointerGetDatum(item);
522         }
523
524         a = construct_array(
525                                                 d,
526                                                 ARRNELEM(bgm),
527                                                 TEXTOID,
528                                                 -1,
529                                                 false,
530                                                 'i'
531                 );
532
533         for (i = 0; i < ARRNELEM(bgm); i++)
534                 pfree(DatumGetPointer(d[i]));
535
536         pfree(d);
537         pfree(bgm);
538         PG_FREE_IF_COPY(in, 0);
539
540         PG_RETURN_POINTER(a);
541 }
542
543 Datum
544 likequery(PG_FUNCTION_ARGS)
545 {
546         text       *query = PG_GETARG_TEXT_PP(0);
547         const char *str;
548         int                     len;
549         const char *sp;
550         text       *result;
551         char       *rp;
552         int                     mblen;
553
554         str = VARDATA_ANY(query);
555         len = VARSIZE_ANY_EXHDR(query);
556
557         if (len == 0)
558                 PG_RETURN_NULL();
559
560         result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
561         rp = VARDATA(result);
562         *rp++ = '%';
563
564         for (sp = str; (sp - str) < len;)
565         {
566                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
567                 {
568                         *rp++ = '\\';
569                         *rp++ = *sp++;
570                 }
571                 else if (IS_HIGHBIT_SET(*sp))
572                 {
573                         mblen = pg_mblen(sp);
574                         memcpy(rp, sp, mblen);
575                         rp += mblen;
576                         sp += mblen;
577                 }
578                 else
579                         *rp++ = *sp++;
580         }
581
582         *rp++ = '%';
583         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
584
585         PG_RETURN_TEXT_P(result);
586 }
587
588 inline int
589 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
590 {
591         int                     i;
592         int                     len = Min(len1, len2);
593
594         for (i = 0; i < len; i++, arg1++, arg2++)
595         {
596                 if (*arg1 == *arg2)
597                         continue;
598                 if (*arg1 < *arg2)
599                         return -1;
600                 else
601                         return 1;
602         }
603
604         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
605 }
606
607 Datum
608 bigmtextcmp(PG_FUNCTION_ARGS)
609 {
610         text    *arg1 = PG_GETARG_TEXT_PP(0);
611         text    *arg2 = PG_GETARG_TEXT_PP(1);
612         char    *a1p = VARDATA_ANY(arg1);
613         char    *a2p = VARDATA_ANY(arg2);
614         int             len1 = VARSIZE_ANY_EXHDR(arg1);
615         int             len2 = VARSIZE_ANY_EXHDR(arg2);
616
617         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
618 }