1 /*-------------------------------------------------------------------------
3 * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
7 * Support full text search using bigrams.
8 * Author: NTT DATA Corporation
10 *-------------------------------------------------------------------------
18 #include "catalog/pg_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "utils/array.h"
25 /* Last update date of pg_bigm */
26 #define BIGM_LAST_UPDATE "2013.04.05"
29 bool bigm_enable_recheck = false;
30 int bigm_gin_key_limit = 0;
31 char *bigm_last_update = NULL;
33 PG_FUNCTION_INFO_V1(show_bigm);
34 Datum show_bigm(PG_FUNCTION_ARGS);
36 PG_FUNCTION_INFO_V1(bigmtextcmp);
37 Datum bigmtextcmp(PG_FUNCTION_ARGS);
39 PG_FUNCTION_INFO_V1(likequery);
40 Datum likequery(PG_FUNCTION_ARGS);
48 /* Define custom GUC variables */
49 DefineCustomBoolVariable("pg_bigm.enable_recheck",
50 "Recheck that heap tuples fetched from index "
61 DefineCustomIntVariable("pg_bigm.gin_key_limit",
62 "Sets the maximum number of bi-gram keys allowed to "
63 "use for GIN index search.",
64 "Zero means no limit.",
74 /* Can't be set in postgresql.conf */
75 DefineCustomStringVariable("pg_bigm.last_update",
76 "Shows the last update date of pg_bigm.",
81 GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
86 EmitWarningsOnPlaceholders("pg_bigm");
95 comp_bigm(const void *a, const void *b, void *arg)
98 bool *haveDups = (bool *) arg;
109 unique_array(bigm *a, int len)
115 while (tmp - a < len)
116 if (CMPBIGM(tmp, curend))
120 memcpy(curend, tmp, BIGMSIZE);
126 return curend + 1 - a;
129 #define iswordchr(c) (!t_isspace(c))
132 * Finds first word in string, returns pointer to the word,
133 * endword points to the character after word
136 find_word(char *str, int lenstr, char **endword, int *charlen)
138 char *beginword = str;
140 while (beginword - str < lenstr && !iswordchr(beginword))
141 beginword += pg_mblen(beginword);
143 if (beginword - str >= lenstr)
146 *endword = beginword;
148 while (*endword - str < lenstr && iswordchr(*endword))
150 *endword += pg_mblen(*endword);
157 #ifdef USE_WIDE_UPPER_LOWER
159 cnt_bigram(bigm *bptr, char *str, int bytelen)
161 CPBIGM(bptr, str, bytelen);
166 * Adds bigrams from words (already padded).
169 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
175 #ifdef USE_WIDE_UPPER_LOWER
176 cnt_bigram(bptr, ptr, pg_mblen(str));
178 CPBIGM(bptr, ptr, 1);
185 #ifdef USE_WIDE_UPPER_LOWER
186 if (pg_database_encoding_max_length() > 1)
188 int lenfirst = pg_mblen(str),
189 lenlast = pg_mblen(str + lenfirst);
191 while ((ptr - str) + lenfirst + lenlast <= bytelen)
193 cnt_bigram(bptr, ptr, lenfirst + lenlast);
199 lenlast = pg_mblen(ptr + lenfirst);
205 Assert(bytelen == charlen);
207 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
209 CPBIGM(bptr, ptr, 2);
219 generate_bigm(char *str, int slen)
230 bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
231 SET_VARSIZE(bgm, VARHDRSZ);
233 if (slen + LPADDING + RPADDING < 2 || slen == 0)
238 buf = palloc(sizeof(char) * (slen + 4));
248 while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
250 bytelen = eword - bword;
251 memcpy(buf + LPADDING, bword, bytelen);
253 buf[LPADDING + bytelen] = ' ';
254 buf[LPADDING + bytelen + 1] = ' ';
259 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
260 charlen + LPADDING + RPADDING);
265 if ((len = bptr - GETARR(bgm)) == 0)
270 bool haveDups = false;
272 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
274 len = unique_array(GETARR(bgm), len);
277 SET_VARSIZE(bgm, CALCGTSIZE(len));
283 * Extract the next non-wildcard part of a search string, ie, a word bounded
284 * by '_' or '%' meta-characters, non-word characters or string end.
286 * str: source string, of length lenstr bytes (need not be null-terminated)
287 * buf: where to return the substring (must be long enough)
288 * *bytelen: receives byte length of the found substring
289 * *charlen: receives character length of the found substring
291 * Returns pointer to end+1 of the found substring in the source string.
292 * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
294 * If the found word is bounded by non-word characters or string boundaries
295 * then this function will include corresponding padding spaces into buf.
298 get_wildcard_part(const char *str, int lenstr,
299 char *buf, int *bytelen, int *charlen)
301 const char *beginword = str;
304 bool in_leading_wildcard_meta = false;
305 bool in_trailing_wildcard_meta = false;
306 bool in_escape = false;
310 * Find the first word character, remembering whether preceding character
311 * was wildcard meta-character. Note that the in_escape state persists
312 * from this loop to the next one, since we may exit at a word character
315 while (beginword - str < lenstr)
319 if (iswordchr(beginword))
322 in_leading_wildcard_meta = false;
326 if (ISESCAPECHAR(beginword))
328 else if (ISWILDCARDCHAR(beginword))
329 in_leading_wildcard_meta = true;
330 else if (iswordchr(beginword))
333 in_leading_wildcard_meta = false;
335 beginword += pg_mblen(beginword);
341 if (beginword - str >= lenstr)
345 * Add left padding spaces if preceding character wasn't wildcard
349 if (!in_leading_wildcard_meta)
364 * Copy data into buf until wildcard meta-character, non-word character or
365 * string boundary. Strip escapes during copy.
368 while (endword - str < lenstr)
370 clen = pg_mblen(endword);
373 if (iswordchr(endword))
375 memcpy(s, endword, clen);
382 * Back up endword to the escape character when stopping at
383 * an escaped char, so that subsequent get_wildcard_part will
384 * restart from the escape character. We assume here that
385 * escape chars are single-byte.
394 if (ISESCAPECHAR(endword))
396 else if (ISWILDCARDCHAR(endword))
398 in_trailing_wildcard_meta = true;
401 else if (iswordchr(endword))
403 memcpy(s, endword, clen);
414 * Add right padding spaces if next character isn't wildcard
417 if (!in_trailing_wildcard_meta)
436 * Generates bigrams for wildcard search string.
438 * Returns array of bigrams that must occur in any string that matches the
439 * wildcard string. For example, given pattern "a%bcd%" the bigrams
440 * " a", "bcd" would be extracted.
442 * Set 'removeDups' to true if duplicate bigrams are removed.
445 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
457 bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
458 SET_VARSIZE(bgm, VARHDRSZ);
460 if (slen + LPADDING + RPADDING < 2 || slen == 0)
465 buf = palloc(sizeof(char) * (slen + 4));
468 * Extract bigrams from each substring extracted by get_wildcard_part.
471 while ((eword = get_wildcard_part(eword, slen - (eword - str),
472 buf, &bytelen, &charlen)) != NULL)
477 bptr = make_bigrams(bptr, buf, bytelen, charlen);
482 if ((len = bptr - GETARR(bgm)) == 0)
486 * Make bigrams unique.
490 bool haveDups = false;
492 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
496 len = unique_array(GETARR(bgm), len);
500 SET_VARSIZE(bgm, CALCGTSIZE(len));
506 show_bigm(PG_FUNCTION_ARGS)
508 text *in = PG_GETARG_TEXT_P(0);
515 bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
516 d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
518 for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
520 text *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
521 d[i] = PointerGetDatum(item);
533 for (i = 0; i < ARRNELEM(bgm); i++)
534 pfree(DatumGetPointer(d[i]));
538 PG_FREE_IF_COPY(in, 0);
540 PG_RETURN_POINTER(a);
544 likequery(PG_FUNCTION_ARGS)
546 text *query = PG_GETARG_TEXT_PP(0);
554 str = VARDATA_ANY(query);
555 len = VARSIZE_ANY_EXHDR(query);
560 result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
561 rp = VARDATA(result);
564 for (sp = str; (sp - str) < len;)
566 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
571 else if (IS_HIGHBIT_SET(*sp))
573 mblen = pg_mblen(sp);
574 memcpy(rp, sp, mblen);
583 SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
585 PG_RETURN_TEXT_P(result);
589 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
592 int len = Min(len1, len2);
594 for (i = 0; i < len; i++, arg1++, arg2++)
604 return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
608 bigmtextcmp(PG_FUNCTION_ARGS)
610 text *arg1 = PG_GETARG_TEXT_PP(0);
611 text *arg2 = PG_GETARG_TEXT_PP(1);
612 char *a1p = VARDATA_ANY(arg1);
613 char *a2p = VARDATA_ANY(arg2);
614 int len1 = VARSIZE_ANY_EXHDR(arg1);
615 int len2 = VARSIZE_ANY_EXHDR(arg2);
617 PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));