/*-------------------------------------------------------------------------
*
+ * Portions Copyright (c) 2017-2020, pg_bigm Development Group
+ * Portions Copyright (c) 2013-2016, NTT DATA Corporation
* Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
*
* Changelog:
- * 2013/01/09
- * Support full text search using bigrams.
- * Author: NTT DATA Corporation
+ * 2013/01/09
+ * Support full text search using bigrams.
+ * Author: NTT DATA Corporation
*
*-------------------------------------------------------------------------
*/
#include "catalog/pg_type.h"
#include "tsearch/ts_locale.h"
#include "utils/array.h"
-
+#include "utils/memutils.h"
PG_MODULE_MAGIC;
/* Last update date of pg_bigm */
-#define BIGM_LAST_UPDATE "2013.04.05"
+#define BIGM_LAST_UPDATE "2020.02.28"
/* GUC variable */
-bool bigm_enable_recheck = false;
-int bigm_gin_key_limit = 0;
-char *bigm_last_update = NULL;
+bool bigm_enable_recheck = false;
+int bigm_gin_key_limit = 0;
+double bigm_similarity_limit = 0.3;
+char *bigm_last_update = NULL;
PG_FUNCTION_INFO_V1(show_bigm);
-Datum show_bigm(PG_FUNCTION_ARGS);
-
PG_FUNCTION_INFO_V1(bigmtextcmp);
-Datum bigmtextcmp(PG_FUNCTION_ARGS);
-
PG_FUNCTION_INFO_V1(likequery);
+PG_FUNCTION_INFO_V1(bigm_similarity);
+PG_FUNCTION_INFO_V1(bigm_similarity_op);
+
+/*
+ * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
+ * macro since 9.4, and hence the declaration of the function prototypes
+ * here is necessary only for 9.3 or before.
+ */
+#if PG_VERSION_NUM < 90400
+Datum show_bigm(PG_FUNCTION_ARGS);
+Datum bigmtextcmp(PG_FUNCTION_ARGS);
Datum likequery(PG_FUNCTION_ARGS);
+Datum bigm_similarity(PG_FUNCTION_ARGS);
+Datum bigm_similarity_op(PG_FUNCTION_ARGS);
+#endif
void _PG_init(void);
void _PG_fini(void);
NULL,
NULL);
+ DefineCustomRealVariable("pg_bigm.similarity_limit",
+ "Sets the similarity threshold used by the "
+ "=% operator.",
+ NULL,
+ &bigm_similarity_limit,
+ 0.3,
+ 0.0, 1.0,
+ PGC_USERSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
/* Can't be set in postgresql.conf */
DefineCustomStringVariable("pg_bigm.last_update",
"Shows the last update date of pg_bigm.",
static int
comp_bigm(const void *a, const void *b, void *arg)
{
- int res;
- bool *haveDups = (bool *) arg;
+ int res;
+ bool *haveDups = (bool *) arg;
res = CMPBIGM(a, b);
return beginword;
}
-/*
+/*
* The function is named compact_bigram to maintain consistency with pg_trgm,
* though it does not reduce multibyte characters to hash values like in
* compact_trigram.
char *bword,
*eword;
- bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
+ /*
+ * Guard against possible overflow in the palloc requests below.
+ * We need to prevent integer overflow in the multiplications here.
+ */
+ if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
+ (Size) slen > MaxAllocSize - 4)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("out of memory")));
+
+ bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
SET_VARSIZE(bgm, VARHDRSZ);
if (slen + LPADDING + RPADDING < 2 || slen == 0)
* count bigrams
*/
bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
- charlen + LPADDING + RPADDING);
+ charlen + LPADDING + RPADDING);
}
pfree(buf);
if ((len = bptr - GETARR(bgm)) == 0)
return bgm;
- if (len > 0)
+ /*
+ * Make bigrams unique.
+ */
+ if (len > 1)
{
- bool haveDups = false;
+ bool haveDups = false;
qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
if (haveDups)
}
/*
- * Extract the next non-wildcard part of a search string, ie, a word bounded
+ * Extract the next non-wildcard part of a search string, i.e. a word bounded
* by '_' or '%' meta-characters, non-word characters or string end.
*
* str: source string, of length lenstr bytes (need not be null-terminated)
const char *beginword = str;
const char *endword;
char *s = buf;
- bool in_leading_wildcard_meta = false;
- bool in_trailing_wildcard_meta = false;
+ bool in_leading_wildcard_meta = false;
+ bool in_trailing_wildcard_meta = false;
bool in_escape = false;
int clen;
else
{
/*
- * Back up endword to the escape character when stopping at
- * an escaped char, so that subsequent get_wildcard_part will
+ * Back up endword to the escape character when stopping at an
+ * escaped char, so that subsequent get_wildcard_part will
* restart from the escape character. We assume here that
* escape chars are single-byte.
*/
*removeDups = false;
- bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
+ /*
+ * Guard against possible overflow in the palloc requests below.
+ * We need to prevent integer overflow in the multiplications here.
+ */
+ if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
+ (Size) slen > MaxAllocSize - 4)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("out of memory")));
+
+ bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
SET_VARSIZE(bgm, VARHDRSZ);
if (slen + LPADDING + RPADDING < 2 || slen == 0)
/*
* Make bigrams unique.
*/
- if (len > 0)
+ if (len > 1)
{
- bool haveDups = false;
+ bool haveDups = false;
qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
if (haveDups)
for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
{
text *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
+
d[i] = PointerGetDatum(item);
}
PG_RETURN_POINTER(a);
}
+static float4
+cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
+{
+ bigm *ptr1,
+ *ptr2;
+ int count = 0;
+ int len1,
+ len2;
+
+ ptr1 = GETARR(bgm1);
+ ptr2 = GETARR(bgm2);
+
+ len1 = ARRNELEM(bgm1);
+ len2 = ARRNELEM(bgm2);
+
+ /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
+ if (len1 <= 0 || len2 <= 0)
+ return (float4) 0.0;
+
+ while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
+ {
+ int res = CMPBIGM(ptr1, ptr2);
+
+ if (res < 0)
+ ptr1++;
+ else if (res > 0)
+ ptr2++;
+ else
+ {
+ ptr1++;
+ ptr2++;
+ count++;
+ }
+ }
+
+#ifdef DIVUNION
+ return ((float4) count) / ((float4) (len1 + len2 - count));
+#else
+ return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
+#endif
+}
+
+Datum
+bigm_similarity(PG_FUNCTION_ARGS)
+{
+ text *in1 = PG_GETARG_TEXT_P(0);
+ text *in2 = PG_GETARG_TEXT_P(1);
+ BIGM *bgm1,
+ *bgm2;
+ float4 res;
+
+ bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
+ bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
+
+ res = cnt_sml_bigm(bgm1, bgm2);
+
+ pfree(bgm1);
+ pfree(bgm2);
+ PG_FREE_IF_COPY(in1, 0);
+ PG_FREE_IF_COPY(in2, 1);
+
+ PG_RETURN_FLOAT4(res);
+}
+
+Datum
+bigm_similarity_op(PG_FUNCTION_ARGS)
+{
+ float4 res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1)));
+
+ PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
+}
+
Datum
likequery(PG_FUNCTION_ARGS)
{
if (len == 0)
PG_RETURN_NULL();
- result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
+ result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
rp = VARDATA(result);
*rp++ = '%';
PG_RETURN_TEXT_P(result);
}
-inline int
-bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
-{
- int i;
- int len = Min(len1, len2);
-
- for (i = 0; i < len; i++, arg1++, arg2++)
- {
- if (*arg1 == *arg2)
- continue;
- if (*arg1 < *arg2)
- return -1;
- else
- return 1;
- }
-
- return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
-}
-
Datum
bigmtextcmp(PG_FUNCTION_ARGS)
{
- text *arg1 = PG_GETARG_TEXT_PP(0);
- text *arg2 = PG_GETARG_TEXT_PP(1);
- char *a1p = VARDATA_ANY(arg1);
- char *a2p = VARDATA_ANY(arg2);
- int len1 = VARSIZE_ANY_EXHDR(arg1);
- int len2 = VARSIZE_ANY_EXHDR(arg2);
+ text *arg1 = PG_GETARG_TEXT_PP(0);
+ text *arg2 = PG_GETARG_TEXT_PP(1);
+ char *a1p = VARDATA_ANY(arg1);
+ char *a2p = VARDATA_ANY(arg2);
+ int len1 = VARSIZE_ANY_EXHDR(arg1);
+ int len2 = VARSIZE_ANY_EXHDR(arg2);
PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
}