OSDN Git Service

Update BIGM_LAST_UPDATE to '2020.02.28'.
[pgbigm/pg_bigm.git] / bigm_op.c
index 4b4207a..a9ae73a 100644 (file)
--- a/bigm_op.c
+++ b/bigm_op.c
@@ -1,11 +1,13 @@
 /*-------------------------------------------------------------------------
  *
+ * Portions Copyright (c) 2017-2020, pg_bigm Development Group
+ * Portions Copyright (c) 2013-2016, NTT DATA Corporation
  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
  *
  * Changelog:
- *   2013/01/09
- *   Support full text search using bigrams.
- *   Author: NTT DATA Corporation
+ *      2013/01/09
+ *      Support full text search using bigrams.
+ *      Author: NTT DATA Corporation
  *
  *-------------------------------------------------------------------------
  */
 #include "catalog/pg_type.h"
 #include "tsearch/ts_locale.h"
 #include "utils/array.h"
-
+#include "utils/memutils.h"
 
 PG_MODULE_MAGIC;
 
 /* Last update date of pg_bigm */
-#define        BIGM_LAST_UPDATE        "2013.04.05"
+#define BIGM_LAST_UPDATE       "2020.02.28"
 
 /* GUC variable */
-bool   bigm_enable_recheck = false;
-int            bigm_gin_key_limit = 0;
-char   *bigm_last_update = NULL;
+bool           bigm_enable_recheck = false;
+int                    bigm_gin_key_limit = 0;
+double         bigm_similarity_limit = 0.3;
+char      *bigm_last_update = NULL;
 
 PG_FUNCTION_INFO_V1(show_bigm);
-Datum          show_bigm(PG_FUNCTION_ARGS);
-
 PG_FUNCTION_INFO_V1(bigmtextcmp);
-Datum          bigmtextcmp(PG_FUNCTION_ARGS);
-
 PG_FUNCTION_INFO_V1(likequery);
+PG_FUNCTION_INFO_V1(bigm_similarity);
+PG_FUNCTION_INFO_V1(bigm_similarity_op);
+
+/*
+ * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
+ * macro since 9.4, and hence the declaration of the function prototypes
+ * here is necessary only for 9.3 or before.
+ */
+#if PG_VERSION_NUM < 90400
+Datum          show_bigm(PG_FUNCTION_ARGS);
+Datum          bigmtextcmp(PG_FUNCTION_ARGS);
 Datum          likequery(PG_FUNCTION_ARGS);
+Datum          bigm_similarity(PG_FUNCTION_ARGS);
+Datum          bigm_similarity_op(PG_FUNCTION_ARGS);
+#endif
 
 void           _PG_init(void);
 void           _PG_fini(void);
@@ -71,6 +84,19 @@ _PG_init(void)
                                                        NULL,
                                                        NULL);
 
+       DefineCustomRealVariable("pg_bigm.similarity_limit",
+                                                        "Sets the similarity threshold used by the "
+                                                        "=% operator.",
+                                                        NULL,
+                                                        &bigm_similarity_limit,
+                                                        0.3,
+                                                        0.0, 1.0,
+                                                        PGC_USERSET,
+                                                        0,
+                                                        NULL,
+                                                        NULL,
+                                                        NULL);
+
        /* Can't be set in postgresql.conf */
        DefineCustomStringVariable("pg_bigm.last_update",
                                                           "Shows the last update date of pg_bigm.",
@@ -94,8 +120,8 @@ _PG_fini(void)
 static int
 comp_bigm(const void *a, const void *b, void *arg)
 {
-       int             res;
-       bool    *haveDups = (bool *) arg;
+       int                     res;
+       bool       *haveDups = (bool *) arg;
 
        res = CMPBIGM(a, b);
 
@@ -154,7 +180,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
        return beginword;
 }
 
-/* 
+/*
  * The function is named compact_bigram to maintain consistency with pg_trgm,
  * though it does not reduce multibyte characters to hash values like in
  * compact_trigram.
@@ -226,7 +252,17 @@ generate_bigm(char *str, int slen)
        char       *bword,
                           *eword;
 
-       bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
+       /*
+        * Guard against possible overflow in the palloc requests below.
+        * We need to prevent integer overflow in the multiplications here.
+        */
+       if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
+               (Size) slen > MaxAllocSize - 4)
+               ereport(ERROR,
+                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                errmsg("out of memory")));
+
+       bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
        SET_VARSIZE(bgm, VARHDRSZ);
 
        if (slen + LPADDING + RPADDING < 2 || slen == 0)
@@ -256,7 +292,7 @@ generate_bigm(char *str, int slen)
                 * count bigrams
                 */
                bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
-                                                        charlen + LPADDING + RPADDING);
+                                                       charlen + LPADDING + RPADDING);
        }
 
        pfree(buf);
@@ -264,9 +300,12 @@ generate_bigm(char *str, int slen)
        if ((len = bptr - GETARR(bgm)) == 0)
                return bgm;
 
-       if (len > 0)
+       /*
+        * Make bigrams unique.
+        */
+       if (len > 1)
        {
-               bool    haveDups = false;
+               bool            haveDups = false;
 
                qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
                if (haveDups)
@@ -279,7 +318,7 @@ generate_bigm(char *str, int slen)
 }
 
 /*
- * Extract the next non-wildcard part of a search string, ie, a word bounded
+ * Extract the next non-wildcard part of a search string, i.e. a word bounded
  * by '_' or '%' meta-characters, non-word characters or string end.
  *
  * str: source string, of length lenstr bytes (need not be null-terminated)
@@ -300,8 +339,8 @@ get_wildcard_part(const char *str, int lenstr,
        const char *beginword = str;
        const char *endword;
        char       *s = buf;
-       bool        in_leading_wildcard_meta = false;
-       bool        in_trailing_wildcard_meta = false;
+       bool            in_leading_wildcard_meta = false;
+       bool            in_trailing_wildcard_meta = false;
        bool            in_escape = false;
        int                     clen;
 
@@ -378,8 +417,8 @@ get_wildcard_part(const char *str, int lenstr,
                        else
                        {
                                /*
-                                * Back up endword to the escape character when stopping at
-                                * an escaped char, so that subsequent get_wildcard_part will
+                                * Back up endword to the escape character when stopping at an
+                                * escaped char, so that subsequent get_wildcard_part will
                                 * restart from the escape character.  We assume here that
                                 * escape chars are single-byte.
                                 */
@@ -453,7 +492,17 @@ generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
 
        *removeDups = false;
 
-       bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
+       /*
+        * Guard against possible overflow in the palloc requests below.
+        * We need to prevent integer overflow in the multiplications here.
+        */
+       if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
+               (Size) slen > MaxAllocSize - 4)
+               ereport(ERROR,
+                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                errmsg("out of memory")));
+
+       bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
        SET_VARSIZE(bgm, VARHDRSZ);
 
        if (slen + LPADDING + RPADDING < 2 || slen == 0)
@@ -484,9 +533,9 @@ generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
        /*
         * Make bigrams unique.
         */
-       if (len > 0)
+       if (len > 1)
        {
-               bool    haveDups = false;
+               bool            haveDups = false;
 
                qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
                if (haveDups)
@@ -517,6 +566,7 @@ show_bigm(PG_FUNCTION_ARGS)
        for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
        {
                text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
+
                d[i] = PointerGetDatum(item);
        }
 
@@ -539,6 +589,80 @@ show_bigm(PG_FUNCTION_ARGS)
        PG_RETURN_POINTER(a);
 }
 
+static float4
+cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
+{
+       bigm       *ptr1,
+                          *ptr2;
+       int                     count = 0;
+       int                     len1,
+                               len2;
+
+       ptr1 = GETARR(bgm1);
+       ptr2 = GETARR(bgm2);
+
+       len1 = ARRNELEM(bgm1);
+       len2 = ARRNELEM(bgm2);
+
+       /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
+       if (len1 <= 0 || len2 <= 0)
+               return (float4) 0.0;
+
+       while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
+       {
+               int                     res = CMPBIGM(ptr1, ptr2);
+
+               if (res < 0)
+                       ptr1++;
+               else if (res > 0)
+                       ptr2++;
+               else
+               {
+                       ptr1++;
+                       ptr2++;
+                       count++;
+               }
+       }
+
+#ifdef DIVUNION
+       return ((float4) count) / ((float4) (len1 + len2 - count));
+#else
+       return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
+#endif
+}
+
+Datum
+bigm_similarity(PG_FUNCTION_ARGS)
+{
+       text       *in1 = PG_GETARG_TEXT_P(0);
+       text       *in2 = PG_GETARG_TEXT_P(1);
+       BIGM       *bgm1,
+                          *bgm2;
+       float4          res;
+
+       bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
+       bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
+
+       res = cnt_sml_bigm(bgm1, bgm2);
+
+       pfree(bgm1);
+       pfree(bgm2);
+       PG_FREE_IF_COPY(in1, 0);
+       PG_FREE_IF_COPY(in2, 1);
+
+       PG_RETURN_FLOAT4(res);
+}
+
+Datum
+bigm_similarity_op(PG_FUNCTION_ARGS)
+{
+       float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
+                                                                                                                PG_GETARG_DATUM(0),
+                                                                                                                PG_GETARG_DATUM(1)));
+
+       PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
+}
+
 Datum
 likequery(PG_FUNCTION_ARGS)
 {
@@ -556,7 +680,7 @@ likequery(PG_FUNCTION_ARGS)
        if (len == 0)
                PG_RETURN_NULL();
 
-       result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
+       result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
        rp = VARDATA(result);
        *rp++ = '%';
 
@@ -584,34 +708,15 @@ likequery(PG_FUNCTION_ARGS)
        PG_RETURN_TEXT_P(result);
 }
 
-inline int
-bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
-{
-       int                     i;
-       int                     len = Min(len1, len2);
-
-       for (i = 0; i < len; i++, arg1++, arg2++)
-       {
-               if (*arg1 == *arg2)
-                       continue;
-               if (*arg1 < *arg2)
-                       return -1;
-               else
-                       return 1;
-       }
-
-       return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
-}
-
 Datum
 bigmtextcmp(PG_FUNCTION_ARGS)
 {
-       text    *arg1 = PG_GETARG_TEXT_PP(0);
-       text    *arg2 = PG_GETARG_TEXT_PP(1);
-       char    *a1p = VARDATA_ANY(arg1);
-       char    *a2p = VARDATA_ANY(arg2);
-       int             len1 = VARSIZE_ANY_EXHDR(arg1);
-       int             len2 = VARSIZE_ANY_EXHDR(arg2);
+       text       *arg1 = PG_GETARG_TEXT_PP(0);
+       text       *arg2 = PG_GETARG_TEXT_PP(1);
+       char       *a1p = VARDATA_ANY(arg1);
+       char       *a2p = VARDATA_ANY(arg2);
+       int                     len1 = VARSIZE_ANY_EXHDR(arg1);
+       int                     len2 = VARSIZE_ANY_EXHDR(arg2);
 
        PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
 }