OSDN Git Service

Support text similarity search.
authorMasaoFujii <masao.fujii@gmail.com>
Fri, 4 Oct 2013 03:41:22 +0000 (12:41 +0900)
committerMasaoFujii <masao.fujii@gmail.com>
Fri, 4 Oct 2013 03:41:22 +0000 (12:41 +0900)
Amit Langote, reviewed by Beena Emerson, and further edited by me.

bigm.h
bigm_gin.c
bigm_op.c
pg_bigm--1.1.sql
pg_bigm.control

diff --git a/bigm.h b/bigm.h
index b6fbcb4..5d8d014 100644 (file)
--- a/bigm.h
+++ b/bigm.h
@@ -17,8 +17,9 @@
 #include "utils/builtins.h"
 
 /* GUC variable */
-extern bool    bigm_enable_recheck;
-extern int     bigm_gin_key_limit;
+extern bool            bigm_enable_recheck;
+extern int             bigm_gin_key_limit;
+extern double  bigm_similarity_limit;
 
 /* options */
 #define LPADDING               1
@@ -26,6 +27,7 @@ extern int    bigm_gin_key_limit;
 
 /* operator strategy numbers */
 #define LikeStrategyNumber                     1
+#define SimilarityStrategyNumber       2
 
 typedef struct
 {
index 5bcbd4b..da6d9dc 100644 (file)
@@ -139,6 +139,12 @@ gin_extract_query_bigm(PG_FUNCTION_ARGS)
                                *recheck = true;
                        break;
                }
+               case SimilarityStrategyNumber:
+               {
+                       bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+                       bgmlen = ARRNELEM(bgm);
+                       break;
+               }
                default:
                        elog(ERROR, "unrecognized strategy number: %d", strategy);
                        bgm = NULL;                     /* keep compiler quiet */
@@ -191,6 +197,7 @@ gin_bigm_consistent(PG_FUNCTION_ARGS)
        bool       *recheck = (bool *) PG_GETARG_POINTER(5);
        bool            res;
        int32           i;
+       int32           ntrue;
 
        Assert(nkeys > 0);
 
@@ -217,6 +224,24 @@ gin_bigm_consistent(PG_FUNCTION_ARGS)
                                }
                        }
                        break;
+               case SimilarityStrategyNumber:
+                       /* Count the matches */
+                       ntrue = 0;
+                       for (i = 0; i < nkeys; i++)
+                       {
+                               if (check[i])
+                                       ntrue++;
+                       }
+#ifdef DIVUNION
+                       res = (nkeys == ntrue) ? true :
+                               ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >=
+                                 (float4) bigm_similarity_limit) ? true : false);
+#else
+                       res = (nkeys == 0) ? false :
+                               ((((((float4) ntrue) / ((float4) nkeys))) >=
+                                 (float4) bigm_similarity_limit) ? true : false);
+#endif
+                       break;
                default:
                        elog(ERROR, "unrecognized strategy number: %d", strategy);
                        res = false;            /* keep compiler quiet */
index 77d2fe5..90b009f 100644 (file)
--- a/bigm_op.c
+++ b/bigm_op.c
@@ -28,6 +28,7 @@ PG_MODULE_MAGIC;
 /* GUC variable */
 bool   bigm_enable_recheck = false;
 int            bigm_gin_key_limit = 0;
+double bigm_similarity_limit = 0.3;
 char   *bigm_last_update = NULL;
 
 PG_FUNCTION_INFO_V1(show_bigm);
@@ -42,6 +43,9 @@ Datum         likequery(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(bigm_similarity);
 Datum          bigm_similarity(PG_FUNCTION_ARGS);
 
+PG_FUNCTION_INFO_V1(bigm_similarity_op);
+Datum          bigm_similarity_op(PG_FUNCTION_ARGS);
+
 void           _PG_init(void);
 void           _PG_fini(void);
 
@@ -74,6 +78,19 @@ _PG_init(void)
                                                        NULL,
                                                        NULL);
 
+       DefineCustomRealVariable("pg_bigm.similarity_limit",
+                                                        "Sets the similarity threshold used by the "
+                                                        "=% operator.",
+                                                        NULL,
+                                                        &bigm_similarity_limit,
+                                                        0.3,
+                                                        0.0, 1.0,
+                                                        PGC_USERSET,
+                                                        0,
+                                                        NULL,
+                                                        NULL,
+                                                        NULL);
+
        /* Can't be set in postgresql.conf */
        DefineCustomStringVariable("pg_bigm.last_update",
                                                           "Shows the last update date of pg_bigm.",
@@ -607,6 +624,16 @@ bigm_similarity(PG_FUNCTION_ARGS)
 }
 
 Datum
+bigm_similarity_op(PG_FUNCTION_ARGS)
+{
+       float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
+                                                                                                                PG_GETARG_DATUM(0),
+                                                                                                                PG_GETARG_DATUM(1)));
+
+       PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
+}
+
+Datum
 likequery(PG_FUNCTION_ARGS)
 {
        text       *query = PG_GETARG_TEXT_PP(0);
index 763e936..196be9e 100644 (file)
@@ -11,6 +11,20 @@ RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 
+CREATE FUNCTION bigm_similarity_op(text,text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;  -- stable because depends on pg_bigm.similarity_limit
+
+CREATE OPERATOR =% (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = bigm_similarity_op,
+        COMMUTATOR = '=%',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
 -- support functions for gin
 CREATE FUNCTION gin_extract_value_bigm(text, internal)
 RETURNS internal
@@ -42,6 +56,7 @@ CREATE OPERATOR CLASS gin_bigm_ops
 FOR TYPE text USING gin
 AS
         OPERATOR        1       pg_catalog.~~ (text, text),
+        OPERATOR        2       =% (text, text),
         FUNCTION        1       bigmtextcmp (text, text),
         FUNCTION        2       gin_extract_value_bigm (text, internal),
         FUNCTION        3       gin_extract_query_bigm (text, internal, int2, internal, internal, internal, internal),
index 5f3e553..ec861e2 100644 (file)
@@ -1,5 +1,5 @@
 # pg_bigm extension
-comment = 'text index searching based on bigrams'
+comment = 'text similarity measurement and index searching based on bigrams'
 default_version = '1.1'
 module_pathname = '$libdir/pg_bigm'
 relocatable = true