OSDN Git Service

Add the news about the release of RPM files.
[pgbigm/pg_bigm.git] / bigm_gin.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2007-2012, PostgreSQL Global Development Group
4  * Portions Copyright (c) 2013-2015, NTT DATA Corporation
5  *
6  * Changelog:
7  *   2013/01/09
8  *   Support full text search using bigrams.
9  *   Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include "bigm.h"
16
17 #include "access/gin.h"
18 #include "access/gin_private.h"
19 #include "access/itup.h"
20 #include "access/skey.h"
21 #include "access/tuptoaster.h"
22 #include "catalog/pg_type.h"
23 #include "funcapi.h"
24 #include "mb/pg_wchar.h"
25 #include "storage/bufmgr.h"
26 #include "storage/bufpage.h"
27 #include "tsearch/ts_locale.h"
28 #include "utils/array.h"
29 #include "utils/builtins.h"
30
31
32 PG_FUNCTION_INFO_V1(gin_extract_value_bigm);
33 Datum           gin_extract_value_bigm(PG_FUNCTION_ARGS);
34
35 PG_FUNCTION_INFO_V1(gin_extract_query_bigm);
36 Datum           gin_extract_query_bigm(PG_FUNCTION_ARGS);
37
38 PG_FUNCTION_INFO_V1(gin_bigm_consistent);
39 Datum           gin_bigm_consistent(PG_FUNCTION_ARGS);
40
41 PG_FUNCTION_INFO_V1(gin_bigm_compare_partial);
42 Datum           gin_bigm_compare_partial(PG_FUNCTION_ARGS);
43
44 PG_FUNCTION_INFO_V1(pg_gin_pending_stats);
45 Datum           pg_gin_pending_stats(PG_FUNCTION_ARGS);
46
47 Datum
48 gin_extract_value_bigm(PG_FUNCTION_ARGS)
49 {
50         text       *val = (text *) PG_GETARG_TEXT_P(0);
51         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
52         Datum      *entries = NULL;
53         BIGM       *bgm;
54         int32           bgmlen;
55
56         *nentries = 0;
57
58         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
59         bgmlen = ARRNELEM(bgm);
60
61         if (bgmlen > 0)
62         {
63                 bigm       *ptr;
64                 int32           i;
65
66                 *nentries = bgmlen;
67                 entries = (Datum *) palloc(sizeof(Datum) * bgmlen);
68
69                 ptr = GETARR(bgm);
70                 for (i = 0; i < bgmlen; i++)
71                 {
72                         text            *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
73                         entries[i] = PointerGetDatum(item);
74                         ptr++;
75                 }
76         }
77
78         PG_RETURN_POINTER(entries);
79 }
80
81 Datum
82 gin_extract_query_bigm(PG_FUNCTION_ARGS)
83 {
84         text       *val = (text *) PG_GETARG_TEXT_P(0);
85         int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
86         StrategyNumber strategy = PG_GETARG_UINT16(2);
87
88         bool   **pmatch = (bool **) PG_GETARG_POINTER(3);
89         Pointer   **extra_data = (Pointer **) PG_GETARG_POINTER(4);
90         /* bool   **nullFlags = (bool **) PG_GETARG_POINTER(5); */
91         int32      *searchMode = (int32 *) PG_GETARG_POINTER(6);
92         Datum      *entries = NULL;
93         BIGM       *bgm;
94         int32           bgmlen = 0;
95         bigm       *ptr;
96         int32           i;
97         bool            removeDups;
98
99         switch (strategy)
100         {
101                 case LikeStrategyNumber:
102                 {
103                         char    *str = VARDATA(val);
104                         int             slen = VARSIZE(val) - VARHDRSZ;
105                         bool    *recheck;
106
107                         /*
108                          * For wildcard search we extract all the bigrams that every
109                          * potentially-matching string must include.
110                          */
111                         bgm = generate_wildcard_bigm(str, slen, &removeDups);
112                         bgmlen = ARRNELEM(bgm);
113
114                         /*
115                          * Check whether the heap tuple fetched by index search needs to be
116                          * rechecked against the query. If the search word consists of one
117                          * or two characters and doesn't contain any space character, we can
118                          * guarantee that the index test would be exact. That is, the heap
119                          * tuple does match the query, so it doesn't need to be rechecked.
120                          */
121                         *extra_data = (Pointer *) palloc(sizeof(bool));
122                         recheck = (bool *) *extra_data;
123                         if (bgmlen == 1 && !removeDups)
124                         {
125                                 const char      *sp;
126
127                                 *recheck = false;
128                                 for (sp = str; (sp - str) < slen;)
129                                 {
130                                         if (t_isspace(sp))
131                                         {
132                                                 *recheck = true;
133                                                 break;
134                                         }
135
136                                         sp += IS_HIGHBIT_SET(*sp) ? pg_mblen(sp) : 1;
137                                 }
138                         }
139                         else
140                                 *recheck = true;
141                         break;
142                 }
143                 case SimilarityStrategyNumber:
144                 {
145                         bgm = generate_bigm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
146                         bgmlen = ARRNELEM(bgm);
147                         break;
148                 }
149                 default:
150                         elog(ERROR, "unrecognized strategy number: %d", strategy);
151                         bgm = NULL;                     /* keep compiler quiet */
152                         break;
153         }
154
155         *nentries = (bigm_gin_key_limit == 0) ?
156                 bgmlen : Min(bigm_gin_key_limit, bgmlen);
157         *pmatch = NULL;
158
159         if (*nentries > 0)
160         {
161                 entries = (Datum *) palloc(sizeof(Datum) * *nentries);
162                 ptr = GETARR(bgm);
163                 for (i = 0; i < *nentries; i++)
164                 {
165                         text            *item;
166
167                         if (ptr->pmatch)
168                         {
169                                 if (*pmatch == NULL)
170                                         *pmatch = (bool *) palloc0(sizeof(bool) * *nentries);
171                                 (*pmatch)[i] = true;
172                         }
173                         item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
174                         entries[i] = PointerGetDatum(item);
175                         ptr++;
176                 }
177         }
178
179         /*
180          * If no bigram was extracted then we have to scan all the index.
181          */
182         if (*nentries == 0)
183                 *searchMode = GIN_SEARCH_MODE_ALL;
184
185         PG_RETURN_POINTER(entries);
186 }
187
188 Datum
189 gin_bigm_consistent(PG_FUNCTION_ARGS)
190 {
191         bool       *check = (bool *) PG_GETARG_POINTER(0);
192         StrategyNumber strategy = PG_GETARG_UINT16(1);
193
194         /* text    *query = PG_GETARG_TEXT_P(2); */
195         int32           nkeys = PG_GETARG_INT32(3);
196
197         Pointer   *extra_data = (Pointer *) PG_GETARG_POINTER(4);
198         bool       *recheck = (bool *) PG_GETARG_POINTER(5);
199         bool            res;
200         int32           i;
201         int32           ntrue;
202
203         switch (strategy)
204         {
205                 case LikeStrategyNumber:
206                         /*
207                          * Don't recheck the heap tuple against the query if either
208                          * pg_bigm.enable_recheck is disabled or the search word is
209                          * the special one so that the index can return the exact
210                          * result.
211                          */
212                         Assert(extra_data != NULL);
213                         *recheck = bigm_enable_recheck &&
214                                 (*((bool *) extra_data) || (nkeys != 1));
215
216                         /* Check if all extracted bigrams are presented. */
217                         res = true;
218                         for (i = 0; i < nkeys; i++)
219                         {
220                                 if (!check[i])
221                                 {
222                                         res = false;
223                                         break;
224                                 }
225                         }
226                         break;
227                 case SimilarityStrategyNumber:
228                         /* Count the matches */
229                         *recheck = bigm_enable_recheck;
230                         ntrue = 0;
231                         for (i = 0; i < nkeys; i++)
232                         {
233                                 if (check[i])
234                                         ntrue++;
235                         }
236 #ifdef DIVUNION
237                         res = (nkeys == ntrue) ? true :
238                                 ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >=
239                                   (float4) bigm_similarity_limit) ? true : false);
240 #else
241                         res = (nkeys == 0) ? false :
242                                 ((((((float4) ntrue) / ((float4) nkeys))) >=
243                                   (float4) bigm_similarity_limit) ? true : false);
244 #endif
245                         break;
246                 default:
247                         elog(ERROR, "unrecognized strategy number: %d", strategy);
248                         res = false;            /* keep compiler quiet */
249                         break;
250         }
251
252         PG_RETURN_BOOL(res);
253 }
254
255 Datum
256 gin_bigm_compare_partial(PG_FUNCTION_ARGS)
257 {
258         text    *arg1 = PG_GETARG_TEXT_PP(0);
259         text    *arg2 = PG_GETARG_TEXT_PP(1);
260         char    *a1p;
261         char    *a2p;
262         int             mblen1;
263         int             mblen2;
264         int             res;
265
266         a1p = VARDATA_ANY(arg1);
267         a2p = VARDATA_ANY(arg2);
268
269         mblen1 = pg_mblen(a1p);
270         mblen2 = pg_mblen(a2p);
271
272         if (mblen1 != mblen2)
273                 PG_RETURN_INT32(1);
274
275         res = memcmp(a1p, a2p, mblen1) ? 1 : 0;
276         PG_RETURN_INT32(res);
277 }
278
279 /*
280  * Report both number of pages and number of heap tuples that
281  * are in the pending list.
282  */
283 Datum
284 pg_gin_pending_stats(PG_FUNCTION_ARGS)
285 {
286         Oid                     indexOid = PG_GETARG_OID(0);
287         Relation        indexRel;
288         Buffer          metabuffer;
289         Page            metapage;
290         GinMetaPageData *metadata;
291         Datum           values[2];
292         bool            isnull[2];
293         HeapTuple       tuple;
294         TupleDesc       tupdesc;
295
296         /*
297          * Obtain statistic information from the meta page
298          */
299         indexRel = index_open(indexOid, AccessShareLock);
300         metabuffer = ReadBuffer(indexRel, GIN_METAPAGE_BLKNO);
301         LockBuffer(metabuffer, GIN_SHARE);
302         metapage = BufferGetPage(metabuffer);
303         metadata = GinPageGetMeta(metapage);
304         index_close(indexRel, AccessShareLock);
305
306         /*
307          * Construct a tuple descriptor for the result row. This must
308          * match this function's pg_bigm--x.x.sql entry.
309          */
310         tupdesc = CreateTemplateTupleDesc(2, false);
311         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
312                                            "pages", INT4OID, -1, 0);
313         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
314                                            "tuples", INT8OID, -1, 0);
315         tupdesc = BlessTupleDesc(tupdesc);
316
317         /* pages */
318         values[0] = Int32GetDatum(metadata->nPendingPages);
319         isnull[0] = false;
320
321         /* tuples */
322         values[1] = Int64GetDatum(metadata->nPendingHeapTuples);
323         isnull[1] = false;
324
325         UnlockReleaseBuffer(metabuffer);
326
327         tuple = heap_form_tuple(tupdesc, values, isnull);
328         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
329 }