OSDN Git Service

Update the last release date of ludia_funcs.
[ludiafuncs/ludia_funcs.git] / ludia_funcs.c
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2006-2013, NTT DATA Corporation
4  * All rights reserved.
5  *
6  * Changelog:
7  *   2013/01/09
8  *   Update Ludia functions so that they are available with PostgreSQL9.1.
9  *   Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <limits.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19
20 #include "catalog/pg_type.h"
21 #include "fmgr.h"
22 #include "funcapi.h"
23 #include "ludia_funcs.h"
24 #include "mb/pg_wchar.h"
25 #include "senna.h"
26 #include "storage/fd.h"
27 #include "utils/builtins.h"
28 #include "utils/guc.h"
29 #include "miscadmin.h"
30
31 PG_MODULE_MAGIC;
32
33 /* Last update date of ludia_funcs */
34 #define PGS2_LAST_UPDATE        "2013.04.05"
35
36 /* GUC variables */
37 #ifdef PGS2_DEBUG
38 static bool     pgs2_enable_debug = false;
39 #endif
40 static char     *pgs2_last_update = NULL;
41 static int      norm_cache_limit = -1;
42 static bool     escape_snippet_keyword = false;
43
44 #define SEN_NORMALIZE_FLAGS 0
45 #define SEN_MAX_N_EXPRS         32
46
47 /* upper limit for GUC variables measured in kilobytes of memory */
48 /* note that various places assume the byte size fits in a "long" variable */
49 #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4
50 #define MAX_KILOBYTES   INT_MAX
51 #else
52 #define MAX_KILOBYTES   (INT_MAX / 1024)
53 #endif
54
55 #define ISBACKSLASHCHAR(x) (*(x) == '\\')
56 #define ISDOUBLEQUOTECHAR(x) (*(x) == '"')
57 #define ISSENNAOPSCHAR(x) (*(x) == '+' || *(x) == '-' || *(x) == ' ')
58
59 PG_FUNCTION_INFO_V1(pgs2snippet1);
60 Datum   pgs2snippet1(PG_FUNCTION_ARGS);
61 PG_FUNCTION_INFO_V1(pgs2norm);
62 Datum   pgs2norm(PG_FUNCTION_ARGS);
63 PG_FUNCTION_INFO_V1(pgs2textporter1);
64 Datum   pgs2textporter1(PG_FUNCTION_ARGS);
65 PG_FUNCTION_INFO_V1(pgs2seninfo);
66 Datum   pgs2seninfo(PG_FUNCTION_ARGS);
67
68 static sen_encoding     GetSennaEncoding(void);
69 static sen_query        *GetSennaQuery(char *str, size_t len);
70 static bool                     EscapeSnippetKeyword(char **s, size_t *slen);
71
72 #ifdef TEXTPORTER
73 #define TEXTPORTER_TMPDIR                       "/tmp"
74 #define TEXTPORTER_GROUPNAME            "UTF-8"
75 #define TEXTPORTER_DEFLANGNAME          "Japanese"
76 #define TEXTPORTER_BBIGENDIAN           1
77 #define TEXTPORTER_OPTION                       0x00000020      /* DMC_GETTEXT_OPT_LF */
78 #define TEXTPORTER_OPTION_STRING        "32"
79 #define TEXTPORTER_OPTION1                      0x00010000      /* DMC_GETTEXT_OPT1_TXCONV */
80 #define TEXTPORTER_SIZE                         0
81 #define TEXTPORTER_CSV_C                        0
82
83 /* GUC variables for pgs2textpoter1 */
84 static int      textporter_error = ERROR;
85 static unsigned int     textporter_option = TEXTPORTER_OPTION;
86
87 /*
88  * This variable is a dummy that doesn't do anything, except in some
89  * cases provides the value for SHOW to display.  The real state is
90  * elsewhere and is kept in sync by assign_hooks.
91  */
92 static char     *textporter_option_string;
93
94 static const struct config_enum_entry textporter_error_options[] = {
95         {"debug1", DEBUG1, false},
96         {"log", LOG, false},
97         {"info", INFO, false},
98         {"notice", NOTICE, false},
99         {"warning", WARNING, false},
100         {"error", ERROR, false},
101         {NULL, 0, false}
102 };
103
104 static void CleanupTextPorterTmpFiles(void);
105
106 static bool check_textporter_option(char **newval, void **extra, GucSource source);
107 static void assign_textporter_option(const char *newval, void *extra);
108 #endif  /* TEXTPORTER */
109
110 void    _PG_init(void);
111 void    _PG_fini(void);
112
113 void
114 _PG_init(void)
115 {
116         sen_rc          rc;
117
118 #ifdef PGS2_DEBUG
119         /* Define custom GUC variable for debugging */
120         DefineCustomBoolVariable("ludia_funcs.enable_debug",
121                                                          "Emit ludia_funcs debugging output.",
122                                                          NULL,
123                                                          &pgs2_enable_debug,
124                                                          false,
125                                                          PGC_USERSET,
126                                                          0,
127                                                          NULL,
128                                                          NULL,
129                                                          NULL);
130 #endif
131
132         /* Can't be set in postgresql.conf */
133         DefineCustomStringVariable("ludia_funcs.last_update",
134                                                            "Shows the last update date of ludia_funcs.",
135                                                            NULL,
136                                                            &pgs2_last_update,
137                                                            PGS2_LAST_UPDATE,
138                                                            PGC_INTERNAL,
139                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
140                                                            NULL,
141                                                            NULL,
142                                                            NULL);
143
144 #ifdef TEXTPORTER
145         /* Define custom GUC variables */
146         DefineCustomEnumVariable("ludia_funcs.textporter_error",
147                                                          "Sets the message levels that are emitted "
148                                                          "when textporter fails.",
149                                                          NULL,
150                                                          &textporter_error,
151                                                          ERROR,
152                                                          textporter_error_options,
153                                                          PGC_SUSET,
154                                                          0,
155                                                          NULL,
156                                                          NULL,
157                                                          NULL);
158
159         DefineCustomStringVariable("ludia_funcs.textporter_option",
160                                                            "Sets the option used to get text data "
161                                                            "from TextPorter",
162                                                            NULL,
163                                                            &textporter_option_string,
164                                                            TEXTPORTER_OPTION_STRING,
165                                                            PGC_SUSET,
166                                                            0,
167                                                            check_textporter_option,
168                                                            assign_textporter_option,
169                                                            NULL);
170
171         /* Clean up remaining textporter temporary files */
172         CleanupTextPorterTmpFiles();
173 #endif  /* TEXTPORTER */
174
175         /*
176          * A value of 0 means no limit on the cache size. A value of -1 means
177          * that work_mem is used as the upper size limit of the cache.
178          */
179         DefineCustomIntVariable("ludia_funcs.norm_cache_limit",
180                                                         "Sets the maximum memory to be used for caching "
181                                                         "the result of pgs2norm()",
182                                                         NULL,
183                                                         &norm_cache_limit,
184                                                         -1,
185                                                         -1,
186                                                         MAX_KILOBYTES,
187                                                         PGC_USERSET,
188                                                         GUC_UNIT_KB,
189                                                         NULL,
190                                                         NULL,
191                                                         NULL);
192
193         DefineCustomBoolVariable("ludia_funcs.escape_snippet_keyword",
194                                                          "Escapes snippet keyword string.",
195                                                          NULL,
196                                                          &escape_snippet_keyword,
197                                                          false,
198                                                          PGC_USERSET,
199                                                          0,
200                                                          NULL,
201                                                          NULL,
202                                                          NULL);
203
204         EmitWarningsOnPlaceholders("ludia_funcs");
205
206         /* Initialize Senna */
207         rc = sen_init();
208         if (rc != sen_success)
209                 ereport(ERROR,
210                                 (errmsg("sen_init() failed: %d", rc)));
211 }
212
213 void
214 _PG_fini(void)
215 {
216 }
217
218 #ifdef TEXTPORTER
219 #define REMOVE_TMPFILE(path)                                                                                    \
220         do {                                                                                                                            \
221                 if (unlink(path) != 0)                                                                                  \
222                         ereport(WARNING,                                                                                        \
223                                         (errcode_for_file_access(),                                                     \
224                                          errmsg("could not remove temporary file \"%s\": %m", path))); \
225         } while(0)
226
227 Datum
228 pgs2textporter1(PG_FUNCTION_ARGS)
229 {
230         char    *appfile = text_to_cstring(PG_GETARG_TEXT_P(0));
231         char    txtfile[] = TEXTPORTER_TMPDIR "/ludia_funcs_XXXXXX";
232         int             tmpfd;
233         int             ret;
234         FILE    *fp;
235         text    *result = NULL;
236         struct stat     statbuf;
237         bool    return_null = false;
238
239         /* Confirm that database encoding is UTF-8 */
240         GetSennaEncoding();
241
242         PG_TRY();
243         {
244                 /*
245                  * Generate a unique temporary filename where text data gotten
246                  * from application file by TextPorter is stored temporarily.
247                  */
248                 tmpfd = mkstemp(txtfile);
249                 if (tmpfd < 0)
250                         ereport(ERROR,
251                                         (errcode_for_file_access(),
252                                          errmsg("could not generate a unique temporary filename: %m")));
253                 if (close(tmpfd) != 0)
254                         ereport(ERROR,
255                                         (errcode_for_file_access(),
256                                          errmsg("could not close temporary file \"%s\": %m", txtfile)));
257
258                 /*
259                  * Run TextPorter to read text data from application file (appfile)
260                  * to temporary file (txtfile).
261                  */
262                 ret = ExecTextPorter((unsigned char *)appfile,
263                                                          (unsigned char *)txtfile,
264                                                          (unsigned char *)TEXTPORTER_GROUPNAME,
265                                                          (unsigned char *)TEXTPORTER_DEFLANGNAME,
266                                                          TEXTPORTER_BBIGENDIAN, textporter_option,
267                                                          TEXTPORTER_OPTION1, TEXTPORTER_SIZE,
268                                                          TEXTPORTER_CSV_C);
269                 if (ret != 0)
270                 {
271                         ereport(textporter_error,
272                                         (errmsg("could not get text from application file \"%s\"",
273                                                         appfile),
274                                          errdetail("DMC_GetText_V5() failed with errcode %d",
275                                                            ret)));
276
277                         /* Return NULL if textporter_error is set to other than ERROR */
278                         return_null = true;
279                 }
280                 else
281                 {
282                         /* Read text data from temporary file to memory */
283                         if (stat(txtfile, &statbuf))
284                                 ereport(ERROR,
285                                                 (errcode_for_file_access(),
286                                                  errmsg("could not stat file \"%s\": %m", txtfile)));
287                         result = (text *) palloc(statbuf.st_size + VARHDRSZ);
288
289                         fp = AllocateFile(txtfile, "r");
290                         if (fp == NULL)
291                                 ereport(ERROR,
292                                                 (errcode_for_file_access(),
293                                                  errmsg("could not open file \"%s\": %m", txtfile)));
294
295                         if (fread(VARDATA(result), 1, statbuf.st_size, fp) != statbuf.st_size ||
296                                 ferror(fp))
297                                 ereport(ERROR,
298                                                 (errcode_for_file_access(),
299                                                  errmsg("could not read file \"%s\": %m", txtfile)));
300
301                         FreeFile(fp);
302                 }
303
304                 REMOVE_TMPFILE(txtfile);
305                 pfree(appfile);
306         }
307         PG_CATCH();
308         {
309                 REMOVE_TMPFILE(txtfile);
310                 PG_RE_THROW();
311         }
312         PG_END_TRY();
313
314         if (return_null)
315                 PG_RETURN_NULL();
316
317         SET_VARSIZE(result, statbuf.st_size + VARHDRSZ);
318
319         PG_RETURN_TEXT_P(result);
320 }
321
322 /*
323  * Clean up remaining textporter temporary files
324  */
325 static void
326 CleanupTextPorterTmpFiles(void)
327 {
328         DIR                             *tpdir;
329         struct dirent   *tpde;
330         char                    path[MAXPGPATH];
331
332         tpdir = AllocateDir(TEXTPORTER_TMPDIR);
333         if (tpdir == NULL)
334                 ereport(ERROR,
335                                 (errcode_for_file_access(),
336                                  errmsg("could not open textporter temporary file directory \"%s\": %m",
337                                                 TEXTPORTER_TMPDIR)));
338
339         while ((tpde = ReadDir(tpdir, TEXTPORTER_TMPDIR)) != NULL)
340         {
341                 if (strlen(tpde->d_name) == 18 &&
342                         strncmp(tpde->d_name, "ludia_funcs_", 12) == 0)
343                 {
344                         snprintf(path, MAXPGPATH, TEXTPORTER_TMPDIR "/%s", tpde->d_name);
345                         REMOVE_TMPFILE(path);
346                 }
347         }
348
349         FreeDir(tpdir);
350 }
351
352 static bool
353 check_textporter_option(char **newval, void **extra, GucSource source)
354 {
355         unsigned long   val;
356         char                    *endptr;
357         unsigned int    *myextra;
358
359         errno = 0;
360         val = strtoul(*newval, &endptr, 0);
361
362         if (*endptr != '\0')
363                 return false;
364
365         if (errno == ERANGE || val != (unsigned long) ((unsigned int) val))
366         {
367                 GUC_check_errhint("Value exceeds unsigned integer range.");
368                 return false;
369         }
370
371         /* Set up the "extra" struct actually used by assign_textporter_option */
372         myextra = (unsigned int *) malloc(sizeof(unsigned int));
373         if (myextra == NULL)
374         {
375                 GUC_check_errcode(ERRCODE_OUT_OF_MEMORY);
376                 GUC_check_errmsg("out of memory");
377                 return false;
378         }
379         *myextra = (unsigned int) val;
380         *extra = (void *) myextra;
381
382         return true;
383 }
384
385 static void
386 assign_textporter_option(const char *newval, void *extra)
387 {
388         textporter_option = *((unsigned int *) extra);
389 }
390
391 #else   /* TEXTPORTER */
392
393 Datum
394 pgs2textporter1(PG_FUNCTION_ARGS)
395 {
396         PG_RETURN_NULL();
397 }
398
399 #endif  /* TEXTPORTER */
400
401 static sen_encoding
402 GetSennaEncoding(void)
403 {
404         static sen_encoding             encoding = sen_enc_default;
405
406         if (encoding == sen_enc_default)
407         {
408                 if (GetDatabaseEncoding() == PG_UTF8)
409                         encoding = sen_enc_utf8;
410                 else
411                         ereport(ERROR,
412                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
413                                          errmsg("does not support database encoding \"%s\"",
414                                                         GetDatabaseEncodingName())));
415         }
416         return encoding;
417 }
418
419 /*
420  * Escape the backslash and double quote characters in the given string.
421  *
422  * Return false if the given string has no character which needs to be
423  * escaped. Otherwise, return true. In this case, **s points the palloc'd
424  * space storing the escaped keyword string and *slen is set to the size
425  * of that string. The caller needs to free the palloc'd space.
426  */
427 static bool
428 EscapeSnippetKeyword(char **s, size_t *slen)
429 {
430         const char      *sp;
431         char            *ep;
432         char            *escaped;
433         int                     mblen;
434         int                     copylen;
435         bool            in_doublequote = false;
436         bool            in_sennaops = false;
437         bool            need_escape = false;
438
439         /*
440          * Skip the heading double quote character because it always doesn't
441          * need to be interpreted as a character itself and be escaped.
442          * Note that we must not skip the heading character if it's not a
443          * double quote.
444          */
445         sp = *s;
446         if (ISDOUBLEQUOTECHAR(sp))
447                 sp++;
448
449         /*
450          * Check whether the snippet keyword string has a character which
451          * needs to be escaped.
452          */
453         while ((sp - *s) < *slen)
454         {
455                 mblen = pg_mblen(sp);
456
457                 /*
458                  * Backslash in the keyword always needs to be escaped.
459                  */
460                 if (ISBACKSLASHCHAR(sp))
461                 {
462                         need_escape = true;
463                         break;
464                 }
465
466                 if (in_doublequote)
467                 {
468                         if (ISSENNAOPSCHAR(sp))
469                         {
470                                 in_sennaops = true;
471                                 in_doublequote = false;
472                         }
473                         else
474                         {
475                                 /*
476                                  * Double quote in the keyword needs to be escaped if
477                                  * any Senna search operators are to neither its right
478                                  * nor left.
479                                  */
480                                 need_escape = true;
481                                 break;
482                         }
483                 }
484                 else
485                 {
486                         if (ISDOUBLEQUOTECHAR(sp) && !in_sennaops)
487                                 in_doublequote = true;
488                         if (!ISSENNAOPSCHAR(sp))
489                                 in_sennaops = false;
490                 }
491
492                 sp += mblen;
493         }
494
495         /*
496          * Quick exit if the keyword has no character which needs to be
497          * escaped.
498          */
499         if (!need_escape)
500                 return false;
501
502         /*
503          * Allocate the buffer space to store the escaped snippet keyword string.
504          * The maximum size of escaped string is double the input keyword size.
505          * The size reaches the maximum when every character in the input keyword
506          * needs to be escaped.
507          */
508         ep = escaped = (char *) palloc(*slen * 2);
509
510         /*
511          * Copy the characters which have been passed through in the above loop
512          * and don't need to be escaped, into the buffer. If in_doublequote is
513          * true, we don't copy the double quote in the previous position into the
514          * buffer because it might still need to be escaped.
515          */
516         copylen = sp - *s - ((in_doublequote) ? 1 : 0);
517         memcpy(ep, *s, copylen);
518         ep += copylen;
519
520         /*
521          * Construct the escaped snippet keyword string.
522          */
523         while ((sp - *s) < *slen)
524         {
525                 mblen = pg_mblen(sp);
526
527                 if (in_doublequote)
528                 {
529                         /*
530                          * dqchar indicates the previous character, that is a double
531                          * quote. We assume here that a double quote is single-byte
532                          * character.
533                          */
534                         char dqchar     = *(sp - 1);
535
536                         if (ISSENNAOPSCHAR(sp))
537                         {
538                                 /*
539                                  * Don't escape the double quote which is just before Senna
540                                  * operator.
541                                  */
542                                 *ep++ = dqchar;
543                                 *ep++ = *sp;
544                                 in_sennaops = true;
545                                 in_doublequote = false;
546                         }
547                         else
548                         {
549                                 /*
550                                  * Escape the double quote if no Senna operator is next to it.
551                                  */
552                                 *ep++ = '\\';
553                                 *ep++ = dqchar;
554
555                                 if (ISDOUBLEQUOTECHAR(sp))
556                                         in_doublequote = true;
557                                 else
558                                 {
559                                         if (ISBACKSLASHCHAR(sp))
560                                                 *ep++ = '\\';
561                                         memcpy(ep, sp, mblen);
562                                         ep += mblen;
563                                         in_doublequote = false;
564                                 }
565                         }
566                 }
567                 else
568                 {
569                         if (ISDOUBLEQUOTECHAR(sp))
570                         {
571                                 /*
572                                  * Don't escape the double quote which is just after Senna
573                                  * operator.
574                                  */
575                                 if (in_sennaops)
576                                         *ep++ = *sp;
577                                 else
578                                         in_doublequote = true;
579                         }
580                         else
581                         {
582                                 if (ISBACKSLASHCHAR(sp))
583                                         *ep++ = '\\';
584                                 /*
585                                  * We don't check ISSENNAOPSCHAR() here. We handle Senna
586                                  * operator character as a character itself instead of
587                                  * an operator if it doesn't follow a double quote.
588                                  */
589                                 memcpy(ep, sp, mblen);
590                                 ep += mblen;
591                         }
592
593                         if (!ISSENNAOPSCHAR(sp))
594                                 in_sennaops = false;
595                 }
596
597                 sp += mblen;
598         }
599
600         /* Add the tailing double quote into the buffer */
601         if (in_doublequote)
602                 *ep++ = *(sp - 1);
603
604         *s = escaped;
605         *slen = ep - *s;
606
607 #ifdef PGS2_DEBUG
608         if (pgs2_enable_debug)
609         {
610                 char    *tmp = pnstrdup(*s, *slen);
611
612                 elog(LOG, "escaped snippet keyword: %s", tmp);
613                 pfree(tmp);
614         }
615 #endif
616
617         return true;
618 }
619
620 static sen_query *
621 GetSennaQuery(char *str, size_t len)
622 {
623         static sen_query        *query_cache = NULL;
624         static char                     *key_cache = NULL;
625         static size_t           len_cache = 0;
626         static bool                     guc_cache = false;
627         sen_query       *query;
628         sen_encoding    encoding;
629         char            *key;
630         size_t          key_len;
631         int                     rest;
632         bool            needfree = false;
633
634         /*
635          * Return the cached Senna query if the same keyword has
636          * been used the last time.
637          */
638         if (key_cache != NULL &&
639                 len == len_cache &&
640                 strncmp(key_cache, str, len) == 0 &&
641                 escape_snippet_keyword == guc_cache)
642         {
643 #ifdef PGS2_DEBUG
644                 if (pgs2_enable_debug)
645                 {
646                         char    *tmp = pnstrdup(str, len);
647
648                         elog(LOG, "GetSennaQuery(): quick exit: %s", tmp);
649                         pfree(tmp);
650                 }
651 #endif
652                 return query_cache;
653         }
654
655         encoding = GetSennaEncoding();
656
657         key = malloc(len);
658         if (key == NULL)
659                 ereport(ERROR,
660                                 (errcode(ERRCODE_OUT_OF_MEMORY),
661                                  errmsg("out of memory")));
662
663         /*
664          * We always cache the unescaped keyword. Which enables us
665          * to check whether we can use the cached Senna query before
666          * escaping the keyword.
667          */
668         memcpy(key, str, len);
669         key_len = len;
670
671         /*
672          * If the keyword has been escaped, 'str' points to the
673          * newly-palloc'd space storing the escaped keyword. This
674          * space needs to be freed later.
675          */
676         if (escape_snippet_keyword)
677                 needfree = EscapeSnippetKeyword(&str, &len);
678
679         query = sen_query_open(str, len, sen_sel_or, SEN_MAX_N_EXPRS,
680                                                    encoding);
681         if (query == NULL)
682         {
683                 free(key);
684                 ereport(ERROR,
685                                 (errmsg("sen_query_open() failed")));
686         }
687
688         if ((rest = sen_query_rest(query, NULL)) != 0)
689                 ereport(WARNING,
690                                 (errmsg("too many expressions (%d)", rest)));
691
692         if (query_cache != NULL)
693         {
694                 sen_query_close(query_cache);
695                 free(key_cache);
696         }
697
698         key_cache = key;
699         len_cache = key_len;
700         query_cache = query;
701         guc_cache = escape_snippet_keyword;
702
703         if (needfree)
704                 pfree(str);
705
706         return query;
707 }
708
709 Datum
710 pgs2snippet1(PG_FUNCTION_ARGS)
711 {
712         int                     flags = PG_GETARG_INT32(0);
713         uint32          width = PG_GETARG_UINT32(1);
714         uint32          max_results = PG_GETARG_UINT32(2);
715         text       *opentags = PG_GETARG_TEXT_P(3);
716         text       *closetags = PG_GETARG_TEXT_P(4);
717         int                     mapping = PG_GETARG_INT32(5);
718         text       *keywords = PG_GETARG_TEXT_P(6);
719         text       *document = PG_GETARG_TEXT_P(7);
720         sen_query  *query;
721         sen_snip   *snip = NULL;
722         const char *opentags_str = VARDATA_ANY(opentags);
723         const char *closetags_str = VARDATA_ANY(closetags);
724         char       *keywords_str = VARDATA_ANY(keywords);
725         char       *document_str = VARDATA_ANY(document);
726         uint32          opentags_len = VARSIZE_ANY_EXHDR(opentags);
727         uint32          closetags_len = VARSIZE_ANY_EXHDR(closetags);
728         uint32          keywords_len = VARSIZE_ANY_EXHDR(keywords);
729         uint32          document_len = VARSIZE_ANY_EXHDR(document);
730         uint32          nresults = 0;
731         uint32          max_tagged_len = 0;
732         sen_rc          rc;
733         text       *result;
734         uint32          result_len = 0;
735         bool            return_null = false;
736
737         query = GetSennaQuery(keywords_str, keywords_len);
738
739         snip = sen_query_snip(query, flags, width, max_results, 1,
740                                                   &opentags_str, &opentags_len,
741                                                   &closetags_str, &closetags_len,
742                                                   mapping == 0 ? NULL : (sen_snip_mapping *)-1);
743         if (snip == NULL)
744                 ereport(ERROR,
745                                 (errmsg("sen_query_snip() failed")));
746
747         PG_TRY();
748         {
749                 rc = sen_snip_exec(snip, document_str, document_len,
750                                                    &nresults, &max_tagged_len);
751                 if (rc != sen_success)
752                         ereport(ERROR,
753                                         (errmsg("sen_snip_exec() failed: %d", rc)));
754
755                 result = (text *) palloc(max_tagged_len + VARHDRSZ);
756
757                 rc = sen_snip_get_result(snip, 0, VARDATA(result), &result_len);
758                 if (rc == sen_invalid_argument)
759                         return_null = true;
760                 else if (rc != sen_success)
761                         ereport(ERROR,
762                                         (errmsg("sen_snip_get_result() failed: %d", rc)));
763         }
764         PG_CATCH();
765         {
766                 sen_snip_close(snip);
767                 PG_RE_THROW();
768         }
769         PG_END_TRY();
770
771         sen_snip_close(snip);
772
773         if (return_null)
774                 PG_RETURN_NULL();
775
776         SET_VARSIZE(result, max_tagged_len + VARHDRSZ);
777
778         PG_RETURN_TEXT_P(result);
779 }
780
781 /*
782  * Make sure there is enough space for 'needed' more bytes.
783  *
784  * Sets **buf to the allocated space which can store the needed bytes if OK,
785  * NULL if failed to enlarge the space because 'needed' is larger than 'maxlen'.
786  */
787 static inline void
788 pgs2malloc(void **buf, long *buflen, long needed, long maxlen)
789 {
790 #ifdef PGS2_DEBUG
791         if (pgs2_enable_debug)
792                 elog(LOG, "pgs2malloc(): buflen %ld, needed %ld, maxlen %ld",
793                          *buflen, needed, maxlen);
794 #endif
795
796         if (*buf != NULL && *buflen >= needed && (*buflen <= maxlen || maxlen == 0))
797                 return;         /* got enough space already */
798
799         /*
800          * Release the already-allocated space since it's too small to
801          * store the needed bytes or larger than the upper limit.
802          */
803         if (*buf != NULL)
804         {
805                 free(*buf);
806                 *buf = NULL;
807                 *buflen = 0;
808         }
809
810         /*
811          * Don't allocate any space if the needed space is larger than
812          * the upper limit.
813          */
814         if (needed > maxlen && maxlen != 0)
815                 return;
816
817         /*
818          * Allocate the space for the needed bytes.
819          *
820          * We don't want to allocate just a little more space with each enlarge;
821          * for efficiency, double the buffer size each time it overflows.
822          * Actually, we might need to more than double it if 'needed' is big...
823          *
824          * We check whether '*buflen' overflows each cycle to avoid infinite loop.
825          */
826         *buflen = 1024L;
827         while (*buflen < needed && *buflen != 0)
828                 *buflen <<= 1;
829
830         /*
831          * Clamp to maxlen in case we went past it.  Note we are assuming
832          * here that maxlen <= LONG_MAX/2, else the above loop could
833          * overflow.  We will still have *buflen >= needed.
834          */
835         if (*buflen > maxlen && maxlen != 0)
836                 *buflen = maxlen;
837
838         /* Guard against out-of-range '*buflen' value */
839         if (*buflen == 0)
840                 ereport(ERROR,
841                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
842                                  errmsg("out of memory"),
843                                  errdetail("Cannot enlarge buffer by %ld more bytes.",
844                                                    needed)));
845
846         *buf = (void *) malloc(*buflen);
847         if (*buf == NULL)
848                 ereport(ERROR,
849                                 (errcode(ERRCODE_OUT_OF_MEMORY),
850                                  errmsg("out of memory")));
851 }
852
853 Datum
854 pgs2norm(PG_FUNCTION_ARGS)
855 {
856         text            *str = PG_GETARG_TEXT_PP(0);
857         char            *s = VARDATA_ANY(str);
858         long            slen = VARSIZE_ANY_EXHDR(str);
859         text            *result = NULL;
860         long            buflen;
861         long            reslen;
862         long            maxlen;
863         long            needed;
864
865         /*
866          * norm_cache is the cache memory storing both input and normalized strings
867          * as the result of pgs2norm(). norm_cache_size is the size of norm_cache
868          * and its upper limit is specified by norm_cache_limit parameter. norm_result
869          * is the pointer to the normalized string with the verlena header (i.e.,
870          * text type) stored in the latter half of the cache. norm_reslen is the size
871          * of norm_result. norm_slen is the size of the input string which is stored
872          * in the first half of the cache.
873          */
874         static char             *norm_cache = NULL;
875         static long             norm_cache_size = 0;
876         static long             norm_slen = 0;
877         static char             *norm_result = NULL;
878         static long             norm_reslen = 0;
879
880         /*
881          * Return the cached normalization result if the same string of
882          * the given one has been normalized the last time.
883          */
884         if (norm_cache != NULL &&
885                 norm_slen == slen &&
886                 strncmp(norm_cache, s, slen) == 0)
887         {
888 #ifdef PGS2_DEBUG
889                 if (pgs2_enable_debug)
890                 {
891                         char    *tmp = text_to_cstring(str);
892
893                         elog(LOG, "pgs2norm(): quick exit: %s", tmp);
894                         pfree(tmp);
895                 }
896 #endif
897
898                 PG_RETURN_TEXT_P(pnstrdup(norm_result, norm_reslen));
899         }
900
901         /* Confirm that database encoding is UTF-8 */
902         GetSennaEncoding();
903
904         /*
905          * Allocate the result buffer to store the normalized string. Since the size of
906          * normalized string can be larger than that of input one, the result buffer needs
907          * extra space. Problem is that, before calling sen_str_normalize, we need to
908          * allocate the result buffer but cannot know how large extra space is required.
909          * So we use RESULT_EXTRA_SIZE as the estimated size of extra space here.
910          */
911 #define RESULT_EXTRA_SIZE       64
912         buflen = slen + RESULT_EXTRA_SIZE;
913
914 retry:
915         result = (text *) palloc(buflen + VARHDRSZ);
916
917 #if defined(FAST_SENNA)
918         reslen = fast_sen_str_normalize(s, slen, VARDATA(result), buflen);
919 #else
920         reslen = sen_str_normalize(s, slen, sen_enc_utf8,
921                                                            SEN_NORMALIZE_FLAGS,
922                                                            VARDATA(result), buflen);
923 #endif
924
925         if (reslen < 0)
926                 ereport(ERROR,
927                                 (errmsg("could not normalize the string")));
928
929         /*
930          * If the result buffer size is too short to store the normalized string,
931          * we enlarge the buffer and retry the string normalization.
932          */
933         if (buflen <= reslen)
934         {
935                 pfree(result);
936                 buflen = reslen + 1;
937                 goto retry;
938         }
939
940         SET_VARSIZE(result, reslen + VARHDRSZ);
941
942         /*
943          * Cache both input and normalized strings to accelerate the subsequent
944          * calls of pgs2norm() with the same input string. But we don't do that
945          * if the maximum allowed size of the cache is too small to store them.
946          */
947         needed = slen + reslen + VARHDRSZ;
948         maxlen = ((norm_cache_limit >= 0) ? norm_cache_limit : work_mem) * 1024L;
949
950         pgs2malloc((void **) &norm_cache, &norm_cache_size, needed, maxlen);
951         if (norm_cache != NULL)
952         {
953                 /* Store the input string into the first half of the cache */
954                 norm_slen = slen;
955                 memcpy(norm_cache, s, slen);
956
957                 /*
958                  * Store the normalized string with the varlena header (i.e., text type)
959                  * into the latter half of the cache.
960                  */
961                 norm_result = norm_cache + slen;
962                 norm_reslen = reslen + VARHDRSZ;
963                 memcpy(norm_result, result, norm_reslen);
964         }
965
966 #ifdef PGS2_DEBUG
967         if (pgs2_enable_debug)
968         {
969                 char    *tmp = text_to_cstring(str);
970
971                 elog(LOG, "pgs2norm(): complete (%s result cache): %s",
972                          (norm_cache == NULL) ? "unset" : "set", tmp);
973                 pfree(tmp);
974         }
975 #endif
976
977         PG_RETURN_TEXT_P(result);
978 }
979
980 /*
981  * Report the version and configure options of Senna which
982  * ludia_funcs depends on.
983  */
984 Datum
985 pgs2seninfo(PG_FUNCTION_ARGS)
986 {
987         char    *version[MAXPGPATH];
988         char    *coptions[MAXPGPATH];
989         Datum   values[2];
990         bool    isnull[2];
991         HeapTuple tuple;
992         TupleDesc tupdesc;
993
994         /*
995          * Get the version and configure options of Senna. Ignore the
996          * return value of sen_info() because it always returns a success.
997          */
998         sen_info((char **)&version, (char **)&coptions, NULL, NULL, NULL, NULL);
999
1000         /*
1001          * Construct a tuple descriptor for the result row. This must
1002          * match this function's ludia_funcs--x.x.sql entry.
1003          */
1004         tupdesc = CreateTemplateTupleDesc(2, false);
1005         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
1006                                            "version", TEXTOID, -1, 0);
1007         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
1008                                            "configure_options", TEXTOID, -1, 0);
1009         tupdesc = BlessTupleDesc(tupdesc);
1010
1011         /* version */
1012         values[0] = CStringGetTextDatum(*version);
1013         isnull[0] = false;
1014
1015         /* configure option */
1016         values[1] = CStringGetTextDatum(*coptions);
1017         isnull[1] = false;
1018
1019         tuple = heap_form_tuple(tupdesc, values, isnull);
1020         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
1021 }