OSDN Git Service

Update PGS2_LAST_UPDATE to "2019.10.04".
[ludiafuncs/ludia_funcs.git] / ludia_funcs.c
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2016-2019, ludia_funcs Development Group
4  * Copyright (c) 2006-2015, NTT DATA Corporation
5  * All rights reserved.
6  *
7  * Changelog:
8  *   2013/01/09
9  *   Update Ludia functions so that they are available with PostgreSQL9.1.
10  *   Author: NTT DATA Corporation
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include <limits.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <unistd.h>
20
21 #include "catalog/pg_type.h"
22 #include "fmgr.h"
23 #include "funcapi.h"
24 #include "ludia_funcs.h"
25 #include "mb/pg_wchar.h"
26 #include "senna.h"
27 #include "storage/fd.h"
28 #include "utils/builtins.h"
29 #include "utils/guc.h"
30 #include "miscadmin.h"
31
32 #if PG_VERSION_NUM >= 90300
33 #include "access/htup_details.h"
34 #endif
35
36 PG_MODULE_MAGIC;
37
38 /* Last update date of ludia_funcs */
39 #define PGS2_LAST_UPDATE        "2019.10.04"
40
41 /* GUC variables */
42 #ifdef PGS2_DEBUG
43 static bool     pgs2_enable_debug = false;
44 #endif
45 static char     *pgs2_last_update = NULL;
46 static int      norm_cache_limit = -1;
47 static bool     escape_snippet_keyword = false;
48
49 #define SEN_NORMALIZE_FLAGS 0
50 #define SEN_MAX_N_EXPRS         32
51
52 /* upper limit for GUC variables measured in kilobytes of memory */
53 /* note that various places assume the byte size fits in a "long" variable */
54 #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4
55 #define MAX_KILOBYTES   INT_MAX
56 #else
57 #define MAX_KILOBYTES   (INT_MAX / 1024)
58 #endif
59
60 #define ISBACKSLASHCHAR(x) (*(x) == '\\')
61 #define ISDOUBLEQUOTECHAR(x) (*(x) == '"')
62 #define ISSENNAOPSCHAR(x) (*(x) == '+' || *(x) == '-' || *(x) == ' ')
63
64 PG_FUNCTION_INFO_V1(pgs2snippet1);
65 PG_FUNCTION_INFO_V1(pgs2norm);
66 PG_FUNCTION_INFO_V1(pgs2textporter1);
67 PG_FUNCTION_INFO_V1(pgs2seninfo);
68
69 /*
70  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
71  * macro since 9.4, and hence the declaration of the function prototypes
72  * here is necessary only for 9.3 or before.
73  */
74 #if PG_VERSION_NUM < 90400
75 Datum   pgs2snippet1(PG_FUNCTION_ARGS);
76 Datum   pgs2norm(PG_FUNCTION_ARGS);
77 Datum   pgs2textporter1(PG_FUNCTION_ARGS);
78 Datum   pgs2seninfo(PG_FUNCTION_ARGS);
79 #endif
80
81 static sen_encoding     GetSennaEncoding(void);
82 static sen_query        *GetSennaQuery(char *str, size_t len);
83 static bool                     EscapeSnippetKeyword(char **s, size_t *slen);
84
85 #ifdef TEXTPORTER
86 #define TEXTPORTER_TMPDIR                       "/tmp"
87 #define TEXTPORTER_MKSTEMP_UMASK                0177
88 #define TEXTPORTER_GROUPNAME            "UTF-8"
89 #define TEXTPORTER_DEFLANGNAME          "Japanese"
90 #define TEXTPORTER_BBIGENDIAN           1
91 #define TEXTPORTER_OPTION                       0x00000020      /* DMC_GETTEXT_OPT_LF */
92 #define TEXTPORTER_OPTION_STRING        "32"
93 #define TEXTPORTER_OPTION1                      0x00010000      /* DMC_GETTEXT_OPT1_TXCONV */
94 #define TEXTPORTER_SIZE                         0
95 #define TEXTPORTER_CSV_C                        0
96
97
98 /* GUC variables for pgs2textpoter1 */
99 static int      textporter_error = ERROR;
100 static unsigned int     textporter_option = TEXTPORTER_OPTION;
101
102 /*
103  * This variable is a dummy that doesn't do anything, except in some
104  * cases provides the value for SHOW to display.  The real state is
105  * elsewhere and is kept in sync by assign_hooks.
106  */
107 static char     *textporter_option_string;
108
109 static const struct config_enum_entry textporter_error_options[] = {
110         {"debug1", DEBUG1, false},
111         {"log", LOG, false},
112         {"info", INFO, false},
113         {"notice", NOTICE, false},
114         {"warning", WARNING, false},
115         {"error", ERROR, false},
116         {NULL, 0, false}
117 };
118
119 static void CleanupTextPorterTmpFiles(void);
120
121 static bool check_textporter_option(char **newval, void **extra, GucSource source);
122 static void assign_textporter_option(const char *newval, void *extra);
123 #endif  /* TEXTPORTER */
124
125 void    _PG_init(void);
126 void    _PG_fini(void);
127
128 void
129 _PG_init(void)
130 {
131         sen_rc          rc;
132
133 #ifdef PGS2_DEBUG
134         /* Define custom GUC variable for debugging */
135         DefineCustomBoolVariable("ludia_funcs.enable_debug",
136                                                          "Emit ludia_funcs debugging output.",
137                                                          NULL,
138                                                          &pgs2_enable_debug,
139                                                          false,
140                                                          PGC_USERSET,
141                                                          0,
142                                                          NULL,
143                                                          NULL,
144                                                          NULL);
145 #endif
146
147         /* Can't be set in postgresql.conf */
148         DefineCustomStringVariable("ludia_funcs.last_update",
149                                                            "Shows the last update date of ludia_funcs.",
150                                                            NULL,
151                                                            &pgs2_last_update,
152                                                            PGS2_LAST_UPDATE,
153                                                            PGC_INTERNAL,
154                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
155                                                            NULL,
156                                                            NULL,
157                                                            NULL);
158
159 #ifdef TEXTPORTER
160         /* Define custom GUC variables */
161         DefineCustomEnumVariable("ludia_funcs.textporter_error",
162                                                          "Sets the message levels that are emitted "
163                                                          "when textporter fails.",
164                                                          NULL,
165                                                          &textporter_error,
166                                                          ERROR,
167                                                          textporter_error_options,
168                                                          PGC_SUSET,
169                                                          0,
170                                                          NULL,
171                                                          NULL,
172                                                          NULL);
173
174         DefineCustomStringVariable("ludia_funcs.textporter_option",
175                                                            "Sets the option used to get text data "
176                                                            "from TextPorter",
177                                                            NULL,
178                                                            &textporter_option_string,
179                                                            TEXTPORTER_OPTION_STRING,
180                                                            PGC_SUSET,
181                                                            0,
182                                                            check_textporter_option,
183                                                            assign_textporter_option,
184                                                            NULL);
185
186         /* Clean up remaining textporter temporary files */
187         CleanupTextPorterTmpFiles();
188 #endif  /* TEXTPORTER */
189
190         /*
191          * A value of 0 means no limit on the cache size. A value of -1 means
192          * that work_mem is used as the upper size limit of the cache.
193          */
194         DefineCustomIntVariable("ludia_funcs.norm_cache_limit",
195                                                         "Sets the maximum memory to be used for caching "
196                                                         "the result of pgs2norm()",
197                                                         NULL,
198                                                         &norm_cache_limit,
199                                                         -1,
200                                                         -1,
201                                                         MAX_KILOBYTES,
202                                                         PGC_USERSET,
203                                                         GUC_UNIT_KB,
204                                                         NULL,
205                                                         NULL,
206                                                         NULL);
207
208         DefineCustomBoolVariable("ludia_funcs.escape_snippet_keyword",
209                                                          "Escapes snippet keyword string.",
210                                                          NULL,
211                                                          &escape_snippet_keyword,
212                                                          false,
213                                                          PGC_USERSET,
214                                                          0,
215                                                          NULL,
216                                                          NULL,
217                                                          NULL);
218
219         EmitWarningsOnPlaceholders("ludia_funcs");
220
221         /* Initialize Senna */
222         rc = sen_init();
223         if (rc != sen_success)
224                 ereport(ERROR,
225                                 (errmsg("sen_init() failed: %d", rc)));
226 }
227
228 void
229 _PG_fini(void)
230 {
231 }
232
233 #ifdef TEXTPORTER
234 #define REMOVE_TMPFILE(path)                                                                                    \
235         do {                                                                                                                            \
236                 if (unlink(path) != 0)                                                                                  \
237                         ereport(WARNING,                                                                                        \
238                                         (errcode_for_file_access(),                                                     \
239                                          errmsg("could not remove temporary file \"%s\": %m", path))); \
240         } while(0)
241
242 Datum
243 pgs2textporter1(PG_FUNCTION_ARGS)
244 {
245         char    *appfile = text_to_cstring(PG_GETARG_TEXT_P(0));
246         char    txtfile[] = TEXTPORTER_TMPDIR "/ludia_funcs_XXXXXX";
247         int             tmpfd;
248         int             ret;
249         FILE    *fp;
250         text    *result = NULL;
251         struct stat     statbuf;
252         bool    return_null = false;
253         mode_t  oumask;
254
255         /* Confirm that database encoding is UTF-8 */
256         GetSennaEncoding();
257
258         PG_TRY();
259         {
260                 /*
261                  * Generate a unique temporary filename where text data gotten
262                  * from application file by TextPorter is stored temporarily.
263                  * Set the permission of a temporary file to 0600 to ensure that
264                  * only the owner of PostgreSQL server can read and write the file.
265                  */
266                 oumask = umask(TEXTPORTER_MKSTEMP_UMASK);
267                 tmpfd = mkstemp(txtfile);
268                 umask(oumask);
269
270                 if (tmpfd < 0)
271                         ereport(ERROR,
272                                         (errcode_for_file_access(),
273                                          errmsg("could not generate a unique temporary filename: %m")));
274                 if (close(tmpfd) != 0)
275                         ereport(ERROR,
276                                         (errcode_for_file_access(),
277                                          errmsg("could not close temporary file \"%s\": %m", txtfile)));
278
279                 /*
280                  * Run TextPorter to read text data from application file (appfile)
281                  * to temporary file (txtfile).
282                  */
283                 ret = ExecTextPorter((unsigned char *)appfile,
284                                                          (unsigned char *)txtfile,
285                                                          (unsigned char *)TEXTPORTER_GROUPNAME,
286                                                          (unsigned char *)TEXTPORTER_DEFLANGNAME,
287                                                          TEXTPORTER_BBIGENDIAN, textporter_option,
288                                                          TEXTPORTER_OPTION1, TEXTPORTER_SIZE,
289                                                          TEXTPORTER_CSV_C);
290                 if (ret != 0)
291                 {
292                         ereport(textporter_error,
293                                         (errmsg("could not get text from application file \"%s\"",
294                                                         appfile),
295                                          errdetail("DMC_GetText_V5() failed with errcode %d",
296                                                            ret)));
297
298                         /* Return NULL if textporter_error is set to other than ERROR */
299                         return_null = true;
300                 }
301                 else
302                 {
303                         /* Read text data from temporary file to memory */
304                         if (stat(txtfile, &statbuf))
305                                 ereport(ERROR,
306                                                 (errcode_for_file_access(),
307                                                  errmsg("could not stat file \"%s\": %m", txtfile)));
308                         result = (text *) palloc(statbuf.st_size + VARHDRSZ);
309
310                         fp = AllocateFile(txtfile, "r");
311                         if (fp == NULL)
312                                 ereport(ERROR,
313                                                 (errcode_for_file_access(),
314                                                  errmsg("could not open file \"%s\": %m", txtfile)));
315
316                         if (fread(VARDATA(result), 1, statbuf.st_size, fp) != statbuf.st_size ||
317                                 ferror(fp))
318                                 ereport(ERROR,
319                                                 (errcode_for_file_access(),
320                                                  errmsg("could not read file \"%s\": %m", txtfile)));
321
322                         FreeFile(fp);
323                 }
324
325                 REMOVE_TMPFILE(txtfile);
326                 pfree(appfile);
327         }
328         PG_CATCH();
329         {
330                 REMOVE_TMPFILE(txtfile);
331                 PG_RE_THROW();
332         }
333         PG_END_TRY();
334
335         if (return_null)
336                 PG_RETURN_NULL();
337
338         SET_VARSIZE(result, statbuf.st_size + VARHDRSZ);
339
340         PG_RETURN_TEXT_P(result);
341 }
342
343 /*
344  * Clean up remaining textporter temporary files
345  */
346 static void
347 CleanupTextPorterTmpFiles(void)
348 {
349         DIR                             *tpdir;
350         struct dirent   *tpde;
351         char                    path[MAXPGPATH];
352
353         tpdir = AllocateDir(TEXTPORTER_TMPDIR);
354         if (tpdir == NULL)
355                 ereport(ERROR,
356                                 (errcode_for_file_access(),
357                                  errmsg("could not open textporter temporary file directory \"%s\": %m",
358                                                 TEXTPORTER_TMPDIR)));
359
360         while ((tpde = ReadDir(tpdir, TEXTPORTER_TMPDIR)) != NULL)
361         {
362                 if (strlen(tpde->d_name) == 18 &&
363                         strncmp(tpde->d_name, "ludia_funcs_", 12) == 0)
364                 {
365                         snprintf(path, MAXPGPATH, TEXTPORTER_TMPDIR "/%s", tpde->d_name);
366                         REMOVE_TMPFILE(path);
367                 }
368         }
369
370         FreeDir(tpdir);
371 }
372
373 static bool
374 check_textporter_option(char **newval, void **extra, GucSource source)
375 {
376         unsigned long   val;
377         char                    *endptr;
378         unsigned int    *myextra;
379
380         errno = 0;
381         val = strtoul(*newval, &endptr, 0);
382
383         if (*endptr != '\0')
384                 return false;
385
386         if (errno == ERANGE || val != (unsigned long) ((unsigned int) val))
387         {
388                 GUC_check_errhint("Value exceeds unsigned integer range.");
389                 return false;
390         }
391
392         /* Set up the "extra" struct actually used by assign_textporter_option */
393         myextra = (unsigned int *) malloc(sizeof(unsigned int));
394         if (myextra == NULL)
395         {
396                 GUC_check_errcode(ERRCODE_OUT_OF_MEMORY);
397                 GUC_check_errmsg("out of memory");
398                 return false;
399         }
400         *myextra = (unsigned int) val;
401         *extra = (void *) myextra;
402
403         return true;
404 }
405
406 static void
407 assign_textporter_option(const char *newval, void *extra)
408 {
409         textporter_option = *((unsigned int *) extra);
410 }
411
412 #else   /* TEXTPORTER */
413
414 Datum
415 pgs2textporter1(PG_FUNCTION_ARGS)
416 {
417         PG_RETURN_NULL();
418 }
419
420 #endif  /* TEXTPORTER */
421
422 static sen_encoding
423 GetSennaEncoding(void)
424 {
425         static sen_encoding             encoding = sen_enc_default;
426
427         if (encoding == sen_enc_default)
428         {
429                 if (GetDatabaseEncoding() == PG_UTF8)
430                         encoding = sen_enc_utf8;
431                 else
432                         ereport(ERROR,
433                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
434                                          errmsg("does not support database encoding \"%s\"",
435                                                         GetDatabaseEncodingName())));
436         }
437         return encoding;
438 }
439
440 /*
441  * Escape the backslash and double quote characters in the given string.
442  *
443  * Return false if the given string has no character which needs to be
444  * escaped. Otherwise, return true. In this case, **s points the palloc'd
445  * space storing the escaped keyword string and *slen is set to the size
446  * of that string. The caller needs to free the palloc'd space.
447  */
448 static bool
449 EscapeSnippetKeyword(char **s, size_t *slen)
450 {
451         const char      *sp;
452         char            *ep;
453         char            *escaped;
454         int                     mblen;
455         int                     copylen;
456         bool            in_doublequote = false;
457         bool            in_sennaops = false;
458         bool            need_escape = false;
459
460         /*
461          * Skip the heading double quote character because it always doesn't
462          * need to be interpreted as a character itself and be escaped.
463          * Note that we must not skip the heading character if it's not a
464          * double quote.
465          */
466         sp = *s;
467         if (ISDOUBLEQUOTECHAR(sp))
468                 sp++;
469
470         /*
471          * Check whether the snippet keyword string has a character which
472          * needs to be escaped.
473          */
474         while ((sp - *s) < *slen)
475         {
476                 mblen = pg_mblen(sp);
477
478                 /*
479                  * Backslash in the keyword always needs to be escaped.
480                  */
481                 if (ISBACKSLASHCHAR(sp))
482                 {
483                         need_escape = true;
484                         break;
485                 }
486
487                 if (in_doublequote)
488                 {
489                         if (ISSENNAOPSCHAR(sp))
490                         {
491                                 in_sennaops = true;
492                                 in_doublequote = false;
493                         }
494                         else
495                         {
496                                 /*
497                                  * Double quote in the keyword needs to be escaped if
498                                  * any Senna search operators are to neither its right
499                                  * nor left.
500                                  */
501                                 need_escape = true;
502                                 break;
503                         }
504                 }
505                 else
506                 {
507                         if (ISDOUBLEQUOTECHAR(sp) && !in_sennaops)
508                                 in_doublequote = true;
509                         if (!ISSENNAOPSCHAR(sp))
510                                 in_sennaops = false;
511                 }
512
513                 sp += mblen;
514         }
515
516         /*
517          * Quick exit if the keyword has no character which needs to be
518          * escaped.
519          */
520         if (!need_escape)
521                 return false;
522
523         /*
524          * Allocate the buffer space to store the escaped snippet keyword string.
525          * The maximum size of escaped string is double the input keyword size.
526          * The size reaches the maximum when every character in the input keyword
527          * needs to be escaped.
528          */
529         ep = escaped = (char *) palloc(*slen * 2);
530
531         /*
532          * Copy the characters which have been passed through in the above loop
533          * and don't need to be escaped, into the buffer. If in_doublequote is
534          * true, we don't copy the double quote in the previous position into the
535          * buffer because it might still need to be escaped.
536          */
537         copylen = sp - *s - ((in_doublequote) ? 1 : 0);
538         memcpy(ep, *s, copylen);
539         ep += copylen;
540
541         /*
542          * Construct the escaped snippet keyword string.
543          */
544         while ((sp - *s) < *slen)
545         {
546                 mblen = pg_mblen(sp);
547
548                 if (in_doublequote)
549                 {
550                         /*
551                          * dqchar indicates the previous character, that is a double
552                          * quote. We assume here that a double quote is single-byte
553                          * character.
554                          */
555                         char dqchar     = *(sp - 1);
556
557                         if (ISSENNAOPSCHAR(sp))
558                         {
559                                 /*
560                                  * Don't escape the double quote which is just before Senna
561                                  * operator.
562                                  */
563                                 *ep++ = dqchar;
564                                 *ep++ = *sp;
565                                 in_sennaops = true;
566                                 in_doublequote = false;
567                         }
568                         else
569                         {
570                                 /*
571                                  * Escape the double quote if no Senna operator is next to it.
572                                  */
573                                 *ep++ = '\\';
574                                 *ep++ = dqchar;
575
576                                 if (ISDOUBLEQUOTECHAR(sp))
577                                         in_doublequote = true;
578                                 else
579                                 {
580                                         if (ISBACKSLASHCHAR(sp))
581                                                 *ep++ = '\\';
582                                         memcpy(ep, sp, mblen);
583                                         ep += mblen;
584                                         in_doublequote = false;
585                                 }
586                         }
587                 }
588                 else
589                 {
590                         if (ISDOUBLEQUOTECHAR(sp))
591                         {
592                                 /*
593                                  * Don't escape the double quote which is just after Senna
594                                  * operator.
595                                  */
596                                 if (in_sennaops)
597                                         *ep++ = *sp;
598                                 else
599                                         in_doublequote = true;
600                         }
601                         else
602                         {
603                                 if (ISBACKSLASHCHAR(sp))
604                                         *ep++ = '\\';
605                                 /*
606                                  * We don't check ISSENNAOPSCHAR() here. We handle Senna
607                                  * operator character as a character itself instead of
608                                  * an operator if it doesn't follow a double quote.
609                                  */
610                                 memcpy(ep, sp, mblen);
611                                 ep += mblen;
612                         }
613
614                         if (!ISSENNAOPSCHAR(sp))
615                                 in_sennaops = false;
616                 }
617
618                 sp += mblen;
619         }
620
621         /* Add the tailing double quote into the buffer */
622         if (in_doublequote)
623                 *ep++ = *(sp - 1);
624
625         *s = escaped;
626         *slen = ep - *s;
627
628 #ifdef PGS2_DEBUG
629         if (pgs2_enable_debug)
630         {
631                 char    *tmp = pnstrdup(*s, *slen);
632
633                 elog(LOG, "escaped snippet keyword: %s", tmp);
634                 pfree(tmp);
635         }
636 #endif
637
638         return true;
639 }
640
641 static sen_query *
642 GetSennaQuery(char *str, size_t len)
643 {
644         static sen_query        *query_cache = NULL;
645         static char                     *key_cache = NULL;
646         static size_t           len_cache = 0;
647         static bool                     guc_cache = false;
648         sen_query       *query;
649         sen_encoding    encoding;
650         char            *key;
651         size_t          key_len;
652         int                     rest;
653         bool            needfree = false;
654
655         /*
656          * Return the cached Senna query if the same keyword has
657          * been used the last time.
658          */
659         if (key_cache != NULL &&
660                 len == len_cache &&
661                 strncmp(key_cache, str, len) == 0 &&
662                 escape_snippet_keyword == guc_cache)
663         {
664 #ifdef PGS2_DEBUG
665                 if (pgs2_enable_debug)
666                 {
667                         char    *tmp = pnstrdup(str, len);
668
669                         elog(LOG, "GetSennaQuery(): quick exit: %s", tmp);
670                         pfree(tmp);
671                 }
672 #endif
673                 return query_cache;
674         }
675
676         encoding = GetSennaEncoding();
677
678         key = malloc(len);
679         if (key == NULL)
680                 ereport(ERROR,
681                                 (errcode(ERRCODE_OUT_OF_MEMORY),
682                                  errmsg("out of memory")));
683
684         /*
685          * We always cache the unescaped keyword. Which enables us
686          * to check whether we can use the cached Senna query before
687          * escaping the keyword.
688          */
689         memcpy(key, str, len);
690         key_len = len;
691
692         /*
693          * If the keyword has been escaped, 'str' points to the
694          * newly-palloc'd space storing the escaped keyword. This
695          * space needs to be freed later.
696          */
697         if (escape_snippet_keyword)
698                 needfree = EscapeSnippetKeyword(&str, &len);
699
700         query = sen_query_open(str, len, sen_sel_or, SEN_MAX_N_EXPRS,
701                                                    encoding);
702         if (query == NULL)
703         {
704                 free(key);
705                 ereport(ERROR,
706                                 (errmsg("sen_query_open() failed")));
707         }
708
709         if ((rest = sen_query_rest(query, NULL)) != 0)
710                 ereport(WARNING,
711                                 (errmsg("too many expressions (%d)", rest)));
712
713         if (query_cache != NULL)
714         {
715                 sen_query_close(query_cache);
716                 free(key_cache);
717         }
718
719         key_cache = key;
720         len_cache = key_len;
721         query_cache = query;
722         guc_cache = escape_snippet_keyword;
723
724         if (needfree)
725                 pfree(str);
726
727         return query;
728 }
729
730 Datum
731 pgs2snippet1(PG_FUNCTION_ARGS)
732 {
733         int                     flags = PG_GETARG_INT32(0);
734         uint32          width = PG_GETARG_UINT32(1);
735         uint32          max_results = PG_GETARG_UINT32(2);
736         text       *opentags = PG_GETARG_TEXT_P(3);
737         text       *closetags = PG_GETARG_TEXT_P(4);
738         int                     mapping = PG_GETARG_INT32(5);
739         text       *keywords = PG_GETARG_TEXT_P(6);
740         text       *document = PG_GETARG_TEXT_P(7);
741         sen_query  *query;
742         sen_snip   *snip = NULL;
743         const char *opentags_str = VARDATA_ANY(opentags);
744         const char *closetags_str = VARDATA_ANY(closetags);
745         char       *keywords_str = VARDATA_ANY(keywords);
746         char       *document_str = VARDATA_ANY(document);
747         uint32          opentags_len = VARSIZE_ANY_EXHDR(opentags);
748         uint32          closetags_len = VARSIZE_ANY_EXHDR(closetags);
749         uint32          keywords_len = VARSIZE_ANY_EXHDR(keywords);
750         uint32          document_len = VARSIZE_ANY_EXHDR(document);
751         uint32          nresults = 0;
752         uint32          max_tagged_len = 0;
753         sen_rc          rc;
754         text       *result;
755         uint32          result_len = 0;
756         bool            return_null = false;
757
758         query = GetSennaQuery(keywords_str, keywords_len);
759
760         snip = sen_query_snip(query, flags, width, max_results, 1,
761                                                   &opentags_str, &opentags_len,
762                                                   &closetags_str, &closetags_len,
763                                                   mapping == 0 ? NULL : (sen_snip_mapping *)-1);
764         if (snip == NULL)
765                 ereport(ERROR,
766                                 (errmsg("sen_query_snip() failed")));
767
768         PG_TRY();
769         {
770                 rc = sen_snip_exec(snip, document_str, document_len,
771                                                    &nresults, &max_tagged_len);
772                 if (rc != sen_success)
773                         ereport(ERROR,
774                                         (errmsg("sen_snip_exec() failed: %d", rc)));
775
776                 result = (text *) palloc(max_tagged_len + VARHDRSZ);
777
778                 rc = sen_snip_get_result(snip, 0, VARDATA(result), &result_len);
779                 if (rc == sen_invalid_argument)
780                         return_null = true;
781                 else if (rc != sen_success)
782                         ereport(ERROR,
783                                         (errmsg("sen_snip_get_result() failed: %d", rc)));
784         }
785         PG_CATCH();
786         {
787                 sen_snip_close(snip);
788                 PG_RE_THROW();
789         }
790         PG_END_TRY();
791
792         sen_snip_close(snip);
793
794         if (return_null)
795                 PG_RETURN_NULL();
796
797         SET_VARSIZE(result, max_tagged_len + VARHDRSZ);
798
799         PG_RETURN_TEXT_P(result);
800 }
801
802 /*
803  * Make sure there is enough space for 'needed' more bytes.
804  *
805  * Sets **buf to the allocated space which can store the needed bytes if OK,
806  * NULL if failed to enlarge the space because 'needed' is larger than 'maxlen'.
807  */
808 static inline void
809 pgs2malloc(void **buf, long *buflen, long needed, long maxlen)
810 {
811 #ifdef PGS2_DEBUG
812         if (pgs2_enable_debug)
813                 elog(LOG, "pgs2malloc(): buflen %ld, needed %ld, maxlen %ld",
814                          *buflen, needed, maxlen);
815 #endif
816
817         if (*buf != NULL && *buflen >= needed && (*buflen <= maxlen || maxlen == 0))
818                 return;         /* got enough space already */
819
820         /*
821          * Release the already-allocated space since it's too small to
822          * store the needed bytes or larger than the upper limit.
823          */
824         if (*buf != NULL)
825         {
826                 free(*buf);
827                 *buf = NULL;
828                 *buflen = 0;
829         }
830
831         /*
832          * Don't allocate any space if the needed space is larger than
833          * the upper limit.
834          */
835         if (needed > maxlen && maxlen != 0)
836                 return;
837
838         /*
839          * Allocate the space for the needed bytes.
840          *
841          * We don't want to allocate just a little more space with each enlarge;
842          * for efficiency, double the buffer size each time it overflows.
843          * Actually, we might need to more than double it if 'needed' is big...
844          *
845          * We check whether '*buflen' overflows each cycle to avoid infinite loop.
846          */
847         *buflen = 1024L;
848         while (*buflen < needed && *buflen != 0)
849                 *buflen <<= 1;
850
851         /*
852          * Clamp to maxlen in case we went past it.  Note we are assuming
853          * here that maxlen <= LONG_MAX/2, else the above loop could
854          * overflow.  We will still have *buflen >= needed.
855          */
856         if (*buflen > maxlen && maxlen != 0)
857                 *buflen = maxlen;
858
859         /* Guard against out-of-range '*buflen' value */
860         if (*buflen == 0)
861                 ereport(ERROR,
862                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
863                                  errmsg("out of memory"),
864                                  errdetail("Cannot enlarge buffer by %ld more bytes.",
865                                                    needed)));
866
867         *buf = (void *) malloc(*buflen);
868         if (*buf == NULL)
869                 ereport(ERROR,
870                                 (errcode(ERRCODE_OUT_OF_MEMORY),
871                                  errmsg("out of memory")));
872 }
873
874 Datum
875 pgs2norm(PG_FUNCTION_ARGS)
876 {
877         text            *str = PG_GETARG_TEXT_PP(0);
878         char            *s = VARDATA_ANY(str);
879         long            slen = VARSIZE_ANY_EXHDR(str);
880         text            *result = NULL;
881         long            buflen;
882         long            reslen;
883         long            maxlen;
884         long            needed;
885
886         /*
887          * norm_cache is the cache memory storing both input and normalized strings
888          * as the result of pgs2norm(). norm_cache_size is the size of norm_cache
889          * and its upper limit is specified by norm_cache_limit parameter. norm_result
890          * is the pointer to the normalized string with the verlena header (i.e.,
891          * text type) stored in the latter half of the cache. norm_reslen is the size
892          * of norm_result. norm_slen is the size of the input string which is stored
893          * in the first half of the cache.
894          */
895         static char             *norm_cache = NULL;
896         static long             norm_cache_size = 0;
897         static long             norm_slen = 0;
898         static char             *norm_result = NULL;
899         static long             norm_reslen = 0;
900
901         /*
902          * Return the cached normalization result if the same string of
903          * the given one has been normalized the last time.
904          */
905         if (norm_cache != NULL &&
906                 norm_slen == slen &&
907                 strncmp(norm_cache, s, slen) == 0)
908         {
909 #ifdef PGS2_DEBUG
910                 if (pgs2_enable_debug)
911                 {
912                         char    *tmp = text_to_cstring(str);
913
914                         elog(LOG, "pgs2norm(): quick exit: %s", tmp);
915                         pfree(tmp);
916                 }
917 #endif
918
919                 result = (text *) palloc(norm_reslen);
920                 memcpy(result, norm_result, norm_reslen);
921                 PG_RETURN_TEXT_P(result);
922         }
923
924         /* Confirm that database encoding is UTF-8 */
925         GetSennaEncoding();
926
927         /*
928          * Allocate the result buffer to store the normalized string. Since the size of
929          * normalized string can be larger than that of input one, the result buffer needs
930          * extra space. Problem is that, before calling sen_str_normalize, we need to
931          * allocate the result buffer but cannot know how large extra space is required.
932          * So we use RESULT_EXTRA_SIZE as the estimated size of extra space here.
933          */
934 #define RESULT_EXTRA_SIZE       64
935         buflen = slen + RESULT_EXTRA_SIZE;
936
937 retry:
938         result = (text *) palloc(buflen + VARHDRSZ);
939
940 #if defined(FAST_SENNA)
941         reslen = fast_sen_str_normalize(s, slen, VARDATA(result), buflen);
942 #else
943         reslen = sen_str_normalize(s, slen, sen_enc_utf8,
944                                                            SEN_NORMALIZE_FLAGS,
945                                                            VARDATA(result), buflen);
946 #endif
947
948         if (reslen < 0)
949                 ereport(ERROR,
950                                 (errmsg("could not normalize the string")));
951
952         /*
953          * If the result buffer size is too short to store the normalized string,
954          * we enlarge the buffer and retry the string normalization.
955          */
956         if (buflen <= reslen)
957         {
958                 pfree(result);
959                 buflen = reslen + 1;
960                 goto retry;
961         }
962
963         SET_VARSIZE(result, reslen + VARHDRSZ);
964
965         /*
966          * Cache both input and normalized strings to accelerate the subsequent
967          * calls of pgs2norm() with the same input string. But we don't do that
968          * if the maximum allowed size of the cache is too small to store them.
969          */
970         needed = slen + reslen + VARHDRSZ;
971         maxlen = ((norm_cache_limit >= 0) ? norm_cache_limit : work_mem) * 1024L;
972
973         pgs2malloc((void **) &norm_cache, &norm_cache_size, needed, maxlen);
974         if (norm_cache != NULL)
975         {
976                 /* Store the input string into the first half of the cache */
977                 norm_slen = slen;
978                 memcpy(norm_cache, s, slen);
979
980                 /*
981                  * Store the normalized string with the varlena header (i.e., text type)
982                  * into the latter half of the cache.
983                  */
984                 norm_result = norm_cache + slen;
985                 norm_reslen = reslen + VARHDRSZ;
986                 memcpy(norm_result, result, norm_reslen);
987         }
988
989 #ifdef PGS2_DEBUG
990         if (pgs2_enable_debug)
991         {
992                 char    *tmp = text_to_cstring(str);
993
994                 elog(LOG, "pgs2norm(): complete (%s result cache): %s",
995                          (norm_cache == NULL) ? "unset" : "set", tmp);
996                 pfree(tmp);
997         }
998 #endif
999
1000         PG_RETURN_TEXT_P(result);
1001 }
1002
1003 /*
1004  * Report the version and configure options of Senna which
1005  * ludia_funcs depends on.
1006  */
1007 Datum
1008 pgs2seninfo(PG_FUNCTION_ARGS)
1009 {
1010         char    *version[MAXPGPATH];
1011         char    *coptions[MAXPGPATH];
1012         Datum   values[2];
1013         bool    isnull[2];
1014         HeapTuple tuple;
1015         TupleDesc tupdesc;
1016
1017         /*
1018          * Get the version and configure options of Senna. Ignore the
1019          * return value of sen_info() because it always returns a success.
1020          */
1021         sen_info((char **)&version, (char **)&coptions, NULL, NULL, NULL, NULL);
1022
1023         /*
1024          * Construct a tuple descriptor for the result row. This must
1025          * match this function's ludia_funcs--x.x.sql entry.
1026          */
1027 #if PG_VERSION_NUM >= 120000
1028         tupdesc = CreateTemplateTupleDesc(2);
1029 #else
1030         tupdesc = CreateTemplateTupleDesc(2, false);
1031 #endif
1032         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
1033                                            "version", TEXTOID, -1, 0);
1034         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
1035                                            "configure_options", TEXTOID, -1, 0);
1036         tupdesc = BlessTupleDesc(tupdesc);
1037
1038         /* version */
1039         values[0] = CStringGetTextDatum(*version);
1040         isnull[0] = false;
1041
1042         /* configure option */
1043         values[1] = CStringGetTextDatum(*coptions);
1044         isnull[1] = false;
1045
1046         tuple = heap_form_tuple(tupdesc, values, isnull);
1047         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
1048 }