OSDN Git Service

Add textporter_exit_on_segv parameter.
[ludiafuncs/ludia_funcs.git] / ludia_funcs.c
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2016-2020, ludia_funcs Development Group
4  * Copyright (c) 2006-2015, NTT DATA Corporation
5  * All rights reserved.
6  *
7  * Changelog:
8  *   2013/01/09
9  *   Update Ludia functions so that they are available with PostgreSQL9.1.
10  *   Author: NTT DATA Corporation
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include <limits.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <unistd.h>
20
21 #include "catalog/pg_type.h"
22 #include "fmgr.h"
23 #include "funcapi.h"
24 #include "ludia_funcs.h"
25 #include "mb/pg_wchar.h"
26 #include "senna.h"
27 #include "storage/fd.h"
28 #include "utils/builtins.h"
29 #include "utils/guc.h"
30 #include "miscadmin.h"
31
32 #if PG_VERSION_NUM >= 90300
33 #include "access/htup_details.h"
34 #endif
35
36 PG_MODULE_MAGIC;
37
38 /* Last update date of ludia_funcs */
39 #define PGS2_LAST_UPDATE        "2019.10.04"
40
41 /* GUC variables */
42 #ifdef PGS2_DEBUG
43 typedef enum pgs2_enable_debug_type
44 {
45     PGS2_ENABLE_DEBUG_OFF,              /* logs no debug log */
46     PGS2_ENABLE_DEBUG_TERSE,    /* logs tersely, e.g., just names of
47                                                                    functions */
48     PGS2_ENABLE_DEBUG_ON                /* logs detailed infomation */
49 } pgs2_enable_debug_type;
50
51 /* We accept all the likely variants of "on" and "off" */
52 static const struct config_enum_entry pgs2_enable_debug_options[] = {
53         {"off", PGS2_ENABLE_DEBUG_OFF, false},
54         {"terse", PGS2_ENABLE_DEBUG_TERSE, false},
55         {"on", PGS2_ENABLE_DEBUG_ON, false},
56         {"true", PGS2_ENABLE_DEBUG_ON, true},
57         {"false", PGS2_ENABLE_DEBUG_OFF, true},
58         {"yes", PGS2_ENABLE_DEBUG_ON, true},
59         {"no", PGS2_ENABLE_DEBUG_OFF, true},
60         {"1", PGS2_ENABLE_DEBUG_ON, true},
61         {"0", PGS2_ENABLE_DEBUG_OFF, true},
62         {NULL, 0, false}
63 };
64
65 static int      pgs2_enable_debug = PGS2_ENABLE_DEBUG_OFF;
66 #endif  /* PGS2_DEBUG */
67
68 static char     *pgs2_last_update = NULL;
69 static int      norm_cache_limit = -1;
70 static bool     escape_snippet_keyword = false;
71
72 #define SEN_NORMALIZE_FLAGS 0
73 #define SEN_MAX_N_EXPRS         32
74
75 /* upper limit for GUC variables measured in kilobytes of memory */
76 /* note that various places assume the byte size fits in a "long" variable */
77 #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4
78 #define MAX_KILOBYTES   INT_MAX
79 #else
80 #define MAX_KILOBYTES   (INT_MAX / 1024)
81 #endif
82
83 #define ISBACKSLASHCHAR(x) (*(x) == '\\')
84 #define ISDOUBLEQUOTECHAR(x) (*(x) == '"')
85 #define ISSENNAOPSCHAR(x) (*(x) == '+' || *(x) == '-' || *(x) == ' ')
86
87 PG_FUNCTION_INFO_V1(pgs2snippet1);
88 PG_FUNCTION_INFO_V1(pgs2norm);
89 PG_FUNCTION_INFO_V1(pgs2textporter1);
90 PG_FUNCTION_INFO_V1(pgs2seninfo);
91
92 /*
93  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
94  * macro since 9.4, and hence the declaration of the function prototypes
95  * here is necessary only for 9.3 or before.
96  */
97 #if PG_VERSION_NUM < 90400
98 Datum   pgs2snippet1(PG_FUNCTION_ARGS);
99 Datum   pgs2norm(PG_FUNCTION_ARGS);
100 Datum   pgs2textporter1(PG_FUNCTION_ARGS);
101 Datum   pgs2seninfo(PG_FUNCTION_ARGS);
102 #endif
103
104 static sen_encoding     GetSennaEncoding(void);
105 static sen_query        *GetSennaQuery(char *str, size_t len);
106 static bool                     EscapeSnippetKeyword(char **s, size_t *slen);
107
108 #ifdef TEXTPORTER
109 #define TEXTPORTER_TMPDIR                       "/tmp"
110 #define TEXTPORTER_MKSTEMP_UMASK                0177
111 #define TEXTPORTER_GROUPNAME            "UTF-8"
112 #define TEXTPORTER_DEFLANGNAME          "Japanese"
113 #define TEXTPORTER_BBIGENDIAN           1
114 #define TEXTPORTER_OPTION                       0x00000020      /* DMC_GETTEXT_OPT_LF */
115 #define TEXTPORTER_OPTION_STRING        "32"
116 #define TEXTPORTER_OPTION1                      0x00010000      /* DMC_GETTEXT_OPT1_TXCONV */
117 #define TEXTPORTER_SIZE                         0
118 #define TEXTPORTER_CSV_C                        0
119
120
121 /* GUC variables for pgs2textpoter1 */
122 static int      textporter_error = ERROR;
123 static unsigned int     textporter_option = TEXTPORTER_OPTION;
124 static bool     textporter_exit_on_segv = false;
125
126 /*
127  * This variable is a dummy that doesn't do anything, except in some
128  * cases provides the value for SHOW to display.  The real state is
129  * elsewhere and is kept in sync by assign_hooks.
130  */
131 static char     *textporter_option_string;
132
133 static const struct config_enum_entry textporter_error_options[] = {
134         {"debug1", DEBUG1, false},
135         {"log", LOG, false},
136         {"info", INFO, false},
137         {"notice", NOTICE, false},
138         {"warning", WARNING, false},
139         {"error", ERROR, false},
140         {NULL, 0, false}
141 };
142
143 static void CleanupTextPorterTmpFiles(void);
144
145 static bool check_textporter_option(char **newval, void **extra, GucSource source);
146 static void assign_textporter_option(const char *newval, void *extra);
147 static void textporter_exit_on_segv_handler(SIGNAL_ARGS);
148 #endif  /* TEXTPORTER */
149
150 void    _PG_init(void);
151 void    _PG_fini(void);
152
153 void
154 _PG_init(void)
155 {
156         sen_rc          rc;
157
158 #ifdef PGS2_DEBUG
159         /* Define custom GUC variable for debugging */
160         DefineCustomEnumVariable("ludia_funcs.enable_debug",
161                                                          "Emit ludia_funcs debugging output.",
162                                                          NULL,
163                                                          &pgs2_enable_debug,
164                                                          PGS2_ENABLE_DEBUG_OFF,
165                                                          pgs2_enable_debug_options,
166                                                          PGC_USERSET,
167                                                          0,
168                                                          NULL,
169                                                          NULL,
170                                                          NULL);
171 #endif
172
173         /* Can't be set in postgresql.conf */
174         DefineCustomStringVariable("ludia_funcs.last_update",
175                                                            "Shows the last update date of ludia_funcs.",
176                                                            NULL,
177                                                            &pgs2_last_update,
178                                                            PGS2_LAST_UPDATE,
179                                                            PGC_INTERNAL,
180                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
181                                                            NULL,
182                                                            NULL,
183                                                            NULL);
184
185 #ifdef TEXTPORTER
186         /* Define custom GUC variables */
187         DefineCustomEnumVariable("ludia_funcs.textporter_error",
188                                                          "Sets the message levels that are emitted "
189                                                          "when textporter fails.",
190                                                          NULL,
191                                                          &textporter_error,
192                                                          ERROR,
193                                                          textporter_error_options,
194                                                          PGC_SUSET,
195                                                          0,
196                                                          NULL,
197                                                          NULL,
198                                                          NULL);
199
200         DefineCustomStringVariable("ludia_funcs.textporter_option",
201                                                            "Sets the option used to get text data "
202                                                            "from TextPorter",
203                                                            NULL,
204                                                            &textporter_option_string,
205                                                            TEXTPORTER_OPTION_STRING,
206                                                            PGC_SUSET,
207                                                            0,
208                                                            check_textporter_option,
209                                                            assign_textporter_option,
210                                                            NULL);
211
212         DefineCustomBoolVariable("ludia_funcs.textporter_exit_on_segv",
213                                                          "Terminate session when textporter causes segmentation fault.",
214                                                          NULL,
215                                                          &textporter_exit_on_segv,
216                                                          false,
217                                                          PGC_USERSET,
218                                                          0,
219                                                          NULL,
220                                                          NULL,
221                                                          NULL);
222
223         /* Clean up remaining textporter temporary files */
224         CleanupTextPorterTmpFiles();
225 #endif  /* TEXTPORTER */
226
227         /*
228          * A value of 0 means no limit on the cache size. A value of -1 means
229          * that work_mem is used as the upper size limit of the cache.
230          */
231         DefineCustomIntVariable("ludia_funcs.norm_cache_limit",
232                                                         "Sets the maximum memory to be used for caching "
233                                                         "the result of pgs2norm()",
234                                                         NULL,
235                                                         &norm_cache_limit,
236                                                         -1,
237                                                         -1,
238                                                         MAX_KILOBYTES,
239                                                         PGC_USERSET,
240                                                         GUC_UNIT_KB,
241                                                         NULL,
242                                                         NULL,
243                                                         NULL);
244
245         DefineCustomBoolVariable("ludia_funcs.escape_snippet_keyword",
246                                                          "Escapes snippet keyword string.",
247                                                          NULL,
248                                                          &escape_snippet_keyword,
249                                                          false,
250                                                          PGC_USERSET,
251                                                          0,
252                                                          NULL,
253                                                          NULL,
254                                                          NULL);
255
256         EmitWarningsOnPlaceholders("ludia_funcs");
257
258         /* Initialize Senna */
259         rc = sen_init();
260         if (rc != sen_success)
261                 ereport(ERROR,
262                                 (errmsg("sen_init() failed: %d", rc)));
263 }
264
265 void
266 _PG_fini(void)
267 {
268 }
269
270 #ifdef TEXTPORTER
271 #define REMOVE_TMPFILE(path)                                                                                    \
272         do {                                                                                                                            \
273                 if (unlink(path) != 0)                                                                                  \
274                         ereport(WARNING,                                                                                        \
275                                         (errcode_for_file_access(),                                                     \
276                                          errmsg("could not remove temporary file \"%s\": %m", path))); \
277         } while(0)
278
279 Datum
280 pgs2textporter1(PG_FUNCTION_ARGS)
281 {
282         char    *appfile = text_to_cstring(PG_GETARG_TEXT_P(0));
283         char    txtfile[] = TEXTPORTER_TMPDIR "/ludia_funcs_XXXXXX";
284         int             tmpfd;
285         int             ret;
286         FILE    *fp;
287         text    *result = NULL;
288         struct stat     statbuf;
289         bool    return_null = false;
290         mode_t  oumask;
291
292         /* Confirm that database encoding is UTF-8 */
293         GetSennaEncoding();
294
295         PG_TRY();
296         {
297                 /*
298                  * Generate a unique temporary filename where text data gotten
299                  * from application file by TextPorter is stored temporarily.
300                  * Set the permission of a temporary file to 0600 to ensure that
301                  * only the owner of PostgreSQL server can read and write the file.
302                  */
303                 oumask = umask(TEXTPORTER_MKSTEMP_UMASK);
304                 tmpfd = mkstemp(txtfile);
305                 umask(oumask);
306
307                 if (tmpfd < 0)
308                         ereport(ERROR,
309                                         (errcode_for_file_access(),
310                                          errmsg("could not generate a unique temporary filename: %m")));
311                 if (close(tmpfd) != 0)
312                         ereport(ERROR,
313                                         (errcode_for_file_access(),
314                                          errmsg("could not close temporary file \"%s\": %m", txtfile)));
315
316                 /*
317                  * If textporter_exit_on_segv option is enabled, segmentation fault
318                  * caused by textporter will terminate only this connection and
319                  * not lead to the server crash.
320                  */
321                 if (textporter_exit_on_segv)
322                         pqsignal(SIGSEGV, textporter_exit_on_segv_handler);
323
324                 /*
325                  * Run TextPorter to read text data from application file (appfile)
326                  * to temporary file (txtfile).
327                  */
328                 ret = ExecTextPorter((unsigned char *)appfile,
329                                                          (unsigned char *)txtfile,
330                                                          (unsigned char *)TEXTPORTER_GROUPNAME,
331                                                          (unsigned char *)TEXTPORTER_DEFLANGNAME,
332                                                          TEXTPORTER_BBIGENDIAN, textporter_option,
333                                                          TEXTPORTER_OPTION1, TEXTPORTER_SIZE,
334                                                          TEXTPORTER_CSV_C);
335
336                 if (textporter_exit_on_segv)
337                         pqsignal(SIGSEGV, SIG_DFL);
338
339                 if (ret != 0)
340                 {
341                         ereport(textporter_error,
342                                         (errmsg("could not get text from application file \"%s\"",
343                                                         appfile),
344                                          errdetail("DMC_GetText_V5() failed with errcode %d",
345                                                            ret)));
346
347                         /* Return NULL if textporter_error is set to other than ERROR */
348                         return_null = true;
349                 }
350                 else
351                 {
352                         /* Read text data from temporary file to memory */
353                         if (stat(txtfile, &statbuf))
354                                 ereport(ERROR,
355                                                 (errcode_for_file_access(),
356                                                  errmsg("could not stat file \"%s\": %m", txtfile)));
357                         result = (text *) palloc(statbuf.st_size + VARHDRSZ);
358
359                         fp = AllocateFile(txtfile, "r");
360                         if (fp == NULL)
361                                 ereport(ERROR,
362                                                 (errcode_for_file_access(),
363                                                  errmsg("could not open file \"%s\": %m", txtfile)));
364
365                         if (fread(VARDATA(result), 1, statbuf.st_size, fp) != statbuf.st_size ||
366                                 ferror(fp))
367                                 ereport(ERROR,
368                                                 (errcode_for_file_access(),
369                                                  errmsg("could not read file \"%s\": %m", txtfile)));
370
371                         FreeFile(fp);
372                 }
373
374                 REMOVE_TMPFILE(txtfile);
375                 pfree(appfile);
376         }
377         PG_CATCH();
378         {
379                 REMOVE_TMPFILE(txtfile);
380                 PG_RE_THROW();
381         }
382         PG_END_TRY();
383
384         if (return_null)
385                 PG_RETURN_NULL();
386
387         SET_VARSIZE(result, statbuf.st_size + VARHDRSZ);
388
389         PG_RETURN_TEXT_P(result);
390 }
391
392 /*
393  * Clean up remaining textporter temporary files
394  */
395 static void
396 CleanupTextPorterTmpFiles(void)
397 {
398         DIR                             *tpdir;
399         struct dirent   *tpde;
400         char                    path[MAXPGPATH];
401
402         tpdir = AllocateDir(TEXTPORTER_TMPDIR);
403         if (tpdir == NULL)
404                 ereport(ERROR,
405                                 (errcode_for_file_access(),
406                                  errmsg("could not open textporter temporary file directory \"%s\": %m",
407                                                 TEXTPORTER_TMPDIR)));
408
409         while ((tpde = ReadDir(tpdir, TEXTPORTER_TMPDIR)) != NULL)
410         {
411                 if (strlen(tpde->d_name) == 18 &&
412                         strncmp(tpde->d_name, "ludia_funcs_", 12) == 0)
413                 {
414                         snprintf(path, MAXPGPATH, TEXTPORTER_TMPDIR "/%s", tpde->d_name);
415                         REMOVE_TMPFILE(path);
416                 }
417         }
418
419         FreeDir(tpdir);
420 }
421
422 static bool
423 check_textporter_option(char **newval, void **extra, GucSource source)
424 {
425         unsigned long   val;
426         char                    *endptr;
427         unsigned int    *myextra;
428
429         errno = 0;
430         val = strtoul(*newval, &endptr, 0);
431
432         if (*endptr != '\0')
433                 return false;
434
435         if (errno == ERANGE || val != (unsigned long) ((unsigned int) val))
436         {
437                 GUC_check_errhint("Value exceeds unsigned integer range.");
438                 return false;
439         }
440
441         /* Set up the "extra" struct actually used by assign_textporter_option */
442         myextra = (unsigned int *) malloc(sizeof(unsigned int));
443         if (myextra == NULL)
444         {
445                 GUC_check_errcode(ERRCODE_OUT_OF_MEMORY);
446                 GUC_check_errmsg("out of memory");
447                 return false;
448         }
449         *myextra = (unsigned int) val;
450         *extra = (void *) myextra;
451
452         return true;
453 }
454
455 static void
456 assign_textporter_option(const char *newval, void *extra)
457 {
458         textporter_option = *((unsigned int *) extra);
459 }
460
461 static void
462 textporter_exit_on_segv_handler(SIGNAL_ARGS)
463 {
464         ereport(FATAL,
465                         (errcode(ERRCODE_INTERNAL_ERROR),
466                          errmsg("terminating PostgreSQL server process due to "
467                                         "segmentation fault by textporter")));
468 }
469
470 #else   /* TEXTPORTER */
471
472 Datum
473 pgs2textporter1(PG_FUNCTION_ARGS)
474 {
475         PG_RETURN_NULL();
476 }
477
478 #endif  /* TEXTPORTER */
479
480 static sen_encoding
481 GetSennaEncoding(void)
482 {
483         static sen_encoding             encoding = sen_enc_default;
484
485         if (encoding == sen_enc_default)
486         {
487                 if (GetDatabaseEncoding() == PG_UTF8)
488                         encoding = sen_enc_utf8;
489                 else
490                         ereport(ERROR,
491                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
492                                          errmsg("does not support database encoding \"%s\"",
493                                                         GetDatabaseEncodingName())));
494         }
495         return encoding;
496 }
497
498 /*
499  * Escape the backslash and double quote characters in the given string.
500  *
501  * Return false if the given string has no character which needs to be
502  * escaped. Otherwise, return true. In this case, **s points the palloc'd
503  * space storing the escaped keyword string and *slen is set to the size
504  * of that string. The caller needs to free the palloc'd space.
505  */
506 static bool
507 EscapeSnippetKeyword(char **s, size_t *slen)
508 {
509         const char      *sp;
510         char            *ep;
511         char            *escaped;
512         int                     mblen;
513         int                     copylen;
514         bool            in_doublequote = false;
515         bool            in_sennaops = false;
516         bool            need_escape = false;
517
518         /*
519          * Skip the heading double quote character because it always doesn't
520          * need to be interpreted as a character itself and be escaped.
521          * Note that we must not skip the heading character if it's not a
522          * double quote.
523          */
524         sp = *s;
525         if (ISDOUBLEQUOTECHAR(sp))
526                 sp++;
527
528         /*
529          * Check whether the snippet keyword string has a character which
530          * needs to be escaped.
531          */
532         while ((sp - *s) < *slen)
533         {
534                 mblen = pg_mblen(sp);
535
536                 /*
537                  * Backslash in the keyword always needs to be escaped.
538                  */
539                 if (ISBACKSLASHCHAR(sp))
540                 {
541                         need_escape = true;
542                         break;
543                 }
544
545                 if (in_doublequote)
546                 {
547                         if (ISSENNAOPSCHAR(sp))
548                         {
549                                 in_sennaops = true;
550                                 in_doublequote = false;
551                         }
552                         else
553                         {
554                                 /*
555                                  * Double quote in the keyword needs to be escaped if
556                                  * any Senna search operators are to neither its right
557                                  * nor left.
558                                  */
559                                 need_escape = true;
560                                 break;
561                         }
562                 }
563                 else
564                 {
565                         if (ISDOUBLEQUOTECHAR(sp) && !in_sennaops)
566                                 in_doublequote = true;
567                         if (!ISSENNAOPSCHAR(sp))
568                                 in_sennaops = false;
569                 }
570
571                 sp += mblen;
572         }
573
574         /*
575          * Quick exit if the keyword has no character which needs to be
576          * escaped.
577          */
578         if (!need_escape)
579                 return false;
580
581         /*
582          * Allocate the buffer space to store the escaped snippet keyword string.
583          * The maximum size of escaped string is double the input keyword size.
584          * The size reaches the maximum when every character in the input keyword
585          * needs to be escaped.
586          */
587         ep = escaped = (char *) palloc(*slen * 2);
588
589         /*
590          * Copy the characters which have been passed through in the above loop
591          * and don't need to be escaped, into the buffer. If in_doublequote is
592          * true, we don't copy the double quote in the previous position into the
593          * buffer because it might still need to be escaped.
594          */
595         copylen = sp - *s - ((in_doublequote) ? 1 : 0);
596         memcpy(ep, *s, copylen);
597         ep += copylen;
598
599         /*
600          * Construct the escaped snippet keyword string.
601          */
602         while ((sp - *s) < *slen)
603         {
604                 mblen = pg_mblen(sp);
605
606                 if (in_doublequote)
607                 {
608                         /*
609                          * dqchar indicates the previous character, that is a double
610                          * quote. We assume here that a double quote is single-byte
611                          * character.
612                          */
613                         char dqchar     = *(sp - 1);
614
615                         if (ISSENNAOPSCHAR(sp))
616                         {
617                                 /*
618                                  * Don't escape the double quote which is just before Senna
619                                  * operator.
620                                  */
621                                 *ep++ = dqchar;
622                                 *ep++ = *sp;
623                                 in_sennaops = true;
624                                 in_doublequote = false;
625                         }
626                         else
627                         {
628                                 /*
629                                  * Escape the double quote if no Senna operator is next to it.
630                                  */
631                                 *ep++ = '\\';
632                                 *ep++ = dqchar;
633
634                                 if (ISDOUBLEQUOTECHAR(sp))
635                                         in_doublequote = true;
636                                 else
637                                 {
638                                         if (ISBACKSLASHCHAR(sp))
639                                                 *ep++ = '\\';
640                                         memcpy(ep, sp, mblen);
641                                         ep += mblen;
642                                         in_doublequote = false;
643                                 }
644                         }
645                 }
646                 else
647                 {
648                         if (ISDOUBLEQUOTECHAR(sp))
649                         {
650                                 /*
651                                  * Don't escape the double quote which is just after Senna
652                                  * operator.
653                                  */
654                                 if (in_sennaops)
655                                         *ep++ = *sp;
656                                 else
657                                         in_doublequote = true;
658                         }
659                         else
660                         {
661                                 if (ISBACKSLASHCHAR(sp))
662                                         *ep++ = '\\';
663                                 /*
664                                  * We don't check ISSENNAOPSCHAR() here. We handle Senna
665                                  * operator character as a character itself instead of
666                                  * an operator if it doesn't follow a double quote.
667                                  */
668                                 memcpy(ep, sp, mblen);
669                                 ep += mblen;
670                         }
671
672                         if (!ISSENNAOPSCHAR(sp))
673                                 in_sennaops = false;
674                 }
675
676                 sp += mblen;
677         }
678
679         /* Add the tailing double quote into the buffer */
680         if (in_doublequote)
681                 *ep++ = *(sp - 1);
682
683         *s = escaped;
684         *slen = ep - *s;
685
686 #ifdef PGS2_DEBUG
687         if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_ON)
688         {
689                 char    *tmp = pnstrdup(*s, *slen);
690
691                 elog(LOG, "escaped snippet keyword: %s", tmp);
692                 pfree(tmp);
693         }
694         else if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_TERSE)
695                 elog(LOG, "escaped snippet keyword");
696 #endif
697
698         return true;
699 }
700
701 static sen_query *
702 GetSennaQuery(char *str, size_t len)
703 {
704         static sen_query        *query_cache = NULL;
705         static char                     *key_cache = NULL;
706         static size_t           len_cache = 0;
707         static bool                     guc_cache = false;
708         sen_query       *query;
709         sen_encoding    encoding;
710         char            *key;
711         size_t          key_len;
712         int                     rest;
713         bool            needfree = false;
714
715         /*
716          * Return the cached Senna query if the same keyword has
717          * been used the last time.
718          */
719         if (key_cache != NULL &&
720                 len == len_cache &&
721                 strncmp(key_cache, str, len) == 0 &&
722                 escape_snippet_keyword == guc_cache)
723         {
724 #ifdef PGS2_DEBUG
725                 if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_ON)
726                 {
727                         char    *tmp = pnstrdup(str, len);
728
729                         elog(LOG, "GetSennaQuery(): quick exit: %s", tmp);
730                         pfree(tmp);
731                 }
732                 else if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_TERSE)
733                                 elog(LOG, "GetSennaQuery(): quick exit");
734 #endif
735                 return query_cache;
736         }
737
738         encoding = GetSennaEncoding();
739
740         key = malloc(len);
741         if (key == NULL)
742                 ereport(ERROR,
743                                 (errcode(ERRCODE_OUT_OF_MEMORY),
744                                  errmsg("out of memory")));
745
746         /*
747          * We always cache the unescaped keyword. Which enables us
748          * to check whether we can use the cached Senna query before
749          * escaping the keyword.
750          */
751         memcpy(key, str, len);
752         key_len = len;
753
754         /*
755          * If the keyword has been escaped, 'str' points to the
756          * newly-palloc'd space storing the escaped keyword. This
757          * space needs to be freed later.
758          */
759         if (escape_snippet_keyword)
760                 needfree = EscapeSnippetKeyword(&str, &len);
761
762         query = sen_query_open(str, len, sen_sel_or, SEN_MAX_N_EXPRS,
763                                                    encoding);
764         if (query == NULL)
765         {
766                 free(key);
767                 ereport(ERROR,
768                                 (errmsg("sen_query_open() failed")));
769         }
770
771         if ((rest = sen_query_rest(query, NULL)) != 0)
772                 ereport(WARNING,
773                                 (errmsg("too many expressions (%d)", rest)));
774
775         if (query_cache != NULL)
776         {
777                 sen_query_close(query_cache);
778                 free(key_cache);
779         }
780
781         key_cache = key;
782         len_cache = key_len;
783         query_cache = query;
784         guc_cache = escape_snippet_keyword;
785
786         if (needfree)
787                 pfree(str);
788
789         return query;
790 }
791
792 Datum
793 pgs2snippet1(PG_FUNCTION_ARGS)
794 {
795         int                     flags = PG_GETARG_INT32(0);
796         uint32          width = PG_GETARG_UINT32(1);
797         uint32          max_results = PG_GETARG_UINT32(2);
798         text       *opentags = PG_GETARG_TEXT_P(3);
799         text       *closetags = PG_GETARG_TEXT_P(4);
800         int                     mapping = PG_GETARG_INT32(5);
801         text       *keywords = PG_GETARG_TEXT_P(6);
802         text       *document = PG_GETARG_TEXT_P(7);
803         sen_query  *query;
804         sen_snip   *snip = NULL;
805         const char *opentags_str = VARDATA_ANY(opentags);
806         const char *closetags_str = VARDATA_ANY(closetags);
807         char       *keywords_str = VARDATA_ANY(keywords);
808         char       *document_str = VARDATA_ANY(document);
809         uint32          opentags_len = VARSIZE_ANY_EXHDR(opentags);
810         uint32          closetags_len = VARSIZE_ANY_EXHDR(closetags);
811         uint32          keywords_len = VARSIZE_ANY_EXHDR(keywords);
812         uint32          document_len = VARSIZE_ANY_EXHDR(document);
813         uint32          nresults = 0;
814         uint32          max_tagged_len = 0;
815         sen_rc          rc;
816         text       *result;
817         uint32          result_len = 0;
818         bool            return_null = false;
819
820         query = GetSennaQuery(keywords_str, keywords_len);
821
822         snip = sen_query_snip(query, flags, width, max_results, 1,
823                                                   &opentags_str, &opentags_len,
824                                                   &closetags_str, &closetags_len,
825                                                   mapping == 0 ? NULL : (sen_snip_mapping *)-1);
826         if (snip == NULL)
827                 ereport(ERROR,
828                                 (errmsg("sen_query_snip() failed")));
829
830         PG_TRY();
831         {
832                 rc = sen_snip_exec(snip, document_str, document_len,
833                                                    &nresults, &max_tagged_len);
834                 if (rc != sen_success)
835                         ereport(ERROR,
836                                         (errmsg("sen_snip_exec() failed: %d", rc)));
837
838                 result = (text *) palloc(max_tagged_len + VARHDRSZ);
839
840                 rc = sen_snip_get_result(snip, 0, VARDATA(result), &result_len);
841                 if (rc == sen_invalid_argument)
842                         return_null = true;
843                 else if (rc != sen_success)
844                         ereport(ERROR,
845                                         (errmsg("sen_snip_get_result() failed: %d", rc)));
846         }
847         PG_CATCH();
848         {
849                 sen_snip_close(snip);
850                 PG_RE_THROW();
851         }
852         PG_END_TRY();
853
854         sen_snip_close(snip);
855
856         if (return_null)
857                 PG_RETURN_NULL();
858
859         SET_VARSIZE(result, max_tagged_len + VARHDRSZ);
860
861         PG_RETURN_TEXT_P(result);
862 }
863
864 /*
865  * Make sure there is enough space for 'needed' more bytes.
866  *
867  * Sets **buf to the allocated space which can store the needed bytes if OK,
868  * NULL if failed to enlarge the space because 'needed' is larger than 'maxlen'.
869  */
870 static inline void
871 pgs2malloc(void **buf, long *buflen, long needed, long maxlen)
872 {
873 #ifdef PGS2_DEBUG
874         if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_ON)
875                 elog(LOG, "pgs2malloc(): buflen %ld, needed %ld, maxlen %ld",
876                          *buflen, needed, maxlen);
877         else if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_TERSE)
878                 elog(LOG, "pgs2malloc()");
879 #endif
880
881         if (*buf != NULL && *buflen >= needed && (*buflen <= maxlen || maxlen == 0))
882                 return;         /* got enough space already */
883
884         /*
885          * Release the already-allocated space since it's too small to
886          * store the needed bytes or larger than the upper limit.
887          */
888         if (*buf != NULL)
889         {
890                 free(*buf);
891                 *buf = NULL;
892                 *buflen = 0;
893         }
894
895         /*
896          * Don't allocate any space if the needed space is larger than
897          * the upper limit.
898          */
899         if (needed > maxlen && maxlen != 0)
900                 return;
901
902         /*
903          * Allocate the space for the needed bytes.
904          *
905          * We don't want to allocate just a little more space with each enlarge;
906          * for efficiency, double the buffer size each time it overflows.
907          * Actually, we might need to more than double it if 'needed' is big...
908          *
909          * We check whether '*buflen' overflows each cycle to avoid infinite loop.
910          */
911         *buflen = 1024L;
912         while (*buflen < needed && *buflen != 0)
913                 *buflen <<= 1;
914
915         /*
916          * Clamp to maxlen in case we went past it.  Note we are assuming
917          * here that maxlen <= LONG_MAX/2, else the above loop could
918          * overflow.  We will still have *buflen >= needed.
919          */
920         if (*buflen > maxlen && maxlen != 0)
921                 *buflen = maxlen;
922
923         /* Guard against out-of-range '*buflen' value */
924         if (*buflen == 0)
925                 ereport(ERROR,
926                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
927                                  errmsg("out of memory"),
928                                  errdetail("Cannot enlarge buffer by %ld more bytes.",
929                                                    needed)));
930
931         *buf = (void *) malloc(*buflen);
932         if (*buf == NULL)
933                 ereport(ERROR,
934                                 (errcode(ERRCODE_OUT_OF_MEMORY),
935                                  errmsg("out of memory")));
936 }
937
938 Datum
939 pgs2norm(PG_FUNCTION_ARGS)
940 {
941         text            *str = PG_GETARG_TEXT_PP(0);
942         char            *s = VARDATA_ANY(str);
943         long            slen = VARSIZE_ANY_EXHDR(str);
944         text            *result = NULL;
945         long            buflen;
946         long            reslen;
947         long            maxlen;
948         long            needed;
949
950         /*
951          * norm_cache is the cache memory storing both input and normalized strings
952          * as the result of pgs2norm(). norm_cache_size is the size of norm_cache
953          * and its upper limit is specified by norm_cache_limit parameter. norm_result
954          * is the pointer to the normalized string with the verlena header (i.e.,
955          * text type) stored in the latter half of the cache. norm_reslen is the size
956          * of norm_result. norm_slen is the size of the input string which is stored
957          * in the first half of the cache.
958          */
959         static char             *norm_cache = NULL;
960         static long             norm_cache_size = 0;
961         static long             norm_slen = 0;
962         static char             *norm_result = NULL;
963         static long             norm_reslen = 0;
964
965         /*
966          * Return the cached normalization result if the same string of
967          * the given one has been normalized the last time.
968          */
969         if (norm_cache != NULL &&
970                 norm_slen == slen &&
971                 strncmp(norm_cache, s, slen) == 0)
972         {
973 #ifdef PGS2_DEBUG
974                 if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_ON)
975                 {
976                         char    *tmp = text_to_cstring(str);
977
978                         elog(LOG, "pgs2norm(): quick exit: %s", tmp);
979                         pfree(tmp);
980                 }
981                 else if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_TERSE)
982                                 elog(LOG, "pgs2norm(): quick exit");
983 #endif
984
985                 result = (text *) palloc(norm_reslen);
986                 memcpy(result, norm_result, norm_reslen);
987                 PG_RETURN_TEXT_P(result);
988         }
989
990         /* Confirm that database encoding is UTF-8 */
991         GetSennaEncoding();
992
993         /*
994          * Allocate the result buffer to store the normalized string. Since the size of
995          * normalized string can be larger than that of input one, the result buffer needs
996          * extra space. Problem is that, before calling sen_str_normalize, we need to
997          * allocate the result buffer but cannot know how large extra space is required.
998          * So we use RESULT_EXTRA_SIZE as the estimated size of extra space here.
999          */
1000 #define RESULT_EXTRA_SIZE       64
1001         buflen = slen + RESULT_EXTRA_SIZE;
1002
1003 retry:
1004         result = (text *) palloc(buflen + VARHDRSZ);
1005
1006 #if defined(FAST_SENNA)
1007         reslen = fast_sen_str_normalize(s, slen, VARDATA(result), buflen);
1008 #else
1009         reslen = sen_str_normalize(s, slen, sen_enc_utf8,
1010                                                            SEN_NORMALIZE_FLAGS,
1011                                                            VARDATA(result), buflen);
1012 #endif
1013
1014         if (reslen < 0)
1015                 ereport(ERROR,
1016                                 (errmsg("could not normalize the string")));
1017
1018         /*
1019          * If the result buffer size is too short to store the normalized string,
1020          * we enlarge the buffer and retry the string normalization.
1021          */
1022         if (buflen <= reslen)
1023         {
1024                 pfree(result);
1025                 buflen = reslen + 1;
1026                 goto retry;
1027         }
1028
1029         SET_VARSIZE(result, reslen + VARHDRSZ);
1030
1031         /*
1032          * Cache both input and normalized strings to accelerate the subsequent
1033          * calls of pgs2norm() with the same input string. But we don't do that
1034          * if the maximum allowed size of the cache is too small to store them.
1035          */
1036         needed = slen + reslen + VARHDRSZ;
1037         maxlen = ((norm_cache_limit >= 0) ? norm_cache_limit : work_mem) * 1024L;
1038
1039         pgs2malloc((void **) &norm_cache, &norm_cache_size, needed, maxlen);
1040         if (norm_cache != NULL)
1041         {
1042                 /* Store the input string into the first half of the cache */
1043                 norm_slen = slen;
1044                 memcpy(norm_cache, s, slen);
1045
1046                 /*
1047                  * Store the normalized string with the varlena header (i.e., text type)
1048                  * into the latter half of the cache.
1049                  */
1050                 norm_result = norm_cache + slen;
1051                 norm_reslen = reslen + VARHDRSZ;
1052                 memcpy(norm_result, result, norm_reslen);
1053         }
1054
1055 #ifdef PGS2_DEBUG
1056         if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_ON)
1057         {
1058                 char    *tmp = text_to_cstring(str);
1059
1060                 elog(LOG, "pgs2norm(): complete (%s result cache): %s",
1061                          (norm_cache == NULL) ? "unset" : "set", tmp);
1062                 pfree(tmp);
1063         }
1064         else if (pgs2_enable_debug == PGS2_ENABLE_DEBUG_TERSE)
1065                         elog(LOG, "pgs2norm(): complete");
1066 #endif
1067
1068         PG_RETURN_TEXT_P(result);
1069 }
1070
1071 /*
1072  * Report the version and configure options of Senna which
1073  * ludia_funcs depends on.
1074  */
1075 Datum
1076 pgs2seninfo(PG_FUNCTION_ARGS)
1077 {
1078         char    *version[MAXPGPATH];
1079         char    *coptions[MAXPGPATH];
1080         Datum   values[2];
1081         bool    isnull[2];
1082         HeapTuple tuple;
1083         TupleDesc tupdesc;
1084
1085         /*
1086          * Get the version and configure options of Senna. Ignore the
1087          * return value of sen_info() because it always returns a success.
1088          */
1089         sen_info((char **)&version, (char **)&coptions, NULL, NULL, NULL, NULL);
1090
1091         /*
1092          * Construct a tuple descriptor for the result row. This must
1093          * match this function's ludia_funcs--x.x.sql entry.
1094          */
1095 #if PG_VERSION_NUM >= 120000
1096         tupdesc = CreateTemplateTupleDesc(2);
1097 #else
1098         tupdesc = CreateTemplateTupleDesc(2, false);
1099 #endif
1100         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
1101                                            "version", TEXTOID, -1, 0);
1102         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
1103                                            "configure_options", TEXTOID, -1, 0);
1104         tupdesc = BlessTupleDesc(tupdesc);
1105
1106         /* version */
1107         values[0] = CStringGetTextDatum(*version);
1108         isnull[0] = false;
1109
1110         /* configure option */
1111         values[1] = CStringGetTextDatum(*coptions);
1112         isnull[1] = false;
1113
1114         tuple = heap_form_tuple(tupdesc, values, isnull);
1115         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
1116 }