OSDN Git Service

Update PGS2_LAST_UPDATE.
[ludiafuncs/ludia_funcs.git] / ludia_funcs.c
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2006-2015, NTT DATA Corporation
4  * All rights reserved.
5  *
6  * Changelog:
7  *   2013/01/09
8  *   Update Ludia functions so that they are available with PostgreSQL9.1.
9  *   Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <limits.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19
20 #include "catalog/pg_type.h"
21 #include "fmgr.h"
22 #include "funcapi.h"
23 #include "ludia_funcs.h"
24 #include "mb/pg_wchar.h"
25 #include "senna.h"
26 #include "storage/fd.h"
27 #include "utils/builtins.h"
28 #include "utils/guc.h"
29 #include "miscadmin.h"
30
31 #if PG_VERSION_NUM >= 90300
32 #include "access/htup_details.h"
33 #endif
34
35 PG_MODULE_MAGIC;
36
37 /* Last update date of ludia_funcs */
38 #define PGS2_LAST_UPDATE        "2015.09.10"
39
40 /* GUC variables */
41 #ifdef PGS2_DEBUG
42 static bool     pgs2_enable_debug = false;
43 #endif
44 static char     *pgs2_last_update = NULL;
45 static int      norm_cache_limit = -1;
46 static bool     escape_snippet_keyword = false;
47
48 #define SEN_NORMALIZE_FLAGS 0
49 #define SEN_MAX_N_EXPRS         32
50
51 /* upper limit for GUC variables measured in kilobytes of memory */
52 /* note that various places assume the byte size fits in a "long" variable */
53 #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4
54 #define MAX_KILOBYTES   INT_MAX
55 #else
56 #define MAX_KILOBYTES   (INT_MAX / 1024)
57 #endif
58
59 #define ISBACKSLASHCHAR(x) (*(x) == '\\')
60 #define ISDOUBLEQUOTECHAR(x) (*(x) == '"')
61 #define ISSENNAOPSCHAR(x) (*(x) == '+' || *(x) == '-' || *(x) == ' ')
62
63 PG_FUNCTION_INFO_V1(pgs2snippet1);
64 Datum   pgs2snippet1(PG_FUNCTION_ARGS);
65 PG_FUNCTION_INFO_V1(pgs2norm);
66 Datum   pgs2norm(PG_FUNCTION_ARGS);
67 PG_FUNCTION_INFO_V1(pgs2textporter1);
68 Datum   pgs2textporter1(PG_FUNCTION_ARGS);
69 PG_FUNCTION_INFO_V1(pgs2seninfo);
70 Datum   pgs2seninfo(PG_FUNCTION_ARGS);
71
72 static sen_encoding     GetSennaEncoding(void);
73 static sen_query        *GetSennaQuery(char *str, size_t len);
74 static bool                     EscapeSnippetKeyword(char **s, size_t *slen);
75
76 #ifdef TEXTPORTER
77 #define TEXTPORTER_TMPDIR                       "/tmp"
78 #define TEXTPORTER_MKSTEMP_UMASK                0177
79 #define TEXTPORTER_GROUPNAME            "UTF-8"
80 #define TEXTPORTER_DEFLANGNAME          "Japanese"
81 #define TEXTPORTER_BBIGENDIAN           1
82 #define TEXTPORTER_OPTION                       0x00000020      /* DMC_GETTEXT_OPT_LF */
83 #define TEXTPORTER_OPTION_STRING        "32"
84 #define TEXTPORTER_OPTION1                      0x00010000      /* DMC_GETTEXT_OPT1_TXCONV */
85 #define TEXTPORTER_SIZE                         0
86 #define TEXTPORTER_CSV_C                        0
87
88
89 /* GUC variables for pgs2textpoter1 */
90 static int      textporter_error = ERROR;
91 static unsigned int     textporter_option = TEXTPORTER_OPTION;
92
93 /*
94  * This variable is a dummy that doesn't do anything, except in some
95  * cases provides the value for SHOW to display.  The real state is
96  * elsewhere and is kept in sync by assign_hooks.
97  */
98 static char     *textporter_option_string;
99
100 static const struct config_enum_entry textporter_error_options[] = {
101         {"debug1", DEBUG1, false},
102         {"log", LOG, false},
103         {"info", INFO, false},
104         {"notice", NOTICE, false},
105         {"warning", WARNING, false},
106         {"error", ERROR, false},
107         {NULL, 0, false}
108 };
109
110 static void CleanupTextPorterTmpFiles(void);
111
112 static bool check_textporter_option(char **newval, void **extra, GucSource source);
113 static void assign_textporter_option(const char *newval, void *extra);
114 #endif  /* TEXTPORTER */
115
116 void    _PG_init(void);
117 void    _PG_fini(void);
118
119 void
120 _PG_init(void)
121 {
122         sen_rc          rc;
123
124 #ifdef PGS2_DEBUG
125         /* Define custom GUC variable for debugging */
126         DefineCustomBoolVariable("ludia_funcs.enable_debug",
127                                                          "Emit ludia_funcs debugging output.",
128                                                          NULL,
129                                                          &pgs2_enable_debug,
130                                                          false,
131                                                          PGC_USERSET,
132                                                          0,
133                                                          NULL,
134                                                          NULL,
135                                                          NULL);
136 #endif
137
138         /* Can't be set in postgresql.conf */
139         DefineCustomStringVariable("ludia_funcs.last_update",
140                                                            "Shows the last update date of ludia_funcs.",
141                                                            NULL,
142                                                            &pgs2_last_update,
143                                                            PGS2_LAST_UPDATE,
144                                                            PGC_INTERNAL,
145                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
146                                                            NULL,
147                                                            NULL,
148                                                            NULL);
149
150 #ifdef TEXTPORTER
151         /* Define custom GUC variables */
152         DefineCustomEnumVariable("ludia_funcs.textporter_error",
153                                                          "Sets the message levels that are emitted "
154                                                          "when textporter fails.",
155                                                          NULL,
156                                                          &textporter_error,
157                                                          ERROR,
158                                                          textporter_error_options,
159                                                          PGC_SUSET,
160                                                          0,
161                                                          NULL,
162                                                          NULL,
163                                                          NULL);
164
165         DefineCustomStringVariable("ludia_funcs.textporter_option",
166                                                            "Sets the option used to get text data "
167                                                            "from TextPorter",
168                                                            NULL,
169                                                            &textporter_option_string,
170                                                            TEXTPORTER_OPTION_STRING,
171                                                            PGC_SUSET,
172                                                            0,
173                                                            check_textporter_option,
174                                                            assign_textporter_option,
175                                                            NULL);
176
177         /* Clean up remaining textporter temporary files */
178         CleanupTextPorterTmpFiles();
179 #endif  /* TEXTPORTER */
180
181         /*
182          * A value of 0 means no limit on the cache size. A value of -1 means
183          * that work_mem is used as the upper size limit of the cache.
184          */
185         DefineCustomIntVariable("ludia_funcs.norm_cache_limit",
186                                                         "Sets the maximum memory to be used for caching "
187                                                         "the result of pgs2norm()",
188                                                         NULL,
189                                                         &norm_cache_limit,
190                                                         -1,
191                                                         -1,
192                                                         MAX_KILOBYTES,
193                                                         PGC_USERSET,
194                                                         GUC_UNIT_KB,
195                                                         NULL,
196                                                         NULL,
197                                                         NULL);
198
199         DefineCustomBoolVariable("ludia_funcs.escape_snippet_keyword",
200                                                          "Escapes snippet keyword string.",
201                                                          NULL,
202                                                          &escape_snippet_keyword,
203                                                          false,
204                                                          PGC_USERSET,
205                                                          0,
206                                                          NULL,
207                                                          NULL,
208                                                          NULL);
209
210         EmitWarningsOnPlaceholders("ludia_funcs");
211
212         /* Initialize Senna */
213         rc = sen_init();
214         if (rc != sen_success)
215                 ereport(ERROR,
216                                 (errmsg("sen_init() failed: %d", rc)));
217 }
218
219 void
220 _PG_fini(void)
221 {
222 }
223
224 #ifdef TEXTPORTER
225 #define REMOVE_TMPFILE(path)                                                                                    \
226         do {                                                                                                                            \
227                 if (unlink(path) != 0)                                                                                  \
228                         ereport(WARNING,                                                                                        \
229                                         (errcode_for_file_access(),                                                     \
230                                          errmsg("could not remove temporary file \"%s\": %m", path))); \
231         } while(0)
232
233 Datum
234 pgs2textporter1(PG_FUNCTION_ARGS)
235 {
236         char    *appfile = text_to_cstring(PG_GETARG_TEXT_P(0));
237         char    txtfile[] = TEXTPORTER_TMPDIR "/ludia_funcs_XXXXXX";
238         int             tmpfd;
239         int             ret;
240         FILE    *fp;
241         text    *result = NULL;
242         struct stat     statbuf;
243         bool    return_null = false;
244         mode_t  oumask;
245
246         /* Confirm that database encoding is UTF-8 */
247         GetSennaEncoding();
248
249         PG_TRY();
250         {
251                 /*
252                  * Generate a unique temporary filename where text data gotten
253                  * from application file by TextPorter is stored temporarily.
254                  * Set the permission of a temporary file to 0600 to ensure that
255                  * only the owner of PostgreSQL server can read and write the file.
256                  */
257                 oumask = umask(TEXTPORTER_MKSTEMP_UMASK);
258                 tmpfd = mkstemp(txtfile);
259                 umask(oumask);
260
261                 if (tmpfd < 0)
262                         ereport(ERROR,
263                                         (errcode_for_file_access(),
264                                          errmsg("could not generate a unique temporary filename: %m")));
265                 if (close(tmpfd) != 0)
266                         ereport(ERROR,
267                                         (errcode_for_file_access(),
268                                          errmsg("could not close temporary file \"%s\": %m", txtfile)));
269
270                 /*
271                  * Run TextPorter to read text data from application file (appfile)
272                  * to temporary file (txtfile).
273                  */
274                 ret = ExecTextPorter((unsigned char *)appfile,
275                                                          (unsigned char *)txtfile,
276                                                          (unsigned char *)TEXTPORTER_GROUPNAME,
277                                                          (unsigned char *)TEXTPORTER_DEFLANGNAME,
278                                                          TEXTPORTER_BBIGENDIAN, textporter_option,
279                                                          TEXTPORTER_OPTION1, TEXTPORTER_SIZE,
280                                                          TEXTPORTER_CSV_C);
281                 if (ret != 0)
282                 {
283                         ereport(textporter_error,
284                                         (errmsg("could not get text from application file \"%s\"",
285                                                         appfile),
286                                          errdetail("DMC_GetText_V5() failed with errcode %d",
287                                                            ret)));
288
289                         /* Return NULL if textporter_error is set to other than ERROR */
290                         return_null = true;
291                 }
292                 else
293                 {
294                         /* Read text data from temporary file to memory */
295                         if (stat(txtfile, &statbuf))
296                                 ereport(ERROR,
297                                                 (errcode_for_file_access(),
298                                                  errmsg("could not stat file \"%s\": %m", txtfile)));
299                         result = (text *) palloc(statbuf.st_size + VARHDRSZ);
300
301                         fp = AllocateFile(txtfile, "r");
302                         if (fp == NULL)
303                                 ereport(ERROR,
304                                                 (errcode_for_file_access(),
305                                                  errmsg("could not open file \"%s\": %m", txtfile)));
306
307                         if (fread(VARDATA(result), 1, statbuf.st_size, fp) != statbuf.st_size ||
308                                 ferror(fp))
309                                 ereport(ERROR,
310                                                 (errcode_for_file_access(),
311                                                  errmsg("could not read file \"%s\": %m", txtfile)));
312
313                         FreeFile(fp);
314                 }
315
316                 REMOVE_TMPFILE(txtfile);
317                 pfree(appfile);
318         }
319         PG_CATCH();
320         {
321                 REMOVE_TMPFILE(txtfile);
322                 PG_RE_THROW();
323         }
324         PG_END_TRY();
325
326         if (return_null)
327                 PG_RETURN_NULL();
328
329         SET_VARSIZE(result, statbuf.st_size + VARHDRSZ);
330
331         PG_RETURN_TEXT_P(result);
332 }
333
334 /*
335  * Clean up remaining textporter temporary files
336  */
337 static void
338 CleanupTextPorterTmpFiles(void)
339 {
340         DIR                             *tpdir;
341         struct dirent   *tpde;
342         char                    path[MAXPGPATH];
343
344         tpdir = AllocateDir(TEXTPORTER_TMPDIR);
345         if (tpdir == NULL)
346                 ereport(ERROR,
347                                 (errcode_for_file_access(),
348                                  errmsg("could not open textporter temporary file directory \"%s\": %m",
349                                                 TEXTPORTER_TMPDIR)));
350
351         while ((tpde = ReadDir(tpdir, TEXTPORTER_TMPDIR)) != NULL)
352         {
353                 if (strlen(tpde->d_name) == 18 &&
354                         strncmp(tpde->d_name, "ludia_funcs_", 12) == 0)
355                 {
356                         snprintf(path, MAXPGPATH, TEXTPORTER_TMPDIR "/%s", tpde->d_name);
357                         REMOVE_TMPFILE(path);
358                 }
359         }
360
361         FreeDir(tpdir);
362 }
363
364 static bool
365 check_textporter_option(char **newval, void **extra, GucSource source)
366 {
367         unsigned long   val;
368         char                    *endptr;
369         unsigned int    *myextra;
370
371         errno = 0;
372         val = strtoul(*newval, &endptr, 0);
373
374         if (*endptr != '\0')
375                 return false;
376
377         if (errno == ERANGE || val != (unsigned long) ((unsigned int) val))
378         {
379                 GUC_check_errhint("Value exceeds unsigned integer range.");
380                 return false;
381         }
382
383         /* Set up the "extra" struct actually used by assign_textporter_option */
384         myextra = (unsigned int *) malloc(sizeof(unsigned int));
385         if (myextra == NULL)
386         {
387                 GUC_check_errcode(ERRCODE_OUT_OF_MEMORY);
388                 GUC_check_errmsg("out of memory");
389                 return false;
390         }
391         *myextra = (unsigned int) val;
392         *extra = (void *) myextra;
393
394         return true;
395 }
396
397 static void
398 assign_textporter_option(const char *newval, void *extra)
399 {
400         textporter_option = *((unsigned int *) extra);
401 }
402
403 #else   /* TEXTPORTER */
404
405 Datum
406 pgs2textporter1(PG_FUNCTION_ARGS)
407 {
408         PG_RETURN_NULL();
409 }
410
411 #endif  /* TEXTPORTER */
412
413 static sen_encoding
414 GetSennaEncoding(void)
415 {
416         static sen_encoding             encoding = sen_enc_default;
417
418         if (encoding == sen_enc_default)
419         {
420                 if (GetDatabaseEncoding() == PG_UTF8)
421                         encoding = sen_enc_utf8;
422                 else
423                         ereport(ERROR,
424                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
425                                          errmsg("does not support database encoding \"%s\"",
426                                                         GetDatabaseEncodingName())));
427         }
428         return encoding;
429 }
430
431 /*
432  * Escape the backslash and double quote characters in the given string.
433  *
434  * Return false if the given string has no character which needs to be
435  * escaped. Otherwise, return true. In this case, **s points the palloc'd
436  * space storing the escaped keyword string and *slen is set to the size
437  * of that string. The caller needs to free the palloc'd space.
438  */
439 static bool
440 EscapeSnippetKeyword(char **s, size_t *slen)
441 {
442         const char      *sp;
443         char            *ep;
444         char            *escaped;
445         int                     mblen;
446         int                     copylen;
447         bool            in_doublequote = false;
448         bool            in_sennaops = false;
449         bool            need_escape = false;
450
451         /*
452          * Skip the heading double quote character because it always doesn't
453          * need to be interpreted as a character itself and be escaped.
454          * Note that we must not skip the heading character if it's not a
455          * double quote.
456          */
457         sp = *s;
458         if (ISDOUBLEQUOTECHAR(sp))
459                 sp++;
460
461         /*
462          * Check whether the snippet keyword string has a character which
463          * needs to be escaped.
464          */
465         while ((sp - *s) < *slen)
466         {
467                 mblen = pg_mblen(sp);
468
469                 /*
470                  * Backslash in the keyword always needs to be escaped.
471                  */
472                 if (ISBACKSLASHCHAR(sp))
473                 {
474                         need_escape = true;
475                         break;
476                 }
477
478                 if (in_doublequote)
479                 {
480                         if (ISSENNAOPSCHAR(sp))
481                         {
482                                 in_sennaops = true;
483                                 in_doublequote = false;
484                         }
485                         else
486                         {
487                                 /*
488                                  * Double quote in the keyword needs to be escaped if
489                                  * any Senna search operators are to neither its right
490                                  * nor left.
491                                  */
492                                 need_escape = true;
493                                 break;
494                         }
495                 }
496                 else
497                 {
498                         if (ISDOUBLEQUOTECHAR(sp) && !in_sennaops)
499                                 in_doublequote = true;
500                         if (!ISSENNAOPSCHAR(sp))
501                                 in_sennaops = false;
502                 }
503
504                 sp += mblen;
505         }
506
507         /*
508          * Quick exit if the keyword has no character which needs to be
509          * escaped.
510          */
511         if (!need_escape)
512                 return false;
513
514         /*
515          * Allocate the buffer space to store the escaped snippet keyword string.
516          * The maximum size of escaped string is double the input keyword size.
517          * The size reaches the maximum when every character in the input keyword
518          * needs to be escaped.
519          */
520         ep = escaped = (char *) palloc(*slen * 2);
521
522         /*
523          * Copy the characters which have been passed through in the above loop
524          * and don't need to be escaped, into the buffer. If in_doublequote is
525          * true, we don't copy the double quote in the previous position into the
526          * buffer because it might still need to be escaped.
527          */
528         copylen = sp - *s - ((in_doublequote) ? 1 : 0);
529         memcpy(ep, *s, copylen);
530         ep += copylen;
531
532         /*
533          * Construct the escaped snippet keyword string.
534          */
535         while ((sp - *s) < *slen)
536         {
537                 mblen = pg_mblen(sp);
538
539                 if (in_doublequote)
540                 {
541                         /*
542                          * dqchar indicates the previous character, that is a double
543                          * quote. We assume here that a double quote is single-byte
544                          * character.
545                          */
546                         char dqchar     = *(sp - 1);
547
548                         if (ISSENNAOPSCHAR(sp))
549                         {
550                                 /*
551                                  * Don't escape the double quote which is just before Senna
552                                  * operator.
553                                  */
554                                 *ep++ = dqchar;
555                                 *ep++ = *sp;
556                                 in_sennaops = true;
557                                 in_doublequote = false;
558                         }
559                         else
560                         {
561                                 /*
562                                  * Escape the double quote if no Senna operator is next to it.
563                                  */
564                                 *ep++ = '\\';
565                                 *ep++ = dqchar;
566
567                                 if (ISDOUBLEQUOTECHAR(sp))
568                                         in_doublequote = true;
569                                 else
570                                 {
571                                         if (ISBACKSLASHCHAR(sp))
572                                                 *ep++ = '\\';
573                                         memcpy(ep, sp, mblen);
574                                         ep += mblen;
575                                         in_doublequote = false;
576                                 }
577                         }
578                 }
579                 else
580                 {
581                         if (ISDOUBLEQUOTECHAR(sp))
582                         {
583                                 /*
584                                  * Don't escape the double quote which is just after Senna
585                                  * operator.
586                                  */
587                                 if (in_sennaops)
588                                         *ep++ = *sp;
589                                 else
590                                         in_doublequote = true;
591                         }
592                         else
593                         {
594                                 if (ISBACKSLASHCHAR(sp))
595                                         *ep++ = '\\';
596                                 /*
597                                  * We don't check ISSENNAOPSCHAR() here. We handle Senna
598                                  * operator character as a character itself instead of
599                                  * an operator if it doesn't follow a double quote.
600                                  */
601                                 memcpy(ep, sp, mblen);
602                                 ep += mblen;
603                         }
604
605                         if (!ISSENNAOPSCHAR(sp))
606                                 in_sennaops = false;
607                 }
608
609                 sp += mblen;
610         }
611
612         /* Add the tailing double quote into the buffer */
613         if (in_doublequote)
614                 *ep++ = *(sp - 1);
615
616         *s = escaped;
617         *slen = ep - *s;
618
619 #ifdef PGS2_DEBUG
620         if (pgs2_enable_debug)
621         {
622                 char    *tmp = pnstrdup(*s, *slen);
623
624                 elog(LOG, "escaped snippet keyword: %s", tmp);
625                 pfree(tmp);
626         }
627 #endif
628
629         return true;
630 }
631
632 static sen_query *
633 GetSennaQuery(char *str, size_t len)
634 {
635         static sen_query        *query_cache = NULL;
636         static char                     *key_cache = NULL;
637         static size_t           len_cache = 0;
638         static bool                     guc_cache = false;
639         sen_query       *query;
640         sen_encoding    encoding;
641         char            *key;
642         size_t          key_len;
643         int                     rest;
644         bool            needfree = false;
645
646         /*
647          * Return the cached Senna query if the same keyword has
648          * been used the last time.
649          */
650         if (key_cache != NULL &&
651                 len == len_cache &&
652                 strncmp(key_cache, str, len) == 0 &&
653                 escape_snippet_keyword == guc_cache)
654         {
655 #ifdef PGS2_DEBUG
656                 if (pgs2_enable_debug)
657                 {
658                         char    *tmp = pnstrdup(str, len);
659
660                         elog(LOG, "GetSennaQuery(): quick exit: %s", tmp);
661                         pfree(tmp);
662                 }
663 #endif
664                 return query_cache;
665         }
666
667         encoding = GetSennaEncoding();
668
669         key = malloc(len);
670         if (key == NULL)
671                 ereport(ERROR,
672                                 (errcode(ERRCODE_OUT_OF_MEMORY),
673                                  errmsg("out of memory")));
674
675         /*
676          * We always cache the unescaped keyword. Which enables us
677          * to check whether we can use the cached Senna query before
678          * escaping the keyword.
679          */
680         memcpy(key, str, len);
681         key_len = len;
682
683         /*
684          * If the keyword has been escaped, 'str' points to the
685          * newly-palloc'd space storing the escaped keyword. This
686          * space needs to be freed later.
687          */
688         if (escape_snippet_keyword)
689                 needfree = EscapeSnippetKeyword(&str, &len);
690
691         query = sen_query_open(str, len, sen_sel_or, SEN_MAX_N_EXPRS,
692                                                    encoding);
693         if (query == NULL)
694         {
695                 free(key);
696                 ereport(ERROR,
697                                 (errmsg("sen_query_open() failed")));
698         }
699
700         if ((rest = sen_query_rest(query, NULL)) != 0)
701                 ereport(WARNING,
702                                 (errmsg("too many expressions (%d)", rest)));
703
704         if (query_cache != NULL)
705         {
706                 sen_query_close(query_cache);
707                 free(key_cache);
708         }
709
710         key_cache = key;
711         len_cache = key_len;
712         query_cache = query;
713         guc_cache = escape_snippet_keyword;
714
715         if (needfree)
716                 pfree(str);
717
718         return query;
719 }
720
721 Datum
722 pgs2snippet1(PG_FUNCTION_ARGS)
723 {
724         int                     flags = PG_GETARG_INT32(0);
725         uint32          width = PG_GETARG_UINT32(1);
726         uint32          max_results = PG_GETARG_UINT32(2);
727         text       *opentags = PG_GETARG_TEXT_P(3);
728         text       *closetags = PG_GETARG_TEXT_P(4);
729         int                     mapping = PG_GETARG_INT32(5);
730         text       *keywords = PG_GETARG_TEXT_P(6);
731         text       *document = PG_GETARG_TEXT_P(7);
732         sen_query  *query;
733         sen_snip   *snip = NULL;
734         const char *opentags_str = VARDATA_ANY(opentags);
735         const char *closetags_str = VARDATA_ANY(closetags);
736         char       *keywords_str = VARDATA_ANY(keywords);
737         char       *document_str = VARDATA_ANY(document);
738         uint32          opentags_len = VARSIZE_ANY_EXHDR(opentags);
739         uint32          closetags_len = VARSIZE_ANY_EXHDR(closetags);
740         uint32          keywords_len = VARSIZE_ANY_EXHDR(keywords);
741         uint32          document_len = VARSIZE_ANY_EXHDR(document);
742         uint32          nresults = 0;
743         uint32          max_tagged_len = 0;
744         sen_rc          rc;
745         text       *result;
746         uint32          result_len = 0;
747         bool            return_null = false;
748
749         query = GetSennaQuery(keywords_str, keywords_len);
750
751         snip = sen_query_snip(query, flags, width, max_results, 1,
752                                                   &opentags_str, &opentags_len,
753                                                   &closetags_str, &closetags_len,
754                                                   mapping == 0 ? NULL : (sen_snip_mapping *)-1);
755         if (snip == NULL)
756                 ereport(ERROR,
757                                 (errmsg("sen_query_snip() failed")));
758
759         PG_TRY();
760         {
761                 rc = sen_snip_exec(snip, document_str, document_len,
762                                                    &nresults, &max_tagged_len);
763                 if (rc != sen_success)
764                         ereport(ERROR,
765                                         (errmsg("sen_snip_exec() failed: %d", rc)));
766
767                 result = (text *) palloc(max_tagged_len + VARHDRSZ);
768
769                 rc = sen_snip_get_result(snip, 0, VARDATA(result), &result_len);
770                 if (rc == sen_invalid_argument)
771                         return_null = true;
772                 else if (rc != sen_success)
773                         ereport(ERROR,
774                                         (errmsg("sen_snip_get_result() failed: %d", rc)));
775         }
776         PG_CATCH();
777         {
778                 sen_snip_close(snip);
779                 PG_RE_THROW();
780         }
781         PG_END_TRY();
782
783         sen_snip_close(snip);
784
785         if (return_null)
786                 PG_RETURN_NULL();
787
788         SET_VARSIZE(result, max_tagged_len + VARHDRSZ);
789
790         PG_RETURN_TEXT_P(result);
791 }
792
793 /*
794  * Make sure there is enough space for 'needed' more bytes.
795  *
796  * Sets **buf to the allocated space which can store the needed bytes if OK,
797  * NULL if failed to enlarge the space because 'needed' is larger than 'maxlen'.
798  */
799 static inline void
800 pgs2malloc(void **buf, long *buflen, long needed, long maxlen)
801 {
802 #ifdef PGS2_DEBUG
803         if (pgs2_enable_debug)
804                 elog(LOG, "pgs2malloc(): buflen %ld, needed %ld, maxlen %ld",
805                          *buflen, needed, maxlen);
806 #endif
807
808         if (*buf != NULL && *buflen >= needed && (*buflen <= maxlen || maxlen == 0))
809                 return;         /* got enough space already */
810
811         /*
812          * Release the already-allocated space since it's too small to
813          * store the needed bytes or larger than the upper limit.
814          */
815         if (*buf != NULL)
816         {
817                 free(*buf);
818                 *buf = NULL;
819                 *buflen = 0;
820         }
821
822         /*
823          * Don't allocate any space if the needed space is larger than
824          * the upper limit.
825          */
826         if (needed > maxlen && maxlen != 0)
827                 return;
828
829         /*
830          * Allocate the space for the needed bytes.
831          *
832          * We don't want to allocate just a little more space with each enlarge;
833          * for efficiency, double the buffer size each time it overflows.
834          * Actually, we might need to more than double it if 'needed' is big...
835          *
836          * We check whether '*buflen' overflows each cycle to avoid infinite loop.
837          */
838         *buflen = 1024L;
839         while (*buflen < needed && *buflen != 0)
840                 *buflen <<= 1;
841
842         /*
843          * Clamp to maxlen in case we went past it.  Note we are assuming
844          * here that maxlen <= LONG_MAX/2, else the above loop could
845          * overflow.  We will still have *buflen >= needed.
846          */
847         if (*buflen > maxlen && maxlen != 0)
848                 *buflen = maxlen;
849
850         /* Guard against out-of-range '*buflen' value */
851         if (*buflen == 0)
852                 ereport(ERROR,
853                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
854                                  errmsg("out of memory"),
855                                  errdetail("Cannot enlarge buffer by %ld more bytes.",
856                                                    needed)));
857
858         *buf = (void *) malloc(*buflen);
859         if (*buf == NULL)
860                 ereport(ERROR,
861                                 (errcode(ERRCODE_OUT_OF_MEMORY),
862                                  errmsg("out of memory")));
863 }
864
865 Datum
866 pgs2norm(PG_FUNCTION_ARGS)
867 {
868         text            *str = PG_GETARG_TEXT_PP(0);
869         char            *s = VARDATA_ANY(str);
870         long            slen = VARSIZE_ANY_EXHDR(str);
871         text            *result = NULL;
872         long            buflen;
873         long            reslen;
874         long            maxlen;
875         long            needed;
876
877         /*
878          * norm_cache is the cache memory storing both input and normalized strings
879          * as the result of pgs2norm(). norm_cache_size is the size of norm_cache
880          * and its upper limit is specified by norm_cache_limit parameter. norm_result
881          * is the pointer to the normalized string with the verlena header (i.e.,
882          * text type) stored in the latter half of the cache. norm_reslen is the size
883          * of norm_result. norm_slen is the size of the input string which is stored
884          * in the first half of the cache.
885          */
886         static char             *norm_cache = NULL;
887         static long             norm_cache_size = 0;
888         static long             norm_slen = 0;
889         static char             *norm_result = NULL;
890         static long             norm_reslen = 0;
891
892         /*
893          * Return the cached normalization result if the same string of
894          * the given one has been normalized the last time.
895          */
896         if (norm_cache != NULL &&
897                 norm_slen == slen &&
898                 strncmp(norm_cache, s, slen) == 0)
899         {
900 #ifdef PGS2_DEBUG
901                 if (pgs2_enable_debug)
902                 {
903                         char    *tmp = text_to_cstring(str);
904
905                         elog(LOG, "pgs2norm(): quick exit: %s", tmp);
906                         pfree(tmp);
907                 }
908 #endif
909
910                 PG_RETURN_TEXT_P(pnstrdup(norm_result, norm_reslen));
911         }
912
913         /* Confirm that database encoding is UTF-8 */
914         GetSennaEncoding();
915
916         /*
917          * Allocate the result buffer to store the normalized string. Since the size of
918          * normalized string can be larger than that of input one, the result buffer needs
919          * extra space. Problem is that, before calling sen_str_normalize, we need to
920          * allocate the result buffer but cannot know how large extra space is required.
921          * So we use RESULT_EXTRA_SIZE as the estimated size of extra space here.
922          */
923 #define RESULT_EXTRA_SIZE       64
924         buflen = slen + RESULT_EXTRA_SIZE;
925
926 retry:
927         result = (text *) palloc(buflen + VARHDRSZ);
928
929 #if defined(FAST_SENNA)
930         reslen = fast_sen_str_normalize(s, slen, VARDATA(result), buflen);
931 #else
932         reslen = sen_str_normalize(s, slen, sen_enc_utf8,
933                                                            SEN_NORMALIZE_FLAGS,
934                                                            VARDATA(result), buflen);
935 #endif
936
937         if (reslen < 0)
938                 ereport(ERROR,
939                                 (errmsg("could not normalize the string")));
940
941         /*
942          * If the result buffer size is too short to store the normalized string,
943          * we enlarge the buffer and retry the string normalization.
944          */
945         if (buflen <= reslen)
946         {
947                 pfree(result);
948                 buflen = reslen + 1;
949                 goto retry;
950         }
951
952         SET_VARSIZE(result, reslen + VARHDRSZ);
953
954         /*
955          * Cache both input and normalized strings to accelerate the subsequent
956          * calls of pgs2norm() with the same input string. But we don't do that
957          * if the maximum allowed size of the cache is too small to store them.
958          */
959         needed = slen + reslen + VARHDRSZ;
960         maxlen = ((norm_cache_limit >= 0) ? norm_cache_limit : work_mem) * 1024L;
961
962         pgs2malloc((void **) &norm_cache, &norm_cache_size, needed, maxlen);
963         if (norm_cache != NULL)
964         {
965                 /* Store the input string into the first half of the cache */
966                 norm_slen = slen;
967                 memcpy(norm_cache, s, slen);
968
969                 /*
970                  * Store the normalized string with the varlena header (i.e., text type)
971                  * into the latter half of the cache.
972                  */
973                 norm_result = norm_cache + slen;
974                 norm_reslen = reslen + VARHDRSZ;
975                 memcpy(norm_result, result, norm_reslen);
976         }
977
978 #ifdef PGS2_DEBUG
979         if (pgs2_enable_debug)
980         {
981                 char    *tmp = text_to_cstring(str);
982
983                 elog(LOG, "pgs2norm(): complete (%s result cache): %s",
984                          (norm_cache == NULL) ? "unset" : "set", tmp);
985                 pfree(tmp);
986         }
987 #endif
988
989         PG_RETURN_TEXT_P(result);
990 }
991
992 /*
993  * Report the version and configure options of Senna which
994  * ludia_funcs depends on.
995  */
996 Datum
997 pgs2seninfo(PG_FUNCTION_ARGS)
998 {
999         char    *version[MAXPGPATH];
1000         char    *coptions[MAXPGPATH];
1001         Datum   values[2];
1002         bool    isnull[2];
1003         HeapTuple tuple;
1004         TupleDesc tupdesc;
1005
1006         /*
1007          * Get the version and configure options of Senna. Ignore the
1008          * return value of sen_info() because it always returns a success.
1009          */
1010         sen_info((char **)&version, (char **)&coptions, NULL, NULL, NULL, NULL);
1011
1012         /*
1013          * Construct a tuple descriptor for the result row. This must
1014          * match this function's ludia_funcs--x.x.sql entry.
1015          */
1016         tupdesc = CreateTemplateTupleDesc(2, false);
1017         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
1018                                            "version", TEXTOID, -1, 0);
1019         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
1020                                            "configure_options", TEXTOID, -1, 0);
1021         tupdesc = BlessTupleDesc(tupdesc);
1022
1023         /* version */
1024         values[0] = CStringGetTextDatum(*version);
1025         isnull[0] = false;
1026
1027         /* configure option */
1028         values[1] = CStringGetTextDatum(*coptions);
1029         isnull[1] = false;
1030
1031         tuple = heap_form_tuple(tupdesc, values, isnull);
1032         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
1033 }