OSDN Git Service

Fix corrupted style of source comment.
[ludiafuncs/ludia_funcs.git] / ludia_funcs.c
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2006-2015, NTT DATA Corporation
4  * All rights reserved.
5  *
6  * Changelog:
7  *   2013/01/09
8  *   Update Ludia functions so that they are available with PostgreSQL9.1.
9  *   Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <limits.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19
20 #include "catalog/pg_type.h"
21 #include "fmgr.h"
22 #include "funcapi.h"
23 #include "ludia_funcs.h"
24 #include "mb/pg_wchar.h"
25 #include "senna.h"
26 #include "storage/fd.h"
27 #include "utils/builtins.h"
28 #include "utils/guc.h"
29 #include "miscadmin.h"
30
31 #if PG_VERSION_NUM >= 90300
32 #include "access/htup_details.h"
33 #endif
34
35 PG_MODULE_MAGIC;
36
37 /* Last update date of ludia_funcs */
38 #define PGS2_LAST_UPDATE        "2015.09.10"
39
40 /* GUC variables */
41 #ifdef PGS2_DEBUG
42 static bool     pgs2_enable_debug = false;
43 #endif
44 static char     *pgs2_last_update = NULL;
45 static int      norm_cache_limit = -1;
46 static bool     escape_snippet_keyword = false;
47
48 #define SEN_NORMALIZE_FLAGS 0
49 #define SEN_MAX_N_EXPRS         32
50
51 /* upper limit for GUC variables measured in kilobytes of memory */
52 /* note that various places assume the byte size fits in a "long" variable */
53 #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4
54 #define MAX_KILOBYTES   INT_MAX
55 #else
56 #define MAX_KILOBYTES   (INT_MAX / 1024)
57 #endif
58
59 #define ISBACKSLASHCHAR(x) (*(x) == '\\')
60 #define ISDOUBLEQUOTECHAR(x) (*(x) == '"')
61 #define ISSENNAOPSCHAR(x) (*(x) == '+' || *(x) == '-' || *(x) == ' ')
62
63 PG_FUNCTION_INFO_V1(pgs2snippet1);
64 PG_FUNCTION_INFO_V1(pgs2norm);
65 PG_FUNCTION_INFO_V1(pgs2textporter1);
66 PG_FUNCTION_INFO_V1(pgs2seninfo);
67
68 /*
69  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
70  * macro since 9.4, and hence the declaration of the function prototypes
71  * here is necessary only for 9.3 or before.
72  */
73 #if PG_VERSION_NUM < 90400
74 Datum   pgs2snippet1(PG_FUNCTION_ARGS);
75 Datum   pgs2norm(PG_FUNCTION_ARGS);
76 Datum   pgs2textporter1(PG_FUNCTION_ARGS);
77 Datum   pgs2seninfo(PG_FUNCTION_ARGS);
78 #endif
79
80 static sen_encoding     GetSennaEncoding(void);
81 static sen_query        *GetSennaQuery(char *str, size_t len);
82 static bool                     EscapeSnippetKeyword(char **s, size_t *slen);
83
84 #ifdef TEXTPORTER
85 #define TEXTPORTER_TMPDIR                       "/tmp"
86 #define TEXTPORTER_MKSTEMP_UMASK                0177
87 #define TEXTPORTER_GROUPNAME            "UTF-8"
88 #define TEXTPORTER_DEFLANGNAME          "Japanese"
89 #define TEXTPORTER_BBIGENDIAN           1
90 #define TEXTPORTER_OPTION                       0x00000020      /* DMC_GETTEXT_OPT_LF */
91 #define TEXTPORTER_OPTION_STRING        "32"
92 #define TEXTPORTER_OPTION1                      0x00010000      /* DMC_GETTEXT_OPT1_TXCONV */
93 #define TEXTPORTER_SIZE                         0
94 #define TEXTPORTER_CSV_C                        0
95
96
97 /* GUC variables for pgs2textpoter1 */
98 static int      textporter_error = ERROR;
99 static unsigned int     textporter_option = TEXTPORTER_OPTION;
100
101 /*
102  * This variable is a dummy that doesn't do anything, except in some
103  * cases provides the value for SHOW to display.  The real state is
104  * elsewhere and is kept in sync by assign_hooks.
105  */
106 static char     *textporter_option_string;
107
108 static const struct config_enum_entry textporter_error_options[] = {
109         {"debug1", DEBUG1, false},
110         {"log", LOG, false},
111         {"info", INFO, false},
112         {"notice", NOTICE, false},
113         {"warning", WARNING, false},
114         {"error", ERROR, false},
115         {NULL, 0, false}
116 };
117
118 static void CleanupTextPorterTmpFiles(void);
119
120 static bool check_textporter_option(char **newval, void **extra, GucSource source);
121 static void assign_textporter_option(const char *newval, void *extra);
122 #endif  /* TEXTPORTER */
123
124 void    _PG_init(void);
125 void    _PG_fini(void);
126
127 void
128 _PG_init(void)
129 {
130         sen_rc          rc;
131
132 #ifdef PGS2_DEBUG
133         /* Define custom GUC variable for debugging */
134         DefineCustomBoolVariable("ludia_funcs.enable_debug",
135                                                          "Emit ludia_funcs debugging output.",
136                                                          NULL,
137                                                          &pgs2_enable_debug,
138                                                          false,
139                                                          PGC_USERSET,
140                                                          0,
141                                                          NULL,
142                                                          NULL,
143                                                          NULL);
144 #endif
145
146         /* Can't be set in postgresql.conf */
147         DefineCustomStringVariable("ludia_funcs.last_update",
148                                                            "Shows the last update date of ludia_funcs.",
149                                                            NULL,
150                                                            &pgs2_last_update,
151                                                            PGS2_LAST_UPDATE,
152                                                            PGC_INTERNAL,
153                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
154                                                            NULL,
155                                                            NULL,
156                                                            NULL);
157
158 #ifdef TEXTPORTER
159         /* Define custom GUC variables */
160         DefineCustomEnumVariable("ludia_funcs.textporter_error",
161                                                          "Sets the message levels that are emitted "
162                                                          "when textporter fails.",
163                                                          NULL,
164                                                          &textporter_error,
165                                                          ERROR,
166                                                          textporter_error_options,
167                                                          PGC_SUSET,
168                                                          0,
169                                                          NULL,
170                                                          NULL,
171                                                          NULL);
172
173         DefineCustomStringVariable("ludia_funcs.textporter_option",
174                                                            "Sets the option used to get text data "
175                                                            "from TextPorter",
176                                                            NULL,
177                                                            &textporter_option_string,
178                                                            TEXTPORTER_OPTION_STRING,
179                                                            PGC_SUSET,
180                                                            0,
181                                                            check_textporter_option,
182                                                            assign_textporter_option,
183                                                            NULL);
184
185         /* Clean up remaining textporter temporary files */
186         CleanupTextPorterTmpFiles();
187 #endif  /* TEXTPORTER */
188
189         /*
190          * A value of 0 means no limit on the cache size. A value of -1 means
191          * that work_mem is used as the upper size limit of the cache.
192          */
193         DefineCustomIntVariable("ludia_funcs.norm_cache_limit",
194                                                         "Sets the maximum memory to be used for caching "
195                                                         "the result of pgs2norm()",
196                                                         NULL,
197                                                         &norm_cache_limit,
198                                                         -1,
199                                                         -1,
200                                                         MAX_KILOBYTES,
201                                                         PGC_USERSET,
202                                                         GUC_UNIT_KB,
203                                                         NULL,
204                                                         NULL,
205                                                         NULL);
206
207         DefineCustomBoolVariable("ludia_funcs.escape_snippet_keyword",
208                                                          "Escapes snippet keyword string.",
209                                                          NULL,
210                                                          &escape_snippet_keyword,
211                                                          false,
212                                                          PGC_USERSET,
213                                                          0,
214                                                          NULL,
215                                                          NULL,
216                                                          NULL);
217
218         EmitWarningsOnPlaceholders("ludia_funcs");
219
220         /* Initialize Senna */
221         rc = sen_init();
222         if (rc != sen_success)
223                 ereport(ERROR,
224                                 (errmsg("sen_init() failed: %d", rc)));
225 }
226
227 void
228 _PG_fini(void)
229 {
230 }
231
232 #ifdef TEXTPORTER
233 #define REMOVE_TMPFILE(path)                                                                                    \
234         do {                                                                                                                            \
235                 if (unlink(path) != 0)                                                                                  \
236                         ereport(WARNING,                                                                                        \
237                                         (errcode_for_file_access(),                                                     \
238                                          errmsg("could not remove temporary file \"%s\": %m", path))); \
239         } while(0)
240
241 Datum
242 pgs2textporter1(PG_FUNCTION_ARGS)
243 {
244         char    *appfile = text_to_cstring(PG_GETARG_TEXT_P(0));
245         char    txtfile[] = TEXTPORTER_TMPDIR "/ludia_funcs_XXXXXX";
246         int             tmpfd;
247         int             ret;
248         FILE    *fp;
249         text    *result = NULL;
250         struct stat     statbuf;
251         bool    return_null = false;
252         mode_t  oumask;
253
254         /* Confirm that database encoding is UTF-8 */
255         GetSennaEncoding();
256
257         PG_TRY();
258         {
259                 /*
260                  * Generate a unique temporary filename where text data gotten
261                  * from application file by TextPorter is stored temporarily.
262                  * Set the permission of a temporary file to 0600 to ensure that
263                  * only the owner of PostgreSQL server can read and write the file.
264                  */
265                 oumask = umask(TEXTPORTER_MKSTEMP_UMASK);
266                 tmpfd = mkstemp(txtfile);
267                 umask(oumask);
268
269                 if (tmpfd < 0)
270                         ereport(ERROR,
271                                         (errcode_for_file_access(),
272                                          errmsg("could not generate a unique temporary filename: %m")));
273                 if (close(tmpfd) != 0)
274                         ereport(ERROR,
275                                         (errcode_for_file_access(),
276                                          errmsg("could not close temporary file \"%s\": %m", txtfile)));
277
278                 /*
279                  * Run TextPorter to read text data from application file (appfile)
280                  * to temporary file (txtfile).
281                  */
282                 ret = ExecTextPorter((unsigned char *)appfile,
283                                                          (unsigned char *)txtfile,
284                                                          (unsigned char *)TEXTPORTER_GROUPNAME,
285                                                          (unsigned char *)TEXTPORTER_DEFLANGNAME,
286                                                          TEXTPORTER_BBIGENDIAN, textporter_option,
287                                                          TEXTPORTER_OPTION1, TEXTPORTER_SIZE,
288                                                          TEXTPORTER_CSV_C);
289                 if (ret != 0)
290                 {
291                         ereport(textporter_error,
292                                         (errmsg("could not get text from application file \"%s\"",
293                                                         appfile),
294                                          errdetail("DMC_GetText_V5() failed with errcode %d",
295                                                            ret)));
296
297                         /* Return NULL if textporter_error is set to other than ERROR */
298                         return_null = true;
299                 }
300                 else
301                 {
302                         /* Read text data from temporary file to memory */
303                         if (stat(txtfile, &statbuf))
304                                 ereport(ERROR,
305                                                 (errcode_for_file_access(),
306                                                  errmsg("could not stat file \"%s\": %m", txtfile)));
307                         result = (text *) palloc(statbuf.st_size + VARHDRSZ);
308
309                         fp = AllocateFile(txtfile, "r");
310                         if (fp == NULL)
311                                 ereport(ERROR,
312                                                 (errcode_for_file_access(),
313                                                  errmsg("could not open file \"%s\": %m", txtfile)));
314
315                         if (fread(VARDATA(result), 1, statbuf.st_size, fp) != statbuf.st_size ||
316                                 ferror(fp))
317                                 ereport(ERROR,
318                                                 (errcode_for_file_access(),
319                                                  errmsg("could not read file \"%s\": %m", txtfile)));
320
321                         FreeFile(fp);
322                 }
323
324                 REMOVE_TMPFILE(txtfile);
325                 pfree(appfile);
326         }
327         PG_CATCH();
328         {
329                 REMOVE_TMPFILE(txtfile);
330                 PG_RE_THROW();
331         }
332         PG_END_TRY();
333
334         if (return_null)
335                 PG_RETURN_NULL();
336
337         SET_VARSIZE(result, statbuf.st_size + VARHDRSZ);
338
339         PG_RETURN_TEXT_P(result);
340 }
341
342 /*
343  * Clean up remaining textporter temporary files
344  */
345 static void
346 CleanupTextPorterTmpFiles(void)
347 {
348         DIR                             *tpdir;
349         struct dirent   *tpde;
350         char                    path[MAXPGPATH];
351
352         tpdir = AllocateDir(TEXTPORTER_TMPDIR);
353         if (tpdir == NULL)
354                 ereport(ERROR,
355                                 (errcode_for_file_access(),
356                                  errmsg("could not open textporter temporary file directory \"%s\": %m",
357                                                 TEXTPORTER_TMPDIR)));
358
359         while ((tpde = ReadDir(tpdir, TEXTPORTER_TMPDIR)) != NULL)
360         {
361                 if (strlen(tpde->d_name) == 18 &&
362                         strncmp(tpde->d_name, "ludia_funcs_", 12) == 0)
363                 {
364                         snprintf(path, MAXPGPATH, TEXTPORTER_TMPDIR "/%s", tpde->d_name);
365                         REMOVE_TMPFILE(path);
366                 }
367         }
368
369         FreeDir(tpdir);
370 }
371
372 static bool
373 check_textporter_option(char **newval, void **extra, GucSource source)
374 {
375         unsigned long   val;
376         char                    *endptr;
377         unsigned int    *myextra;
378
379         errno = 0;
380         val = strtoul(*newval, &endptr, 0);
381
382         if (*endptr != '\0')
383                 return false;
384
385         if (errno == ERANGE || val != (unsigned long) ((unsigned int) val))
386         {
387                 GUC_check_errhint("Value exceeds unsigned integer range.");
388                 return false;
389         }
390
391         /* Set up the "extra" struct actually used by assign_textporter_option */
392         myextra = (unsigned int *) malloc(sizeof(unsigned int));
393         if (myextra == NULL)
394         {
395                 GUC_check_errcode(ERRCODE_OUT_OF_MEMORY);
396                 GUC_check_errmsg("out of memory");
397                 return false;
398         }
399         *myextra = (unsigned int) val;
400         *extra = (void *) myextra;
401
402         return true;
403 }
404
405 static void
406 assign_textporter_option(const char *newval, void *extra)
407 {
408         textporter_option = *((unsigned int *) extra);
409 }
410
411 #else   /* TEXTPORTER */
412
413 Datum
414 pgs2textporter1(PG_FUNCTION_ARGS)
415 {
416         PG_RETURN_NULL();
417 }
418
419 #endif  /* TEXTPORTER */
420
421 static sen_encoding
422 GetSennaEncoding(void)
423 {
424         static sen_encoding             encoding = sen_enc_default;
425
426         if (encoding == sen_enc_default)
427         {
428                 if (GetDatabaseEncoding() == PG_UTF8)
429                         encoding = sen_enc_utf8;
430                 else
431                         ereport(ERROR,
432                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
433                                          errmsg("does not support database encoding \"%s\"",
434                                                         GetDatabaseEncodingName())));
435         }
436         return encoding;
437 }
438
439 /*
440  * Escape the backslash and double quote characters in the given string.
441  *
442  * Return false if the given string has no character which needs to be
443  * escaped. Otherwise, return true. In this case, **s points the palloc'd
444  * space storing the escaped keyword string and *slen is set to the size
445  * of that string. The caller needs to free the palloc'd space.
446  */
447 static bool
448 EscapeSnippetKeyword(char **s, size_t *slen)
449 {
450         const char      *sp;
451         char            *ep;
452         char            *escaped;
453         int                     mblen;
454         int                     copylen;
455         bool            in_doublequote = false;
456         bool            in_sennaops = false;
457         bool            need_escape = false;
458
459         /*
460          * Skip the heading double quote character because it always doesn't
461          * need to be interpreted as a character itself and be escaped.
462          * Note that we must not skip the heading character if it's not a
463          * double quote.
464          */
465         sp = *s;
466         if (ISDOUBLEQUOTECHAR(sp))
467                 sp++;
468
469         /*
470          * Check whether the snippet keyword string has a character which
471          * needs to be escaped.
472          */
473         while ((sp - *s) < *slen)
474         {
475                 mblen = pg_mblen(sp);
476
477                 /*
478                  * Backslash in the keyword always needs to be escaped.
479                  */
480                 if (ISBACKSLASHCHAR(sp))
481                 {
482                         need_escape = true;
483                         break;
484                 }
485
486                 if (in_doublequote)
487                 {
488                         if (ISSENNAOPSCHAR(sp))
489                         {
490                                 in_sennaops = true;
491                                 in_doublequote = false;
492                         }
493                         else
494                         {
495                                 /*
496                                  * Double quote in the keyword needs to be escaped if
497                                  * any Senna search operators are to neither its right
498                                  * nor left.
499                                  */
500                                 need_escape = true;
501                                 break;
502                         }
503                 }
504                 else
505                 {
506                         if (ISDOUBLEQUOTECHAR(sp) && !in_sennaops)
507                                 in_doublequote = true;
508                         if (!ISSENNAOPSCHAR(sp))
509                                 in_sennaops = false;
510                 }
511
512                 sp += mblen;
513         }
514
515         /*
516          * Quick exit if the keyword has no character which needs to be
517          * escaped.
518          */
519         if (!need_escape)
520                 return false;
521
522         /*
523          * Allocate the buffer space to store the escaped snippet keyword string.
524          * The maximum size of escaped string is double the input keyword size.
525          * The size reaches the maximum when every character in the input keyword
526          * needs to be escaped.
527          */
528         ep = escaped = (char *) palloc(*slen * 2);
529
530         /*
531          * Copy the characters which have been passed through in the above loop
532          * and don't need to be escaped, into the buffer. If in_doublequote is
533          * true, we don't copy the double quote in the previous position into the
534          * buffer because it might still need to be escaped.
535          */
536         copylen = sp - *s - ((in_doublequote) ? 1 : 0);
537         memcpy(ep, *s, copylen);
538         ep += copylen;
539
540         /*
541          * Construct the escaped snippet keyword string.
542          */
543         while ((sp - *s) < *slen)
544         {
545                 mblen = pg_mblen(sp);
546
547                 if (in_doublequote)
548                 {
549                         /*
550                          * dqchar indicates the previous character, that is a double
551                          * quote. We assume here that a double quote is single-byte
552                          * character.
553                          */
554                         char dqchar     = *(sp - 1);
555
556                         if (ISSENNAOPSCHAR(sp))
557                         {
558                                 /*
559                                  * Don't escape the double quote which is just before Senna
560                                  * operator.
561                                  */
562                                 *ep++ = dqchar;
563                                 *ep++ = *sp;
564                                 in_sennaops = true;
565                                 in_doublequote = false;
566                         }
567                         else
568                         {
569                                 /*
570                                  * Escape the double quote if no Senna operator is next to it.
571                                  */
572                                 *ep++ = '\\';
573                                 *ep++ = dqchar;
574
575                                 if (ISDOUBLEQUOTECHAR(sp))
576                                         in_doublequote = true;
577                                 else
578                                 {
579                                         if (ISBACKSLASHCHAR(sp))
580                                                 *ep++ = '\\';
581                                         memcpy(ep, sp, mblen);
582                                         ep += mblen;
583                                         in_doublequote = false;
584                                 }
585                         }
586                 }
587                 else
588                 {
589                         if (ISDOUBLEQUOTECHAR(sp))
590                         {
591                                 /*
592                                  * Don't escape the double quote which is just after Senna
593                                  * operator.
594                                  */
595                                 if (in_sennaops)
596                                         *ep++ = *sp;
597                                 else
598                                         in_doublequote = true;
599                         }
600                         else
601                         {
602                                 if (ISBACKSLASHCHAR(sp))
603                                         *ep++ = '\\';
604                                 /*
605                                  * We don't check ISSENNAOPSCHAR() here. We handle Senna
606                                  * operator character as a character itself instead of
607                                  * an operator if it doesn't follow a double quote.
608                                  */
609                                 memcpy(ep, sp, mblen);
610                                 ep += mblen;
611                         }
612
613                         if (!ISSENNAOPSCHAR(sp))
614                                 in_sennaops = false;
615                 }
616
617                 sp += mblen;
618         }
619
620         /* Add the tailing double quote into the buffer */
621         if (in_doublequote)
622                 *ep++ = *(sp - 1);
623
624         *s = escaped;
625         *slen = ep - *s;
626
627 #ifdef PGS2_DEBUG
628         if (pgs2_enable_debug)
629         {
630                 char    *tmp = pnstrdup(*s, *slen);
631
632                 elog(LOG, "escaped snippet keyword: %s", tmp);
633                 pfree(tmp);
634         }
635 #endif
636
637         return true;
638 }
639
640 static sen_query *
641 GetSennaQuery(char *str, size_t len)
642 {
643         static sen_query        *query_cache = NULL;
644         static char                     *key_cache = NULL;
645         static size_t           len_cache = 0;
646         static bool                     guc_cache = false;
647         sen_query       *query;
648         sen_encoding    encoding;
649         char            *key;
650         size_t          key_len;
651         int                     rest;
652         bool            needfree = false;
653
654         /*
655          * Return the cached Senna query if the same keyword has
656          * been used the last time.
657          */
658         if (key_cache != NULL &&
659                 len == len_cache &&
660                 strncmp(key_cache, str, len) == 0 &&
661                 escape_snippet_keyword == guc_cache)
662         {
663 #ifdef PGS2_DEBUG
664                 if (pgs2_enable_debug)
665                 {
666                         char    *tmp = pnstrdup(str, len);
667
668                         elog(LOG, "GetSennaQuery(): quick exit: %s", tmp);
669                         pfree(tmp);
670                 }
671 #endif
672                 return query_cache;
673         }
674
675         encoding = GetSennaEncoding();
676
677         key = malloc(len);
678         if (key == NULL)
679                 ereport(ERROR,
680                                 (errcode(ERRCODE_OUT_OF_MEMORY),
681                                  errmsg("out of memory")));
682
683         /*
684          * We always cache the unescaped keyword. Which enables us
685          * to check whether we can use the cached Senna query before
686          * escaping the keyword.
687          */
688         memcpy(key, str, len);
689         key_len = len;
690
691         /*
692          * If the keyword has been escaped, 'str' points to the
693          * newly-palloc'd space storing the escaped keyword. This
694          * space needs to be freed later.
695          */
696         if (escape_snippet_keyword)
697                 needfree = EscapeSnippetKeyword(&str, &len);
698
699         query = sen_query_open(str, len, sen_sel_or, SEN_MAX_N_EXPRS,
700                                                    encoding);
701         if (query == NULL)
702         {
703                 free(key);
704                 ereport(ERROR,
705                                 (errmsg("sen_query_open() failed")));
706         }
707
708         if ((rest = sen_query_rest(query, NULL)) != 0)
709                 ereport(WARNING,
710                                 (errmsg("too many expressions (%d)", rest)));
711
712         if (query_cache != NULL)
713         {
714                 sen_query_close(query_cache);
715                 free(key_cache);
716         }
717
718         key_cache = key;
719         len_cache = key_len;
720         query_cache = query;
721         guc_cache = escape_snippet_keyword;
722
723         if (needfree)
724                 pfree(str);
725
726         return query;
727 }
728
729 Datum
730 pgs2snippet1(PG_FUNCTION_ARGS)
731 {
732         int                     flags = PG_GETARG_INT32(0);
733         uint32          width = PG_GETARG_UINT32(1);
734         uint32          max_results = PG_GETARG_UINT32(2);
735         text       *opentags = PG_GETARG_TEXT_P(3);
736         text       *closetags = PG_GETARG_TEXT_P(4);
737         int                     mapping = PG_GETARG_INT32(5);
738         text       *keywords = PG_GETARG_TEXT_P(6);
739         text       *document = PG_GETARG_TEXT_P(7);
740         sen_query  *query;
741         sen_snip   *snip = NULL;
742         const char *opentags_str = VARDATA_ANY(opentags);
743         const char *closetags_str = VARDATA_ANY(closetags);
744         char       *keywords_str = VARDATA_ANY(keywords);
745         char       *document_str = VARDATA_ANY(document);
746         uint32          opentags_len = VARSIZE_ANY_EXHDR(opentags);
747         uint32          closetags_len = VARSIZE_ANY_EXHDR(closetags);
748         uint32          keywords_len = VARSIZE_ANY_EXHDR(keywords);
749         uint32          document_len = VARSIZE_ANY_EXHDR(document);
750         uint32          nresults = 0;
751         uint32          max_tagged_len = 0;
752         sen_rc          rc;
753         text       *result;
754         uint32          result_len = 0;
755         bool            return_null = false;
756
757         query = GetSennaQuery(keywords_str, keywords_len);
758
759         snip = sen_query_snip(query, flags, width, max_results, 1,
760                                                   &opentags_str, &opentags_len,
761                                                   &closetags_str, &closetags_len,
762                                                   mapping == 0 ? NULL : (sen_snip_mapping *)-1);
763         if (snip == NULL)
764                 ereport(ERROR,
765                                 (errmsg("sen_query_snip() failed")));
766
767         PG_TRY();
768         {
769                 rc = sen_snip_exec(snip, document_str, document_len,
770                                                    &nresults, &max_tagged_len);
771                 if (rc != sen_success)
772                         ereport(ERROR,
773                                         (errmsg("sen_snip_exec() failed: %d", rc)));
774
775                 result = (text *) palloc(max_tagged_len + VARHDRSZ);
776
777                 rc = sen_snip_get_result(snip, 0, VARDATA(result), &result_len);
778                 if (rc == sen_invalid_argument)
779                         return_null = true;
780                 else if (rc != sen_success)
781                         ereport(ERROR,
782                                         (errmsg("sen_snip_get_result() failed: %d", rc)));
783         }
784         PG_CATCH();
785         {
786                 sen_snip_close(snip);
787                 PG_RE_THROW();
788         }
789         PG_END_TRY();
790
791         sen_snip_close(snip);
792
793         if (return_null)
794                 PG_RETURN_NULL();
795
796         SET_VARSIZE(result, max_tagged_len + VARHDRSZ);
797
798         PG_RETURN_TEXT_P(result);
799 }
800
801 /*
802  * Make sure there is enough space for 'needed' more bytes.
803  *
804  * Sets **buf to the allocated space which can store the needed bytes if OK,
805  * NULL if failed to enlarge the space because 'needed' is larger than 'maxlen'.
806  */
807 static inline void
808 pgs2malloc(void **buf, long *buflen, long needed, long maxlen)
809 {
810 #ifdef PGS2_DEBUG
811         if (pgs2_enable_debug)
812                 elog(LOG, "pgs2malloc(): buflen %ld, needed %ld, maxlen %ld",
813                          *buflen, needed, maxlen);
814 #endif
815
816         if (*buf != NULL && *buflen >= needed && (*buflen <= maxlen || maxlen == 0))
817                 return;         /* got enough space already */
818
819         /*
820          * Release the already-allocated space since it's too small to
821          * store the needed bytes or larger than the upper limit.
822          */
823         if (*buf != NULL)
824         {
825                 free(*buf);
826                 *buf = NULL;
827                 *buflen = 0;
828         }
829
830         /*
831          * Don't allocate any space if the needed space is larger than
832          * the upper limit.
833          */
834         if (needed > maxlen && maxlen != 0)
835                 return;
836
837         /*
838          * Allocate the space for the needed bytes.
839          *
840          * We don't want to allocate just a little more space with each enlarge;
841          * for efficiency, double the buffer size each time it overflows.
842          * Actually, we might need to more than double it if 'needed' is big...
843          *
844          * We check whether '*buflen' overflows each cycle to avoid infinite loop.
845          */
846         *buflen = 1024L;
847         while (*buflen < needed && *buflen != 0)
848                 *buflen <<= 1;
849
850         /*
851          * Clamp to maxlen in case we went past it.  Note we are assuming
852          * here that maxlen <= LONG_MAX/2, else the above loop could
853          * overflow.  We will still have *buflen >= needed.
854          */
855         if (*buflen > maxlen && maxlen != 0)
856                 *buflen = maxlen;
857
858         /* Guard against out-of-range '*buflen' value */
859         if (*buflen == 0)
860                 ereport(ERROR,
861                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
862                                  errmsg("out of memory"),
863                                  errdetail("Cannot enlarge buffer by %ld more bytes.",
864                                                    needed)));
865
866         *buf = (void *) malloc(*buflen);
867         if (*buf == NULL)
868                 ereport(ERROR,
869                                 (errcode(ERRCODE_OUT_OF_MEMORY),
870                                  errmsg("out of memory")));
871 }
872
873 Datum
874 pgs2norm(PG_FUNCTION_ARGS)
875 {
876         text            *str = PG_GETARG_TEXT_PP(0);
877         char            *s = VARDATA_ANY(str);
878         long            slen = VARSIZE_ANY_EXHDR(str);
879         text            *result = NULL;
880         long            buflen;
881         long            reslen;
882         long            maxlen;
883         long            needed;
884
885         /*
886          * norm_cache is the cache memory storing both input and normalized strings
887          * as the result of pgs2norm(). norm_cache_size is the size of norm_cache
888          * and its upper limit is specified by norm_cache_limit parameter. norm_result
889          * is the pointer to the normalized string with the verlena header (i.e.,
890          * text type) stored in the latter half of the cache. norm_reslen is the size
891          * of norm_result. norm_slen is the size of the input string which is stored
892          * in the first half of the cache.
893          */
894         static char             *norm_cache = NULL;
895         static long             norm_cache_size = 0;
896         static long             norm_slen = 0;
897         static char             *norm_result = NULL;
898         static long             norm_reslen = 0;
899
900         /*
901          * Return the cached normalization result if the same string of
902          * the given one has been normalized the last time.
903          */
904         if (norm_cache != NULL &&
905                 norm_slen == slen &&
906                 strncmp(norm_cache, s, slen) == 0)
907         {
908 #ifdef PGS2_DEBUG
909                 if (pgs2_enable_debug)
910                 {
911                         char    *tmp = text_to_cstring(str);
912
913                         elog(LOG, "pgs2norm(): quick exit: %s", tmp);
914                         pfree(tmp);
915                 }
916 #endif
917
918                 PG_RETURN_TEXT_P(pnstrdup(norm_result, norm_reslen));
919         }
920
921         /* Confirm that database encoding is UTF-8 */
922         GetSennaEncoding();
923
924         /*
925          * Allocate the result buffer to store the normalized string. Since the size of
926          * normalized string can be larger than that of input one, the result buffer needs
927          * extra space. Problem is that, before calling sen_str_normalize, we need to
928          * allocate the result buffer but cannot know how large extra space is required.
929          * So we use RESULT_EXTRA_SIZE as the estimated size of extra space here.
930          */
931 #define RESULT_EXTRA_SIZE       64
932         buflen = slen + RESULT_EXTRA_SIZE;
933
934 retry:
935         result = (text *) palloc(buflen + VARHDRSZ);
936
937 #if defined(FAST_SENNA)
938         reslen = fast_sen_str_normalize(s, slen, VARDATA(result), buflen);
939 #else
940         reslen = sen_str_normalize(s, slen, sen_enc_utf8,
941                                                            SEN_NORMALIZE_FLAGS,
942                                                            VARDATA(result), buflen);
943 #endif
944
945         if (reslen < 0)
946                 ereport(ERROR,
947                                 (errmsg("could not normalize the string")));
948
949         /*
950          * If the result buffer size is too short to store the normalized string,
951          * we enlarge the buffer and retry the string normalization.
952          */
953         if (buflen <= reslen)
954         {
955                 pfree(result);
956                 buflen = reslen + 1;
957                 goto retry;
958         }
959
960         SET_VARSIZE(result, reslen + VARHDRSZ);
961
962         /*
963          * Cache both input and normalized strings to accelerate the subsequent
964          * calls of pgs2norm() with the same input string. But we don't do that
965          * if the maximum allowed size of the cache is too small to store them.
966          */
967         needed = slen + reslen + VARHDRSZ;
968         maxlen = ((norm_cache_limit >= 0) ? norm_cache_limit : work_mem) * 1024L;
969
970         pgs2malloc((void **) &norm_cache, &norm_cache_size, needed, maxlen);
971         if (norm_cache != NULL)
972         {
973                 /* Store the input string into the first half of the cache */
974                 norm_slen = slen;
975                 memcpy(norm_cache, s, slen);
976
977                 /*
978                  * Store the normalized string with the varlena header (i.e., text type)
979                  * into the latter half of the cache.
980                  */
981                 norm_result = norm_cache + slen;
982                 norm_reslen = reslen + VARHDRSZ;
983                 memcpy(norm_result, result, norm_reslen);
984         }
985
986 #ifdef PGS2_DEBUG
987         if (pgs2_enable_debug)
988         {
989                 char    *tmp = text_to_cstring(str);
990
991                 elog(LOG, "pgs2norm(): complete (%s result cache): %s",
992                          (norm_cache == NULL) ? "unset" : "set", tmp);
993                 pfree(tmp);
994         }
995 #endif
996
997         PG_RETURN_TEXT_P(result);
998 }
999
1000 /*
1001  * Report the version and configure options of Senna which
1002  * ludia_funcs depends on.
1003  */
1004 Datum
1005 pgs2seninfo(PG_FUNCTION_ARGS)
1006 {
1007         char    *version[MAXPGPATH];
1008         char    *coptions[MAXPGPATH];
1009         Datum   values[2];
1010         bool    isnull[2];
1011         HeapTuple tuple;
1012         TupleDesc tupdesc;
1013
1014         /*
1015          * Get the version and configure options of Senna. Ignore the
1016          * return value of sen_info() because it always returns a success.
1017          */
1018         sen_info((char **)&version, (char **)&coptions, NULL, NULL, NULL, NULL);
1019
1020         /*
1021          * Construct a tuple descriptor for the result row. This must
1022          * match this function's ludia_funcs--x.x.sql entry.
1023          */
1024         tupdesc = CreateTemplateTupleDesc(2, false);
1025         TupleDescInitEntry(tupdesc, (AttrNumber) 1,
1026                                            "version", TEXTOID, -1, 0);
1027         TupleDescInitEntry(tupdesc, (AttrNumber) 2,
1028                                            "configure_options", TEXTOID, -1, 0);
1029         tupdesc = BlessTupleDesc(tupdesc);
1030
1031         /* version */
1032         values[0] = CStringGetTextDatum(*version);
1033         isnull[0] = false;
1034
1035         /* configure option */
1036         values[1] = CStringGetTextDatum(*coptions);
1037         isnull[1] = false;
1038
1039         tuple = heap_form_tuple(tupdesc, values, isnull);
1040         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
1041 }