1 /*-------------------------------------------------------------------------
3 * Copyright (c) 2006-2013, NTT DATA Corporation
8 * Update Ludia functions so that they are available with PostgreSQL9.1.
9 * Author: NTT DATA Corporation
11 *-------------------------------------------------------------------------
16 #include <sys/types.h>
20 #include "catalog/pg_type.h"
23 #include "ludia_funcs.h"
24 #include "mb/pg_wchar.h"
26 #include "storage/fd.h"
27 #include "utils/builtins.h"
28 #include "utils/guc.h"
29 #include "miscadmin.h"
31 #if PG_VERSION_NUM >= 90300
32 #include "access/htup_details.h"
37 /* Last update date of ludia_funcs */
38 #define PGS2_LAST_UPDATE "2013.04.05"
42 static bool pgs2_enable_debug = false;
44 static char *pgs2_last_update = NULL;
45 static int norm_cache_limit = -1;
46 static bool escape_snippet_keyword = false;
48 #define SEN_NORMALIZE_FLAGS 0
49 #define SEN_MAX_N_EXPRS 32
51 /* upper limit for GUC variables measured in kilobytes of memory */
52 /* note that various places assume the byte size fits in a "long" variable */
53 #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4
54 #define MAX_KILOBYTES INT_MAX
56 #define MAX_KILOBYTES (INT_MAX / 1024)
59 #define ISBACKSLASHCHAR(x) (*(x) == '\\')
60 #define ISDOUBLEQUOTECHAR(x) (*(x) == '"')
61 #define ISSENNAOPSCHAR(x) (*(x) == '+' || *(x) == '-' || *(x) == ' ')
63 PG_FUNCTION_INFO_V1(pgs2snippet1);
64 Datum pgs2snippet1(PG_FUNCTION_ARGS);
65 PG_FUNCTION_INFO_V1(pgs2norm);
66 Datum pgs2norm(PG_FUNCTION_ARGS);
67 PG_FUNCTION_INFO_V1(pgs2textporter1);
68 Datum pgs2textporter1(PG_FUNCTION_ARGS);
69 PG_FUNCTION_INFO_V1(pgs2seninfo);
70 Datum pgs2seninfo(PG_FUNCTION_ARGS);
72 static sen_encoding GetSennaEncoding(void);
73 static sen_query *GetSennaQuery(char *str, size_t len);
74 static bool EscapeSnippetKeyword(char **s, size_t *slen);
77 #define TEXTPORTER_TMPDIR "/tmp"
78 #define TEXTPORTER_GROUPNAME "UTF-8"
79 #define TEXTPORTER_DEFLANGNAME "Japanese"
80 #define TEXTPORTER_BBIGENDIAN 1
81 #define TEXTPORTER_OPTION 0x00000020 /* DMC_GETTEXT_OPT_LF */
82 #define TEXTPORTER_OPTION_STRING "32"
83 #define TEXTPORTER_OPTION1 0x00010000 /* DMC_GETTEXT_OPT1_TXCONV */
84 #define TEXTPORTER_SIZE 0
85 #define TEXTPORTER_CSV_C 0
87 /* GUC variables for pgs2textpoter1 */
88 static int textporter_error = ERROR;
89 static unsigned int textporter_option = TEXTPORTER_OPTION;
92 * This variable is a dummy that doesn't do anything, except in some
93 * cases provides the value for SHOW to display. The real state is
94 * elsewhere and is kept in sync by assign_hooks.
96 static char *textporter_option_string;
98 static const struct config_enum_entry textporter_error_options[] = {
99 {"debug1", DEBUG1, false},
101 {"info", INFO, false},
102 {"notice", NOTICE, false},
103 {"warning", WARNING, false},
104 {"error", ERROR, false},
108 static void CleanupTextPorterTmpFiles(void);
110 static bool check_textporter_option(char **newval, void **extra, GucSource source);
111 static void assign_textporter_option(const char *newval, void *extra);
112 #endif /* TEXTPORTER */
123 /* Define custom GUC variable for debugging */
124 DefineCustomBoolVariable("ludia_funcs.enable_debug",
125 "Emit ludia_funcs debugging output.",
136 /* Can't be set in postgresql.conf */
137 DefineCustomStringVariable("ludia_funcs.last_update",
138 "Shows the last update date of ludia_funcs.",
143 GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
149 /* Define custom GUC variables */
150 DefineCustomEnumVariable("ludia_funcs.textporter_error",
151 "Sets the message levels that are emitted "
152 "when textporter fails.",
156 textporter_error_options,
163 DefineCustomStringVariable("ludia_funcs.textporter_option",
164 "Sets the option used to get text data "
167 &textporter_option_string,
168 TEXTPORTER_OPTION_STRING,
171 check_textporter_option,
172 assign_textporter_option,
175 /* Clean up remaining textporter temporary files */
176 CleanupTextPorterTmpFiles();
177 #endif /* TEXTPORTER */
180 * A value of 0 means no limit on the cache size. A value of -1 means
181 * that work_mem is used as the upper size limit of the cache.
183 DefineCustomIntVariable("ludia_funcs.norm_cache_limit",
184 "Sets the maximum memory to be used for caching "
185 "the result of pgs2norm()",
197 DefineCustomBoolVariable("ludia_funcs.escape_snippet_keyword",
198 "Escapes snippet keyword string.",
200 &escape_snippet_keyword,
208 EmitWarningsOnPlaceholders("ludia_funcs");
210 /* Initialize Senna */
212 if (rc != sen_success)
214 (errmsg("sen_init() failed: %d", rc)));
223 #define REMOVE_TMPFILE(path) \
225 if (unlink(path) != 0) \
227 (errcode_for_file_access(), \
228 errmsg("could not remove temporary file \"%s\": %m", path))); \
232 pgs2textporter1(PG_FUNCTION_ARGS)
234 char *appfile = text_to_cstring(PG_GETARG_TEXT_P(0));
235 char txtfile[] = TEXTPORTER_TMPDIR "/ludia_funcs_XXXXXX";
241 bool return_null = false;
243 /* Confirm that database encoding is UTF-8 */
249 * Generate a unique temporary filename where text data gotten
250 * from application file by TextPorter is stored temporarily.
252 tmpfd = mkstemp(txtfile);
255 (errcode_for_file_access(),
256 errmsg("could not generate a unique temporary filename: %m")));
257 if (close(tmpfd) != 0)
259 (errcode_for_file_access(),
260 errmsg("could not close temporary file \"%s\": %m", txtfile)));
263 * Run TextPorter to read text data from application file (appfile)
264 * to temporary file (txtfile).
266 ret = ExecTextPorter((unsigned char *)appfile,
267 (unsigned char *)txtfile,
268 (unsigned char *)TEXTPORTER_GROUPNAME,
269 (unsigned char *)TEXTPORTER_DEFLANGNAME,
270 TEXTPORTER_BBIGENDIAN, textporter_option,
271 TEXTPORTER_OPTION1, TEXTPORTER_SIZE,
275 ereport(textporter_error,
276 (errmsg("could not get text from application file \"%s\"",
278 errdetail("DMC_GetText_V5() failed with errcode %d",
281 /* Return NULL if textporter_error is set to other than ERROR */
286 /* Read text data from temporary file to memory */
287 if (stat(txtfile, &statbuf))
289 (errcode_for_file_access(),
290 errmsg("could not stat file \"%s\": %m", txtfile)));
291 result = (text *) palloc(statbuf.st_size + VARHDRSZ);
293 fp = AllocateFile(txtfile, "r");
296 (errcode_for_file_access(),
297 errmsg("could not open file \"%s\": %m", txtfile)));
299 if (fread(VARDATA(result), 1, statbuf.st_size, fp) != statbuf.st_size ||
302 (errcode_for_file_access(),
303 errmsg("could not read file \"%s\": %m", txtfile)));
308 REMOVE_TMPFILE(txtfile);
313 REMOVE_TMPFILE(txtfile);
321 SET_VARSIZE(result, statbuf.st_size + VARHDRSZ);
323 PG_RETURN_TEXT_P(result);
327 * Clean up remaining textporter temporary files
330 CleanupTextPorterTmpFiles(void)
334 char path[MAXPGPATH];
336 tpdir = AllocateDir(TEXTPORTER_TMPDIR);
339 (errcode_for_file_access(),
340 errmsg("could not open textporter temporary file directory \"%s\": %m",
341 TEXTPORTER_TMPDIR)));
343 while ((tpde = ReadDir(tpdir, TEXTPORTER_TMPDIR)) != NULL)
345 if (strlen(tpde->d_name) == 18 &&
346 strncmp(tpde->d_name, "ludia_funcs_", 12) == 0)
348 snprintf(path, MAXPGPATH, TEXTPORTER_TMPDIR "/%s", tpde->d_name);
349 REMOVE_TMPFILE(path);
357 check_textporter_option(char **newval, void **extra, GucSource source)
361 unsigned int *myextra;
364 val = strtoul(*newval, &endptr, 0);
369 if (errno == ERANGE || val != (unsigned long) ((unsigned int) val))
371 GUC_check_errhint("Value exceeds unsigned integer range.");
375 /* Set up the "extra" struct actually used by assign_textporter_option */
376 myextra = (unsigned int *) malloc(sizeof(unsigned int));
379 GUC_check_errcode(ERRCODE_OUT_OF_MEMORY);
380 GUC_check_errmsg("out of memory");
383 *myextra = (unsigned int) val;
384 *extra = (void *) myextra;
390 assign_textporter_option(const char *newval, void *extra)
392 textporter_option = *((unsigned int *) extra);
395 #else /* TEXTPORTER */
398 pgs2textporter1(PG_FUNCTION_ARGS)
403 #endif /* TEXTPORTER */
406 GetSennaEncoding(void)
408 static sen_encoding encoding = sen_enc_default;
410 if (encoding == sen_enc_default)
412 if (GetDatabaseEncoding() == PG_UTF8)
413 encoding = sen_enc_utf8;
416 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
417 errmsg("does not support database encoding \"%s\"",
418 GetDatabaseEncodingName())));
424 * Escape the backslash and double quote characters in the given string.
426 * Return false if the given string has no character which needs to be
427 * escaped. Otherwise, return true. In this case, **s points the palloc'd
428 * space storing the escaped keyword string and *slen is set to the size
429 * of that string. The caller needs to free the palloc'd space.
432 EscapeSnippetKeyword(char **s, size_t *slen)
439 bool in_doublequote = false;
440 bool in_sennaops = false;
441 bool need_escape = false;
444 * Skip the heading double quote character because it always doesn't
445 * need to be interpreted as a character itself and be escaped.
446 * Note that we must not skip the heading character if it's not a
450 if (ISDOUBLEQUOTECHAR(sp))
454 * Check whether the snippet keyword string has a character which
455 * needs to be escaped.
457 while ((sp - *s) < *slen)
459 mblen = pg_mblen(sp);
462 * Backslash in the keyword always needs to be escaped.
464 if (ISBACKSLASHCHAR(sp))
472 if (ISSENNAOPSCHAR(sp))
475 in_doublequote = false;
480 * Double quote in the keyword needs to be escaped if
481 * any Senna search operators are to neither its right
490 if (ISDOUBLEQUOTECHAR(sp) && !in_sennaops)
491 in_doublequote = true;
492 if (!ISSENNAOPSCHAR(sp))
500 * Quick exit if the keyword has no character which needs to be
507 * Allocate the buffer space to store the escaped snippet keyword string.
508 * The maximum size of escaped string is double the input keyword size.
509 * The size reaches the maximum when every character in the input keyword
510 * needs to be escaped.
512 ep = escaped = (char *) palloc(*slen * 2);
515 * Copy the characters which have been passed through in the above loop
516 * and don't need to be escaped, into the buffer. If in_doublequote is
517 * true, we don't copy the double quote in the previous position into the
518 * buffer because it might still need to be escaped.
520 copylen = sp - *s - ((in_doublequote) ? 1 : 0);
521 memcpy(ep, *s, copylen);
525 * Construct the escaped snippet keyword string.
527 while ((sp - *s) < *slen)
529 mblen = pg_mblen(sp);
534 * dqchar indicates the previous character, that is a double
535 * quote. We assume here that a double quote is single-byte
538 char dqchar = *(sp - 1);
540 if (ISSENNAOPSCHAR(sp))
543 * Don't escape the double quote which is just before Senna
549 in_doublequote = false;
554 * Escape the double quote if no Senna operator is next to it.
559 if (ISDOUBLEQUOTECHAR(sp))
560 in_doublequote = true;
563 if (ISBACKSLASHCHAR(sp))
565 memcpy(ep, sp, mblen);
567 in_doublequote = false;
573 if (ISDOUBLEQUOTECHAR(sp))
576 * Don't escape the double quote which is just after Senna
582 in_doublequote = true;
586 if (ISBACKSLASHCHAR(sp))
589 * We don't check ISSENNAOPSCHAR() here. We handle Senna
590 * operator character as a character itself instead of
591 * an operator if it doesn't follow a double quote.
593 memcpy(ep, sp, mblen);
597 if (!ISSENNAOPSCHAR(sp))
604 /* Add the tailing double quote into the buffer */
612 if (pgs2_enable_debug)
614 char *tmp = pnstrdup(*s, *slen);
616 elog(LOG, "escaped snippet keyword: %s", tmp);
625 GetSennaQuery(char *str, size_t len)
627 static sen_query *query_cache = NULL;
628 static char *key_cache = NULL;
629 static size_t len_cache = 0;
630 static bool guc_cache = false;
632 sen_encoding encoding;
636 bool needfree = false;
639 * Return the cached Senna query if the same keyword has
640 * been used the last time.
642 if (key_cache != NULL &&
644 strncmp(key_cache, str, len) == 0 &&
645 escape_snippet_keyword == guc_cache)
648 if (pgs2_enable_debug)
650 char *tmp = pnstrdup(str, len);
652 elog(LOG, "GetSennaQuery(): quick exit: %s", tmp);
659 encoding = GetSennaEncoding();
664 (errcode(ERRCODE_OUT_OF_MEMORY),
665 errmsg("out of memory")));
668 * We always cache the unescaped keyword. Which enables us
669 * to check whether we can use the cached Senna query before
670 * escaping the keyword.
672 memcpy(key, str, len);
676 * If the keyword has been escaped, 'str' points to the
677 * newly-palloc'd space storing the escaped keyword. This
678 * space needs to be freed later.
680 if (escape_snippet_keyword)
681 needfree = EscapeSnippetKeyword(&str, &len);
683 query = sen_query_open(str, len, sen_sel_or, SEN_MAX_N_EXPRS,
689 (errmsg("sen_query_open() failed")));
692 if ((rest = sen_query_rest(query, NULL)) != 0)
694 (errmsg("too many expressions (%d)", rest)));
696 if (query_cache != NULL)
698 sen_query_close(query_cache);
705 guc_cache = escape_snippet_keyword;
714 pgs2snippet1(PG_FUNCTION_ARGS)
716 int flags = PG_GETARG_INT32(0);
717 uint32 width = PG_GETARG_UINT32(1);
718 uint32 max_results = PG_GETARG_UINT32(2);
719 text *opentags = PG_GETARG_TEXT_P(3);
720 text *closetags = PG_GETARG_TEXT_P(4);
721 int mapping = PG_GETARG_INT32(5);
722 text *keywords = PG_GETARG_TEXT_P(6);
723 text *document = PG_GETARG_TEXT_P(7);
725 sen_snip *snip = NULL;
726 const char *opentags_str = VARDATA_ANY(opentags);
727 const char *closetags_str = VARDATA_ANY(closetags);
728 char *keywords_str = VARDATA_ANY(keywords);
729 char *document_str = VARDATA_ANY(document);
730 uint32 opentags_len = VARSIZE_ANY_EXHDR(opentags);
731 uint32 closetags_len = VARSIZE_ANY_EXHDR(closetags);
732 uint32 keywords_len = VARSIZE_ANY_EXHDR(keywords);
733 uint32 document_len = VARSIZE_ANY_EXHDR(document);
735 uint32 max_tagged_len = 0;
738 uint32 result_len = 0;
739 bool return_null = false;
741 query = GetSennaQuery(keywords_str, keywords_len);
743 snip = sen_query_snip(query, flags, width, max_results, 1,
744 &opentags_str, &opentags_len,
745 &closetags_str, &closetags_len,
746 mapping == 0 ? NULL : (sen_snip_mapping *)-1);
749 (errmsg("sen_query_snip() failed")));
753 rc = sen_snip_exec(snip, document_str, document_len,
754 &nresults, &max_tagged_len);
755 if (rc != sen_success)
757 (errmsg("sen_snip_exec() failed: %d", rc)));
759 result = (text *) palloc(max_tagged_len + VARHDRSZ);
761 rc = sen_snip_get_result(snip, 0, VARDATA(result), &result_len);
762 if (rc == sen_invalid_argument)
764 else if (rc != sen_success)
766 (errmsg("sen_snip_get_result() failed: %d", rc)));
770 sen_snip_close(snip);
775 sen_snip_close(snip);
780 SET_VARSIZE(result, max_tagged_len + VARHDRSZ);
782 PG_RETURN_TEXT_P(result);
786 * Make sure there is enough space for 'needed' more bytes.
788 * Sets **buf to the allocated space which can store the needed bytes if OK,
789 * NULL if failed to enlarge the space because 'needed' is larger than 'maxlen'.
792 pgs2malloc(void **buf, long *buflen, long needed, long maxlen)
795 if (pgs2_enable_debug)
796 elog(LOG, "pgs2malloc(): buflen %ld, needed %ld, maxlen %ld",
797 *buflen, needed, maxlen);
800 if (*buf != NULL && *buflen >= needed && (*buflen <= maxlen || maxlen == 0))
801 return; /* got enough space already */
804 * Release the already-allocated space since it's too small to
805 * store the needed bytes or larger than the upper limit.
815 * Don't allocate any space if the needed space is larger than
818 if (needed > maxlen && maxlen != 0)
822 * Allocate the space for the needed bytes.
824 * We don't want to allocate just a little more space with each enlarge;
825 * for efficiency, double the buffer size each time it overflows.
826 * Actually, we might need to more than double it if 'needed' is big...
828 * We check whether '*buflen' overflows each cycle to avoid infinite loop.
831 while (*buflen < needed && *buflen != 0)
835 * Clamp to maxlen in case we went past it. Note we are assuming
836 * here that maxlen <= LONG_MAX/2, else the above loop could
837 * overflow. We will still have *buflen >= needed.
839 if (*buflen > maxlen && maxlen != 0)
842 /* Guard against out-of-range '*buflen' value */
845 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
846 errmsg("out of memory"),
847 errdetail("Cannot enlarge buffer by %ld more bytes.",
850 *buf = (void *) malloc(*buflen);
853 (errcode(ERRCODE_OUT_OF_MEMORY),
854 errmsg("out of memory")));
858 pgs2norm(PG_FUNCTION_ARGS)
860 text *str = PG_GETARG_TEXT_PP(0);
861 char *s = VARDATA_ANY(str);
862 long slen = VARSIZE_ANY_EXHDR(str);
870 * norm_cache is the cache memory storing both input and normalized strings
871 * as the result of pgs2norm(). norm_cache_size is the size of norm_cache
872 * and its upper limit is specified by norm_cache_limit parameter. norm_result
873 * is the pointer to the normalized string with the verlena header (i.e.,
874 * text type) stored in the latter half of the cache. norm_reslen is the size
875 * of norm_result. norm_slen is the size of the input string which is stored
876 * in the first half of the cache.
878 static char *norm_cache = NULL;
879 static long norm_cache_size = 0;
880 static long norm_slen = 0;
881 static char *norm_result = NULL;
882 static long norm_reslen = 0;
885 * Return the cached normalization result if the same string of
886 * the given one has been normalized the last time.
888 if (norm_cache != NULL &&
890 strncmp(norm_cache, s, slen) == 0)
893 if (pgs2_enable_debug)
895 char *tmp = text_to_cstring(str);
897 elog(LOG, "pgs2norm(): quick exit: %s", tmp);
902 PG_RETURN_TEXT_P(pnstrdup(norm_result, norm_reslen));
905 /* Confirm that database encoding is UTF-8 */
909 * Allocate the result buffer to store the normalized string. Since the size of
910 * normalized string can be larger than that of input one, the result buffer needs
911 * extra space. Problem is that, before calling sen_str_normalize, we need to
912 * allocate the result buffer but cannot know how large extra space is required.
913 * So we use RESULT_EXTRA_SIZE as the estimated size of extra space here.
915 #define RESULT_EXTRA_SIZE 64
916 buflen = slen + RESULT_EXTRA_SIZE;
919 result = (text *) palloc(buflen + VARHDRSZ);
921 #if defined(FAST_SENNA)
922 reslen = fast_sen_str_normalize(s, slen, VARDATA(result), buflen);
924 reslen = sen_str_normalize(s, slen, sen_enc_utf8,
926 VARDATA(result), buflen);
931 (errmsg("could not normalize the string")));
934 * If the result buffer size is too short to store the normalized string,
935 * we enlarge the buffer and retry the string normalization.
937 if (buflen <= reslen)
944 SET_VARSIZE(result, reslen + VARHDRSZ);
947 * Cache both input and normalized strings to accelerate the subsequent
948 * calls of pgs2norm() with the same input string. But we don't do that
949 * if the maximum allowed size of the cache is too small to store them.
951 needed = slen + reslen + VARHDRSZ;
952 maxlen = ((norm_cache_limit >= 0) ? norm_cache_limit : work_mem) * 1024L;
954 pgs2malloc((void **) &norm_cache, &norm_cache_size, needed, maxlen);
955 if (norm_cache != NULL)
957 /* Store the input string into the first half of the cache */
959 memcpy(norm_cache, s, slen);
962 * Store the normalized string with the varlena header (i.e., text type)
963 * into the latter half of the cache.
965 norm_result = norm_cache + slen;
966 norm_reslen = reslen + VARHDRSZ;
967 memcpy(norm_result, result, norm_reslen);
971 if (pgs2_enable_debug)
973 char *tmp = text_to_cstring(str);
975 elog(LOG, "pgs2norm(): complete (%s result cache): %s",
976 (norm_cache == NULL) ? "unset" : "set", tmp);
981 PG_RETURN_TEXT_P(result);
985 * Report the version and configure options of Senna which
986 * ludia_funcs depends on.
989 pgs2seninfo(PG_FUNCTION_ARGS)
991 char *version[MAXPGPATH];
992 char *coptions[MAXPGPATH];
999 * Get the version and configure options of Senna. Ignore the
1000 * return value of sen_info() because it always returns a success.
1002 sen_info((char **)&version, (char **)&coptions, NULL, NULL, NULL, NULL);
1005 * Construct a tuple descriptor for the result row. This must
1006 * match this function's ludia_funcs--x.x.sql entry.
1008 tupdesc = CreateTemplateTupleDesc(2, false);
1009 TupleDescInitEntry(tupdesc, (AttrNumber) 1,
1010 "version", TEXTOID, -1, 0);
1011 TupleDescInitEntry(tupdesc, (AttrNumber) 2,
1012 "configure_options", TEXTOID, -1, 0);
1013 tupdesc = BlessTupleDesc(tupdesc);
1016 values[0] = CStringGetTextDatum(*version);
1019 /* configure option */
1020 values[1] = CStringGetTextDatum(*coptions);
1023 tuple = heap_form_tuple(tupdesc, values, isnull);
1024 PG_RETURN_DATUM(HeapTupleGetDatum(tuple));