OSDN Git Service

Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes.
authorTeodor Sigaev <teodor@sigaev.ru>
Wed, 31 May 2006 14:05:31 +0000 (14:05 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Wed, 31 May 2006 14:05:31 +0000 (14:05 +0000)
It required some changes in lexize algorithm, but interface with
dictionaries stays compatible with old dictionaries.

Funded by Georgia Public Library Service and LibLime, Inc.

13 files changed:
contrib/tsearch2/Makefile
contrib/tsearch2/common.c
contrib/tsearch2/common.h
contrib/tsearch2/dict.c
contrib/tsearch2/dict.h
contrib/tsearch2/dict_thesaurus.c [new file with mode: 0644]
contrib/tsearch2/expected/tsearch2.out
contrib/tsearch2/stopword.c
contrib/tsearch2/thesaurus [new file with mode: 0644]
contrib/tsearch2/ts_cfg.c
contrib/tsearch2/ts_lexize.c [new file with mode: 0644]
contrib/tsearch2/tsearch.sql.in
contrib/tsearch2/untsearch.sql.in

index 3e322bb..393e3fa 100644 (file)
@@ -1,13 +1,13 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $
 
 MODULE_big = tsearch2
 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
-       dict_snowball.o dict_ispell.o dict_syn.o \
+       dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
        wparser.o wparser_def.o \
        ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
        tsvector_op.o rank.o ts_stat.o \
        query_util.o query_support.o query_rewrite.o query_gist.o \
-       ts_locale.o ginidx.o
+       ts_locale.o ts_lexize.o ginidx.o
 
 SUBDIRS     := snowball ispell wordparser
 SUBDIROBJS  := $(SUBDIRS:%=%/SUBSYS.o)
@@ -16,7 +16,7 @@ OBJS  += $(SUBDIROBJS)
 
 PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
 
-DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
+DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
 DATA_built = tsearch2.sql untsearch2.sql
 DOCS = README.tsearch2
 REGRESS = tsearch2
index 4984c3d..c7b9cd3 100644 (file)
@@ -5,6 +5,7 @@
 #include "catalog/pg_proc.h"
 #include "catalog/pg_namespace.h"
 #include "utils/syscache.h"
+#include "miscadmin.h"
 
 #include "ts_cfg.h"
 #include "dict.h"
@@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid)
 
        return nspoid;
 }
+
+    /* if path is relative, take it as relative to share dir */
+char *
+to_absfilename(char *filename) {
+       if (!is_absolute_path(filename)) {
+               char        sharepath[MAXPGPATH];
+               char       *absfn;
+#ifdef  WIN32
+               char    delim = '\\';
+#else
+               char    delim = '/';
+#endif
+               get_share_path(my_exec_path, sharepath);
+               absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
+               sprintf(absfn, "%s%c%s", sharepath, delim, filename);
+               filename = absfn;
+       }
+
+       return filename;
+}
index c84e841..d2f4cd6 100644 (file)
@@ -16,6 +16,8 @@ text     *mtextdup(text *in);
 
 int                    text_cmp(text *a, text *b);
 
+char * to_absfilename(char *filename);
+
 #define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
 #define ARRNELEMS(x)  ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
 
index 9d91235..2c37a26 100644 (file)
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */
 
 /*
  * interface functions to dictionary
@@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict)
                Datum           opt;
                Oid                     oid = InvalidOid;
 
+               /* setup dictlexize method */
+               oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
+               if (isnull || oid == InvalidOid)
+                       ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
+               fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
+
+               /* setup and call dictinit method, optinally */
                oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
                if (!(isnull || oid == InvalidOid))
                {
                        opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull);
                        dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt));
                }
-               oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
-               if (isnull || oid == InvalidOid)
-                       ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
-               fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
                dict->dict_id = id;
        }
        else
@@ -98,6 +101,29 @@ comparedict(const void *a, const void *b)
        return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1;
 }
 
+static void
+insertdict(Oid id) {
+       DictInfo        newdict;
+
+       if (DList.len == DList.reallen)
+       {
+               DictInfo   *tmp;
+               int                     reallen = (DList.reallen) ? 2 * DList.reallen : 16;
+
+               tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
+               if (!tmp)
+                       ts_error(ERROR, "No memory");
+               DList.reallen = reallen;
+               DList.list = tmp;
+       }
+       init_dict(id, &newdict);
+
+       DList.list[DList.len] = newdict;
+       DList.len++;
+
+       qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
+}
+
 DictInfo *
 finddict(Oid id)
 {
@@ -117,23 +143,8 @@ finddict(Oid id)
                        return DList.last_dict;
        }
 
-       /* last chance */
-       if (DList.len == DList.reallen)
-       {
-               DictInfo   *tmp;
-               int                     reallen = (DList.reallen) ? 2 * DList.reallen : 16;
-
-               tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
-               if (!tmp)
-                       ts_error(ERROR, "No memory");
-               DList.reallen = reallen;
-               DList.list = tmp;
-       }
-       DList.last_dict = &(DList.list[DList.len]);
-       init_dict(id, DList.last_dict);
-
-       DList.len++;
-       qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
+       /* insert new dictionary */ 
+       insertdict(id);
        return finddict(id); /* qsort changed order!! */ ;
 }
 
@@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS)
                           *ptr;
        Datum      *da;
        ArrayType  *a;
+       DictSubState    dstate = { false, false, NULL };
 
        SET_FUNCOID();
        dict = finddict(PG_GETARG_OID(0));
 
        ptr = res = (TSLexeme *) DatumGetPointer(
-                                                                                 FunctionCall3(&(dict->lexize_info),
+                                                                               FunctionCall4(&(dict->lexize_info),
+                                                                               PointerGetDatum(dict->dictionary),
+                                                                               PointerGetDatum(VARDATA(in)),
+                                                                               Int32GetDatum(VARSIZE(in) - VARHDRSZ),
+                                                                               PointerGetDatum(&dstate)
+                                                                                                               )
+               );
+
+       if (dstate.getnext)  {
+               dstate.isend = true;    
+               ptr = res = (TSLexeme *) DatumGetPointer(
+                                                                               FunctionCall4(&(dict->lexize_info),
                                                                                   PointerGetDatum(dict->dictionary),
                                                                                                PointerGetDatum(VARDATA(in)),
-                                                                               Int32GetDatum(VARSIZE(in) - VARHDRSZ)
+                                                                               Int32GetDatum(VARSIZE(in) - VARHDRSZ),
+                                                                               PointerGetDatum(&dstate)
                                                                                                                )
                );
+       }
+
        PG_FREE_IF_COPY(in, 1);
        if (!res)
        {
index 7a6153c..a0e9fe6 100644 (file)
@@ -1,9 +1,10 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */
 
 #ifndef __DICT_H__
 #define __DICT_H__
 #include "postgres.h"
 #include "fmgr.h"
+#include "ts_cfg.h"
 
 typedef struct
 {
@@ -29,6 +30,11 @@ DictInfo   *finddict(Oid id);
 Oid                    name2id_dict(text *name);
 void           reset_dict(void);
 
+typedef struct {
+       bool isend; /* in: marks for lexize_info about text end is reached */
+       bool getnext; /* out: dict wants next lexeme */
+       void    *private;  /* internal dict state between calls with getnext == true */
+} DictSubState;
 
 /* simple parser of cfg string */
 typedef struct
@@ -45,17 +51,61 @@ typedef struct
        /*
         * number of variant of split word , for example Word 'fotballklubber'
         * (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
-        * ball, klubb ). So, dictionary should return: nvariant        lexeme 1
-        * fotball 1       klubb 2               fot 2           ball 2           klubb
-        *
+        * ball, klubb ). So, dictionary should return: 
+        * nvariant     lexeme 
+        *   1          fotball 
+        *   1          klubb 
+        *       2              fot 
+        *       2              ball 
+        *   2          klubb
         */
        uint16          nvariant;
 
-       /* currently unused */
        uint16          flags;
 
        /* C-string */
        char       *lexeme;
 }      TSLexeme;
 
+#define TSL_ADDPOS             0x01
+
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex {
+    int        type;
+    char       *lemm;
+    int        lenlemm;
+       bool            resfollow;
+    struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex {
+       ParsedLex       *head;
+       ParsedLex       *tail;
+} ListParsedLex;
+
+typedef struct {
+    TSCfgInfo       *cfg;
+    Oid             curDictId;
+    int             posDict;
+    DictSubState    dictState;
+    ParsedLex       *curSub;
+       ListParsedLex   towork;   /* current list to work */
+       ListParsedLex   waste;    /* list of lexemes that already lexized */
+
+       /* fields to store last variant to lexize (basically, thesaurus 
+          or similar to, which wants  several lexemes */       
+          
+       ParsedLex               *lastRes;
+       TSLexeme                *tmpRes;
+} LexizeData;
+
+
+void LexizeInit(LexizeData *ld, TSCfgInfo *cfg);
+void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm);
+TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem);
+
 #endif
diff --git a/contrib/tsearch2/dict_thesaurus.c b/contrib/tsearch2/dict_thesaurus.c
new file mode 100644 (file)
index 0000000..8e543a4
--- /dev/null
@@ -0,0 +1,743 @@
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.1 2006/05/31 14:05:31 teodor Exp $ */
+
+/*
+ * thesaurus
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include "postgres.h"
+#include "executor/spi.h"
+
+#include <ctype.h>
+
+#include "dict.h"
+#include "common.h"
+#include "ts_locale.h"
+
+typedef struct LexemeInfo {
+       uint16  idsubst; /* entry's number in DictThesaurus->subst */
+       uint16  posinsubst; /* pos info in entry */
+       uint16  tnvariant;  /* total num lexemes in one variant */
+       struct LexemeInfo *nextentry;
+       struct LexemeInfo *nextvariant;
+} LexemeInfo;
+
+typedef struct {
+       char            *lexeme;
+       LexemeInfo      *entries;
+} TheLexeme; 
+
+typedef struct {
+       uint16  lastlexeme; /* number lexemes to substitute */
+       uint16  reslen;
+       TSLexeme        *res;   /* prepared substituted result */ 
+} TheSubstitute;
+
+typedef struct
+{
+       /* subdictionary to normalize lexemes */        
+       DictInfo        subdict;
+
+       /* Array to search lexeme by exact match */
+       TheLexeme       *wrds;
+       int                     nwrds;
+       int                     ntwrds;
+
+       /* Storage of substituted result, n-th element is for
+          n-th expression */
+       TheSubstitute   *subst;
+       int                             nsubst;
+}      DictThesaurus;
+
+PG_FUNCTION_INFO_V1(thesaurus_init);
+Datum          thesaurus_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(thesaurus_lexize);
+Datum          thesaurus_lexize(PG_FUNCTION_ARGS);
+
+static void
+freeDictThesaurus(DictThesaurus * d)
+{
+       free(d);
+}
+
+static void
+newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst ) {
+       TheLexeme       *ptr;
+
+       if ( d->nwrds >= d->ntwrds ) {
+               if ( d->ntwrds == 0 ) {
+                       d->ntwrds = 16;
+                       d->wrds = (TheLexeme*)malloc(sizeof(TheLexeme) * d->ntwrds);
+               } else {
+                       d->ntwrds *= 2;
+                       d->wrds = (TheLexeme*)realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
+               }
+               if (!d->wrds)
+                       elog(ERROR,"Out of memory");
+       }
+
+       ptr = d->wrds + d->nwrds;
+       d->nwrds++;
+
+       if  ( (ptr->lexeme = malloc(e-b+1)) == NULL )
+               elog(ERROR,"Out of memory");
+
+       memcpy(ptr->lexeme, b, e-b);
+       ptr->lexeme[e-b] = '\0';
+
+       if  ( (ptr->entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ))==NULL )
+               elog(ERROR,"Out of memory");
+
+       ptr->entries->nextentry=NULL;
+       ptr->entries->idsubst = idsubst;
+       ptr->entries->posinsubst = posinsubst;
+}
+
+static void
+addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) {
+       static  int nres=0;
+       static  int ntres = 0;
+       TheSubstitute   *ptr;
+
+       if ( nwrd == 0 ) {
+               nres = ntres = 0;
+
+               if ( idsubst <= d->nsubst ) {
+                       if ( d->nsubst == 0 ) {
+                               d->nsubst = 16;
+                               d->subst = (TheSubstitute*)malloc(sizeof(TheSubstitute) * d->nsubst);
+                       } else {
+                               d->nsubst *= 2;
+                               d->subst = (TheSubstitute*)realloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
+                       }
+                       if (!d->subst)
+                               elog(ERROR,"Out of memory");
+               }
+       }
+
+       ptr = d->subst + idsubst;
+
+       ptr->lastlexeme = posinsubst-1;
+
+       if ( nres+1 >= ntres ) {
+               if ( ntres == 0 ) {
+                       ntres = 2;
+                       ptr->res = (TSLexeme*)malloc( sizeof(TSLexeme) * ntres );
+               } else {
+                       ntres *= 2;
+                       ptr->res = (TSLexeme*)realloc( ptr->res, sizeof(TSLexeme) * ntres );
+               }
+
+               if ( !ptr->res ) 
+                               elog(ERROR,"Out of memory");
+       }
+
+       if ( (ptr->res[ nres ].lexeme = malloc(e-b+1))==0 ) 
+               elog(ERROR,"Out of memory");
+       memcpy(ptr->res[ nres ].lexeme, b, e-b);
+       ptr->res[ nres ].lexeme[e-b] = '\0';
+
+       ptr->res[ nres ].nvariant = nwrd;
+       ptr->res[ nres ].flags = TSL_ADDPOS;
+
+       ptr->res[ ++nres ].lexeme = NULL;
+}
+
+#define TR_WAITLEX     1
+#define TR_INLEX       2
+#define        TR_WAITSUBS     3
+#define TR_INSUBS      4
+
+static void
+thesaurusRead( char *filename, DictThesaurus *d ) {
+       FILE *fh;
+       char str[BUFSIZ];
+       int lineno=0;
+       uint16  idsubst = 0;
+
+       fh = fopen(to_absfilename(filename), "r");
+       if (!fh)
+               elog(ERROR,"Thesaurus: can't open '%s' file", filename);
+
+       while( fgets(str, sizeof(str), fh)) {
+               char *ptr = str;
+               int state = TR_WAITLEX;
+               char    *beginwrd = NULL;
+               uint16  posinsubst=0;
+               uint16  nwrd=0;
+
+               lineno++;
+
+               /* is it comment ? */
+               while( t_isspace(ptr) )
+                       ptr += pg_mblen(ptr);
+               if ( t_iseq(str, '#') || *str=='\0' || t_iseq(str, '\n') || t_iseq(str, '\r') )
+                       continue;
+
+               pg_verifymbstr(ptr, strlen(ptr), false);
+               while(*ptr) {
+                       if ( state == TR_WAITLEX ) {
+                               if ( t_iseq(ptr, ':' ) ) {
+                                       if ( posinsubst == 0 ) {
+                                               fclose(fh);
+                                               elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno);
+                                       }
+                                       state = TR_WAITSUBS;
+                               } else if ( !t_isspace(ptr) ) {
+                                       beginwrd = ptr;
+                                       state = TR_INLEX;
+                               }
+                       } else if ( state == TR_INLEX ) {
+                               if ( t_iseq(ptr, ':') ) {
+                                       newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
+                                       state = TR_WAITSUBS;
+                               } else if ( t_isspace(ptr) ) {
+                                       newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
+                                       state = TR_WAITLEX;
+                               }
+                       } else if ( state == TR_WAITSUBS ) {
+                               if ( !t_isspace(ptr) ) { 
+                                       beginwrd = ptr;
+                                       state = TR_INSUBS;
+                               }
+                       } else if ( state == TR_INSUBS ) {
+                               if ( t_isspace(ptr) ) { 
+                                       addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
+                                       state = TR_WAITSUBS;
+                               }
+                       } else
+                               elog(ERROR,"Thesaurus: Unknown state: %d", state);
+                               
+                       ptr += pg_mblen(ptr);
+               }
+
+               if ( state == TR_INSUBS )
+                       addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
+
+               idsubst++;
+
+               if ( !(nwrd && posinsubst) ) {
+                       fclose(fh);
+                       elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno);
+               }
+                       
+       }
+
+       d->nsubst = idsubst;
+
+       fclose(fh);
+}
+
+static TheLexeme*
+addCompiledLexeme(TheLexeme   *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo* src, uint16 tnvariant) {
+
+       if ( *nnw >= *tnm ) {
+               *tnm *= 2;
+               newwrds = (TheLexeme*)realloc( newwrds, sizeof(TheLexeme) * *tnm);
+               if (!newwrds)
+                       elog(ERROR,"Out of memory");
+       }
+
+       newwrds[ *nnw ].entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) );
+       if (!newwrds[ *nnw ].entries)
+               elog(ERROR,"Out of memory");
+
+       if ( lexeme && lexeme->lexeme ) {
+               newwrds[ *nnw ].lexeme = strdup( lexeme->lexeme );
+               if ( !newwrds[ *nnw ].lexeme )
+                       elog(ERROR,"Out of memory");
+
+               newwrds[ *nnw ].entries->tnvariant = tnvariant;
+       } else {
+               newwrds[ *nnw ].lexeme = NULL;
+               newwrds[ *nnw ].entries->tnvariant = 1;
+       }
+
+       newwrds[ *nnw ].entries->idsubst = src->idsubst;
+       newwrds[ *nnw ].entries->posinsubst = src->posinsubst;
+
+       newwrds[ *nnw ].entries->nextentry = NULL;
+
+       (*nnw)++;
+       return newwrds;
+}
+
+static int
+cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) {
+       if ( a==NULL || b==NULL )
+               return 0;
+
+       if ( a->idsubst == b->idsubst ) {
+               if ( a->posinsubst == b->posinsubst ) {
+                       if ( a->tnvariant == b->tnvariant ) 
+                                       return 0;
+
+                       return ( a->tnvariant > b->tnvariant ) ? 1 : -1;
+               }
+
+               return ( a->posinsubst > b->posinsubst ) ? 1 : -1;
+       }
+
+       return ( a->idsubst > b->idsubst ) ? 1 : -1;
+}
+
+static int
+cmpLexeme(TheLexeme *a, TheLexeme* b) {
+       if ( a->lexeme == NULL ) {
+               if ( b->lexeme == NULL )
+                       return 0;
+               else
+                       return 1;
+       } else if ( b->lexeme == NULL )
+               return -1;
+
+       return strcmp( a->lexeme, b->lexeme );
+}
+
+static int
+cmpLexemeQ(const void *a, const void *b) {
+       return cmpLexeme( (TheLexeme*)a, (TheLexeme*)b ); 
+}
+
+static int cmpTheLexeme(const void *a, const void *b) {
+       TheLexeme *la  = (TheLexeme*)a;
+       TheLexeme *lb  = (TheLexeme*)b;
+       int res;
+
+       if ( (res=cmpLexeme(la, lb)) != 0 )
+               return res;
+
+       return -cmpLexemeInfo(la->entries, lb->entries);
+}
+
+static void
+compileTheLexeme(DictThesaurus *d) {
+       int                     i,nnw=0, tnm=16;
+       TheLexeme       *newwrds = (TheLexeme*)malloc(sizeof(TheLexeme)*tnm), *ptrwrds;
+
+       if (!newwrds) 
+               elog(ERROR,"Out of memory");
+
+       for(i=0;i<d->nwrds;i++) {
+               TSLexeme *ptr = (TSLexeme*) DatumGetPointer( 
+                               FunctionCall4(
+                                       &(d->subdict.lexize_info),
+                                       PointerGetDatum(d->subdict.dictionary),
+                                       PointerGetDatum(d->wrds[i].lexeme),
+                                       Int32GetDatum(strlen(d->wrds[i].lexeme)),
+                                       PointerGetDatum(NULL)
+                               )
+                       );
+
+               if ( !(ptr && ptr->lexeme) ) {
+                       newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
+                       elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, assign any non-recognized word", d->wrds[i].lexeme);
+               } else {
+                       while( ptr->lexeme ) {
+                               TSLexeme        *remptr = ptr+1;
+                               int tnvar = 1;
+                               int     curvar = ptr->nvariant;
+
+                               /* compute n words in one variant */
+                               while( remptr->lexeme ) {
+                                       if ( remptr->nvariant != (remptr-1)->nvariant )
+                                               break;
+                                       tnvar++;
+                                       remptr++;
+                               }
+
+                               remptr = ptr;
+                               while( remptr->lexeme && remptr->nvariant == curvar ) {
+                                       newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); 
+                                       remptr++;
+                               }
+
+                               ptr = remptr;
+                       }
+               }
+
+               free( d->wrds[i].lexeme );
+               free( d->wrds[i].entries );
+       }
+
+       free( d->wrds );
+       d->wrds = newwrds;
+       d->nwrds = nnw;
+       d->ntwrds = tnm;
+
+       if ( d->nwrds > 1 ) {
+               qsort( d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme ); 
+
+               /* uniq */
+               newwrds = d->wrds;
+               ptrwrds = d->wrds + 1;
+               while( ptrwrds - d->wrds < d->nwrds ) {
+                       if ( cmpLexeme( ptrwrds, newwrds ) == 0 ) {
+                               if ( cmpLexemeInfo(ptrwrds->entries, newwrds->entries) ) {
+                                       ptrwrds->entries->nextentry = newwrds->entries;
+                                       newwrds->entries = ptrwrds->entries;
+                               } else
+                                       free( ptrwrds->entries );
+
+                               if ( ptrwrds->lexeme )
+                                       free( ptrwrds->lexeme );
+                       } else {
+                               newwrds++;
+                               *newwrds = *ptrwrds;
+                       }
+
+                       ptrwrds++;
+               }
+
+               d->nwrds = newwrds - d->wrds + 1;
+               d->wrds = (TheLexeme*)realloc( d->wrds, sizeof(TheLexeme) * d->nwrds );
+       }
+}
+
+static void
+compileTheSubstitute(DictThesaurus *d) {
+       int i;
+
+       for(i=0;i<d->nsubst;i++) {
+               TSLexeme        *rem = d->subst[i].res, *outptr, *inptr;
+               int                     n=2;
+
+               outptr = d->subst[i].res = (TSLexeme*)malloc( sizeof(TSLexeme) * n );
+               if ( d->subst[i].res == NULL )
+                       elog(ERROR,"Out of Memory");
+               outptr->lexeme = NULL;
+               inptr = rem;
+
+               while( inptr && inptr->lexeme ) { 
+                       TSLexeme        *reml, *lexized = (TSLexeme*) DatumGetPointer( 
+                               FunctionCall4(
+                                       &(d->subdict.lexize_info),
+                                       PointerGetDatum(d->subdict.dictionary),
+                                       PointerGetDatum(inptr->lexeme),
+                                       Int32GetDatum(strlen(inptr->lexeme)),
+                                       PointerGetDatum(NULL)
+                               )
+                       );
+
+                       reml = lexized;
+                       if ( lexized ) {
+                               int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res)  : -1;
+
+                               while( lexized->lexeme ) {
+                                       if ( outptr - d->subst[i].res + 1 >= n ) {
+                                               int diff = outptr - d->subst[i].res;
+                                               n *= 2;
+                                               d->subst[i].res = (TSLexeme*)realloc( d->subst[i].res, sizeof(TSLexeme) * n );
+                                               if ( d->subst[i].res == NULL )
+                                                       elog(ERROR,"Out of Memory");
+                                               outptr = d->subst[i].res + diff;
+                                       }
+
+                                       *outptr = *lexized;
+                                       if ( (outptr->lexeme = strdup(lexized->lexeme)) == NULL )
+                                               elog(ERROR,"Out of Memory");
+
+                                       outptr++;
+                                       lexized++;
+                               }
+
+                               if ( toset > 0)
+                                       d->subst[i].res[toset].flags |= TSL_ADDPOS;
+                       }
+
+                       if ( inptr->lexeme )
+                               free( inptr->lexeme );
+                       inptr++;
+               }
+
+               d->subst[i].reslen = outptr - d->subst[i].res;
+
+               free(rem);
+       }
+}
+
+Datum
+thesaurus_init(PG_FUNCTION_ARGS)
+{
+       DictThesaurus *d;
+       Map                *cfg,
+                          *pcfg;
+       text       *in, *subdictname=NULL;
+       bool            fileloaded = false;
+
+       if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_CONFIG_FILE_ERROR),
+                                errmsg("Thesaurus confguration error")));
+
+       d = (DictThesaurus *) malloc(sizeof(DictThesaurus));
+       if (!d)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
+       memset(d, 0, sizeof(DictThesaurus));
+
+       in = PG_GETARG_TEXT_P(0);
+       parse_cfgdict(in, &cfg);
+       PG_FREE_IF_COPY(in, 0);
+       pcfg = cfg;
+       while (pcfg->key)
+       {
+               if (pg_strcasecmp("DictFile", pcfg->key) == 0)
+               {
+                       if (fileloaded)
+                       {
+                               freeDictThesaurus(d);
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                                errmsg("Thesaurus file is already loaded")));
+                       }
+                       fileloaded = true;
+                       thesaurusRead( pcfg->value, d );
+               }
+               else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
+               {
+                       if (subdictname)
+                       {
+                               freeDictThesaurus(d);
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                                errmsg("Thesaurus: SubDictionary is already defined")));
+                       }
+                       subdictname = char2text( pcfg->value );
+               }
+               else
+               {
+                       freeDictThesaurus(d);
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_SYNTAX_ERROR),
+                                        errmsg("unrecognized option: %s => %s",
+                                                       pcfg->key, pcfg->value)));
+               }
+               pfree(pcfg->key);
+               pfree(pcfg->value);
+               pcfg++;
+       }
+       pfree(cfg);
+
+       if (!fileloaded)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("Thesaurus file  isn't defined")));
+
+       if ( subdictname ) {
+               DictInfo        *subdictptr;
+               /* 
+                * we already in SPI, but name2id_dict()/finddict()
+                * invoke SPI_connect()
+                */
+               SPI_push(); 
+
+               subdictptr = finddict( name2id_dict( subdictname ) );
+
+               SPI_pop();
+
+               d->subdict = *subdictptr;
+       } else 
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("Thesaurus: SubDictionary isn't defined")));
+
+       compileTheLexeme( d );
+       compileTheSubstitute(d);
+
+       PG_RETURN_POINTER(d);
+}
+
+static LexemeInfo*
+findTheLexeme(DictThesaurus *d, char * lexeme) {
+       TheLexeme key = { lexeme, NULL }, *res;
+
+       if ( d->nwrds == 0 )
+               return NULL;
+
+       res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
+
+       if ( res == NULL )
+               return NULL;
+       return res->entries;
+}
+
+static bool
+matchIdSubst(LexemeInfo *stored, uint16 idsubst) {
+       bool res = true;
+
+       if (stored) {
+               res = false;
+
+               for(; stored; stored=stored->nextvariant) 
+                       if ( stored->idsubst == idsubst ) {
+                               res = true;
+                               break;
+                       }
+       }
+
+       return res;
+}
+
+static LexemeInfo*
+findVariant( LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) {
+       for(;;) {
+               int i;
+               LexemeInfo *ptr = newin[0];
+
+               for(i=0; i<newn; i++) {
+                       while(newin[i] && newin[i]->idsubst < ptr->idsubst) 
+                               newin[i] = newin[i]->nextentry;
+
+                       if ( newin[i] == NULL )
+                               return in;
+
+                       if ( newin[i]->idsubst > ptr->idsubst ) {
+                               ptr = newin[i];
+                               i=-1;
+                               continue;
+                       }
+
+                       while(newin[i]->idsubst == ptr->idsubst) {
+                               if ( newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn ) {
+                                       ptr = newin[i];
+                                       break;
+                               }
+
+                               newin[i] = newin[i]->nextentry;
+                               if ( newin[i] == NULL )
+                                       return in;
+                       }
+
+                       if ( newin[i]->idsubst != ptr->idsubst ) {
+                               ptr = newin[i];
+                               i=-1;
+                               continue;
+                       }
+               }
+
+               if ( i==newn && matchIdSubst(stored, ptr->idsubst) && (in==NULL || !matchIdSubst(in, ptr->idsubst)) ) { /* found */
+
+                       ptr->nextvariant = in;
+                       in = ptr;
+               }
+
+               /* step forward */
+               for(i=0; i<newn; i++)
+                       newin[i] = newin[i]->nextentry;
+       }
+
+       return NULL;
+}
+
+static TSLexeme*
+copyTSLexeme( TheSubstitute *ts ) {
+       TSLexeme        *res;
+       uint16 i;
+
+       res = (TSLexeme*)palloc( sizeof(TSLexeme) * (ts->reslen+1) );
+       for(i=0;i<ts->reslen;i++) {     
+               res[i] = ts->res[i];
+               res[i].lexeme = pstrdup( ts->res[i].lexeme );
+       }
+
+       res[ts->reslen].lexeme = NULL;
+
+       return res;
+}
+
+static TSLexeme*
+checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) {
+       *moreres = false;
+       while(info) {
+               Assert( info->idsubst < d->nsubst );
+               if ( info->nextvariant )
+                       *moreres = true;
+               if ( d->subst[ info->idsubst ].lastlexeme == curpos ) 
+                       return copyTSLexeme( d->subst + info->idsubst );
+               info = info->nextvariant;
+       }
+
+       return NULL;
+}
+
+Datum
+thesaurus_lexize(PG_FUNCTION_ARGS)
+{
+       DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
+       DictSubState    *dstate = (DictSubState*)PG_GETARG_POINTER(3);
+       TSLexeme        *res=NULL;
+       LexemeInfo *stored, *info = NULL;
+       uint16  curpos = 0;
+       bool    moreres = false;
+
+       if ( dstate == NULL || PG_NARGS() < 4 )
+               elog(ERROR,"Forbidden call of thesaurus or nested call");
+
+       if ( dstate->isend ) 
+               PG_RETURN_POINTER(NULL);
+       stored = (LexemeInfo*) dstate->private;
+
+       if (stored) 
+               curpos = stored->posinsubst+1;
+
+       res =(TSLexeme*) DatumGetPointer (
+               FunctionCall4(
+                       &(d->subdict.lexize_info),
+                       PointerGetDatum(d->subdict.dictionary),
+                       PG_GETARG_DATUM(1),
+                       PG_GETARG_INT32(2),
+                       PointerGetDatum(NULL)
+               )
+       );
+
+       if ( res && res->lexeme ) {
+               TSLexeme        *ptr = res , *basevar;
+
+               while( ptr->lexeme ) {
+                       uint16          nv = ptr->nvariant;
+                       uint16          i,nlex = 0;
+                       LexemeInfo      **infos;
+
+                       basevar = ptr;
+                       while( ptr->lexeme && nv == ptr->nvariant ) {
+                               nlex++;
+                               ptr++;
+                       }
+
+                       infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
+                       for(i=0;i<nlex;i++) 
+                               if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
+                                       break;
+
+                       if ( i<nlex ) { 
+                               /* no chance to find */
+                               pfree( infos );
+                               continue;
+                       }
+
+                       info = findVariant( info, stored, curpos, infos, nlex);
+               }
+
+       } else {
+               LexemeInfo      *infos = findTheLexeme(d, NULL);
+               info = findVariant( NULL, stored, curpos, &infos, 1);
+       }
+
+       dstate->private = (void*)info;
+
+       if ( !info ) {
+               dstate->getnext = false;
+               PG_RETURN_POINTER(NULL);
+       }
+                       
+       if ( (res=checkMatch(d, info, curpos,&moreres)) != NULL ) {
+               dstate->getnext = moreres;
+               PG_RETURN_POINTER(res);
+       }
+
+       dstate->getnext = true;
+
+       PG_RETURN_POINTER(NULL);        
+}
index 39a95b2..35c97c9 100644 (file)
@@ -4,21 +4,21 @@
 --
 \set ECHO none
 psql:tsearch2.sql:13: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict"
-psql:tsearch2.sql:158: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
-psql:tsearch2.sql:257: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
-psql:tsearch2.sql:264: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
-psql:tsearch2.sql:370: NOTICE:  type "tsvector" is not yet defined
+psql:tsearch2.sql:177: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
+psql:tsearch2.sql:276: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
+psql:tsearch2.sql:283: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
+psql:tsearch2.sql:389: NOTICE:  type "tsvector" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:375: NOTICE:  argument type tsvector is only a shell
-psql:tsearch2.sql:429: NOTICE:  type "tsquery" is not yet defined
+psql:tsearch2.sql:394: NOTICE:  argument type tsvector is only a shell
+psql:tsearch2.sql:448: NOTICE:  type "tsquery" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:434: NOTICE:  argument type tsquery is only a shell
-psql:tsearch2.sql:592: NOTICE:  type "gtsvector" is not yet defined
+psql:tsearch2.sql:453: NOTICE:  argument type tsquery is only a shell
+psql:tsearch2.sql:611: NOTICE:  type "gtsvector" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:597: NOTICE:  argument type gtsvector is only a shell
-psql:tsearch2.sql:1087: NOTICE:  type "gtsq" is not yet defined
+psql:tsearch2.sql:616: NOTICE:  argument type gtsvector is only a shell
+psql:tsearch2.sql:1106: NOTICE:  type "gtsq" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:1092: NOTICE:  argument type gtsq is only a shell
+psql:tsearch2.sql:1111: NOTICE:  argument type gtsq is only a shell
 --tsvector
 SELECT '1'::tsvector;
  tsvector 
index f389471..e6141f8 100644 (file)
@@ -4,8 +4,6 @@
  */
 #include "postgres.h"
 
-#include "miscadmin.h"
-
 #include "common.h"
 #include "dict.h"
 #include "ts_locale.h"
@@ -36,30 +34,11 @@ readstoplist(text *in, StopList * s)
        s->len = 0;
        if (in && VARSIZE(in) - VARHDRSZ > 0)
        {
-               char       *filename = text2char(in);
+               char       *filename = to_absfilename(text2char(in));
                FILE       *hin;
                char            buf[STOPBUFLEN];
                int                     reallen = 0;
 
-               /* if path is relative, take it as relative to share dir */
-               if (!is_absolute_path(filename))
-               {
-                       char            sharepath[MAXPGPATH];
-                       char       *absfn;
-#ifdef WIN32
-                       char    delim = '\\';
-#else
-                       char    delim = '/';
-#endif
-
-                       get_share_path(my_exec_path, sharepath);
-                       absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
-                       sprintf(absfn, "%s%c%s", sharepath, delim, filename);
-
-                       pfree(filename);
-                       filename = absfn;
-               }
-
                if ((hin = fopen(filename, "r")) == NULL)
                        ereport(ERROR,
                                        (errcode(ERRCODE_CONFIG_FILE_ERROR),
diff --git a/contrib/tsearch2/thesaurus b/contrib/tsearch2/thesaurus
new file mode 100644 (file)
index 0000000..5591646
--- /dev/null
@@ -0,0 +1,19 @@
+#
+# Theasurus config file. Character ':' splits
+# string to part: 
+#     to be substituted string
+#     substituting string
+#
+
+#one two three : 123
+#one two : 12
+#one : 1
+#two : 2
+
+#foo bar : blah blah
+#f   bar : fbar
+#e   bar : ebar
+#g   bar bar : gbarbar
+#asd:sdffff
+#qwerty:qwer wert erty
+
index a71cf97..5a662b7 100644 (file)
@@ -281,15 +281,15 @@ name2id_cfg(text *name)
        return id;
 }
 
-
 void
 parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 {
        int                     type,
-                               lenlemm,
-                               i;
+                               lenlemm;
        char       *lemm = NULL;
        WParserInfo *prsobj = findprs(cfg->prs_id);
+       LexizeData      ldata;
+       TSLexeme   *norms;
 
        prsobj->prs = (void *) DatumGetPointer(
                                                                                   FunctionCall2(
@@ -299,14 +299,16 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
                                                                                                                 )
                );
 
-       while ((type = DatumGetInt32(FunctionCall3(
+       LexizeInit(&ldata, cfg);
+
+       do {
+               type = DatumGetInt32(FunctionCall3(
                                                                                           &(prsobj->getlexeme_info),
                                                                                           PointerGetDatum(prsobj->prs),
                                                                                           PointerGetDatum(&lemm),
-                                                                                  PointerGetDatum(&lenlemm)))) != 0)
-       {
+                                                                                  PointerGetDatum(&lenlemm)));
 
-               if (lenlemm >= MAXSTRLEN)
+               if (type>0 && lenlemm >= MAXSTRLEN)
                {
 #ifdef IGNORE_LONGLEXEME
                        ereport(NOTICE,
@@ -320,25 +322,11 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 #endif
                }
 
-               if (type >= cfg->len)   /* skip this type of lexeme */
-                       continue;
+               LexizeAddLemm(&ldata, type, lemm, lenlemm);
 
-               for (i = 0; i < cfg->map[type].len; i++)
+               while(  (norms = LexizeExec(&ldata, NULL)) != NULL )
                {
-                       DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-                       TSLexeme   *norms,
-                                          *ptr;
-
-                       norms = ptr = (TSLexeme *) DatumGetPointer(
-                                                                                                          FunctionCall3(
-                                                                                                               &(dict->lexize_info),
-                                                                                  PointerGetDatum(dict->dictionary),
-                                                                                                          PointerGetDatum(lemm),
-                                                                                                        PointerGetDatum(lenlemm)
-                                                                                                                                        )
-                               );
-                       if (!norms)                     /* dictionary doesn't know this lexeme */
-                               continue;
+                       TSLexeme *ptr = norms;
 
                        prs->pos++;                     /* set pos */
 
@@ -350,6 +338,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
                                        prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
                                }
 
+                               if ( ptr->flags & TSL_ADDPOS )
+                                       prs->pos++;
                                prs->words[prs->curwords].len = strlen(ptr->lexeme);
                                prs->words[prs->curwords].word = ptr->lexeme;
                                prs->words[prs->curwords].nvariant = ptr->nvariant;
@@ -359,9 +349,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
                                prs->curwords++;
                        }
                        pfree(norms);
-                       break;                          /* lexeme already normalized or is stop word */
-               }
        }
+       } while(type>0);
 
        FunctionCall1(
                                  &(prsobj->end_info),
@@ -417,14 +406,47 @@ hlfinditem(HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int buflen)
        }
 }
 
+static void
+addHLParsedLex(HLPRSTEXT *prs, QUERYTYPE * query, ParsedLex *lexs, TSLexeme *norms) {
+       ParsedLex       *tmplexs;
+       TSLexeme *ptr;
+
+       while( lexs ) {
+               
+               if ( lexs->type > 0 ) 
+                       hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+               ptr = norms;
+               while( ptr && ptr->lexeme ) {
+                       hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+                       ptr++;
+               }
+
+               tmplexs = lexs->next;
+               pfree( lexs );
+               lexs = tmplexs;
+       }
+
+       if ( norms ) {
+               ptr = norms;
+               while( ptr->lexeme ) {
+                       pfree( ptr->lexeme );
+                       ptr++;
+               }
+               pfree(norms);
+       }
+}
+
 void
 hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen)
 {
        int                     type,
-                               lenlemm,
-                               i;
+                               lenlemm;
        char       *lemm = NULL;
        WParserInfo *prsobj = findprs(cfg->prs_id);
+       LexizeData      ldata;
+       TSLexeme        *norms;
+       ParsedLex       *lexs;
 
        prsobj->prs = (void *) DatumGetPointer(
                                                                                   FunctionCall2(
@@ -434,14 +456,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
                                                                                                                 )
                );
 
-       while ((type = DatumGetInt32(FunctionCall3(
+       LexizeInit(&ldata, cfg);
+
+       do {
+               type = DatumGetInt32(FunctionCall3(
                                                                                           &(prsobj->getlexeme_info),
                                                                                           PointerGetDatum(prsobj->prs),
                                                                                           PointerGetDatum(&lemm),
-                                                                                  PointerGetDatum(&lenlemm)))) != 0)
-       {
+                                                                       PointerGetDatum(&lenlemm)));
 
-               if (lenlemm >= MAXSTRLEN)
+               if (type>0 && lenlemm >= MAXSTRLEN)
                {
 #ifdef IGNORE_LONGLEXEME
                        ereport(NOTICE,
@@ -455,38 +479,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
 #endif
                }
 
-               hladdword(prs, lemm, lenlemm, type);
+               LexizeAddLemm(&ldata, type, lemm, lenlemm);
 
-               if (type >= cfg->len)
-                       continue;
+               do {
+                       if ( (norms = LexizeExec(&ldata,&lexs)) != NULL ) 
+                               addHLParsedLex(prs, query, lexs, norms);
+                       else 
+                               addHLParsedLex(prs, query, lexs, NULL);
+               } while( norms );
 
-               for (i = 0; i < cfg->map[type].len; i++)
-               {
-                       DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-                       TSLexeme   *norms,
-                                          *ptr;
-
-                       norms = ptr = (TSLexeme *) DatumGetPointer(
-                                                                                                          FunctionCall3(
-                                                                                                               &(dict->lexize_info),
-                                                                                  PointerGetDatum(dict->dictionary),
-                                                                                                          PointerGetDatum(lemm),
-                                                                                                        PointerGetDatum(lenlemm)
-                                                                                                                                        )
-                               );
-                       if (!norms)                     /* dictionary doesn't know this lexeme */
-                               continue;
-
-                       while (ptr->lexeme)
-                       {
-                               hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
-                               pfree(ptr->lexeme);
-                               ptr++;
-                       }
-                       pfree(norms);
-                       break;                          /* lexeme already normalized or is stop word */
-               }
-       }
+       } while( type>0 );
 
        FunctionCall1(
                                  &(prsobj->end_info),
diff --git a/contrib/tsearch2/ts_lexize.c b/contrib/tsearch2/ts_lexize.c
new file mode 100644 (file)
index 0000000..c90848c
--- /dev/null
@@ -0,0 +1,261 @@
+/*
+ * lexize stream of lexemes 
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+#include <locale.h>
+
+#include "ts_cfg.h"
+#include "dict.h"
+
+void
+LexizeInit(LexizeData *ld, TSCfgInfo *cfg) {
+       ld->cfg = cfg;
+       ld->curDictId = InvalidOid;
+       ld->posDict = 0;
+       ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+       ld->waste.head = ld->waste.tail = NULL;
+       ld->lastRes=NULL;
+       ld->tmpRes=NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex *list, ParsedLex *newpl) {
+       if ( list->tail ) {
+               list->tail->next = newpl;
+               list->tail = newpl;
+       } else
+               list->head = list->tail = newpl;
+       newpl->next = NULL;
+}
+
+static ParsedLex*
+LPLRemoveHead(ListParsedLex *list) {
+       ParsedLex *res = list->head;
+
+       if ( list->head ) 
+               list->head = list->head->next;
+
+       if ( list->head == NULL )
+               list->tail = NULL;
+
+       return res;
+}
+
+
+void
+LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) {
+       ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
+
+       newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
+       newpl->type = type;
+       newpl->lemm = lemm;
+       newpl->lenlemm = lenlemm;
+       LPLAddTail(&ld->towork, newpl);
+       ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData *ld) {
+       LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+       ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) {
+       if ( correspondLexem ) {
+               *correspondLexem = ld->waste.head;
+       } else {
+               ParsedLex       *tmp, *ptr = ld->waste.head;
+
+               while(ptr) {
+                       tmp = ptr->next;
+                       pfree(ptr);
+                       ptr = tmp;
+               }
+       }
+       ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData *ld, ParsedLex *stop) {
+       bool    go = true;
+
+       while( ld->towork.head && go) {
+               if (ld->towork.head == stop) {
+                       ld->curSub = stop->next;
+                       go = false;
+               }
+               RemoveHead(ld);
+       }
+}
+
+static void
+setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) {
+       if ( ld->tmpRes ) {
+               TSLexeme        *ptr;
+               for( ptr=ld->tmpRes; ptr->lexeme; ptr++ ) 
+                       pfree( ptr->lexeme );
+               pfree( ld->tmpRes );
+       }
+       ld->tmpRes = res;
+       ld->lastRes = lex;
+}
+
+TSLexeme*
+LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) {
+       int i;
+       ListDictionary  *map;
+       DictInfo *dict;
+       TSLexeme        *res;
+
+       if ( ld->curDictId == InvalidOid ) {
+               /* 
+                * usial mode: dictionary wants only one word,
+                * but we should keep in mind that we should go through
+                * all stack
+                */
+
+               while( ld->towork.head ) {
+                       ParsedLex       *curVal = ld->towork.head;
+
+                       map = ld->cfg->map + curVal->type;
+
+                       if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {      
+                               /* skip this type of lexeme */
+                               RemoveHead(ld);
+                               continue;
+                       }
+
+                       for (i = ld->posDict; i < map->len; i++) {
+                               dict = finddict(DatumGetObjectId(map->dict_id[i]));
+
+                               ld->dictState.isend = ld->dictState.getnext = false;
+                               ld->dictState.private = NULL;
+                               res = (TSLexeme *) DatumGetPointer( FunctionCall4(
+                                                                                                       &(dict->lexize_info),
+                                                                                                       PointerGetDatum(dict->dictionary),
+                                                                                                       PointerGetDatum(curVal->lemm),
+                                                                                                       Int32GetDatum(curVal->lenlemm),
+                                                                                                       PointerGetDatum(&ld->dictState)
+                                                                                ));
+
+                               if ( ld->dictState.getnext ) {
+                                       /* 
+                                        * dictinary wants next word, so setup and store
+                                        * current position and go to multiword  mode
+                                        */
+                                        
+                                       ld->curDictId = DatumGetObjectId(map->dict_id[i]);
+                                       ld->posDict = i+1;
+                                       ld->curSub = curVal->next;
+                                       if ( res )
+                                               setNewTmpRes(ld, curVal, res);
+                                       return LexizeExec(ld, correspondLexem);
+                               }
+
+                               if (!res)                       /* dictionary doesn't know this lexeme */
+                                       continue;
+                               
+                               RemoveHead(ld);
+                               setCorrLex(ld, correspondLexem);
+                               return res;
+                       }
+
+                       RemoveHead(ld);
+               } 
+       } else { /* curDictId is valid */
+               dict = finddict(ld->curDictId);
+               
+               /*
+                * Dictionary ld->curDictId asks  us about following words
+                */
+
+               while( ld->curSub ) {
+                       ParsedLex       *curVal = ld->curSub;
+
+                       map = ld->cfg->map + curVal->type;
+
+                       if (curVal->type != 0) {
+                               bool dictExists = false;
+
+                               if (curVal->type >= ld->cfg->len || map->len == 0 ) {   
+                                       /* skip this type of lexeme */
+                                       ld->curSub = curVal->next;
+                                       continue;
+                               }
+
+                               /*
+                                * We should be sure that current type of lexeme is recognized by
+                                * our dictinonary: we just check is it exist in 
+                                * list of dictionaries ?
+                                */
+                               for(i=0;i < map->len && !dictExists; i++) 
+                                       if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) )
+                                               dictExists = true;
+
+                               if ( !dictExists ) {
+                                       /*
+                                        * Dictionary can't work with current tpe of lexeme,
+                                        * return to basic mode and redo all stored lexemes
+                                        */
+                                       ld->curDictId = InvalidOid;
+                                       return LexizeExec(ld, correspondLexem);
+                               }
+                       } 
+       
+                       ld->dictState.isend = (curVal->type==0) ? true : false;
+                       ld->dictState.getnext = false;
+
+                       res = (TSLexeme *) DatumGetPointer( FunctionCall4(
+                                                                                               &(dict->lexize_info),
+                                                                                               PointerGetDatum(dict->dictionary),
+                                                                                               PointerGetDatum(curVal->lemm),
+                                                                                               Int32GetDatum(curVal->lenlemm),
+                                                                                               PointerGetDatum(&ld->dictState)
+                                                                                ));
+
+                       if ( ld->dictState.getnext ) {
+                               /* Dictionary wants one more */
+                               ld->curSub = curVal->next;
+                               if ( res )
+                                       setNewTmpRes(ld, curVal, res);
+                               continue;
+                       }
+
+                       if ( res || ld->tmpRes ) {
+                               /*
+                                * Dictionary normalizes lexemes,
+                                * so we remove from stack all used lexemes ,
+                                * return to basic mode and redo end of stack (if it exists)
+                                */
+                               if ( res ) {
+                                       moveToWaste( ld, ld->curSub );
+                               } else {
+                                       res = ld->tmpRes;
+                                       moveToWaste( ld, ld->lastRes );
+                               }
+
+                               /* reset to initial state */
+                               ld->curDictId = InvalidOid;
+                               ld->posDict = 0;
+                               ld->lastRes = NULL;
+                               ld->tmpRes = NULL;
+                               setCorrLex(ld, correspondLexem);
+                               return res;
+                       }
+
+                       /* Dict don't want next lexem and didn't recognize anything,
+                          redo from ld->towork.head */
+                       ld->curDictId = InvalidOid;
+                       return LexizeExec(ld, correspondLexem);
+               }       
+       }
+
+       setCorrLex(ld, correspondLexem);
+       return NULL;
+}
+
index 76b4c5b..39c6bf9 100644 (file)
@@ -146,6 +146,25 @@ insert into pg_ts_dict select
        'Example of synonym dictionary'
 ;
 
+CREATE FUNCTION thesaurus_init(internal)
+       RETURNS internal
+       as 'MODULE_PATHNAME' 
+       LANGUAGE C;
+
+CREATE FUNCTION thesaurus_lexize(internal,internal,int4,internal)
+       RETURNS internal
+       as 'MODULE_PATHNAME'
+       LANGUAGE C
+       RETURNS NULL ON NULL INPUT;
+
+insert into pg_ts_dict select 
+       'thesaurus_template', 
+       'thesaurus_init(internal)',
+       null,
+       'thesaurus_lexize(internal,internal,int4,internal)',
+       'Thesaurus template, must be pointed Dictionary and DictFile'
+;
+
 --dict conf
 CREATE TABLE pg_ts_parser (
        prs_name        text not null primary key,
@@ -1193,7 +1212,11 @@ AS
 
 --example of ISpell dictionary
 --update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
+
 --example of synonym dict
---update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
+--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_name='synonym';
 
+--example of thesaurus dict
+--update pg_ts_dict set dict_initoption='DictFile="contrib/thesaurus", Dictionary="en_stem"' where dict_name='thesaurus_template';
+--update pg_ts_cfgmap set dict_name = '{thesaurus_template,en_stem}' where dict_name = '{en_stem}';
 END;
index 2a658df..f344f86 100644 (file)
@@ -41,6 +41,8 @@ DROP FUNCTION snb_lexize(internal,internal,int4);
 DROP FUNCTION snb_ru_init(internal);
 DROP FUNCTION spell_init(internal);
 DROP FUNCTION spell_lexize(internal,internal,int4);
+DROP FUNCTION thesaurus_init(internal);
+DROP FUNCTION thesaurus_lexize(internal,internal,int4);
 DROP FUNCTION syn_init(internal);
 DROP FUNCTION syn_lexize(internal,internal,int4);
 DROP FUNCTION set_curprs(int);