3 * New dictionary is include in dict.h. For languages which
4 * use latin charset it may be need to modify mapdict table.
5 * Teodor Sigaev <teodor@stack.net>
9 #include "utils/elog.h"
10 #include "utils/palloc.h"
11 #include "utils/builtins.h"
12 #include "catalog/pg_control.h"
13 #include "utils/pg_locale.h"
19 * Struct for calling dictionaries
20 * All of this methods are optional, but
21 * if all methods are NULL, then dictionary does nothing :)
22 * Return value of lemmatize must be palloced or the same.
23 * Return value of init must be malloced in other case
24 * it will be free in end of transaction!
28 char localename[LOCALE_NAME_BUFLEN];
31 /* close dictionary */
32 void (*close) (void *);
33 /* find in dictionary */
34 char *(*lemmatize) (void *, char *, int *);
35 int (*is_stoplemm) (void *, char *, int);
36 int (*is_stemstoplemm) (void *, char *, int);
39 /* insert all dictionaries */
44 /* fill dictionary's structure */
48 "C", NULL, NULL, NULL, NULL, NULL /* fake dictionary */
55 /* array for storing dictinary's objects (if needed) */
65 typedef int2 MAPDICT[MAXNDICT];
67 #define GETDICT(x,i) *( ((int2*)(x)) + (i) )
69 /* map dictionaries for lexem type */
70 static MAPDICT mapdict[] = {
71 {NODICT, NODICT}, /* not used */
72 {DEFAULTDICT, NODICT}, /* LATWORD */
73 {BYLOCALE, NODICT}, /* NONLATINWORD */
74 {BYLOCALE, DEFAULTDICT}, /* UWORD */
75 {NODICT, NODICT}, /* EMAIL */
76 {NODICT, NODICT}, /* FURL */
77 {NODICT, NODICT}, /* HOST */
78 {NODICT, NODICT}, /* SCIENTIFIC */
79 {NODICT, NODICT}, /* VERSIONNUMBER */
80 {BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
81 {BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
82 {DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
83 {STOPLEXEM, NODICT}, /* SPACE */
84 {STOPLEXEM, NODICT}, /* TAG */
85 {STOPLEXEM, NODICT}, /* HTTP */
86 {BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
87 {DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
88 {BYLOCALE, NODICT}, /* CYRHYPHENWORD */
89 {NODICT, NODICT}, /* URI */
90 {NODICT, NODICT}, /* FILEPATH */
91 {NODICT, NODICT}, /* DECIMAL */
92 {NODICT, NODICT}, /* SIGNEDINT */
93 {NODICT, NODICT}, /* UNSIGNEDINT */
94 {STOPLEXEM, NODICT} /* HTMLENTITY */
97 static bool inited = false;
106 bool needinit[lengthof(dicts)];
107 PG_LocaleCategories lc;
109 int bylocaledict = NODICT;
113 for (i = 1; i < lengthof(dicts); i++)
118 for (i = 1; i < lengthof(dicts); i++)
119 if (strcmp(dicts[i].localename, lc.lc_ctype) == 0)
124 PGLC_free_categories(&lc);
126 for (i = 1; i < lengthof(mapdict); i++)
130 for (j = 0; j < MAXNDICT; j++)
132 GETDICT(md, k) = GETDICT(md, j);
133 if (GETDICT(md, k) == NODICT)
135 else if (GETDICT(md, k) == BYLOCALE)
137 if (bylocaledict == NODICT)
139 GETDICT(md, k) = bylocaledict;
141 if (GETDICT(md, k) >= (int2) lengthof(dicts))
143 needinit[GETDICT(md, k)] = true;
146 for (; k < MAXNDICT; k++)
147 if (GETDICT(md, k) != STOPLEXEM)
148 GETDICT(md, k) = NODICT;
151 for (i = 1; i < lengthof(dicts); i++)
152 if (needinit[i] && dicts[i].init)
153 dictobjs[i] = (*(dicts[i].init)) ();
160 lemmatize(char *word, int *len, int type)
166 for (i = 0; i < MAXNDICT; i++)
168 nd = GETDICT(&mapdict[type], i);
171 /* there is no dictionary */
174 else if (nd == STOPLEXEM)
176 /* word is stopword */
182 if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
187 char *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
189 /* word is recognized by distionary */
190 if (newword != word || *len != oldlen)
192 if (dict->is_stemstoplemm &&
193 (*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
195 if (newword != word && newword)
209 is_stoptype(int type)
211 return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;