2 * lexize stream of lexemes
3 * Teodor Sigaev <teodor@sigaev.ru>
14 LexizeInit(LexizeData * ld, TSCfgInfo * cfg)
17 ld->curDictId = InvalidOid;
19 ld->towork.head = ld->towork.tail = ld->curSub = NULL;
20 ld->waste.head = ld->waste.tail = NULL;
26 LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
30 list->tail->next = newpl;
34 list->head = list->tail = newpl;
39 LPLRemoveHead(ListParsedLex * list)
41 ParsedLex *res = list->head;
44 list->head = list->head->next;
46 if (list->head == NULL)
54 LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
56 ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
58 newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
61 newpl->lenlemm = lenlemm;
62 LPLAddTail(&ld->towork, newpl);
63 ld->curSub = ld->towork.tail;
67 RemoveHead(LexizeData * ld)
69 LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
75 setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
79 *correspondLexem = ld->waste.head;
84 *ptr = ld->waste.head;
93 ld->waste.head = ld->waste.tail = NULL;
97 moveToWaste(LexizeData * ld, ParsedLex * stop)
101 while (ld->towork.head && go)
103 if (ld->towork.head == stop)
105 ld->curSub = stop->next;
113 setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
119 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
128 LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
135 if (ld->curDictId == InvalidOid)
138 * usial mode: dictionary wants only one word, but we should keep in
139 * mind that we should go through all stack
142 while (ld->towork.head)
144 ParsedLex *curVal = ld->towork.head;
146 map = ld->cfg->map + curVal->type;
148 if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0)
150 /* skip this type of lexeme */
155 for (i = ld->posDict; i < map->len; i++)
157 dict = finddict(DatumGetObjectId(map->dict_id[i]));
159 ld->dictState.isend = ld->dictState.getnext = false;
160 ld->dictState.private = NULL;
161 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
162 &(dict->lexize_info),
163 PointerGetDatum(dict->dictionary),
164 PointerGetDatum(curVal->lemm),
165 Int32GetDatum(curVal->lenlemm),
166 PointerGetDatum(&ld->dictState)
169 if (ld->dictState.getnext)
172 * dictinary wants next word, so setup and store current
173 * position and go to multiword mode
176 ld->curDictId = DatumGetObjectId(map->dict_id[i]);
178 ld->curSub = curVal->next;
180 setNewTmpRes(ld, curVal, res);
181 return LexizeExec(ld, correspondLexem);
184 if (!res) /* dictionary doesn't know this lexeme */
188 setCorrLex(ld, correspondLexem);
196 { /* curDictId is valid */
197 dict = finddict(ld->curDictId);
200 * Dictionary ld->curDictId asks us about following words
205 ParsedLex *curVal = ld->curSub;
207 map = ld->cfg->map + curVal->type;
209 if (curVal->type != 0)
211 bool dictExists = false;
213 if (curVal->type >= ld->cfg->len || map->len == 0)
215 /* skip this type of lexeme */
216 ld->curSub = curVal->next;
221 * We should be sure that current type of lexeme is recognized
222 * by our dictinonary: we just check is it exist in list of
225 for (i = 0; i < map->len && !dictExists; i++)
226 if (ld->curDictId == DatumGetObjectId(map->dict_id[i]))
232 * Dictionary can't work with current tpe of lexeme,
233 * return to basic mode and redo all stored lexemes
235 ld->curDictId = InvalidOid;
236 return LexizeExec(ld, correspondLexem);
240 ld->dictState.isend = (curVal->type == 0) ? true : false;
241 ld->dictState.getnext = false;
243 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
244 &(dict->lexize_info),
245 PointerGetDatum(dict->dictionary),
246 PointerGetDatum(curVal->lemm),
247 Int32GetDatum(curVal->lenlemm),
248 PointerGetDatum(&ld->dictState)
251 if (ld->dictState.getnext)
253 /* Dictionary wants one more */
254 ld->curSub = curVal->next;
256 setNewTmpRes(ld, curVal, res);
260 if (res || ld->tmpRes)
263 * Dictionary normalizes lexemes, so we remove from stack all
264 * used lexemes , return to basic mode and redo end of stack
269 moveToWaste(ld, ld->curSub);
274 moveToWaste(ld, ld->lastRes);
277 /* reset to initial state */
278 ld->curDictId = InvalidOid;
282 setCorrLex(ld, correspondLexem);
287 * Dict don't want next lexem and didn't recognize anything, redo
288 * from ld->towork.head
290 ld->curDictId = InvalidOid;
291 return LexizeExec(ld, correspondLexem);
295 setCorrLex(ld, correspondLexem);