1 /* Copyright(C) 2004 Brazil
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 Add fast string-normalization function into Senna.
20 Author: NTT DATA Corporation
29 /* Declare this is the fast version of Senna */
34 /* available bit for sen_index_create flag */
35 #define SEN_INDEX_NORMALIZE 0x0001
36 #define SEN_INDEX_SPLIT_ALPHA 0x0002
37 #define SEN_INDEX_SPLIT_DIGIT 0x0004
38 #define SEN_INDEX_SPLIT_SYMBOL 0x0008
39 #define SEN_INDEX_MORPH_ANALYSE 0x0000
40 #define SEN_INDEX_NGRAM 0x0010
41 #define SEN_INDEX_DELIMITED 0x0020
42 #define SEN_INDEX_ENABLE_SUFFIX_SEARCH 0x0100
43 #define SEN_INDEX_DISABLE_SUFFIX_SEARCH 0x0200
44 #define SEN_INDEX_WITH_VGRAM 0x1000
45 #define SEN_INDEX_SHARED_LEXICON 0x2000
46 #define SEN_INDEX_WITH_VACUUM 0x8000
48 /* 16 tokenizers can be registered */
49 #define SEN_INDEX_TOKENIZER_MASK 0x00f0
51 #define SEN_SYM_MAX_KEY_SIZE 8192
53 #define SEN_SYM_WITH_SIS 0x80000000
55 #define SEN_SNIP_NORMALIZE 0x0001
56 #define SEN_SNIP_COPY_TAG 0x0002
57 #define SEN_SNIP_SKIP_LEADING_SPACES 0x0004
58 #define SEN_QUERY_SCAN_NORMALIZE SEN_SNIP_NORMALIZE
60 #define SEN_LEX_NGRAM_UNIT_SIZE 2
62 #ifndef SEN_STACK_SIZE
63 #define SEN_STACK_SIZE 0x10000000
64 #endif /* SEN_STACK_SIZE */
67 #define SEN_QUERY_AND '+'
68 #endif /* SEN_QUERY_AND */
70 #define SEN_QUERY_BUT '-'
71 #endif /* SEN_QUERY_BUT */
72 #ifndef SEN_QUERY_ADJ_INC
73 #define SEN_QUERY_ADJ_INC '>'
74 #endif /* SEN_QUERY_ADJ_POS2 */
75 #ifndef SEN_QUERY_ADJ_DEC
76 #define SEN_QUERY_ADJ_DEC '<'
77 #endif /* SEN_QUERY_ADJ_POS1 */
78 #ifndef SEN_QUERY_ADJ_NEG
79 #define SEN_QUERY_ADJ_NEG '~'
80 #endif /* SEN_QUERY_ADJ_NEG */
81 #ifndef SEN_QUERY_PREFIX
82 #define SEN_QUERY_PREFIX '*'
83 #endif /* SEN_QUERY_PREFIX */
84 #ifndef SEN_QUERY_PARENL
85 #define SEN_QUERY_PARENL '('
86 #endif /* SEN_QUERY_PARENL */
87 #ifndef SEN_QUERY_PARENR
88 #define SEN_QUERY_PARENR ')'
89 #endif /* SEN_QUERY_PARENR */
90 #ifndef SEN_QUERY_QUOTEL
91 #define SEN_QUERY_QUOTEL '"'
92 #endif /* SEN_QUERY_QUOTEL */
93 #ifndef SEN_QUERY_QUOTER
94 #define SEN_QUERY_QUOTER '"'
95 #endif /* SEN_QUERY_QUOTER */
96 #ifndef SEN_QUERY_ESCAPE
97 #define SEN_QUERY_ESCAPE '\\'
98 #endif /* SEN_QUERY_ESCAPE */
100 #define SEN_SYM_NIL 0
104 sen_memory_exhausted,
106 sen_file_operation_error,
107 sen_invalid_argument,
125 sen_rec_document = 0,
146 sen_sel_term_extract,
152 sen_sort_descending = 0,
169 typedef struct _sen_db sen_db;
170 typedef struct _sen_ctx sen_ctx;
171 typedef struct _sen_ctx_info sen_ctx_info;
172 typedef struct _sen_set sen_set;
173 typedef struct _sen_sym sen_sym;
174 typedef struct _sen_inv sen_inv;
175 typedef struct _sen_index sen_index;
176 typedef struct _sen_records sen_records;
177 typedef struct _sen_set_cursor sen_set_cursor;
178 typedef struct _sen_set_element *sen_set_eh;
179 typedef struct _sen_value sen_value;
180 typedef struct _sen_values sen_values;
181 typedef struct _sen_select_optarg sen_select_optarg;
182 typedef struct _sen_group_optarg sen_group_optarg;
183 typedef struct _sen_sort_optarg sen_sort_optarg;
184 typedef struct _sen_set_sort_optarg sen_set_sort_optarg;
185 typedef struct _sen_snip sen_snip;
186 typedef struct _sen_query sen_query;
187 typedef struct _sen_logger_info sen_logger_info;
188 typedef struct _sen_snip_mapping sen_snip_mapping;
189 typedef struct _sen_records_heap sen_records_heap;
190 typedef struct _sen_vgram sen_vgram;
191 typedef struct _sen_vgram_buf sen_vgram_buf;
192 typedef struct _sen_sym_scan_hit sen_sym_scan_hit;
193 typedef unsigned sen_id;
195 #define SEN_ID_MAX 0x3fffffff
197 typedef sen_set_eh sen_recordh;
199 struct _sen_ctx_info {
201 unsigned int com_status;
202 unsigned int com_info;
203 struct _sen_rbuf *outbuf;
215 struct _sen_records {
216 sen_rec_unit record_unit;
217 sen_rec_unit subrec_unit;
218 unsigned int max_n_subrecs;
219 unsigned int record_size;
220 unsigned int subrec_size;
223 sen_set_cursor *cursor;
224 sen_recordh *curr_rec;
227 int ignore_deleted_records;
234 unsigned int str_len;
243 struct _sen_select_optarg {
245 int similarity_threshold;
249 int (*func)(sen_records *, const void *, int, void *);
253 struct _sen_group_optarg {
255 int (*func)(sen_records *, const sen_recordh *, void *, void *);
260 struct _sen_sort_optarg {
262 int (*compar)(sen_records *, const sen_recordh *, sen_records *, const sen_recordh *, void *);
266 struct _sen_set_sort_optarg {
268 int (*compar)(sen_set *, sen_set_eh *, sen_set *, sen_set_eh *, void *);
270 sen_set *compar_arg0;
273 struct _sen_snip_mapping {
277 #define SEN_LOG_TIME 1
278 #define SEN_LOG_TITLE 2
279 #define SEN_LOG_MESSAGE 4
280 #define SEN_LOG_LOCATION 8
282 struct _sen_logger_info {
283 sen_log_level max_level;
285 void (*func)(int, const char *, const char *, const char *, const char *, void *);
289 struct _sen_sym_scan_hit {
295 typedef int (*query_term_callback)(const char *, unsigned int, void *);
298 /******** query language API ********/
300 sen_db *sen_db_create(const char *path, int flags, sen_encoding encoding);
301 sen_db *sen_db_open(const char *path);
302 sen_rc sen_db_close(sen_db *s);
304 #define SEN_CTX_MORE 0x01
305 #define SEN_CTX_TAIL 0x02
306 #define SEN_CTX_HEAD 0x04
307 #define SEN_CTX_QUIET 0x08
308 #define SEN_CTX_QUIT 0x10
310 #define SEN_CTX_USEQL 1
311 #define SEN_CTX_BATCHMODE 2
313 sen_ctx *sen_ctx_open(sen_db *db, int flags);
314 sen_ctx *sen_ctx_connect(const char *host, int port, int flags);
315 sen_rc sen_ctx_load(sen_ctx *c, const char *path);
316 sen_rc sen_ctx_send(sen_ctx *c, char *str, unsigned int str_len, int flags);
317 sen_rc sen_ctx_recv(sen_ctx *c, char **str, unsigned int *str_len, int *flags);
318 sen_rc sen_ctx_close(sen_ctx *c);
319 sen_rc sen_ctx_info_get(sen_ctx *c, sen_ctx_info *info);
321 /******** basic API ********/
323 sen_rc sen_init(void);
324 sen_rc sen_fin(void);
325 sen_rc sen_info(char **version,
326 char **configure_options,
328 sen_encoding *default_encoding,
329 unsigned int *initial_n_segments,
330 unsigned int *partial_match_threshold);
332 sen_index *sen_index_create(const char *path, int key_size, int flags,
333 int initial_n_segments, sen_encoding encoding);
334 sen_index *sen_index_open(const char *path);
335 sen_rc sen_index_close(sen_index *i);
336 sen_rc sen_index_remove(const char *path);
337 sen_rc sen_index_rename(const char *old_name, const char *new_name);
338 sen_rc sen_index_upd(sen_index *i, const void *key,
339 const char *oldvalue, unsigned int oldvalue_len,
340 const char *newvalue, unsigned int newvalue_len);
341 sen_records *sen_index_sel(sen_index *i,
342 const char *string, unsigned int string_len);
343 int sen_records_next(sen_records *r, void *keybuf, int buf_size, int *score);
344 sen_rc sen_records_rewind(sen_records *r);
345 int sen_records_curr_score(sen_records *r);
346 int sen_records_curr_key(sen_records *r, void *keybuf, int buf_size);
347 int sen_records_nhits(sen_records *r);
348 int sen_records_find(sen_records *r, const void *key);
349 sen_rc sen_records_close(sen_records *r);
351 /******** advanced API ********/
353 sen_values *sen_values_open(void);
354 sen_rc sen_values_close(sen_values *v);
355 sen_rc sen_values_add(sen_values *v,
356 const char *str, unsigned int str_len,
357 unsigned int weight);
359 sen_records *sen_records_open(sen_rec_unit record_unit,
360 sen_rec_unit subrec_unit,
361 unsigned int max_n_subrecs);
362 sen_records *sen_records_union(sen_records *a, sen_records *b);
363 sen_records *sen_records_subtract(sen_records *a, sen_records *b);
364 sen_records *sen_records_intersect(sen_records *a, sen_records *b);
365 int sen_records_difference(sen_records *a, sen_records *b);
366 sen_rc sen_records_sort(sen_records *r, int limit, sen_sort_optarg *optarg);
367 sen_rc sen_records_group(sen_records *r, int limit, sen_group_optarg *optarg);
368 const sen_recordh *sen_records_curr_rec(sen_records *r);
369 const sen_recordh *sen_records_at(sen_records *r, const void *key,
370 unsigned section, unsigned pos,
371 int *score, int *n_subrecs);
372 sen_rc sen_record_info(sen_records *r, const sen_recordh *rh,
373 void *keybuf, int buf_size, int *key_size,
374 int *section, int *pos, int *score, int *n_subrecs);
375 sen_rc sen_record_subrec_info(sen_records *r, const sen_recordh *rh,
376 int index, void *keybuf, int buf_size,
377 int *key_size, int *section, int *pos, int *score);
378 sen_index *sen_index_create_with_keys(const char *path, sen_sym *keys, int flags,
379 int initial_n_segments, sen_encoding encoding);
380 sen_index *sen_index_open_with_keys(const char *path, sen_sym *keys);
381 sen_index *sen_index_create_with_keys_lexicon(const char *path, sen_sym *keys,
382 sen_sym *lexicon, int initial_n_segments);
383 sen_index *sen_index_open_with_keys_lexicon(const char *path, sen_sym *keys,
385 sen_rc sen_index_update(sen_index *i, const void *key, unsigned int section,
386 sen_values *oldvalues, sen_values *newvalues);
387 sen_rc sen_index_select(sen_index *i,
388 const char *string, unsigned int string_len,
390 sen_sel_operator op, sen_select_optarg *optarg);
391 sen_rc sen_index_info(sen_index *i, int *key_size, int *flags,
392 int *initial_n_segments, sen_encoding *encoding,
393 unsigned *nrecords_keys, unsigned *file_size_keys,
394 unsigned *nrecords_lexicon, unsigned *file_size_lexicon,
395 unsigned long long *inv_seg_size,
396 unsigned long long *inv_chunk_size);
397 int sen_index_path(sen_index *i, char *pathbuf, int buf_size);
399 sen_set *sen_index_related_terms(sen_index *index, const char *string,
400 const char *(*fetcher)(void *, void *),
404 sen_query *sen_query_open(const char *str, unsigned int str_len,
405 sen_sel_operator default_op,
406 int max_exprs, sen_encoding encoding);
407 unsigned int sen_query_rest(sen_query *q, const char ** const rest);
408 sen_rc sen_query_close(sen_query *q);
409 sen_rc sen_query_exec(sen_index *i, sen_query *q, sen_records *r, sen_sel_operator op);
410 void sen_query_term(sen_query *q, query_term_callback func, void *func_arg);
411 sen_rc sen_query_scan(sen_query *q, const char **strs, unsigned int *str_lens,
412 unsigned int nstrs, int flags, int *found, int *score);
413 sen_snip *sen_query_snip(sen_query *query, int flags,
414 unsigned int width, unsigned int max_results,
416 const char **opentags, unsigned int *opentag_lens,
417 const char **closetags, unsigned int *closetag_lens,
418 sen_snip_mapping *mapping);
420 sen_rc sen_index_del(sen_index *i, const void *key);
422 /******** low level API ********/
424 sen_set *sen_set_open(unsigned key_size, unsigned value_size, unsigned init_size);
425 sen_rc sen_set_close(sen_set *set);
426 sen_rc sen_set_info(sen_set *set, unsigned *key_size,
427 unsigned *value_size, unsigned *n_entries);
428 sen_set_eh *sen_set_get(sen_set *set, const void *key, void **value);
429 sen_set_eh *sen_set_at(sen_set *set, const void *key, void **value);
430 sen_rc sen_set_del(sen_set *set, sen_set_eh *e);
431 sen_set_cursor *sen_set_cursor_open(sen_set *set);
432 sen_set_eh *sen_set_cursor_next(sen_set_cursor *cursor, void **key, void **value);
433 sen_rc sen_set_cursor_close(sen_set_cursor *cursor);
434 sen_rc sen_set_element_info(sen_set *set, const sen_set_eh *e,
435 void **key, void **value);
436 sen_set *sen_set_union(sen_set *a, sen_set *b);
437 sen_set *sen_set_subtract(sen_set *a, sen_set *b);
438 sen_set *sen_set_intersect(sen_set *a, sen_set *b);
439 int sen_set_difference(sen_set *a, sen_set *b);
440 sen_set_eh *sen_set_sort(sen_set *set, int limit, sen_set_sort_optarg *optarg);
442 sen_sym *sen_sym_create(const char *path, unsigned key_size,
443 unsigned flags, sen_encoding encoding);
444 sen_sym *sen_sym_open(const char *path);
445 sen_rc sen_sym_info(sen_sym *sym, int *key_size, unsigned *flags,
446 sen_encoding *encoding, unsigned *nrecords, unsigned *file_size);
447 sen_rc sen_sym_close(sen_sym *sym);
448 sen_rc sen_sym_remove(const char *path);
450 /* Lookup the sym table and find the id of the corresponding entry.
451 * If no matches are found, create a new entry, and return that ID
453 sen_id sen_sym_get(sen_sym *sym, const void *key);
455 /* Lookup the sym table and find the id of the corresponding entry.
456 * If no matches are found return SEN_SYM_NIL
458 sen_id sen_sym_at(sen_sym *sym, const void *key);
459 sen_rc sen_sym_del(sen_sym *sym, const void *key);
460 unsigned int sen_sym_size(sen_sym *sym);
461 int sen_sym_key(sen_sym *sym, sen_id id, void *keybuf, int buf_size);
462 sen_set *sen_sym_prefix_search(sen_sym *sym, const void *key);
463 sen_set *sen_sym_suffix_search(sen_sym *sym, const void *key);
464 sen_id sen_sym_common_prefix_search(sen_sym *sym, const void *key);
465 int sen_sym_pocket_get(sen_sym *sym, sen_id id);
466 sen_rc sen_sym_pocket_set(sen_sym *sym, sen_id id, unsigned int value);
467 sen_id sen_sym_next(sen_sym *sym, sen_id id);
468 int sen_sym_scan(sen_sym *sym, const char *str, unsigned int str_len,
469 sen_sym_scan_hit *sh, unsigned int sh_size, const char **rest);
471 /******** utility API ********/
472 sen_snip *sen_snip_open(sen_encoding encoding, int flags, unsigned int width,
473 unsigned int max_results,
474 const char *defaultopentag, unsigned int defaultopentag_len,
475 const char *defaultclosetag, unsigned int defaultclosetag_len,
476 sen_snip_mapping *mapping);
477 sen_rc sen_snip_close(sen_snip *snip);
478 sen_rc sen_snip_add_cond(sen_snip *snip,
479 const char *keyword, unsigned int keyword_len,
480 const char *opentag, unsigned int opentag_len,
481 const char *closetag, unsigned int closetag_len);
482 sen_rc sen_snip_exec(sen_snip *snip,
483 const char *string, unsigned int string_len,
484 unsigned int *nresults, unsigned int *max_tagged_len);
485 sen_rc sen_snip_get_result(sen_snip *snip, const unsigned int index,
486 char *result, unsigned int *result_len);
488 sen_records_heap *sen_records_heap_open(int size, int limit, sen_sort_optarg *optarg);
489 sen_rc sen_records_heap_add(sen_records_heap *h, sen_records *r);
490 int sen_records_heap_next(sen_records_heap *h);
491 sen_records *sen_records_heap_head(sen_records_heap *h);
492 sen_rc sen_records_heap_close(sen_records_heap *h);
494 int sen_inv_entry_info(sen_inv *inv, sen_id key, unsigned *a, unsigned *pocket,
495 unsigned *chunk, unsigned *chunk_size, unsigned *buffer_free,
496 unsigned *nterms, unsigned *nterms_void, unsigned *tid,
497 unsigned *size_in_chunk, unsigned *pos_in_chunk,
498 unsigned *size_in_buffer, unsigned *pos_in_buffer);
500 /* flags for sen_str_normalize */
501 #define SEN_STR_REMOVEBLANK 1
502 #define SEN_STR_WITH_CTYPES 2
503 #define SEN_STR_WITH_CHECKS 4
504 int sen_str_normalize(const char *str, unsigned int str_len,
505 sen_encoding encoding, int flags,
506 char *nstrbuf, int buf_size);
507 int fast_sen_str_normalize(const char *str, unsigned int str_len,
508 char *nstrbuf, int buf_size);
509 unsigned int sen_str_charlen(const char *str, sen_encoding encoding);
513 sen_rc sen_logger_info_set(const sen_logger_info *info);
515 void sen_logger_put(sen_log_level level,
516 const char *file, int line, const char *func, char *fmt, ...);
518 int sen_logger_pass(sen_log_level level);
520 #define SEN_LOG(level,...) \
521 if (sen_logger_pass(level)) {\
522 sen_logger_put((level), __FILE__, __LINE__, __FUNCTION__, __VA_ARGS__);\
525 #ifndef SEN_LOG_DEFAULT_LEVEL
526 #define SEN_LOG_DEFAULT_LEVEL sen_log_notice
527 #endif /* SEN_LOG_DEFAULT_LEVEL */
529 #define sen_log(...) \
530 if (sen_logger_pass(SEN_LOG_DEFAULT_LEVEL)) {\
531 sen_logger_put(SEN_LOG_DEFAULT_LEVEL, __FILE__, __LINE__, __FUNCTION__, __VA_ARGS__);\