1 /* Copyright(C) 2006-2007 Brazil
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
8 nnn This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 /* query string parser and executor */
28 #define DEFAULT_WEIGHT 5
29 #define DEFAULT_DECAYSTEP 2
30 #define DEFAULT_MAX_INTERVAL 10
31 #define DEFAULT_SIMILARITY_THRESHOLD 10
32 #define DEFAULT_TERM_EXTRACT_POLICY 0
33 #define DEFAULT_WEIGHT_VECTOR_SIZE 4096
41 sen_sel_operator default_op;
42 sen_select_optarg opt;
43 sen_sel_mode default_mode;
44 int escalation_threshold;
45 int escalation_decaystep;
48 sen_encoding encoding;
54 snip_cond *snip_conds;
55 cell cell_pool[1]; /* dummy */
59 cell_new(sen_query *q)
61 if (q->cur_cell <= q->max_cells) {
62 cell *c = &q->cell_pool[q->cur_cell++];
69 cell_del(sen_query *q)
71 if (q->cur_cell > 0) { q->cur_cell--; }
75 cons(sen_query *q, cell *car, cell *cdr)
78 if ((c = cell_new(q))) {
79 c->type = sen_ql_list;
89 token_new(sen_query *q, const char *start, const char *end)
92 if (start >= end) { return NIL; }
93 if ((c = cell_new(q))) {
94 unsigned int len = end - start;
95 c->type = sen_ql_bulk;
96 c->u.b.value = (char *)start;
106 op_new(sen_query *q, int8_t op, int16_t weight, int8_t mode, int32_t option)
109 if ((c = cell_new(q))) {
112 c->u.op.weight = weight;
114 c->u.op.option = option;
122 skip_space(sen_query *q)
125 while (q->cur < q->str_end && sen_isspace(q->cur, q->encoding)) {
126 /* null check and length check */
127 if (!(len = sen_str_charlen_nonnull(q->cur, q->str_end, q->encoding))) {
136 get_phrase(sen_query *q)
139 start = s = d = q->cur;
142 if (s >= q->str_end) {
146 len = sen_str_charlen_nonnull(s, q->str_end, q->encoding);
148 if (*s == SEN_QUERY_QUOTER) {
151 } else if (*s == SEN_QUERY_ESCAPE && s + 1 < q->str_end) {
153 len = sen_str_charlen_nonnull(s, q->str_end, q->encoding);
156 while (len--) { *d++ = *s++; }
158 return token_new(q, start, d);
162 get_word(sen_query *q, int *prefixp)
164 char *start = q->cur, *end;
166 for (end = q->cur;; ) {
167 /* null check and length check */
168 if (!(len = sen_str_charlen_nonnull(end, q->str_end, q->encoding))) {
172 if (sen_isspace(end, q->encoding) ||
173 *end == SEN_QUERY_PARENR) {
177 if (*end == SEN_QUERY_PREFIX) {
184 return token_new(q, start, end);
188 get_op(sen_query *q, sen_sel_operator op, int weight)
190 char *start, *end = q->cur;
194 mode = sen_sel_similar;
196 option = sen_atoi(start, q->str_end, (const char **)&end);
197 if (start == end) { option = DEFAULT_SIMILARITY_THRESHOLD; }
203 option = sen_atoi(start, q->str_end, (const char **)&end);
204 if (start == end) { option = DEFAULT_MAX_INTERVAL; }
208 mode = sen_sel_near2;
210 option = sen_atoi(start, q->str_end, (const char **)&end);
211 if (start == end) { option = DEFAULT_MAX_INTERVAL; }
215 mode = sen_sel_term_extract;
217 option = sen_atoi(start, q->str_end, (const char **)&end);
218 if (start == end) { option = DEFAULT_TERM_EXTRACT_POLICY; }
224 return op_new(q, op, weight, mode, option);
227 static cell *get_expr(sen_query *q);
230 get_token(sen_query *q)
233 sen_sel_operator op = q->default_op;
235 int weight = DEFAULT_WEIGHT, prefixp = 0, mode = -1, option = 0;
237 if (q->cur_expr >= q->max_exprs ||
238 q->cur_cell >= q->max_cells ||
239 q->cur >= q->str_end) { return NIL; }
243 case SEN_QUERY_PARENR :
246 case SEN_QUERY_QUOTEL :
248 token = get_phrase(q);
250 case SEN_QUERY_PREFIX :
252 token = get_op(q, op, weight);
256 token = op_new(q, sen_sel_and, weight, mode, option);
260 token = op_new(q, sen_sel_but, weight, mode, option);
262 case SEN_QUERY_ADJ_INC :
264 if (weight < 127) { weight++; }
265 token = op_new(q, sen_sel_adjust, weight, mode, option);
267 case SEN_QUERY_ADJ_DEC :
269 if (weight > -128) { weight--; }
270 token = op_new(q, sen_sel_adjust, weight, mode, option);
272 case SEN_QUERY_ADJ_NEG :
274 token = op_new(q, sen_sel_adjust, -1, mode, option);
276 case SEN_QUERY_PARENL :
281 if ((token = get_word(q, &prefixp)) &&
282 token->u.b.value[0] == 'O' &&
283 token->u.b.value[1] == 'R' &&
284 token->u.b.size == 2) {
287 token = op_new(q, sen_sel_or, weight, mode, option);
292 return cons(q, token, NIL);
296 get_expr(sen_query *q)
299 for (c = r = get_token(q); c != NIL; c = c_) {
300 c_ = c->u.l.cdr = get_token(q);
306 get_weight_vector(sen_query *query, const char *source)
308 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
311 if (!query->opt.weight_vector &&
312 !query->weight_set &&
313 !(query->opt.weight_vector = SEN_CALLOC(sizeof(int) * DEFAULT_WEIGHT_VECTOR_SIZE))) {
314 SEN_LOG(sen_log_alert, "get_weight_vector malloc fail");
317 for (p = source; p < query->str_end; ) {
321 /* key, key is not zero */
322 key = sen_atoui(p, query->str_end, &p);
323 if (!key || key > SEN_ID_MAX) { break; }
328 value = sen_atoi(p, query->str_end, &p);
333 if (query->weight_set) {
335 if (sen_set_get(query->weight_set, &key, (void **)&pval)) {
338 } else if (key < DEFAULT_WEIGHT_VECTOR_SIZE) {
339 query->opt.weight_vector[key - 1] = value;
341 SEN_FREE(query->opt.weight_vector);
342 query->opt.weight_vector = NULL;
343 if (!(query->weight_set = sen_set_open(sizeof(unsigned int), sizeof(int), 0))) {
346 p = source; /* reparse */
349 if (*p != ',') { break; }
356 get_pragma(sen_query *q)
358 char *start, *end = q->cur;
359 while (end < q->str_end && *end == SEN_QUERY_PREFIX) {
360 if (++end >= q->str_end) { break; }
364 q->escalation_threshold = sen_atoi(start, q->str_end, (const char **)&end);
365 while (end < q->str_end && (isdigit(*end) || *end == '-')) { end++; }
368 q->escalation_decaystep = sen_atoi(start, q->str_end, (const char **)&end);
374 while (end < q->str_end && *end != SEN_QUERY_PREFIX && !sen_isspace(end, q->encoding)) {
380 q->default_op = sen_sel_or;
383 q->default_op = sen_sel_and;
386 q->default_op = sen_sel_but;
388 case SEN_QUERY_ADJ_INC :
389 q->default_op = sen_sel_adjust;
397 end = (char *)get_weight_vector(q, start);
405 section_weight_cb(sen_records *r, const void *rid, int sid, void *arg)
408 sen_set *s = (sen_set *)arg;
409 if (s && sen_set_at(s, &sid, (void **)&w)) {
417 sen_query_open(const char *str, unsigned int str_len,
418 sen_sel_operator default_op, int max_exprs, sen_encoding encoding)
420 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
422 int max_cells = max_exprs * 4;
423 if (!(q = SEN_MALLOC(sizeof(sen_query) + max_cells * sizeof(cell) + str_len + 1))) {
424 SEN_LOG(sen_log_alert, "sen_query_open malloc fail");
427 q->str = (char *)&q->cell_pool[max_cells];
428 memcpy(q->str, str, str_len);
429 q->str[str_len] = '\0';
431 q->str_end = q->str + str_len;
432 q->default_op = default_op;
433 q->encoding = encoding;
434 q->max_exprs = max_exprs;
435 q->max_cells = max_cells;
438 q->escalation_threshold = SENNA_DEFAULT_QUERY_ESCALATION_THRESHOLD;
439 q->escalation_decaystep = DEFAULT_DECAYSTEP;
440 q->weight_offset = 0;
441 q->opt.weight_vector = NULL;
442 q->weight_set = NULL;
444 q->expr = get_expr(q);
445 q->opt.vector_size = DEFAULT_WEIGHT_VECTOR_SIZE;
446 q->opt.func = q->weight_set ? section_weight_cb : NULL;
447 q->opt.func_arg = q->weight_set;
448 q->snip_conds = NULL;
453 sen_query_rest(sen_query *q, const char ** const rest)
455 if (!q) { return 0; }
459 return (unsigned int)(q->str_end - q->cur);
463 sen_query_close(sen_query *q)
465 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
466 if (!q) { return sen_invalid_argument; }
467 if (q->opt.weight_vector) {
468 SEN_FREE(q->opt.weight_vector);
471 sen_set_close(q->weight_set);
475 for (sc = q->snip_conds; sc < q->snip_conds + q->cur_expr; sc++) {
476 sen_snip_cond_close(sc);
478 SEN_FREE(q->snip_conds);
485 exec_query(sen_index *i, sen_query *q, cell *c, sen_records *r, sen_sel_operator op)
489 int n = sen_records_nhits(r);
490 sen_sel_operator op0 = sen_sel_or, *opp = &op0, op1 = q->default_op;
491 if (!n && op != sen_sel_or) { return; }
492 s = n ? sen_records_open(r->record_unit, r->subrec_unit, 0) : r;
497 if (opp == &op0 && e->u.op.op == sen_sel_but) {
506 q->opt.mode = ope->u.op.mode == -1 ? q->default_mode : ope->u.op.mode;
507 q->opt.max_interval = q->opt.similarity_threshold = ope->u.op.option;
508 if (!q->opt.weight_vector) {
509 q->opt.vector_size = ope->u.op.weight + q->weight_offset;
511 if (ope->u.op.mode == sen_sel_similar) {
512 q->opt.max_interval = q->default_mode;
515 q->opt.mode = q->default_mode;
516 q->opt.max_interval = DEFAULT_MAX_INTERVAL;
517 q->opt.similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD;
518 if (!q->opt.weight_vector) {
519 q->opt.vector_size = DEFAULT_WEIGHT + q->weight_offset;
522 if (sen_index_select(i, e->u.b.value, e->u.b.size, s, *opp, &q->opt)) {
523 SEN_LOG(sen_log_error, "sen_index_select on exec_query failed !");
528 exec_query(i, q, e, s, *opp);
531 SEN_LOG(sen_log_notice, "invalid object assigned in query (%d)", e->type);
541 if (!sen_records_union(r, s)) { sen_records_close(s); }
544 if (!sen_records_intersect(r, s)) { sen_records_close(s); }
547 if (!sen_records_subtract(r, s)) { sen_records_close(s); }
550 case sen_sel_adjust :
554 sen_records_close(s);
561 sen_query_exec(sen_index *i, sen_query *q, sen_records *r, sen_sel_operator op)
564 if (!i || !q || !r || !PAIRP(q->expr)) { return sen_invalid_argument; }
565 p = q->escalation_threshold;
566 // dump_query(q, q->expr, 0);
567 // sen_log("escalation_threshold=%d", p);
568 if (p >= 0 || (-p & 1)) {
569 q->default_mode = sen_sel_exact;
570 exec_query(i, q, q->expr, r, op);
571 SEN_LOG(sen_log_info, "hits(exact)=%d", sen_records_nhits(r));
573 if ((p >= 0) ? (p >= sen_records_nhits(r)) : (-p & 2)) {
574 q->weight_offset -= q->escalation_decaystep;
575 q->default_mode = sen_sel_unsplit;
576 exec_query(i, q, q->expr, r, op);
577 SEN_LOG(sen_log_info, "hits(unsplit)=%d", sen_records_nhits(r));
579 if ((p >= 0) ? (p >= sen_records_nhits(r)) : (-p & 4)) {
580 q->weight_offset -= q->escalation_decaystep;
581 q->default_mode = sen_sel_partial;
582 exec_query(i, q, q->expr, r, op);
583 SEN_LOG(sen_log_info, "hits(partial)=%d", sen_records_nhits(r));
589 query_term_rec(sen_query* q, cell* c, query_term_callback func, void *func_arg)
593 return func(c->u.b.value, c->u.b.size, func_arg);
595 for (token = c; PAIRP(token); token = CDR(token)) {
596 if (!query_term_rec(q, CAR(token), func, func_arg)) {
597 return 0; /* abort */
600 return 1; /* continue */
604 sen_query_term(sen_query *q, query_term_callback func, void *func_arg)
606 query_term_rec(q, q->expr, func, func_arg);
609 /* FIXME: for test */
611 sen_query_str(sen_query *q, const char **str, unsigned int *len)
613 if (str) { *str = q->str; }
614 if (len) { *len = q->str_end - q->str; }
619 scan_keyword(snip_cond *sc, sen_nstr *str, sen_id section,
620 sen_sel_operator op, sen_select_optarg *optarg,
621 int *found, int *score)
625 for (tf = 0; ; tf++) {
626 sen_bm_tunedbm(sc, str, 0);
627 if (sc->stopflag == SNIPCOND_STOP) { break; }
629 if (optarg->vector_size) {
630 if (!optarg->weight_vector) {
631 w = optarg->vector_size;
632 } else if (section) {
633 w = (section <= optarg->vector_size ?
634 optarg->weight_vector[section - 1] : 0);
656 case sen_sel_adjust :
661 /* TODO: delete overlapping logic with exec_query */
663 scan_query(sen_query *q, sen_nstr *nstr, sen_id section, cell *c, snip_cond **sc,
664 sen_sel_operator op, int flags, int *found, int *score)
666 int _found = 0, _score = 0;
668 sen_sel_operator op0 = sen_sel_or, *opp = &op0, op1 = q->default_op;
673 if (opp == &op0 && e->u.op.op == sen_sel_but) {
682 q->opt.mode = ope->u.op.mode == -1 ? q->default_mode : ope->u.op.mode;
683 q->opt.max_interval = q->opt.similarity_threshold = ope->u.op.option;
684 if (!q->opt.weight_vector) {
685 q->opt.vector_size = ope->u.op.weight + q->weight_offset;
688 q->opt.mode = q->default_mode;
689 q->opt.max_interval = DEFAULT_MAX_INTERVAL;
690 q->opt.similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD;
691 if (!q->opt.weight_vector) {
692 q->opt.vector_size = DEFAULT_WEIGHT + q->weight_offset;
695 if ((flags & SEN_QUERY_SCAN_ALLOCCONDS)) {
697 /* NOTE: SEN_SNIP_NORMALIZE = SEN_QUERY_SCAN_NORMALIZE */
698 if ((rc = sen_snip_cond_init(*sc, e->u.b.value, e->u.b.size,
699 q->encoding, flags & SEN_SNIP_NORMALIZE))) {
703 sen_snip_cond_reinit(*sc);
705 scan_keyword(*sc, nstr, section, *opp, &q->opt, &_found, &_score);
709 scan_query(q, nstr, section, e, sc, *opp, flags, &_found, &_score);
712 SEN_LOG(sen_log_notice, "invalid object assigned in query! (%d)", e->type);
731 case sen_sel_adjust :
741 alloc_snip_conds(sen_query *q)
743 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
744 if (!(q->snip_conds = SEN_CALLOC(sizeof(snip_cond) * q->cur_expr))) {
745 SEN_LOG(sen_log_alert, "snip_cond allocation failed");
746 return sen_memory_exhausted;
752 sen_query_scan(sen_query *q, const char **strs, unsigned int *str_lens, unsigned int nstrs,
753 int flags, int *found, int *score)
757 if (!q || !strs || !nstrs) { return sen_invalid_argument; }
759 if (!q->snip_conds) {
760 if ((rc = alloc_snip_conds(q))) { return rc; }
761 flags |= SEN_QUERY_SCAN_ALLOCCONDS;
762 } else if (flags & SEN_QUERY_SCAN_ALLOCCONDS) {
763 SEN_LOG(sen_log_warning, "invalid flags specified on sen_query_scan")
764 return sen_invalid_argument;
766 for (i = 0; i < nstrs; i++) {
768 snip_cond *sc = q->snip_conds;
769 if (flags & SEN_QUERY_SCAN_NORMALIZE) {
770 n = sen_nstr_open(*(strs + i), *(str_lens + i), q->encoding,
771 SEN_STR_WITH_CHECKS | SEN_STR_REMOVEBLANK);
773 n = sen_fakenstr_open(*(strs + i), *(str_lens + i), q->encoding,
774 SEN_STR_WITH_CHECKS | SEN_STR_REMOVEBLANK);
776 if (!n) { return sen_memory_exhausted; }
777 if ((rc = scan_query(q, n, i + 1, q->expr, &sc, sen_sel_or, flags, found, score))) {
781 flags &= ~SEN_QUERY_SCAN_ALLOCCONDS;
787 /* TODO: delete overlapping logic with exec_query */
789 snip_query(sen_query *q, sen_snip *snip, cell *c, sen_sel_operator op,
790 unsigned int n_tags, int c_but,
791 const char **opentags, unsigned int *opentag_lens,
792 const char **closetags, unsigned int *closetag_lens)
795 sen_sel_operator op0 = sen_sel_or, *opp = &op0, op1 = q->default_op;
805 q->opt.mode = ope->u.op.mode == -1 ? q->default_mode : ope->u.op.mode;
807 q->opt.mode = q->default_mode;
809 if (!(c_but ^ (*opp == sen_sel_but))) {
811 unsigned int i = snip->cond_len % n_tags;
812 if ((rc = sen_snip_add_cond(snip, e->u.b.value, e->u.b.size,
813 opentags[i], opentag_lens[i],
814 closetags[i], closetag_lens[i]))) {
820 snip_query(q, snip, e, *opp, n_tags, (*opp == sen_sel_but) ? c_but ^ 1 : c_but,
821 opentags, opentag_lens, closetags, closetag_lens);
824 SEN_LOG(sen_log_notice, "invalid object assigned in query!! (%d)", e->type);
835 sen_query_snip(sen_query *query, int flags,
836 unsigned int width, unsigned int max_results,
838 const char **opentags, unsigned int *opentag_lens,
839 const char **closetags, unsigned int *closetag_lens,
840 sen_snip_mapping *mapping)
843 if (!(res = sen_snip_open(query->encoding, flags, width, max_results,
844 NULL, 0, NULL, 0, mapping))) {
847 if (snip_query(query, res, query->expr, sen_sel_or, n_tags, 0,
848 opentags, opentag_lens, closetags, closetag_lens)) {
858 dump_query(sen_query *q, cell *c, int level)
860 { int i; for (i = level; i; i--) { putchar(' '); }}
861 printf("%d:%d ", c->weight, c->op);
862 if (c->type == cell_token) {
863 { int i; for (i = level; i; i--) { putchar(' '); }}
864 fwrite(c->u.b.value, 1, c->u.b.size, stdout);
868 for (token = c->u.l.car; token; token = token->u.l.cdr) {
869 dump_query(q, token, level + 1);