1 /* Copyright(C) 2004-2005 Brazil
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 #define MAX(a, b) ((a) > (b) ? (a) : (b))
29 #define MIN(a, b) ((a) < (b) ? (a) : (b))
33 sen_bm_check_euc(const unsigned char *x, const size_t y)
35 const unsigned char *p;
36 for (p = x + y - 1; p >= x && *p >= 0x80U; p--);
37 return (int) ((x + y - p) & 1);
41 sen_bm_check_sjis(const unsigned char *x, const size_t y)
43 const unsigned char *p;
44 for (p = x + y - 1; p >= x; p--)
45 if ((*p < 0x81U) || (*p > 0x9fU && *p < 0xe0U) || (*p > 0xfcU))
47 return (int) ((x + y - p) & 1);
52 sen_bm_suffixes(const unsigned char *x, size_t m, size_t *suff)
59 for (i = m - 2; i >= 0; --i) {
60 if (i > (intptr_t) g && suff[i + m - 1 - f] < i - g)
61 suff[i] = suff[i + m - 1 - f];
66 while (g > 0 && x[g] == x[g + m - 1 - f])
75 sen_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc)
78 for (i = 0; i < ASIZE; ++i) {
81 for (i = 0; i < m - 1; ++i) {
82 bmBc[(unsigned int) x[i]] = m - (i + 1);
86 #define SEN_BM_COMPARE \
87 if (object->checks[found]) { \
88 size_t offset = cond->start_offset, found_alpha_head = cond->found_alpha_head; \
89 /* calc real offset */\
90 for (i = cond->last_found; i < found; i++) { \
91 if (object->checks[i] > 0) { \
92 found_alpha_head = i; \
93 offset += object->checks[i]; \
96 /* if real offset is in a character, move it the head of the character */ \
97 if (object->checks[found] < 0) { \
98 offset -= object->checks[found_alpha_head]; \
99 cond->last_found = found_alpha_head; \
101 cond->last_found = found; \
103 if (flags & SEN_SNIP_SKIP_LEADING_SPACES) { \
104 while (offset < object->orig_blen && \
105 (i = sen_isspace(object->orig + offset, object->encoding))) { offset += i; } \
107 cond->start_offset = offset; \
108 for (i = cond->last_found; i < found + m; i++) { \
109 if (object->checks[i] > 0) { \
110 offset += object->checks[i]; \
113 cond->end_offset = offset; \
114 cond->found = found + shift; \
115 cond->found_alpha_head = found_alpha_head; \
116 /* printf("bm: cond:%p found:%zd last_found:%zd st_off:%zd ed_off:%zd\n", cond, cond->found,cond->last_found,cond->start_offset,cond->end_offset); */ \
120 #define SEN_BM_BM_COMPARE \
123 for (i = 3; i <= m && p[-(intptr_t)i] == cp[-(intptr_t)i]; ++i) { \
133 sen_bm_tunedbm(snip_cond *cond, sen_nstr *object, int flags)
135 register unsigned char *limit, ck;
136 register const unsigned char *p, *cp;
137 register size_t *bmBc, delta1, i;
139 const unsigned char *x;
143 const size_t n = object->norm_blen, m = cond->keyword->norm_blen;
145 y = (unsigned char *) object->norm;
147 if (n > cond->found) {
149 p = memchr(y + cond->found, cond->keyword->norm[0], n - cond->found);
155 cond->stopflag = SNIPCOND_STOP;
159 x = (unsigned char *) cond->keyword->norm;
164 p = y + m + cond->found;
168 /* 12 means 1(initial offset) + 10 (in loop) + 1 (shift) */
169 if (n - cond->found > 12 * m) {
170 limit = y + n - 11 * m;
173 if(!(delta1 = bmBc[p[-1]])) {
179 if(!(delta1 = bmBc[p[-1]])) {
185 if(!(delta1 = bmBc[p[-1]])) {
197 /* limit check + search */
200 if (!(delta1 = bmBc[p[-1]])) {
206 cond->stopflag = SNIPCOND_STOP;
210 count_mapped_chars(const char *str, const char *end)
216 for (p = str; p != end; p++) {
220 dl += 4; /* < or > */
226 dl += 6; /* " */
237 sen_snip_cond_close(snip_cond *cond)
240 return sen_invalid_argument;
243 sen_nstr_close(cond->keyword);
249 sen_snip_cond_init(snip_cond *sc, const char *keyword, unsigned int keyword_len,
250 sen_encoding enc, int flags)
253 memset(sc, 0, sizeof(snip_cond));
254 if (flags & SEN_SNIP_NORMALIZE) {
255 if (!(sc->keyword = sen_nstr_open(keyword, keyword_len,
256 enc, SEN_STR_REMOVEBLANK))) {
257 SEN_LOG(sen_log_alert, "sen_nstr_open on snip_cond_init failed !");
258 return sen_memory_exhausted;
261 if (!(sc->keyword = sen_fakenstr_open(keyword, keyword_len,
262 enc, SEN_STR_REMOVEBLANK))) {
263 SEN_LOG(sen_log_alert, "sen_fakenstr_open on snip_cond_init failed !");
264 return sen_memory_exhausted;
267 norm_blen = sc->keyword->norm_blen; /* byte length, not cond->keyword->length */
269 sen_snip_cond_close(sc);
270 return sen_invalid_argument;
272 if (norm_blen != 1) {
273 sen_bm_preBmBc((unsigned char *)sc->keyword->norm, norm_blen, sc->bmBc);
274 sc->shift = sc->bmBc[(unsigned char)sc->keyword->norm[norm_blen - 1]];
275 sc->bmBc[(unsigned char)sc->keyword->norm[norm_blen - 1]] = 0;
281 sen_snip_cond_reinit(snip_cond *cond)
284 cond->last_found = 0;
285 cond->start_offset = 0;
286 cond->end_offset = 0;
289 cond->stopflag = SNIPCOND_NONSTOP;
293 sen_snip_add_cond(sen_snip *snip,
294 const char *keyword, unsigned int keyword_len,
295 const char *opentag, unsigned int opentag_len,
296 const char *closetag, unsigned int closetag_len)
300 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
302 if (!snip || !keyword || !keyword_len || snip->cond_len >= MAX_SNIP_COND_COUNT) {
303 return sen_invalid_argument;
305 cond = snip->cond + snip->cond_len;
306 if ((rc = sen_snip_cond_init(cond, keyword, keyword_len,
307 snip->encoding, snip->flags))) {
310 if (cond->keyword->norm_blen > snip->width) {
311 sen_snip_cond_close(cond);
312 return sen_invalid_argument;
315 if (snip->flags & SEN_SNIP_COPY_TAG) {
316 char *t = SEN_MALLOC(opentag_len + 1);
318 sen_snip_cond_close(cond);
319 return sen_memory_exhausted;
321 memcpy(t, opentag, opentag_len);
322 t[opentag_len]= '\0'; /* not required, but for ql use */
325 cond->opentag = opentag;
327 cond->opentag_len = opentag_len;
329 cond->opentag = snip->defaultopentag;
330 cond->opentag_len = snip->defaultopentag_len;
333 if (snip->flags & SEN_SNIP_COPY_TAG) {
334 char *t = SEN_MALLOC(closetag_len + 1);
335 if (!t) { return sen_memory_exhausted; }
336 memcpy(t, closetag, closetag_len);
337 t[closetag_len]= '\0'; /* not required, but for ql use */
340 cond->closetag = closetag;
342 cond->closetag_len = closetag_len;
344 cond->closetag = snip->defaultclosetag;
345 cond->closetag_len = snip->defaultclosetag_len;
352 sen_snip_find_firstbyte(const char *string, sen_encoding encoding, size_t offset,
357 while (!(sen_bm_check_euc((unsigned char *) string, offset)))
361 if (!(sen_bm_check_sjis((unsigned char *) string, offset)))
365 while (string[offset] <= (char)0xc0)
375 sen_snip_open(sen_encoding encoding, int flags, unsigned int width,
376 unsigned int max_results,
377 const char *defaultopentag, unsigned int defaultopentag_len,
378 const char *defaultclosetag, unsigned int defaultclosetag_len,
379 sen_snip_mapping *mapping)
381 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
382 sen_snip *ret = NULL;
383 if (!(ret = SEN_MALLOC(sizeof(sen_snip)))) {
384 SEN_LOG(sen_log_alert, "sen_snip allocation failed on sen_snip_open");
387 if (max_results > MAX_SNIP_RESULT_COUNT || max_results == 0) {
388 SEN_LOG(sen_log_warning, "max_results is invalid on sen_snip_open");
391 ret->encoding = encoding;
394 ret->max_results = max_results;
395 if (flags & SEN_SNIP_COPY_TAG) {
397 t = SEN_MALLOC(defaultopentag_len + 1);
402 memcpy(t, defaultopentag, defaultopentag_len);
403 t[defaultopentag_len]= '\0'; /* not required, but for ql use */
404 ret->defaultopentag = t;
406 t = SEN_MALLOC(defaultclosetag_len + 1);
408 SEN_FREE((void *)ret->defaultopentag);
412 memcpy(t, defaultclosetag, defaultclosetag_len);
413 t[defaultclosetag_len]= '\0'; /* not required, but for ql use */
414 ret->defaultclosetag = t;
416 ret->defaultopentag = defaultopentag;
417 ret->defaultclosetag = defaultclosetag;
419 ret->defaultopentag_len = defaultopentag_len;
420 ret->defaultclosetag_len = defaultclosetag_len;
422 ret->mapping = mapping;
431 exec_clean(sen_snip *snip)
433 snip_cond *cond, *cond_end;
435 sen_nstr_close(snip->nstr);
439 snip->snip_count = 0;
440 for (cond = snip->cond, cond_end = cond + snip->cond_len;
441 cond < cond_end; cond++) {
442 sen_snip_cond_reinit(cond);
448 sen_snip_close(sen_snip *snip)
450 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
451 snip_cond *cond, *cond_end;
452 if (!snip) { return sen_invalid_argument; }
453 if (snip->flags & SEN_SNIP_COPY_TAG) {
456 const char *dot = snip->defaultopentag, *dct = snip->defaultclosetag;
457 for (i = snip->cond_len, sc = snip->cond; i; i--, sc++) {
458 if (sc->opentag != dot) { SEN_FREE((void *)sc->opentag); }
459 if (sc->closetag != dct) { SEN_FREE((void *)sc->closetag); }
461 if (dot) { SEN_FREE((void *)dot); }
462 if (dct) { SEN_FREE((void *)dct); }
465 sen_nstr_close(snip->nstr);
467 for (cond = snip->cond, cond_end = cond + snip->cond_len;
468 cond < cond_end; cond++) {
469 sen_snip_cond_close(cond);
476 sen_snip_exec(sen_snip *snip, const char *string, unsigned int string_len,
477 unsigned int *nresults, unsigned int *max_tagged_len)
480 if (!snip || !string) {
481 return sen_invalid_argument;
485 if (snip->flags & SEN_SNIP_NORMALIZE) {
487 sen_nstr_open(string, string_len, snip->encoding,
488 SEN_STR_WITH_CHECKS | SEN_STR_REMOVEBLANK);
491 sen_fakenstr_open(string, string_len, snip->encoding,
492 SEN_STR_WITH_CHECKS | SEN_STR_REMOVEBLANK);
496 SEN_LOG(sen_log_alert, "sen_nstr_open on sen_snip_exec failed !");
497 return sen_memory_exhausted;
499 for (i = 0; i < snip->cond_len; i++) {
500 sen_bm_tunedbm(snip->cond + i, snip->nstr, snip->flags);
504 _snip_tag_result *tag_result = snip->tag_result;
505 _snip_result *snip_result = snip->snip_result;
506 size_t last_end_offset = 0, last_last_end_offset = 0;
507 unsigned int unfound_cond_count = snip->cond_len;
511 size_t tagged_len = 0, last_tag_end = 0;
512 int_least8_t all_stop = 1, found_cond = 0;
513 snip_result->tag_count = 0;
516 size_t min_start_offset = (size_t) -1;
517 size_t max_end_offset = 0;
518 snip_cond *cond = NULL;
520 /* get condition which have minimum offset and is not stopped */
521 for (i = 0; i < snip->cond_len; i++) {
522 if (snip->cond[i].stopflag == SNIPCOND_NONSTOP &&
523 (min_start_offset > snip->cond[i].start_offset ||
524 (min_start_offset == snip->cond[i].start_offset &&
525 max_end_offset < snip->cond[i].end_offset))) {
526 min_start_offset = snip->cond[i].start_offset;
527 max_end_offset = snip->cond[i].end_offset;
528 cond = &snip->cond[i];
534 /* check whether condtion is the first condition in snippet */
535 if (snip_result->tag_count == 0) {
536 /* skip condition if the number of rest snippet field is smaller than */
537 /* the number of unfound keywords. */
538 if (snip->max_results - *nresults <= unfound_cond_count && cond->count > 0) {
539 int_least8_t exclude_other_cond = 1;
540 for (i = 0; i < snip->cond_len; i++) {
541 if ((snip->cond + i) != cond
542 && snip->cond[i].end_offset <= cond->start_offset + snip->width
543 && snip->cond[i].count == 0) {
544 exclude_other_cond = 0;
547 if (exclude_other_cond) {
548 sen_bm_tunedbm(cond, snip->nstr, snip->flags);
552 snip_result->start_offset = cond->start_offset;
553 snip_result->first_tag_result_idx = snip->tag_count;
555 if (cond->start_offset >= snip_result->start_offset + snip->width) {
558 /* check nesting to make valid HTML */
559 /* ToDo: allow <test><te>te</te><st>st</st></test> */
560 if (cond->start_offset < last_tag_end) {
561 sen_bm_tunedbm(cond, snip->nstr, snip->flags);
565 if (cond->end_offset > snip_result->start_offset + snip->width) {
566 /* If a keyword gets across a snippet, */
567 /* it was skipped and never to be tagged. */
568 cond->stopflag = SNIPCOND_ACROSS;
569 sen_bm_tunedbm(cond, snip->nstr, snip->flags);
572 if (cond->count == 0) {
573 unfound_cond_count--;
576 last_end_offset = cond->end_offset;
578 tag_result->cond = cond;
579 tag_result->start_offset = cond->start_offset;
580 tag_result->end_offset = last_tag_end = cond->end_offset;
582 snip_result->tag_count++;
584 tagged_len += cond->opentag_len + cond->closetag_len;
585 if (++snip->tag_count >= MAX_SNIP_TAG_COUNT) {
588 sen_bm_tunedbm(cond, snip->nstr, snip->flags);
594 if (snip_result->start_offset + last_end_offset < snip->width) {
595 snip_result->start_offset = 0;
597 snip_result->start_offset =
599 ((snip_result->start_offset + last_end_offset - snip->width) / 2,
600 string_len - snip->width), last_last_end_offset);
602 snip_result->start_offset =
603 sen_snip_find_firstbyte(string, snip->encoding, snip_result->start_offset, 1);
605 snip_result->end_offset = snip_result->start_offset + snip->width;
606 if (snip_result->end_offset < string_len) {
607 snip_result->end_offset =
608 sen_snip_find_firstbyte(string, snip->encoding, snip_result->end_offset, -1);
610 snip_result->end_offset = string_len;
612 last_last_end_offset = snip_result->end_offset;
615 count_mapped_chars(&string[snip_result->start_offset],
616 &string[snip_result->end_offset]) + 1;
618 *max_tagged_len = MAX(*max_tagged_len, tagged_len);
620 snip_result->last_tag_result_idx = snip->tag_count - 1;
624 if (*nresults == snip->max_results || snip->tag_count == MAX_SNIP_TAG_COUNT) {
627 for (i = 0; i < snip->cond_len; i++) {
628 if (snip->cond[i].stopflag != SNIPCOND_STOP) {
630 snip->cond[i].stopflag = SNIPCOND_NONSTOP;
638 snip->snip_count = *nresults;
639 snip->string = string;
641 snip->max_tagged_len = *max_tagged_len;
647 sen_snip_get_result(sen_snip *snip, const unsigned int index, char *result, unsigned int *result_len)
651 _snip_result *sres = &snip->snip_result[index];
653 if (snip->snip_count <= index || !snip->nstr) {
654 return sen_invalid_argument;
657 assert(snip->snip_count != 0 && snip->tag_count != 0);
659 j = sres->first_tag_result_idx;
660 for (p = result, i = sres->start_offset; i < sres->end_offset; i++) {
661 for (; j <= sres->last_tag_result_idx && snip->tag_result[j].start_offset == i; j++) {
662 if (snip->tag_result[j].end_offset > sres->end_offset) {
665 memcpy(p, snip->tag_result[j].cond->opentag, snip->tag_result[j].cond->opentag_len);
666 p += snip->tag_result[j].cond->opentag_len;
669 if (snip->mapping == (sen_snip_mapping *) -1) {
670 switch (snip->string[i]) {
699 *p++ = snip->string[i];
703 *p++ = snip->string[i];
706 for (k = sres->last_tag_result_idx;
707 snip->tag_result[k].end_offset <= sres->end_offset; k--) {
708 /* TODO: avoid all loop */
709 if (snip->tag_result[k].end_offset == i + 1) {
710 memcpy(p, snip->tag_result[k].cond->closetag,
711 snip->tag_result[k].cond->closetag_len);
712 p += snip->tag_result[k].cond->closetag_len;
714 if (k <= sres->first_tag_result_idx) {
721 if(result_len) { *result_len = (unsigned int)(p - result); }
722 assert((unsigned int)(p - result) <= snip->max_tagged_len);