1 /* Copyright(C) 2004 Brazil
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 Add fast string-normalization function into Senna.
20 Author: NTT DATA Corporation
32 static sen_set *prefix = NULL;
33 static sen_set *suffix = NULL;
38 #define PREFIX_PATH SENNA_HOME PATH_SEPARATOR "prefix"
39 #define SUFFIX_PATH SENNA_HOME PATH_SEPARATOR "suffix"
47 prefix = sen_set_open(2, sizeof(int), 0);
48 if (!prefix) { SEN_LOG(sen_log_alert, "sen_set_open on prefix_init failed !"); return; }
49 if ((fp = fopen(PREFIX_PATH, "r"))) {
50 for (i = 0; i < N_PREFIX; i++) {
51 if (!fgets(buffer, 4, fp)) { break; }
52 sen_set_get(prefix, buffer, (void **)&ip);
65 suffix = sen_set_open(2, 0, 0);
66 if (!suffix) { SEN_LOG(sen_log_alert, "sen_set_open on suffix_init failed !"); return; }
67 if ((fp = fopen(SUFFIX_PATH, "r"))) {
68 for (i = N_SUFFIX; i; i--) {
69 if (!fgets(buffer, 4, fp)) { break; }
70 sen_set_get(suffix, buffer, NULL);
77 sen_str_charlen_utf8(const unsigned char *str, const unsigned char *end)
79 /* MEMO: This function allows non-null-terminated string as str. */
80 /* But requires the end of string. */
81 const unsigned char *p = str;
82 if (!*p || p >= end) { return 0; }
86 for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
88 SEN_LOG(sen_log_warning, "invalid utf8 string(1) on sen_str_charlen_utf8");
91 for (size = 1; w--; size++) {
92 if (++p >= end || !*p || (*p & 0xc0) != 0x80) {
93 SEN_LOG(sen_log_warning, "invalid utf8 string(2) on sen_str_charlen_utf8");
105 fast_sen_str_charlen_utf8(const unsigned char *s, const unsigned char *e)
112 if ((*s & 0x80) == 0)
114 else if ((*s & 0xe0) == 0xc0)
116 else if ((*s & 0xf0) == 0xe0)
118 else if ((*s & 0xf8) == 0xf0)
120 else if ((*s & 0xfc) == 0xf8)
122 else if ((*s & 0xfe) == 0xfc)
130 sen_str_charlen(const char *str, sen_encoding encoding)
132 /* MEMO: This function requires null-terminated string as str.*/
133 unsigned char *p = (unsigned char *) str;
134 if (!*p) { return 0; }
136 case sen_enc_euc_jp :
141 /* This is invalid character */
142 SEN_LOG(sen_log_warning, "invalid euc-jp string end on sen_str_charlen");
152 for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
154 SEN_LOG(sen_log_warning, "invalid utf8 string(1) on sen_str_charlen");
157 for (size = 1; w--; size++) {
158 if (!*++p || (*p & 0xc0) != 0x80) {
159 SEN_LOG(sen_log_warning, "invalid utf8 string(2) on sen_str_charlen");
170 /* we regard 0xa0 as JIS X 0201 KANA. adjusted to other tools. */
171 if (0xa0 <= *p && *p <= 0xdf) {
174 } else if (!(*(p + 1))) {
175 /* This is invalid character */
176 SEN_LOG(sen_log_warning, "invalid sjis string end on sen_str_charlen");
193 sen_str_charlen_nonnull(const char *str, const char *end, sen_encoding encoding)
195 /* MEMO: This function allows non-null-terminated string as str. */
196 /* But requires the end of string. */
197 unsigned char *p = (unsigned char *) str;
198 if (p >= (unsigned char *)end) { return 0; }
200 case sen_enc_euc_jp :
202 if ((p + 1) < (unsigned char *)end) {
205 /* This is invalid character */
206 SEN_LOG(sen_log_warning, "invalid euc-jp string end on sen_str_charlen_nonnull");
213 return sen_str_charlen_utf8(p, (unsigned char *)end);
217 /* we regard 0xa0 as JIS X 0201 KANA. adjusted to other tools. */
218 if (0xa0 <= *p && *p <= 0xdf) {
221 } else if (++p >= (unsigned char *)end) {
222 /* This is invalid character */
223 SEN_LOG(sen_log_warning, "invalid sjis string end on sen_str_charlen_nonnull");
242 if (prefix) { sen_set_close(prefix); }
243 if (suffix) { sen_set_close(suffix); }
248 sen_str_get_prefix_order(const char *str)
251 if (!str) { return -1; }
252 if (!prefix) { prefix_init(); }
253 if (prefix && sen_set_at(prefix, str, (void **)&ip)) {
260 static unsigned char symbol[] = {
261 ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
263 '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
264 '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
265 '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
269 normalize_euc(sen_nstr *nstr)
271 static uint16_t hankana[] = {
272 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
273 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
274 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
275 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
276 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
277 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
278 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
281 static unsigned char dakuten[] = {
282 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
283 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
284 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
287 static unsigned char handaku[] = {
288 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
291 sen_ctx *ctx = nstr->ctx;
292 const unsigned char *s, *s_, *e;
293 unsigned char *d, *d0, *d_, b;
294 uint_least8_t *cp, *ctypes, ctype;
295 size_t size = nstr->orig_blen, length = 0;
296 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
297 if (!(nstr->norm = SEN_MALLOC(size * 2 + 1))) {
298 return sen_memory_exhausted;
300 d0 = (unsigned char *) nstr->norm;
301 if (nstr->flags & SEN_STR_WITH_CHECKS) {
302 if (!(nstr->checks = SEN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
303 SEN_FREE(nstr->norm);
305 return sen_memory_exhausted;
309 if (nstr->flags & SEN_STR_WITH_CTYPES) {
310 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
311 SEN_FREE(nstr->checks);
312 SEN_FREE(nstr->norm);
315 return sen_memory_exhausted;
318 cp = ctypes = nstr->ctypes;
319 e = (unsigned char *)nstr->orig + size;
320 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
322 if (((s + 1) < e) && (*(s + 1) & 0x80)) {
323 unsigned char c1 = *s++, c2 = *s, c3 = 0;
326 if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
327 uint16_t c = hankana[c2 - 0xa0];
330 if (d > d0 + 1 && d[-2] == 0xa5
331 && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
333 if (ch) { ch[-1] += 2; s_ += 2; }
336 *d++ = c >> 8; *d = c & 0xff;
340 if (d > d0 + 1 && d[-2] == 0xa5
341 && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
343 if (ch) { ch[-1] += 2; s_ += 2; }
346 *d++ = c >> 8; *d = c & 0xff;
350 *d++ = c >> 8; *d = c & 0xff;
353 ctype = sen_str_katakana;
356 ctype = sen_str_others;
361 ctype = sen_str_others;
369 ctype = sen_str_katakana;
373 ctype = sen_str_kanji;
377 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
381 ctype = SEN_NSTR_BLANK|sen_str_symbol;
385 if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
387 ctype = sen_str_symbol;
390 ctype = sen_str_others;
397 ctype = sen_str_symbol;
401 if ('a' <= c3 && c3 <= 'z') {
402 ctype = sen_str_alpha;
404 } else if ('A' <= c3 && c3 <= 'Z') {
405 ctype = sen_str_alpha;
407 } else if ('0' <= c3 && c3 <= '9') {
408 ctype = sen_str_digit;
411 ctype = sen_str_others;
417 ctype = sen_str_hiragana;
421 ctype = sen_str_katakana;
427 ctype = sen_str_symbol;
431 ctype = sen_str_others;
437 ctype = sen_str_kanji;
441 /* skip invalid character */
445 unsigned char c = *s;
449 /* skip unprintable ascii */
450 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
455 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
459 ctype = SEN_NSTR_BLANK|sen_str_symbol;
463 ctype = sen_str_symbol;
468 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
471 *d = ('A' <= c) ? c + 0x20 : c;
472 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
475 *d = (c <= 'Z') ? c + 0x20 : c;
476 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
480 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
484 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
488 ctype = sen_str_others;
494 if (cp) { *cp++ = ctype; }
496 *ch++ = (int16_t)(s + 1 - s_);
498 while (++d_ < d) { *ch++ = 0; }
501 if (cp) { *cp = sen_str_null; }
503 nstr->length = length;
504 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
509 uint_least8_t sen_nfkc_ctype(const unsigned char *str);
510 const char *sen_nfkc_map1(const unsigned char *str);
511 const char *sen_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
514 * We backported all changes about normalize_utf8 from Senna 1.1.5.
515 * Because normalize_utf8() in Senna 1.1.2 has the bug; the buffer
516 * overflow can happen when U+FDFA or U+3316 is given.
519 normalize_utf8(sen_nstr *nstr)
522 sen_ctx *ctx = nstr->ctx;
523 const unsigned char *s, *s_, *s__, *p, *p2, *pe, *e;
524 unsigned char *d, *d_, *de;
526 size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
527 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
528 if (!(nstr->norm = SEN_MALLOC(ds + 1))) {
529 return sen_memory_exhausted;
531 if (nstr->flags & SEN_STR_WITH_CHECKS) {
532 if (!(nstr->checks = SEN_MALLOC(ds * sizeof(int16_t) + 1))) {
533 SEN_FREE(nstr->norm);
535 return sen_memory_exhausted;
539 if (nstr->flags & SEN_STR_WITH_CTYPES) {
540 if (!(nstr->ctypes = SEN_MALLOC(ds + 1))) {
542 SEN_FREE(nstr->checks); nstr->checks = NULL;
544 SEN_FREE(nstr->norm); nstr->norm = NULL;
545 return sen_memory_exhausted;
549 d = (unsigned char *)nstr->norm;
552 e = (unsigned char *)nstr->orig + size;
553 for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
554 if (!(ls = sen_str_charlen_utf8(s, e))) {
557 if ((p = (unsigned char *)sen_nfkc_map1(s))) {
558 pe = p + strlen((char *)p);
563 if (d_ && (p2 = (unsigned char *)sen_nfkc_map2(d_, p))) {
565 pe = p + strlen((char *)p);
575 if (!(lp = sen_str_charlen_utf8(p, pe))) {
578 if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) {
579 if (cp > nstr->ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
583 ds += (ds >> 1) + lp;
584 if (!(norm = SEN_REALLOC(nstr->norm, ds + 1))) {
585 if (nstr->ctypes) { SEN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
586 if (nstr->checks) { SEN_FREE(nstr->checks); nstr->checks = NULL; }
587 SEN_FREE(nstr->norm); nstr->norm = NULL;
588 return sen_memory_exhausted;
591 d = norm + (d - (unsigned char *)nstr->norm);
595 if (!(checks = SEN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
596 if (nstr->ctypes) { SEN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
597 SEN_FREE(nstr->checks); nstr->checks = NULL;
598 SEN_FREE(nstr->norm); nstr->norm = NULL;
599 return sen_memory_exhausted;
601 ch = checks + (ch - nstr->checks);
602 nstr->checks = checks;
605 uint_least8_t *ctypes;
606 if (!(ctypes = SEN_REALLOC(nstr->ctypes, ds + 1))) {
607 SEN_FREE(nstr->ctypes); nstr->ctypes = NULL;
608 if (nstr->checks) { SEN_FREE(nstr->checks); nstr->checks = NULL; }
609 SEN_FREE(nstr->norm); nstr->norm = NULL;
610 return sen_memory_exhausted;
612 cp = ctypes + (cp - nstr->ctypes);
613 nstr->ctypes = ctypes;
621 if (cp) { *cp++ = sen_nfkc_ctype(p); }
627 *ch++ = (int16_t)(s + ls - s_);
631 for (i = lp; i > 1; i--) { *ch++ = 0; }
636 if (cp) { *cp = sen_str_null; }
638 nstr->length = length;
639 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
643 /* Assume that nstr->flags is always zero */
645 fast_normalize_utf8(sen_nstr *nstr)
647 sen_ctx *ctx = nstr->ctx;
648 const unsigned char *s, *s_, *p, *p2, *pe, *e;
649 unsigned char *d, *d_, *de;
650 size_t ls, lp, size = nstr->orig_blen;
655 nstr->norm = SEN_MALLOC(ds + 1);
656 if (nstr->norm == NULL)
657 return sen_memory_exhausted;
659 d = (unsigned char *)nstr->norm;
662 e = (unsigned char *)nstr->orig + size;
664 for (s = s_ = (unsigned char *)nstr->orig; ; s += ls)
666 if (!(ls = fast_sen_str_charlen_utf8(s, e)))
669 if ((p = (unsigned char *)sen_nfkc_map1(s)))
671 pe = p + strlen((char *)p);
680 if (d_ && (p2 = (unsigned char *)sen_nfkc_map2(d_, p)))
683 pe = p + strlen((char *)p);
688 /* Skip unprintable ascii */
699 if (!(lp = fast_sen_str_charlen_utf8(p, pe)))
706 ds += (ds >> 1) + lp;
707 if (!(norm = SEN_REALLOC(nstr->norm, ds + 1)))
709 SEN_FREE(nstr->norm); nstr->norm = NULL;
710 return sen_memory_exhausted;
713 d = norm + (d - (unsigned char *)nstr->norm);
730 ds += (ds >> 1) + ls;
731 if (!(norm = SEN_REALLOC(nstr->norm, ds + 1)))
733 SEN_FREE(nstr->norm); nstr->norm = NULL;
734 return sen_memory_exhausted;
737 d = norm + (d - (unsigned char *)nstr->norm);
747 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
753 normalize_sjis(sen_nstr *nstr)
755 static uint16_t hankana[] = {
756 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
757 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
758 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
759 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
760 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
761 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
762 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
765 static unsigned char dakuten[] = {
766 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
767 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
768 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
771 static unsigned char handaku[] = {
772 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
775 sen_ctx *ctx = nstr->ctx;
776 const unsigned char *s, *s_;
777 unsigned char *d, *d0, *d_, b, *e;
778 uint_least8_t *cp, *ctypes, ctype;
779 size_t size = nstr->orig_blen, length = 0;
780 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
781 if (!(nstr->norm = SEN_MALLOC(size * 2 + 1))) {
782 return sen_memory_exhausted;
784 d0 = (unsigned char *) nstr->norm;
785 if (nstr->flags & SEN_STR_WITH_CHECKS) {
786 if (!(nstr->checks = SEN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
787 SEN_FREE(nstr->norm);
789 return sen_memory_exhausted;
793 if (nstr->flags & SEN_STR_WITH_CTYPES) {
794 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
795 SEN_FREE(nstr->checks);
796 SEN_FREE(nstr->norm);
799 return sen_memory_exhausted;
802 cp = ctypes = nstr->ctypes;
803 e = (unsigned char *)nstr->orig + size;
804 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
806 if (0xa0 <= *s && *s <= 0xdf) {
807 uint16_t c = hankana[*s - 0xa0];
810 if (d > d0 + 1 && d[-2] == 0x83
811 && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
813 if (ch) { ch[-1]++; s_++; }
816 *d++ = c >> 8; *d = c & 0xff;
820 if (d > d0 + 1 && d[-2] == 0x83
821 && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
823 if (ch) { ch[-1]++; s_++; }
826 *d++ = c >> 8; *d = c & 0xff;
830 *d++ = c >> 8; *d = c & 0xff;
833 ctype = sen_str_katakana;
835 if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
836 unsigned char c1 = *s++, c2 = *s, c3 = 0;
837 if (0x81 <= c1 && c1 <= 0x87) {
843 ctype = sen_str_katakana;
847 ctype = sen_str_kanji;
851 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
855 ctype = SEN_NSTR_BLANK|sen_str_symbol;
859 if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
861 ctype = sen_str_symbol;
862 } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
864 ctype = sen_str_symbol;
867 ctype = sen_str_others;
874 if (0x4f <= c2 && c2 <= 0x58) {
875 ctype = sen_str_digit;
877 } else if (0x60 <= c2 && c2 <= 0x79) {
878 ctype = sen_str_alpha;
880 } else if (0x81 <= c2 && c2 <= 0x9a) {
881 ctype = sen_str_alpha;
883 } else if (0x9f <= c2 && c2 <= 0xf1) {
885 ctype = sen_str_hiragana;
888 ctype = sen_str_others;
892 if (0x40 <= c2 && c2 <= 0x96) {
894 ctype = sen_str_katakana;
897 ctype = sen_str_symbol;
903 ctype = sen_str_symbol;
907 ctype = sen_str_others;
912 ctype = sen_str_kanji;
915 /* skip invalid character */
920 unsigned char c = *s;
924 /* skip unprintable ascii */
925 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
930 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
934 ctype = SEN_NSTR_BLANK|sen_str_symbol;
938 ctype = sen_str_symbol;
943 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
946 *d = ('A' <= c) ? c + 0x20 : c;
947 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
950 *d = (c <= 'Z') ? c + 0x20 : c;
951 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
955 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
959 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
963 ctype = sen_str_others;
969 if (cp) { *cp++ = ctype; }
971 *ch++ = (int16_t)(s + 1 - s_);
973 while (++d_ < d) { *ch++ = 0; }
976 if (cp) { *cp = sen_str_null; }
978 nstr->length = length;
979 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
984 normalize_none(sen_nstr *nstr)
987 sen_ctx *ctx = nstr->ctx;
988 const unsigned char *s, *s_, *e;
989 unsigned char *d, *d0, *d_;
990 uint_least8_t *cp, *ctypes, ctype;
991 size_t size = nstr->orig_blen, length = 0;
992 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
993 if (!(nstr->norm = SEN_MALLOC(size + 1))) {
994 return sen_memory_exhausted;
996 d0 = (unsigned char *) nstr->norm;
997 if (nstr->flags & SEN_STR_WITH_CHECKS) {
998 if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
999 SEN_FREE(nstr->norm);
1001 return sen_memory_exhausted;
1005 if (nstr->flags & SEN_STR_WITH_CTYPES) {
1006 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
1007 SEN_FREE(nstr->checks);
1008 SEN_FREE(nstr->norm);
1009 nstr->checks = NULL;
1011 return sen_memory_exhausted;
1014 cp = ctypes = nstr->ctypes;
1015 e = (unsigned char *)nstr->orig + size;
1016 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
1017 unsigned char c = *s;
1021 /* skip unprintable ascii */
1022 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1027 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1031 ctype = SEN_NSTR_BLANK|sen_str_symbol;
1035 ctype = sen_str_symbol;
1040 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
1043 *d = ('A' <= c) ? c + 0x20 : c;
1044 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
1047 *d = (c <= 'Z') ? c + 0x20 : c;
1048 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
1052 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
1056 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
1060 ctype = sen_str_others;
1065 if (cp) { *cp++ = ctype; }
1067 *ch++ = (int16_t)(s + 1 - s_);
1069 while (++d_ < d) { *ch++ = 0; }
1072 if (cp) { *cp = sen_str_null; }
1074 nstr->length = length;
1075 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
1079 /* use cp1252 as latin1 */
1080 inline static sen_rc
1081 normalize_latin1(sen_nstr *nstr)
1084 sen_ctx *ctx = nstr->ctx;
1085 const unsigned char *s, *s_, *e;
1086 unsigned char *d, *d0, *d_;
1087 uint_least8_t *cp, *ctypes, ctype;
1088 size_t size = strlen(nstr->orig), length = 0;
1089 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
1090 if (!(nstr->norm = SEN_MALLOC(size + 1))) {
1091 return sen_memory_exhausted;
1093 d0 = (unsigned char *) nstr->norm;
1094 if (nstr->flags & SEN_STR_WITH_CHECKS) {
1095 if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
1096 SEN_FREE(nstr->norm);
1098 return sen_memory_exhausted;
1102 if (nstr->flags & SEN_STR_WITH_CTYPES) {
1103 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
1104 SEN_FREE(nstr->checks);
1105 SEN_FREE(nstr->norm);
1106 nstr->checks = NULL;
1108 return sen_memory_exhausted;
1111 cp = ctypes = nstr->ctypes;
1112 e = (unsigned char *)nstr->orig + size;
1113 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
1114 unsigned char c = *s;
1118 /* skip unprintable ascii */
1119 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1124 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1128 ctype = SEN_NSTR_BLANK|sen_str_symbol;
1132 ctype = sen_str_symbol;
1137 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
1140 *d = ('A' <= c) ? c + 0x20 : c;
1141 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
1144 *d = (c <= 'Z') ? c + 0x20 : c;
1145 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
1149 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
1153 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
1156 if (c == 0x8a || c == 0x8c || c == 0x8e) {
1158 ctype = sen_str_alpha;
1161 ctype = sen_str_symbol;
1165 if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
1166 *d = (c == 0x9f) ? c + 0x60 : c;
1167 ctype = sen_str_alpha;
1170 ctype = sen_str_symbol;
1175 ctype = sen_str_alpha;
1178 *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
1179 ctype = (c == 0xd7) ? sen_str_symbol : sen_str_alpha;
1183 ctype = sen_str_alpha;
1187 ctype = (c == 0xf7) ? sen_str_symbol : sen_str_alpha;
1191 ctype = sen_str_others;
1196 if (cp) { *cp++ = ctype; }
1198 *ch++ = (int16_t)(s + 1 - s_);
1200 while (++d_ < d) { *ch++ = 0; }
1203 if (cp) { *cp = sen_str_null; }
1205 nstr->length = length;
1206 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
1210 inline static sen_rc
1211 normalize_koi8r(sen_nstr *nstr)
1214 sen_ctx *ctx = nstr->ctx;
1215 const unsigned char *s, *s_, *e;
1216 unsigned char *d, *d0, *d_;
1217 uint_least8_t *cp, *ctypes, ctype;
1218 size_t size = strlen(nstr->orig), length = 0;
1219 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
1220 if (!(nstr->norm = SEN_MALLOC(size + 1))) {
1221 return sen_memory_exhausted;
1223 d0 = (unsigned char *) nstr->norm;
1224 if (nstr->flags & SEN_STR_WITH_CHECKS) {
1225 if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
1226 SEN_FREE(nstr->norm);
1228 return sen_memory_exhausted;
1232 if (nstr->flags & SEN_STR_WITH_CTYPES) {
1233 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
1234 SEN_FREE(nstr->checks);
1235 SEN_FREE(nstr->norm);
1236 nstr->checks = NULL;
1238 return sen_memory_exhausted;
1241 cp = ctypes = nstr->ctypes;
1242 e = (unsigned char *)nstr->orig + size;
1243 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
1244 unsigned char c = *s;
1248 /* skip unprintable ascii */
1249 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1254 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1258 ctype = SEN_NSTR_BLANK|sen_str_symbol;
1262 ctype = sen_str_symbol;
1267 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
1270 *d = ('A' <= c) ? c + 0x20 : c;
1271 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
1274 *d = (c <= 'Z') ? c + 0x20 : c;
1275 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
1279 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
1283 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
1287 ctype = (c == 0xa3) ? sen_str_alpha : sen_str_others;
1292 ctype = sen_str_alpha;
1295 ctype = sen_str_others;
1301 ctype = sen_str_alpha;
1306 ctype = sen_str_alpha;
1310 ctype = sen_str_others;
1315 if (cp) { *cp++ = ctype; }
1317 *ch++ = (int16_t)(s + 1 - s_);
1319 while (++d_ < d) { *ch++ = 0; }
1322 if (cp) { *cp = sen_str_null; }
1324 nstr->length = length;
1325 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
1330 sen_nstr_open(const char *str, size_t str_len, sen_encoding encoding, int flags)
1333 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1335 if (!str) { return NULL; }
1336 if (!(nstr = SEN_MALLOC(sizeof(sen_nstr)))) {
1337 SEN_LOG(sen_log_alert, "memory allocation on sen_fakenstr_open failed !");
1341 nstr->orig_blen = str_len;
1343 nstr->norm_blen = 0;
1344 nstr->checks = NULL;
1345 nstr->ctypes = NULL;
1346 nstr->encoding = encoding;
1347 nstr->flags = flags;
1350 case sen_enc_euc_jp :
1351 rc = normalize_euc(nstr);
1355 rc = normalize_none(nstr);
1357 rc = normalize_utf8(nstr);
1358 #endif /* NO_NFKC */
1361 rc = normalize_sjis(nstr);
1363 case sen_enc_latin1 :
1364 rc = normalize_latin1(nstr);
1366 case sen_enc_koi8r :
1367 rc = normalize_koi8r(nstr);
1370 rc = normalize_none(nstr);
1374 sen_nstr_close(nstr);
1380 /* Assume that current encoding is UTF8 */
1382 fast_sen_nstr_open(const char *str, size_t str_len)
1384 sen_ctx *ctx = &sen_gctx;
1385 sen_nstr *nstr = NULL;
1390 nstr = SEN_MALLOC(sizeof(sen_nstr));
1393 SEN_LOG(sen_log_alert, "memory allocation on sen_fakenstr_open failed !");
1398 nstr->orig_blen = str_len;
1400 nstr->norm_blen = 0;
1401 nstr->checks = NULL;
1402 nstr->ctypes = NULL;
1403 nstr->encoding = sen_enc_utf8;
1407 if (fast_normalize_utf8(nstr))
1409 sen_nstr_close(nstr);
1417 sen_fakenstr_open(const char *str, size_t str_len, sen_encoding encoding, int flags)
1419 /* TODO: support SEN_STR_REMOVEBLANK flag and ctypes */
1421 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1423 if (!(nstr = SEN_MALLOC(sizeof(sen_nstr)))) {
1424 SEN_LOG(sen_log_alert, "memory allocation on sen_fakenstr_open failed !");
1427 if (!(nstr->norm = SEN_MALLOC(str_len + 1))) {
1428 SEN_LOG(sen_log_alert, "memory allocation for keyword on sen_snip_add_cond failed !");
1433 nstr->orig_blen = str_len;
1434 memcpy(nstr->norm, str, str_len);
1435 nstr->norm[str_len] = '\0';
1436 nstr->norm_blen = str_len;
1437 nstr->ctypes = NULL;
1438 nstr->flags = flags;
1441 if (flags & SEN_STR_WITH_CHECKS) {
1445 if (!(nstr->checks = (int16_t *) SEN_MALLOC(sizeof(int16_t) * str_len))) {
1446 SEN_FREE(nstr->norm);
1451 case sen_enc_euc_jp:
1452 for (i = 0; i < str_len; i++) {
1454 c = (unsigned char) str[i];
1455 f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
1457 nstr->checks[i] = f;
1459 nstr->checks[i] = 0;
1465 for (i = 0; i < str_len; i++) {
1467 c = (unsigned char) str[i];
1468 f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
1469 nstr->checks[i] = f;
1471 nstr->checks[i] = 0;
1477 for (i = 0; i < str_len; i++) {
1479 c = (unsigned char) str[i];
1480 f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
1483 nstr->checks[i] = f;
1485 nstr->checks[i] = 0;
1491 for (i = 0; i < str_len; i++) {
1492 nstr->checks[i] = 1;
1498 nstr->checks = NULL;
1504 sen_nstr_close(sen_nstr *nstr)
1507 sen_ctx *ctx = nstr->ctx;
1508 if (nstr->norm) { SEN_FREE(nstr->norm); }
1509 if (nstr->ctypes) { SEN_FREE(nstr->ctypes); }
1510 if (nstr->checks) { SEN_FREE(nstr->checks); }
1514 return sen_invalid_argument;
1518 static const char *sen_enc_string[] = {
1529 sen_enctostr(sen_encoding enc)
1531 if (enc < (sizeof(sen_enc_string) / sizeof(char *))) {
1532 return sen_enc_string[enc];
1539 sen_strtoenc(const char *str)
1541 sen_encoding e = sen_enc_euc_jp;
1542 int i = sizeof(sen_enc_string) / sizeof(sen_enc_string[0]);
1544 if (!strcmp(str, sen_enc_string[i])) {
1545 e = (sen_encoding)i;
1552 sen_str_len(const char *str, sen_encoding encoding, const char **last)
1555 const char *p = NULL;
1556 for (len = 0; ; len++) {
1558 if (!(tlen = sen_str_charlen(str, encoding))) {
1563 if (last) { *last = p; }
1568 sen_isspace(const char *str, sen_encoding encoding)
1570 const unsigned char *s = (const unsigned char *) str;
1571 if (!s) { return 0; }
1581 if (encoding == sen_enc_sjis && s[1] == 0x40) { return 2; }
1584 if (encoding == sen_enc_euc_jp && s[1] == 0xA1) { return 2; }
1587 if (encoding == sen_enc_utf8 && s[1] == 0x80 && s[2] == 0x80) { return 3; }
1596 sen_atoi(const char *nptr, const char *end, const char **rest)
1598 /* FIXME: INT_MIN is not supported */
1599 const char *p = nptr;
1600 int v = 0, t, n = 0, o = 0;
1601 if (p < end && *p == '-') {
1606 while (p < end && *p >= '0' && *p <= '9') {
1607 t = v * 10 + (*p - '0');
1608 if (t < v) { v =0; break; }
1613 if (rest) { *rest = o ? nptr : p; }
1618 sen_atoui(const char *nptr, const char *end, const char **rest)
1620 unsigned int v = 0, t;
1621 while (nptr < end && *nptr >= '0' && *nptr <= '9') {
1622 t = v * 10 + (*nptr - '0');
1623 if (t < v) { v = 0; break; }
1627 if (rest) { *rest = nptr; }
1632 sen_atoll(const char *nptr, const char *end, const char **rest)
1634 /* FIXME: INT_MIN is not supported */
1635 const char *p = nptr;
1638 if (p < end && *p == '-') {
1643 while (p < end && *p >= '0' && *p <= '9') {
1644 t = v * 10 + (*p - '0');
1645 if (t < v) { v = 0; break; }
1650 if (rest) { *rest = o ? nptr : p; }
1655 sen_htoui(const char *nptr, const char *end, const char **rest)
1657 unsigned int v = 0, t;
1658 while (nptr < end) {
1670 t = v * 16 + (*nptr++ - '0');
1678 t = v * 16 + (*nptr++ - 'a') + 10;
1686 t = v * 16 + (*nptr++ - 'A') + 10;
1691 if (t < v) { v = 0; goto exit; }
1695 if (rest) { *rest = nptr; }
1700 sen_str_itoh(unsigned int i, char *p, unsigned int len)
1702 static const char *hex = "0123456789ABCDEF";
1706 *p-- = hex[i & 0xf];
1712 sen_str_itoa(int i, char *p, char *end, char **rest)
1714 /* FIXME: INT_MIN is not supported */
1716 if (p >= end) { return sen_invalid_argument; }
1723 if (p >= end) { return sen_invalid_argument; }
1724 *p++ = i % 10 + '0';
1725 } while ((i /= 10) > 0);
1726 if (rest) { *rest = p; }
1727 for (p--; q < p; q++, p--) {
1736 sen_str_lltoa(int64_t i, char *p, char *end, char **rest)
1738 /* FIXME: INT_MIN is not supported */
1740 if (p >= end) { return sen_invalid_argument; }
1747 if (p >= end) { return sen_invalid_argument; }
1748 *p++ = i % 10 + '0';
1749 } while ((i /= 10) > 0);
1750 if (rest) { *rest = p; }
1751 for (p--; q < p; q++, p--) {
1760 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(i) & 0x3f])
1763 (((b) < '+' || 'z' < (b)) ? 0xff : "\x3e\xff\xff\xff\x3f\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\xff\xff\xff\xff\xff\xff\xff\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\xff\xff\xff\xff\xff\xff\x1a\x1b\x1c\x1d\x1e\x1f\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33"[(b) - '+'])
1765 #define MASK 0x34d34d34
1768 sen_str_itob(sen_id id, char *p)
1771 *p++ = I2B(id >> 24);
1772 *p++ = I2B(id >> 18);
1773 *p++ = I2B(id >> 12);
1774 *p++ = I2B(id >> 6);
1780 sen_str_btoi(char *b)
1787 if ((i = B2I(c)) == 0xff) { return 0; }
1794 sen_str_tok(char *str, size_t str_len, char delim, char **tokbuf, int buf_size, char **rest)
1796 char **tok = tokbuf, **tok_end = tokbuf + buf_size;
1798 char *str_end = str + str_len;
1800 if (str == str_end) {
1804 if (delim == *str) {
1807 if (tok == tok_end) { break; }
1811 if (rest) { *rest = str; }
1812 return tok - tokbuf;
1816 op_getopt_flag(int *flags, const sen_str_getopt_opt *o,
1817 int argc, char * const argv[], int *i)
1820 case getopt_op_none:
1828 case getopt_op_update:
1835 if (++(*i) < argc) {
1844 sen_str_getopt(int argc, char * const argv[], const sen_str_getopt_opt *opts,
1848 for (i = 1; i < argc; i++) {
1849 const char * v = argv[i];
1851 const sen_str_getopt_opt *o;
1855 for (o = opts; o->opt != '\0' || o->longopt != NULL; o++) {
1856 if (o->longopt && !strcmp(v, o->longopt)) {
1857 op_getopt_flag(flags, o, argc, argv, &i);
1862 if (!found) { goto exit; }
1865 for (p = v; *p; p++) {
1867 for (o = opts; o->opt != '\0' || o->longopt != NULL; o++) {
1868 if (o->opt && *p == o->opt) {
1869 op_getopt_flag(flags, o, argc, argv, &i);
1874 if (!found) { goto exit; }
1883 fprintf(stderr, "cannot recognize option '%s'.\n", argv[i]);
1887 #define UNIT_SIZE (1 << 12)
1888 #define UNIT_MASK (UNIT_SIZE - 1)
1890 int sen_rbuf_margin_size = 0;
1893 sen_rbuf_init(sen_rbuf *buf, size_t size)
1898 return size ? sen_rbuf_resize(buf, size) : sen_success;
1902 sen_rbuf_resize(sen_rbuf *buf, size_t newsize)
1905 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1906 newsize += sen_rbuf_margin_size + 1;
1907 newsize = (newsize + (UNIT_MASK)) & ~UNIT_MASK;
1908 head = buf->head - (buf->head ? sen_rbuf_margin_size : 0);
1909 if (!(head = SEN_REALLOC(head, newsize))) { return sen_memory_exhausted; }
1910 buf->curr = head + sen_rbuf_margin_size + SEN_RBUF_VSIZE(buf);
1911 buf->head = head + sen_rbuf_margin_size;
1912 buf->tail = head + newsize;
1917 sen_rbuf_reinit(sen_rbuf *buf, size_t size)
1919 SEN_RBUF_REWIND(buf);
1920 return sen_rbuf_resize(buf, size);
1924 sen_rbuf_write(sen_rbuf *buf, const char *str, size_t len)
1926 sen_rc rc = sen_success;
1927 if (SEN_RBUF_REST(buf) < len) {
1928 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1930 memcpy(buf->curr, str, len);
1936 sen_rbuf_reserve(sen_rbuf *buf, size_t len)
1938 sen_rc rc = sen_success;
1939 if (SEN_RBUF_REST(buf) < len) {
1940 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1946 sen_rbuf_space(sen_rbuf *buf, size_t len)
1948 sen_rc rc = sen_rbuf_reserve(buf, len);
1949 if (!rc) { buf->curr += len; }
1954 sen_rbuf_itoa(sen_rbuf *buf, int i)
1956 sen_rc rc = sen_success;
1957 while (sen_str_itoa(i, buf->curr, buf->tail, &buf->curr)) {
1958 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_WSIZE(buf) + UNIT_SIZE))) { return rc; }
1964 sen_rbuf_lltoa(sen_rbuf *buf, int64_t i)
1966 sen_rc rc = sen_success;
1967 while (sen_str_lltoa(i, buf->curr, buf->tail, &buf->curr)) {
1968 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_WSIZE(buf) + UNIT_SIZE))) { return rc; }
1974 sen_rbuf_ftoa(sen_rbuf *buf, double d)
1977 sen_rc rc = sen_success;
1978 if (SEN_RBUF_REST(buf) < len) {
1979 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1981 switch (fpclassify(d)) {
1983 SEN_RBUF_PUTS(buf, "#<nan>");
1986 SEN_RBUF_PUTS(buf, d > 0 ? "#i1/0" : "#i-1/0");
1989 len = sprintf(buf->curr, "%#.15g", d);
1990 if (buf->curr[len - 1] == '.') {
1992 SEN_RBUF_PUTC(buf, '0');
1995 buf->curr[len] = '\0';
1996 if ((p = strchr(buf->curr, 'e'))) {
1997 for (q = p; *(q - 2) != '.' && *(q - 1) == '0'; q--) { len--; }
1998 memmove(q, p, buf->curr + len - q);
2000 for (q = buf->curr + len; *(q - 2) != '.' && *(q - 1) == '0'; q--) { len--; }
2010 sen_rbuf_itoh(sen_rbuf *buf, int i)
2013 sen_rc rc = sen_success;
2014 if (SEN_RBUF_REST(buf) < len) {
2015 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
2017 sen_str_itoh(i, buf->curr, len);
2023 sen_rbuf_itob(sen_rbuf *buf, sen_id id)
2026 sen_rc rc = sen_success;
2027 if (SEN_RBUF_REST(buf) < len) {
2028 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
2030 sen_str_itob(id, buf->curr);
2036 sen_rbuf_str_esc(sen_rbuf *buf, const char *s, int len, sen_encoding encoding)
2040 if (len < 0) { len = strlen(s); }
2041 SEN_RBUF_PUTC(buf, '"');
2042 for (e = s + len; s < e; s += l) {
2043 if (!(l = sen_str_charlen_nonnull(s, e, encoding))) { break; }
2047 sen_rbuf_write(buf, "\\n", 2);
2050 sen_rbuf_write(buf, "\\\"", 2);
2053 sen_rbuf_write(buf, "\\\\", 2);
2056 SEN_RBUF_PUTC(buf, *s);
2059 sen_rbuf_write(buf, s, l);
2062 SEN_RBUF_PUTC(buf, '"');
2066 sen_rbuf_fin(sen_rbuf *buf)
2068 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
2070 SEN_REALLOC(buf->head - sen_rbuf_margin_size, 0);
2076 struct _sen_lbuf_node {
2077 sen_lbuf_node *next;
2083 sen_lbuf_init(sen_lbuf *buf)
2086 buf->tail = &buf->head;
2091 sen_lbuf_add(sen_lbuf *buf, size_t size)
2093 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
2094 sen_lbuf_node *node = SEN_MALLOC(size + (size_t)(&((sen_lbuf_node *)0)->val));
2095 if (!node) { return NULL; }
2099 buf->tail = &node->next;
2104 sen_lbuf_fin(sen_lbuf *buf)
2106 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
2107 sen_lbuf_node *cur, *next;
2108 for (cur = buf->head; cur; cur = next) {
2116 sen_substring(char **str, char **str_end, int start, int end, sen_encoding encoding)
2120 char *s = *str, *e = *str_end;
2121 for (i = 0; s < e; i++, s += l) {
2122 if (i == start) { *str = s; }
2123 if (!(l = sen_str_charlen_nonnull(s, e, encoding))) {
2124 return sen_invalid_argument;
2135 sen_str_normalize(const char *str, unsigned int str_len,
2136 sen_encoding encoding, int flags,
2137 char *nstrbuf, int buf_size)
2141 if (!(nstr = sen_nstr_open(str, str_len, encoding, flags))) {
2144 /* if the buffer size is short to store for the normalized string,
2145 the required size is returned
2146 (to inform the caller to cast me again). */
2147 len = (int)nstr->norm_blen;
2148 if (buf_size > len) {
2149 memcpy(nstrbuf, nstr->norm, len + 1);
2150 } else if (buf_size == len) {
2151 /* NB: non-NULL-terminated */
2152 memcpy(nstrbuf, nstr->norm, len);
2154 sen_nstr_close(nstr);
2158 /* Assume that current encoding is UTF-8 */
2160 fast_sen_str_normalize(const char *str, unsigned int str_len,
2161 char *nstrbuf, int buf_size)
2166 if (!(nstr = fast_sen_nstr_open(str, str_len)))
2170 * If the buffer size is short to store for the normalized string,
2171 * the required size is returned (to inform the caller to cast me again).
2173 len = (int)nstr->norm_blen;
2176 memcpy(nstrbuf, nstr->norm, len + 1);
2177 else if (buf_size == len)
2178 /* NB: non-NULL-terminated */
2179 memcpy(nstrbuf, nstr->norm, len);
2180 sen_nstr_close(nstr);