2 * Copyright (c) 2003 Nara Institute of Science and Technology
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name Nara Institute of Science and Technology may not be used to
15 * endorse or promote products derived from this software without
16 * specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute
22 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * $Id: tokenizer.c,v 1.3 2007/03/25 16:25:46 kazuma-t Exp $
38 #include "tokenizer.h"
40 #define is_space(c) (((c) == ' ') || ((c) == '\t'))
45 PROLONGED, /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
46 KATAKANA, /* KATAKANA LETTER (SMALL) [A-KE] */
47 SMALL_KATAKANA, /* KATAKANA LETTER SMALL AIUEO, TU, YAYUYO, WA */
48 FULL_LATIN, /* FULLWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] */
49 HALF_LATIN, /* LATIN (CAPITAL|SMALL) LETTER [A-Z] */
60 static int euc_mblen(unsigned char*, int);
61 static int sjis_mblen(unsigned char*, int);
62 static int iso8859_mblen(unsigned char*, int);
63 static int utf8_mblen(unsigned char*, int);
65 static int ja_char_type_parse(chasen_tok_t*,int,int*,int);
66 static int en_char_type_parse(chasen_tok_t*,int,int*,int);
68 static enum ja_char_type
69 ja_euc_char_type(chasen_tok_t*, unsigned char *, int);
70 static enum ja_char_type
71 ja_sjis_char_type(chasen_tok_t*, unsigned char *, int);
72 static enum ja_char_type
73 ja_utf8_char_type(chasen_tok_t*, unsigned char *, int);
75 static enum en_char_type
76 en_char_type(chasen_tok_t*, unsigned char *, int);
78 typedef int (*ja_char_type_get)(chasen_tok_t*,unsigned char*,int);
79 typedef int (*en_char_type_get)(chasen_tok_t*,unsigned char*,int);
81 static int is_anno(chasen_tok_t*, unsigned char*, int);
82 static int is_anno2(anno_info*, unsigned char*, int);
85 * This function constructs a tokenizer object.
86 * If an error occurs, it terminates a process.
89 cha_tok_new(int lang, int encode)
93 tok = cha_malloc(sizeof(chasen_tok_t));
99 if (lang == CHASEN_LANG_JA) {
100 if (encode == CHASEN_ENCODE_EUCJP) {
101 tok->mblen = euc_mblen;
102 tok->char_type_parse = ja_char_type_parse;
103 tok->get_char_type = (ja_char_type_get)ja_euc_char_type;
104 } else if (encode == CHASEN_ENCODE_SJIS) {
105 tok->mblen = sjis_mblen;
106 tok->char_type_parse = ja_char_type_parse;
107 tok->get_char_type = (ja_char_type_get)ja_sjis_char_type;
108 } else if (encode == CHASEN_ENCODE_UTF8) {
109 tok->mblen = utf8_mblen;
110 tok->char_type_parse = ja_char_type_parse;
111 tok->get_char_type = (ja_char_type_get)ja_utf8_char_type;
113 } else if (lang == CHASEN_LANG_EN) {
114 if (encode == CHASEN_ENCODE_ISO8859) {
115 tok->mblen = iso8859_mblen;
116 tok->char_type_parse = en_char_type_parse;
117 tok->get_char_type = (en_char_type_get)en_char_type;
118 } else if (encode == CHASEN_ENCODE_UTF8) {
119 tok->mblen = utf8_mblen;
120 tok->char_type_parse = en_char_type_parse;
121 tok->get_char_type = (en_char_type_get)en_char_type;
124 tok->mblen = iso8859_mblen;
125 tok->char_type_parse = en_char_type_parse;
126 tok->get_char_type = (en_char_type_get)en_char_type;
133 * This function destroys the tokenizer object.
136 cha_tok_delete(chasen_tok_t *tok)
142 cha_tok_parse(chasen_tok_t *tok, unsigned char *str, char *type, int len,
147 anno_info *anno = NULL;
150 if (anno_no != NULL && (no = is_anno(tok, str, len)) < 0) {
151 anno = &(tok->anno[-no]);
153 for (cursor = anno->len1;
155 cursor += tok->mblen(str + cursor, len - cursor)) {
156 if (is_anno2(anno, str, cursor))
162 state0 = state = 0; /* NOSTATE */
163 for (cursor = head = 0; cursor < len;
164 cursor += tok->mblen(str + cursor, len - cursor)) {
165 if (anno_no != NULL &&
166 is_anno(tok, str + cursor, len - cursor) < 0) {
169 state = tok->get_char_type(tok, str + cursor, len - cursor);
170 state = tok->char_type_parse(tok, state, &state0, cursor);
173 if (state != state0) {
174 type[head] = cursor - head;
179 type[head] = cursor - head;
185 * This function returns the length in bytes of the multibyte character
186 * str with len bytes.
188 * If the character is `\0', it returns 1.
191 cha_tok_mblen(chasen_tok_t *tok, unsigned char *str, int len)
193 return tok->mblen(str, len);
197 * This function sets information of annotation anno in tokenizer tok.
200 cha_tok_set_annotation(chasen_tok_t *tok, anno_info *anno)
209 euc_mblen(unsigned char *str, int len)
212 str[0] == 0x8f && (str[1] & 0x80) && (str[2] & 0x80)) {
214 } else if (len >= 2 && (str[0] & 0x80) && (str[1] & 0x80)) {
222 sjis_mblen(unsigned char *str, int len)
224 if (str[0] >= 0xa0 && str[0] <= 0xdf) {
226 } else if (len >= 2 && (str[0] & 0x80)) {
234 iso8859_mblen(unsigned char *str, int len)
240 utf8_mblen(unsigned char *str, int len)
242 if (len >= 4 && (str[0] & 0xf0) == 0xf0 &&
243 (str[1] & 0x80) && (str[2] & 0x80) && (str[3] & 0x80)) {
245 } else if (len >= 3 && (str[0] & 0xe0) == 0xe0 &&
246 (str[1] & 0x80) && (str[2] & 0x80)) {
248 } else if (len >= 2 && (str[0] & 0xc0) == 0xc0 && (str[1] & 0x80)) {
256 ja_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
258 if (state == JA_SPACE) {
259 /* tok->anno_type[cursor] = 0; */ /* XXX */
260 } else if ((state == HALF_LATIN) ||
261 (state == FULL_LATIN)) {
263 } else if (((*state0 == KATAKANA) &&
264 ((state == PROLONGED) ||
265 (state == SMALL_KATAKANA))) ||
266 (state == KATAKANA)) {
270 *state0 = JA_NOSTATE;
277 en_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
279 if (state == EN_SPACE) {
280 /* tok->anno_type[cursor] = 0; */ /* XXX */
281 } else if (state == EN_OTHER) {
282 *state0 = EN_NOSTATE;
288 static enum ja_char_type
289 ja_euc_char_type(chasen_tok_t *tok, unsigned char *str, int len)
291 int mblen = tok->mblen(str, len);
294 if (isalpha(str[0])) {
296 } else if (is_space(str[0])) {
299 } else if (mblen == 2) {
300 if ((str[0] == 0xa1) && (str[1] == 0xbc)) {
302 } else if (str[0] == 0xa5) {
303 if ((str[1] == 0xa1) || (str[1] == 0xa3) ||
304 (str[1] == 0xa5) || (str[1] == 0xa7) ||
305 (str[1] == 0xa9) || (str[1] == 0xc3) ||
306 (str[1] == 0xe3) || (str[1] == 0xe5) ||
307 (str[1] == 0xe7) || (str[1] == 0xee)) {
308 return SMALL_KATAKANA;
312 } else if ((str[0] == 0xa3) && (str[1] >= 0xc1)) {
320 static enum ja_char_type
321 ja_sjis_char_type(chasen_tok_t *tok, unsigned char *str, int len)
323 int mblen = tok->mblen(str, len);
326 if (isalpha(str[0])) {
328 } else if (is_space(str[0])) {
331 } else if (mblen == 2) {
332 if ((str[0] == 0x81) && (str[1] == 0x5b)) {
334 } else if (str[0] == 0x83) {
335 if ((str[1] == 0x40) || (str[1] == 0x42) ||
336 (str[1] == 0x44) || (str[1] == 0x46) ||
337 (str[1] == 0x48) || (str[1] == 0x62) ||
338 (str[1] == 0x83) || (str[1] == 0x85) ||
339 (str[1] == 0x87) || (str[1] == 0x8e)) {
340 return SMALL_KATAKANA;
344 } else if ((str[0] == 0x82) &&
345 (str[1] >= 0x60) && (str[1] <= 0x9a)) {
353 static enum ja_char_type
354 ja_utf8_char_type(chasen_tok_t *tok, unsigned char *str, int len)
356 int mblen = tok->mblen(str, len);
359 if (isalpha(str[0])) {
361 } else if (is_space(str[0])) {
364 } else if (mblen == 3) {
365 if ((str[0] == 0xe3) && (str[1] == 0x83) && (str[2] == 0xbc)) {
367 } else if (str[0] == 0xe3) {
368 if (((str[1] == 0x82) &&
369 ((str[2] == 0xa1) || (str[2] == 0xa3) ||
370 (str[2] == 0xa5) || (str[2] == 0xa7) ||
371 (str[2] == 0xa9))) ||
373 ((str[2] == 0x83) || (str[2] == 0xa3) ||
374 (str[2] == 0xa5) || (str[2] == 0xa7) ||
375 (str[2] == 0xae)))) {
376 return SMALL_KATAKANA;
377 } else if (((str[1] == 0x82) &&
378 (str[2] >= 0xa1) && (str[2] <= 0xbf)) ||
380 (str[2] >= 0x80) && (str[2] <= 0xBA))) {
383 } else if ((str[0] == 0xef) &&
384 (((str[1] == 0xbc) &&
385 (str[2] >= 0xa1) && (str[2] <= 0xba)) ||
387 (str[2] >= 0x81) && (str[2] <= 0x9a)))) {
395 static enum en_char_type
396 en_char_type(chasen_tok_t *tok, unsigned char *str, int len)
398 unsigned char c = str[0];
402 } else if (isalpha(c)) { /* for English only */
411 is_anno(chasen_tok_t *tok, unsigned char *string, int len)
414 anno_info *anno = tok->anno;
419 for (i = 1; (anno[i].str1 != NULL); i++) {
420 if (len < anno[i].len1) {
423 if (!memcmp(string, anno[i].str1, anno[i].len1)) {
431 is_anno2(anno_info *anno, unsigned char *bos, int cursor)
433 int len2 = anno->len2;
439 return (memcmp(bos + cursor - len2, anno->str2, len2) == 0);