2 * Copyright (c) 2003 Nara Institute of Science and Technology
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name Nara Institute of Science and Technology may not be used to
15 * endorse or promote products derived from this software without
16 * specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute
22 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * $Id: tokenizer.c,v 1.1 2007/03/13 07:40:10 masayu-a Exp $
38 #include "tokenizer.h"
40 #define is_space(c) (((c) == ' ') || ((c) == '\t'))
45 PROLONGED, /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
46 KATAKANA, /* KATAKANA LETTER (SMALL) [A-KE] */
47 SMALL_KATAKANA, /* KATAKANA LETTER SMALL AIUEO, TU, YAYUYO, WA */
48 FULL_LATIN, /* FULLWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] */
49 HALF_LATIN, /* LATIN (CAPITAL|SMALL) LETTER [A-Z] */
60 static int euc_mblen(unsigned char*, int);
61 static int sjis_mblen(unsigned char*, int);
62 static int iso8859_mblen(unsigned char*, int);
63 static int utf8_mblen(unsigned char*, int);
65 static int ja_char_type_parse(chasen_tok_t*,int,int*,int);
66 static int en_char_type_parse(chasen_tok_t*,int,int*,int);
68 static enum ja_char_type
69 ja_euc_char_type(chasen_tok_t*, unsigned char *, int);
70 static enum ja_char_type
71 ja_sjis_char_type(chasen_tok_t*, unsigned char *, int);
72 static enum ja_char_type
73 ja_utf8_char_type(chasen_tok_t*, unsigned char *, int);
75 static enum en_char_type
76 en_char_type(chasen_tok_t*, unsigned char *, int);
78 typedef int (*ja_char_type_get)(chasen_tok_t*,unsigned char*,int);
79 typedef int (*en_char_type_get)(chasen_tok_t*,unsigned char*,int);
81 static int is_anno(chasen_tok_t*, unsigned char*, int);
82 static int is_anno2(anno_info*, unsigned char*, int);
85 * This function constructs a tokenizer object.
86 * If an error occurs, it terminates a process.
89 cha_tok_new(int lang, int encode)
93 tok = cha_malloc(sizeof(chasen_tok_t));
99 if (lang == CHASEN_LANG_JA) {
100 if (encode == CHASEN_ENCODE_EUCJP) {
101 tok->mblen = euc_mblen;
102 tok->char_type_parse = ja_char_type_parse;
103 tok->get_char_type = (ja_char_type_get)ja_euc_char_type;
104 } else if (encode == CHASEN_ENCODE_SJIS) {
105 tok->mblen = sjis_mblen;
106 tok->char_type_parse = ja_char_type_parse;
107 tok->get_char_type = (ja_char_type_get)ja_sjis_char_type;
108 } else if (encode == CHASEN_ENCODE_UTF8) {
109 tok->mblen = utf8_mblen;
110 tok->char_type_parse = ja_char_type_parse;
111 tok->get_char_type = (ja_char_type_get)ja_utf8_char_type;
113 } else if (lang == CHASEN_LANG_EN) {
114 if (encode == CHASEN_ENCODE_ISO8859) {
115 tok->mblen = iso8859_mblen;
116 tok->char_type_parse = en_char_type_parse;
117 tok->get_char_type = (en_char_type_get)en_char_type;
118 } else if (encode == CHASEN_ENCODE_UTF8) {
119 tok->mblen = utf8_mblen;
120 tok->char_type_parse = en_char_type_parse;
121 tok->get_char_type = (en_char_type_get)en_char_type;
124 tok->mblen = iso8859_mblen;
125 tok->char_type_parse = en_char_type_parse;
126 tok->get_char_type = (en_char_type_get)en_char_type;
133 * This function destroys the tokenizer object.
136 cha_tok_delete(chasen_tok_t *tok)
142 cha_tok_parse(chasen_tok_t *tok, unsigned char *str, char *type, int len,
147 anno_info *anno = NULL;
150 if (anno_no != NULL && (no = is_anno(tok, str, len)) < 0) {
151 anno = &(tok->anno[-no]);
153 for (cursor = 0; cursor < len;
154 cursor += tok->mblen(str + cursor, len - cursor)) {
155 if (is_anno2(anno, str, cursor))
161 state0 = state = 0; /* NOSTATE */
162 for (cursor = head = 0; cursor < len;
163 cursor += tok->mblen(str + cursor, len - cursor)) {
164 if (anno_no != NULL &&
165 is_anno(tok, str + cursor, len - cursor) < 0) {
168 state = tok->get_char_type(tok, str + cursor, len - cursor);
169 state = tok->char_type_parse(tok, state, &state0, cursor);
172 if (state != state0) {
173 type[head] = cursor - head;
178 type[head] = cursor - head;
184 * This function returns the length in bytes of the multibyte character
185 * str with len bytes.
187 * If the character is `\0', it returns 1.
190 cha_tok_mblen(chasen_tok_t *tok, unsigned char *str, int len)
192 return tok->mblen(str, len);
196 * This function sets information of annotation anno in tokenizer tok.
199 cha_tok_set_annotation(chasen_tok_t *tok, anno_info *anno)
208 euc_mblen(unsigned char *str, int len)
211 str[0] == 0x8f && (str[1] & 0x80) && (str[2] & 0x80)) {
213 } else if (len >= 2 && (str[0] & 0x80) && (str[1] & 0x80)) {
221 sjis_mblen(unsigned char *str, int len)
223 if (str[0] >= 0xa0 && str[0] <= 0xdf) {
225 } else if (len >= 2 && (str[0] & 0x80)) {
233 iso8859_mblen(unsigned char *str, int len)
239 utf8_mblen(unsigned char *str, int len)
241 if (len >= 4 && (str[0] & 0xf0) == 0xf0 &&
242 (str[1] & 0x80) && (str[2] & 0x80) && (str[3] & 0x80)) {
244 } else if (len >= 3 && (str[0] & 0xe0) == 0xe0 &&
245 (str[1] & 0x80) && (str[2] & 0x80)) {
247 } else if (len >= 2 && (str[0] & 0xc0) == 0xc0 && (str[1] & 0x80)) {
255 ja_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
257 if (state == JA_SPACE) {
258 /* tok->anno_type[cursor] = 0; */ /* XXX */
259 } else if ((state == HALF_LATIN) ||
260 (state == FULL_LATIN)) {
262 } else if (((*state0 == KATAKANA) &&
263 ((state == PROLONGED) ||
264 (state == SMALL_KATAKANA))) ||
265 (state == KATAKANA)) {
269 *state0 = JA_NOSTATE;
276 en_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
278 if (state == EN_SPACE) {
279 /* tok->anno_type[cursor] = 0; */ /* XXX */
280 } else if (state == EN_OTHER) {
281 *state0 = EN_NOSTATE;
287 static enum ja_char_type
288 ja_euc_char_type(chasen_tok_t *tok, unsigned char *str, int len)
290 int mblen = tok->mblen(str, len);
293 if (isalpha(str[0])) {
295 } else if (is_space(str[0])) {
298 } else if (mblen == 2) {
299 if ((str[0] == 0xa1) && (str[1] == 0xbc)) {
301 } else if (str[0] == 0xa5) {
302 if ((str[1] == 0xa1) || (str[1] == 0xa3) ||
303 (str[1] == 0xa5) || (str[1] == 0xa7) ||
304 (str[1] == 0xa9) || (str[1] == 0xc3) ||
305 (str[1] == 0xe3) || (str[1] == 0xe5) ||
306 (str[1] == 0xe7) || (str[1] == 0xee)) {
307 return SMALL_KATAKANA;
311 } else if ((str[0] == 0xa3) && (str[1] >= 0xc1)) {
319 static enum ja_char_type
320 ja_sjis_char_type(chasen_tok_t *tok, unsigned char *str, int len)
322 int mblen = tok->mblen(str, len);
325 if (isalpha(str[0])) {
327 } else if (is_space(str[0])) {
330 } else if (mblen == 2) {
331 if ((str[0] == 0x81) && (str[1] == 0x5b)) {
333 } else if (str[0] == 0x83) {
334 if ((str[1] == 0x40) || (str[1] == 0x42) ||
335 (str[1] == 0x44) || (str[1] == 0x46) ||
336 (str[1] == 0x48) || (str[1] == 0x62) ||
337 (str[1] == 0x83) || (str[1] == 0x85) ||
338 (str[1] == 0x87) || (str[1] == 0x8e)) {
339 return SMALL_KATAKANA;
343 } else if ((str[0] == 0x82) &&
344 (str[1] >= 0x60) && (str[1] <= 0x9a)) {
352 static enum ja_char_type
353 ja_utf8_char_type(chasen_tok_t *tok, unsigned char *str, int len)
355 int mblen = tok->mblen(str, len);
358 if (isalpha(str[0])) {
360 } else if (is_space(str[0])) {
363 } else if (mblen == 3) {
364 if ((str[0] == 0xe3) && (str[1] == 0x83) && (str[2] == 0xbc)) {
366 } else if (str[0] == 0xe3) {
367 if (((str[1] == 0x82) &&
368 ((str[2] == 0xa1) || (str[2] == 0xa3) ||
369 (str[2] == 0xa5) || (str[2] == 0xa7) ||
370 (str[2] == 0xa9))) ||
372 ((str[2] == 0x83) || (str[2] == 0xa3) ||
373 (str[2] == 0xa5) || (str[2] == 0xa7) ||
374 (str[2] == 0xae)))) {
375 return SMALL_KATAKANA;
376 } else if (((str[1] == 0x82) &&
377 (str[2] >= 0xa1) && (str[2] <= 0xbf)) ||
379 (str[2] >= 0x80) && (str[2] <= 0xBA))) {
382 } else if ((str[0] == 0xef) &&
383 (((str[1] == 0xbc) &&
384 (str[2] >= 0xa1) && (str[2] <= 0xba)) ||
386 (str[2] >= 0x81) && (str[2] <= 0x9a)))) {
394 static enum en_char_type
395 en_char_type(chasen_tok_t *tok, unsigned char *str, int len)
397 unsigned char c = str[0];
401 } else if (isalpha(c)) { /* for English only */
410 is_anno(chasen_tok_t *tok, unsigned char *string, int len)
413 anno_info *anno = tok->anno;
418 for (i = 1; (anno[i].str1 != NULL); i++) {
419 if (len < anno[i].len1) {
422 if (!memcmp(string, anno[i].str1, anno[i].len1)) {
430 is_anno2(anno_info *anno, unsigned char *bos, int cursor)
432 int len2 = anno->len2;
438 return (memcmp(bos + cursor - len2, anno->str2, len2) == 0);