lib/tokenizer.c

   1 /*
   2  * Copyright (c) 2003 Nara Institute of Science and Technology
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *   notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 3. The name Nara Institute of Science and Technology may not be used to
  15  *    endorse or promote products derived from this software without
  16  *    specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  21  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE Nara Institute
  22  * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  24  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  25  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  26  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  27  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * $Id: tokenizer.c,v 1.3 2007/03/25 16:25:46 kazuma-t Exp $
  31  */
  32
  33 #include <string.h>
  34 #include <ctype.h>
  35
  36 #include "chalib.h"
  37 #include "literal.h"
  38 #include "tokenizer.h"
  39
  40 #define is_space(c) (((c) == ' ') || ((c) == '\t'))
  41
  42 enum ja_char_type {
  43     JA_NOSTATE,
  44     JA_SPACE,
  45     PROLONGED,      /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
  46     KATAKANA,       /* KATAKANA LETTER (SMALL) [A-KE] */
  47     SMALL_KATAKANA, /* KATAKANA LETTER SMALL AIUEO, TU, YAYUYO, WA */
  48     FULL_LATIN,     /* FULLWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] */
  49     HALF_LATIN,     /* LATIN (CAPITAL|SMALL) LETTER [A-Z] */
  50     JA_OTHER,
  51 };
  52
  53 enum en_char_type {
  54     EN_NOSTATE,
  55     EN_SPACE,
  56     EN_LATIN,
  57     EN_OTHER,
  58 };
  59
  60 static int euc_mblen(unsigned char*, int);
  61 static int sjis_mblen(unsigned char*, int);
  62 static int iso8859_mblen(unsigned char*, int);
  63 static int utf8_mblen(unsigned char*, int);
  64
  65 static int ja_char_type_parse(chasen_tok_t*,int,int*,int);
  66 static int en_char_type_parse(chasen_tok_t*,int,int*,int);
  67
  68 static enum ja_char_type
  69 ja_euc_char_type(chasen_tok_t*, unsigned char *, int);
  70 static enum ja_char_type
  71 ja_sjis_char_type(chasen_tok_t*, unsigned char *, int);
  72 static enum ja_char_type
  73 ja_utf8_char_type(chasen_tok_t*, unsigned char *, int);
  74
  75 static enum en_char_type
  76 en_char_type(chasen_tok_t*, unsigned char *, int);
  77
  78 typedef int (*ja_char_type_get)(chasen_tok_t*,unsigned char*,int);
  79 typedef int (*en_char_type_get)(chasen_tok_t*,unsigned char*,int);
  80
  81 static int is_anno(chasen_tok_t*, unsigned char*, int);
  82 static int is_anno2(anno_info*, unsigned char*, int);
  83
  84 /*
  85  * This function constructs a tokenizer object.
  86  * If an error occurs, it terminates a process.
  87  */
  88 chasen_tok_t *
  89 cha_tok_new(int lang, int encode)
  90 {
  91     chasen_tok_t *tok;
  92
  93     tok = cha_malloc(sizeof(chasen_tok_t));
  94
  95     tok->lang = lang;
  96     tok->encode = encode;
  97     tok->anno = NULL;
  98
  99     if (lang == CHASEN_LANG_JA) {
 100         if (encode == CHASEN_ENCODE_EUCJP) {
 101             tok->mblen = euc_mblen;
 102             tok->char_type_parse = ja_char_type_parse;
 103             tok->get_char_type = (ja_char_type_get)ja_euc_char_type;
 104         } else if (encode == CHASEN_ENCODE_SJIS) {
 105             tok->mblen = sjis_mblen;
 106             tok->char_type_parse = ja_char_type_parse;
 107             tok->get_char_type = (ja_char_type_get)ja_sjis_char_type;
 108         } else if (encode == CHASEN_ENCODE_UTF8) {
 109             tok->mblen = utf8_mblen;
 110             tok->char_type_parse = ja_char_type_parse;
 111             tok->get_char_type = (ja_char_type_get)ja_utf8_char_type;
 112         }
 113     } else if (lang == CHASEN_LANG_EN) {
 114         if (encode == CHASEN_ENCODE_ISO8859) {
 115             tok->mblen = iso8859_mblen;
 116             tok->char_type_parse = en_char_type_parse;
 117             tok->get_char_type = (en_char_type_get)en_char_type;
 118         } else if (encode == CHASEN_ENCODE_UTF8) {
 119             tok->mblen = utf8_mblen;
 120             tok->char_type_parse = en_char_type_parse;
 121             tok->get_char_type = (en_char_type_get)en_char_type;
 122         }
 123     } else {
 124         tok->mblen = iso8859_mblen;
 125         tok->char_type_parse = en_char_type_parse;
 126         tok->get_char_type = (en_char_type_get)en_char_type;
 127     }
 128
 129     return tok;
 130 }
 131
 132 /*
 133  * This function destroys the tokenizer object.
 134  */
 135 void
 136 cha_tok_delete(chasen_tok_t *tok)
 137 {
 138     cha_free(tok);
 139 }
 140
 141 int
 142 cha_tok_parse(chasen_tok_t *tok, unsigned char *str, char *type, int len,
 143               int *anno_no)
 144 {
 145     int cursor, head;
 146     int state, state0;
 147     anno_info *anno = NULL;
 148     int no;
 149
 150     if (anno_no != NULL && (no = is_anno(tok, str, len)) < 0) {
 151         anno = &(tok->anno[-no]);
 152         *anno_no = -no;
 153         for (cursor = anno->len1;
 154              cursor < len;
 155              cursor += tok->mblen(str + cursor, len - cursor)) {
 156             if (is_anno2(anno, str, cursor))
 157                 break;
 158         }
 159         return cursor;
 160     }
 161
 162     state0 = state = 0; /* NOSTATE */
 163     for (cursor = head = 0; cursor < len;
 164          cursor += tok->mblen(str + cursor, len - cursor)) {
 165         if (anno_no != NULL &&
 166             is_anno(tok, str + cursor, len - cursor) < 0) {
 167             return cursor;
 168         } else {
 169             state = tok->get_char_type(tok, str + cursor, len - cursor);
 170             state = tok->char_type_parse(tok, state, &state0, cursor);
 171         }
 172
 173         if (state != state0) {
 174             type[head] = cursor - head;
 175             head = cursor;
 176         }
 177         state0 = state;
 178     }
 179     type[head] = cursor - head;
 180
 181     return cursor;
 182 }
 183
 184 /*
 185  * This function returns the length in bytes of the multibyte character
 186  * str with len bytes.
 187  *
 188  * If the character is `\0', it returns 1.
 189  */
 190 int
 191 cha_tok_mblen(chasen_tok_t *tok, unsigned char *str, int len)
 192 {
 193     return tok->mblen(str, len);
 194 }
 195
 196 /*
 197  * This function sets information of annotation anno in tokenizer tok.
 198  */
 199 void
 200 cha_tok_set_annotation(chasen_tok_t *tok, anno_info *anno)
 201 {
 202     tok->anno = anno;
 203 }
 204
 205 /*
 206  * private functions
 207  */
 208 static int
 209 euc_mblen(unsigned char *str, int len)
 210 {
 211     if (len >= 3 &&
 212         str[0] == 0x8f && (str[1] & 0x80) && (str[2] & 0x80)) {
 213         return 3;
 214     } else if (len >= 2 && (str[0] & 0x80) && (str[1] & 0x80)) {
 215         return 2;
 216     }
 217
 218     return 1;
 219 }
 220
 221 static int
 222 sjis_mblen(unsigned char *str, int len)
 223 {
 224     if (str[0] >= 0xa0 && str[0] <= 0xdf) {
 225         return 1;
 226     } else if (len >= 2 && (str[0] & 0x80)) {
 227         return 2;
 228     }
 229
 230     return 1;
 231 }
 232
 233 static int
 234 iso8859_mblen(unsigned char *str, int len)
 235 {
 236     return 1;
 237 }
 238
 239 static int
 240 utf8_mblen(unsigned char *str, int len)
 241 {
 242     if (len >= 4 && (str[0] & 0xf0) == 0xf0 &&
 243         (str[1] & 0x80) && (str[2] & 0x80) && (str[3] & 0x80)) {
 244         return 4;
 245     } else if (len >= 3 && (str[0] & 0xe0) == 0xe0 &&
 246                (str[1] & 0x80) && (str[2] & 0x80)) {
 247         return 3;
 248     } else if (len >= 2 && (str[0] & 0xc0) == 0xc0 && (str[1] & 0x80)) {
 249         return 2;
 250     }
 251
 252     return 1;
 253 }
 254
 255 static int
 256 ja_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
 257 {
 258     if (state == JA_SPACE) {
 259         /* tok->anno_type[cursor] = 0; */ /* XXX */
 260     } else if ((state == HALF_LATIN) ||
 261                (state == FULL_LATIN)) {
 262         ; /* do nothing */
 263     } else if (((*state0 == KATAKANA) &&
 264                 ((state == PROLONGED) ||
 265                  (state == SMALL_KATAKANA))) ||
 266                (state == KATAKANA)) {
 267         state = KATAKANA;
 268     } else {
 269         state = JA_OTHER;
 270         *state0 = JA_NOSTATE;
 271     }
 272
 273     return state;
 274 }
 275
 276 static int
 277 en_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
 278 {
 279     if (state == EN_SPACE) {
 280         /* tok->anno_type[cursor] = 0; */ /* XXX */
 281     } else if (state == EN_OTHER) {
 282         *state0 = EN_NOSTATE;
 283     }
 284
 285     return state;
 286 }
 287
 288 static enum ja_char_type
 289 ja_euc_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 290 {
 291     int mblen = tok->mblen(str, len);
 292
 293     if (mblen == 1) {
 294         if (isalpha(str[0])) {
 295             return HALF_LATIN;
 296         } else if (is_space(str[0])) {
 297             return JA_SPACE;
 298         }
 299     } else if (mblen == 2) {
 300         if ((str[0] == 0xa1) && (str[1] == 0xbc)) {
 301             return PROLONGED;
 302         } else if (str[0] == 0xa5) {
 303             if ((str[1] == 0xa1) || (str[1] == 0xa3) ||
 304                 (str[1] == 0xa5) || (str[1] == 0xa7) ||
 305                 (str[1] == 0xa9) || (str[1] == 0xc3) ||
 306                 (str[1] == 0xe3) || (str[1] == 0xe5) ||
 307                 (str[1] == 0xe7) || (str[1] == 0xee)) {
 308                 return SMALL_KATAKANA;
 309             } else {
 310                 return KATAKANA;
 311             }
 312         } else if ((str[0] == 0xa3) && (str[1] >= 0xc1)) {
 313             return FULL_LATIN;
 314         }
 315     }
 316
 317     return JA_OTHER;
 318 }
 319
 320 static enum ja_char_type
 321 ja_sjis_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 322 {
 323     int mblen = tok->mblen(str, len);
 324
 325     if (mblen == 1) {
 326         if (isalpha(str[0])) {
 327             return HALF_LATIN;
 328         } else if (is_space(str[0])) {
 329             return JA_SPACE;
 330         }
 331     } else if (mblen == 2) {
 332         if ((str[0] == 0x81) && (str[1] == 0x5b)) {
 333             return PROLONGED;
 334         } else if (str[0] == 0x83) {
 335             if ((str[1] == 0x40) || (str[1] == 0x42) ||
 336                 (str[1] == 0x44) || (str[1] == 0x46) ||
 337                 (str[1] == 0x48) || (str[1] == 0x62) ||
 338                 (str[1] == 0x83) || (str[1] == 0x85) ||
 339                 (str[1] == 0x87) || (str[1] == 0x8e)) {
 340                 return SMALL_KATAKANA;
 341             } else {
 342                 return KATAKANA;
 343             }
 344         } else if ((str[0] == 0x82) &&
 345                    (str[1] >= 0x60) && (str[1] <= 0x9a)) {
 346             return FULL_LATIN;
 347         }
 348     }
 349
 350     return JA_OTHER;
 351 }
 352
 353 static enum ja_char_type
 354 ja_utf8_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 355 {
 356     int mblen = tok->mblen(str, len);
 357
 358     if (mblen == 1) {
 359         if (isalpha(str[0])) {
 360             return HALF_LATIN;
 361         } else if (is_space(str[0])) {
 362             return JA_SPACE;
 363         }
 364     } else if (mblen == 3) {
 365         if ((str[0] == 0xe3) && (str[1] == 0x83) && (str[2] == 0xbc)) {
 366             return PROLONGED;
 367         } else if (str[0] == 0xe3) {
 368             if (((str[1] == 0x82) &&
 369                  ((str[2] == 0xa1) || (str[2] == 0xa3) ||
 370                   (str[2] == 0xa5) || (str[2] == 0xa7) ||
 371                   (str[2] == 0xa9))) ||
 372                 ((str[1] == 0x83) &&
 373                   ((str[2] == 0x83) || (str[2] == 0xa3) ||
 374                    (str[2] == 0xa5) || (str[2] == 0xa7) ||
 375                    (str[2] == 0xae)))) {
 376                 return SMALL_KATAKANA;
 377             } else if (((str[1] == 0x82) &&
 378                         (str[2] >= 0xa1) && (str[2] <= 0xbf)) ||
 379                        ((str[1] == 0x83) &&
 380                         (str[2] >= 0x80) && (str[2] <= 0xBA))) {
 381                 return KATAKANA;
 382             }
 383         } else if ((str[0] == 0xef) &&
 384                    (((str[1] == 0xbc) &&
 385                      (str[2] >= 0xa1) && (str[2] <= 0xba)) ||
 386                     ((str[1] == 0xbd) &&
 387                      (str[2] >= 0x81) && (str[2] <= 0x9a)))) {
 388             return FULL_LATIN;
 389         }
 390     }
 391
 392     return JA_OTHER;
 393 }
 394
 395 static enum en_char_type
 396 en_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 397 {
 398     unsigned char c = str[0];
 399
 400     if (is_space(c)) {
 401         return EN_SPACE;
 402     } else if (isalpha(c)) { /* for English only */
 403         return EN_LATIN;
 404     }
 405
 406     return EN_OTHER;
 407 }
 408
 409
 410 static int
 411 is_anno(chasen_tok_t *tok, unsigned char *string, int len)
 412 {
 413     int i;
 414     anno_info *anno = tok->anno;
 415
 416     if (anno == NULL) {
 417         return 0;
 418     }
 419     for (i = 1; (anno[i].str1 != NULL); i++) {
 420         if (len < anno[i].len1) {
 421             continue;
 422         }
 423         if (!memcmp(string, anno[i].str1, anno[i].len1)) {
 424             return -i;
 425         }
 426     }
 427     return 0;
 428 }
 429
 430 static int
 431 is_anno2(anno_info *anno, unsigned char *bos, int cursor)
 432 {
 433     int len2 = anno->len2;
 434
 435     if (cursor < len2) {
 436         return 0;
 437     }
 438
 439     return (memcmp(bos + cursor - len2, anno->str2, len2) == 0);
 440 }