lib/tokenizer.c

   1 /*
   2  * Copyright (c) 2003 Nara Institute of Science and Technology
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *   notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 3. The name Nara Institute of Science and Technology may not be used to
  15  *    endorse or promote products derived from this software without
  16  *    specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  21  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE Nara Institute
  22  * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  24  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  25  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  26  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  27  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * $Id: tokenizer.c,v 1.1 2007/03/13 07:40:10 masayu-a Exp $
  31  */
  32
  33 #include <string.h>
  34 #include <ctype.h>
  35
  36 #include "chalib.h"
  37 #include "literal.h"
  38 #include "tokenizer.h"
  39
  40 #define is_space(c) (((c) == ' ') || ((c) == '\t'))
  41
  42 enum ja_char_type {
  43     JA_NOSTATE,
  44     JA_SPACE,
  45     PROLONGED,      /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
  46     KATAKANA,       /* KATAKANA LETTER (SMALL) [A-KE] */
  47     SMALL_KATAKANA, /* KATAKANA LETTER SMALL AIUEO, TU, YAYUYO, WA */
  48     FULL_LATIN,     /* FULLWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] */
  49     HALF_LATIN,     /* LATIN (CAPITAL|SMALL) LETTER [A-Z] */
  50     JA_OTHER,
  51 };
  52
  53 enum en_char_type {
  54     EN_NOSTATE,
  55     EN_SPACE,
  56     EN_LATIN,
  57     EN_OTHER,
  58 };
  59
  60 static int euc_mblen(unsigned char*, int);
  61 static int sjis_mblen(unsigned char*, int);
  62 static int iso8859_mblen(unsigned char*, int);
  63 static int utf8_mblen(unsigned char*, int);
  64
  65 static int ja_char_type_parse(chasen_tok_t*,int,int*,int);
  66 static int en_char_type_parse(chasen_tok_t*,int,int*,int);
  67
  68 static enum ja_char_type
  69 ja_euc_char_type(chasen_tok_t*, unsigned char *, int);
  70 static enum ja_char_type
  71 ja_sjis_char_type(chasen_tok_t*, unsigned char *, int);
  72 static enum ja_char_type
  73 ja_utf8_char_type(chasen_tok_t*, unsigned char *, int);
  74
  75 static enum en_char_type
  76 en_char_type(chasen_tok_t*, unsigned char *, int);
  77
  78 typedef int (*ja_char_type_get)(chasen_tok_t*,unsigned char*,int);
  79 typedef int (*en_char_type_get)(chasen_tok_t*,unsigned char*,int);
  80
  81 static int is_anno(chasen_tok_t*, unsigned char*, int);
  82 static int is_anno2(anno_info*, unsigned char*, int);
  83
  84 /*
  85  * This function constructs a tokenizer object.
  86  * If an error occurs, it terminates a process.
  87  */
  88 chasen_tok_t *
  89 cha_tok_new(int lang, int encode)
  90 {
  91     chasen_tok_t *tok;
  92
  93     tok = cha_malloc(sizeof(chasen_tok_t));
  94
  95     tok->lang = lang;
  96     tok->encode = encode;
  97     tok->anno = NULL;
  98
  99     if (lang == CHASEN_LANG_JA) {
 100         if (encode == CHASEN_ENCODE_EUCJP) {
 101             tok->mblen = euc_mblen;
 102             tok->char_type_parse = ja_char_type_parse;
 103             tok->get_char_type = (ja_char_type_get)ja_euc_char_type;
 104         } else if (encode == CHASEN_ENCODE_SJIS) {
 105             tok->mblen = sjis_mblen;
 106             tok->char_type_parse = ja_char_type_parse;
 107             tok->get_char_type = (ja_char_type_get)ja_sjis_char_type;
 108         } else if (encode == CHASEN_ENCODE_UTF8) {
 109             tok->mblen = utf8_mblen;
 110             tok->char_type_parse = ja_char_type_parse;
 111             tok->get_char_type = (ja_char_type_get)ja_utf8_char_type;
 112         }
 113     } else if (lang == CHASEN_LANG_EN) {
 114         if (encode == CHASEN_ENCODE_ISO8859) {
 115             tok->mblen = iso8859_mblen;
 116             tok->char_type_parse = en_char_type_parse;
 117             tok->get_char_type = (en_char_type_get)en_char_type;
 118         } else if (encode == CHASEN_ENCODE_UTF8) {
 119             tok->mblen = utf8_mblen;
 120             tok->char_type_parse = en_char_type_parse;
 121             tok->get_char_type = (en_char_type_get)en_char_type;
 122         }
 123     } else {
 124         tok->mblen = iso8859_mblen;
 125         tok->char_type_parse = en_char_type_parse;
 126         tok->get_char_type = (en_char_type_get)en_char_type;
 127     }
 128
 129     return tok;
 130 }
 131
 132 /*
 133  * This function destroys the tokenizer object.
 134  */
 135 void
 136 cha_tok_delete(chasen_tok_t *tok)
 137 {
 138     cha_free(tok);
 139 }
 140
 141 int
 142 cha_tok_parse(chasen_tok_t *tok, unsigned char *str, char *type, int len,
 143               int *anno_no)
 144 {
 145     int cursor, head;
 146     int state, state0;
 147     anno_info *anno = NULL;
 148     int no;
 149
 150     if (anno_no != NULL && (no = is_anno(tok, str, len)) < 0) {
 151         anno = &(tok->anno[-no]);
 152         *anno_no = -no;
 153         for (cursor = 0; cursor < len;
 154              cursor += tok->mblen(str + cursor, len - cursor)) {
 155             if (is_anno2(anno, str, cursor))
 156                 break;
 157         }
 158         return cursor;
 159     }
 160
 161     state0 = state = 0; /* NOSTATE */
 162     for (cursor = head = 0; cursor < len;
 163          cursor += tok->mblen(str + cursor, len - cursor)) {
 164         if (anno_no != NULL &&
 165             is_anno(tok, str + cursor, len - cursor) < 0) {
 166             return cursor;
 167         } else {
 168             state = tok->get_char_type(tok, str + cursor, len - cursor);
 169             state = tok->char_type_parse(tok, state, &state0, cursor);
 170         }
 171
 172         if (state != state0) {
 173             type[head] = cursor - head;
 174             head = cursor;
 175         }
 176         state0 = state;
 177     }
 178     type[head] = cursor - head;
 179
 180     return cursor;
 181 }
 182
 183 /*
 184  * This function returns the length in bytes of the multibyte character
 185  * str with len bytes.
 186  *
 187  * If the character is `\0', it returns 1.
 188  */
 189 int
 190 cha_tok_mblen(chasen_tok_t *tok, unsigned char *str, int len)
 191 {
 192     return tok->mblen(str, len);
 193 }
 194
 195 /*
 196  * This function sets information of annotation anno in tokenizer tok.
 197  */
 198 void
 199 cha_tok_set_annotation(chasen_tok_t *tok, anno_info *anno)
 200 {
 201     tok->anno = anno;
 202 }
 203
 204 /*
 205  * private functions
 206  */
 207 static int
 208 euc_mblen(unsigned char *str, int len)
 209 {
 210     if (len >= 3 &&
 211         str[0] == 0x8f && (str[1] & 0x80) && (str[2] & 0x80)) {
 212         return 3;
 213     } else if (len >= 2 && (str[0] & 0x80) && (str[1] & 0x80)) {
 214         return 2;
 215     }
 216
 217     return 1;
 218 }
 219
 220 static int
 221 sjis_mblen(unsigned char *str, int len)
 222 {
 223     if (str[0] >= 0xa0 && str[0] <= 0xdf) {
 224         return 1;
 225     } else if (len >= 2 && (str[0] & 0x80)) {
 226         return 2;
 227     }
 228
 229     return 1;
 230 }
 231
 232 static int
 233 iso8859_mblen(unsigned char *str, int len)
 234 {
 235     return 1;
 236 }
 237
 238 static int
 239 utf8_mblen(unsigned char *str, int len)
 240 {
 241     if (len >= 4 && (str[0] & 0xf0) == 0xf0 &&
 242         (str[1] & 0x80) && (str[2] & 0x80) && (str[3] & 0x80)) {
 243         return 4;
 244     } else if (len >= 3 && (str[0] & 0xe0) == 0xe0 &&
 245                (str[1] & 0x80) && (str[2] & 0x80)) {
 246         return 3;
 247     } else if (len >= 2 && (str[0] & 0xc0) == 0xc0 && (str[1] & 0x80)) {
 248         return 2;
 249     }
 250
 251     return 1;
 252 }
 253
 254 static int
 255 ja_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
 256 {
 257     if (state == JA_SPACE) {
 258         /* tok->anno_type[cursor] = 0; */ /* XXX */
 259     } else if ((state == HALF_LATIN) ||
 260                (state == FULL_LATIN)) {
 261         ; /* do nothing */
 262     } else if (((*state0 == KATAKANA) &&
 263                 ((state == PROLONGED) ||
 264                  (state == SMALL_KATAKANA))) ||
 265                (state == KATAKANA)) {
 266         state = KATAKANA;
 267     } else {
 268         state = JA_OTHER;
 269         *state0 = JA_NOSTATE;
 270     }
 271
 272     return state;
 273 }
 274
 275 static int
 276 en_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor)
 277 {
 278     if (state == EN_SPACE) {
 279         /* tok->anno_type[cursor] = 0; */ /* XXX */
 280     } else if (state == EN_OTHER) {
 281         *state0 = EN_NOSTATE;
 282     }
 283
 284     return state;
 285 }
 286
 287 static enum ja_char_type
 288 ja_euc_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 289 {
 290     int mblen = tok->mblen(str, len);
 291
 292     if (mblen == 1) {
 293         if (isalpha(str[0])) {
 294             return HALF_LATIN;
 295         } else if (is_space(str[0])) {
 296             return JA_SPACE;
 297         }
 298     } else if (mblen == 2) {
 299         if ((str[0] == 0xa1) && (str[1] == 0xbc)) {
 300             return PROLONGED;
 301         } else if (str[0] == 0xa5) {
 302             if ((str[1] == 0xa1) || (str[1] == 0xa3) ||
 303                 (str[1] == 0xa5) || (str[1] == 0xa7) ||
 304                 (str[1] == 0xa9) || (str[1] == 0xc3) ||
 305                 (str[1] == 0xe3) || (str[1] == 0xe5) ||
 306                 (str[1] == 0xe7) || (str[1] == 0xee)) {
 307                 return SMALL_KATAKANA;
 308             } else {
 309                 return KATAKANA;
 310             }
 311         } else if ((str[0] == 0xa3) && (str[1] >= 0xc1)) {
 312             return FULL_LATIN;
 313         }
 314     }
 315
 316     return JA_OTHER;
 317 }
 318
 319 static enum ja_char_type
 320 ja_sjis_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 321 {
 322     int mblen = tok->mblen(str, len);
 323
 324     if (mblen == 1) {
 325         if (isalpha(str[0])) {
 326             return HALF_LATIN;
 327         } else if (is_space(str[0])) {
 328             return JA_SPACE;
 329         }
 330     } else if (mblen == 2) {
 331         if ((str[0] == 0x81) && (str[1] == 0x5b)) {
 332             return PROLONGED;
 333         } else if (str[0] == 0x83) {
 334             if ((str[1] == 0x40) || (str[1] == 0x42) ||
 335                 (str[1] == 0x44) || (str[1] == 0x46) ||
 336                 (str[1] == 0x48) || (str[1] == 0x62) ||
 337                 (str[1] == 0x83) || (str[1] == 0x85) ||
 338                 (str[1] == 0x87) || (str[1] == 0x8e)) {
 339                 return SMALL_KATAKANA;
 340             } else {
 341                 return KATAKANA;
 342             }
 343         } else if ((str[0] == 0x82) &&
 344                    (str[1] >= 0x60) && (str[1] <= 0x9a)) {
 345             return FULL_LATIN;
 346         }
 347     }
 348
 349     return JA_OTHER;
 350 }
 351
 352 static enum ja_char_type
 353 ja_utf8_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 354 {
 355     int mblen = tok->mblen(str, len);
 356
 357     if (mblen == 1) {
 358         if (isalpha(str[0])) {
 359             return HALF_LATIN;
 360         } else if (is_space(str[0])) {
 361             return JA_SPACE;
 362         }
 363     } else if (mblen == 3) {
 364         if ((str[0] == 0xe3) && (str[1] == 0x83) && (str[2] == 0xbc)) {
 365             return PROLONGED;
 366         } else if (str[0] == 0xe3) {
 367             if (((str[1] == 0x82) &&
 368                  ((str[2] == 0xa1) || (str[2] == 0xa3) ||
 369                   (str[2] == 0xa5) || (str[2] == 0xa7) ||
 370                   (str[2] == 0xa9))) ||
 371                 ((str[1] == 0x83) &&
 372                   ((str[2] == 0x83) || (str[2] == 0xa3) ||
 373                    (str[2] == 0xa5) || (str[2] == 0xa7) ||
 374                    (str[2] == 0xae)))) {
 375                 return SMALL_KATAKANA;
 376             } else if (((str[1] == 0x82) &&
 377                         (str[2] >= 0xa1) && (str[2] <= 0xbf)) ||
 378                        ((str[1] == 0x83) &&
 379                         (str[2] >= 0x80) && (str[2] <= 0xBA))) {
 380                 return KATAKANA;
 381             }
 382         } else if ((str[0] == 0xef) &&
 383                    (((str[1] == 0xbc) &&
 384                      (str[2] >= 0xa1) && (str[2] <= 0xba)) ||
 385                     ((str[1] == 0xbd) &&
 386                      (str[2] >= 0x81) && (str[2] <= 0x9a)))) {
 387             return FULL_LATIN;
 388         }
 389     }
 390
 391     return JA_OTHER;
 392 }
 393
 394 static enum en_char_type
 395 en_char_type(chasen_tok_t *tok, unsigned char *str, int len)
 396 {
 397     unsigned char c = str[0];
 398
 399     if (is_space(c)) {
 400         return EN_SPACE;
 401     } else if (isalpha(c)) { /* for English only */
 402         return EN_LATIN;
 403     }
 404
 405     return EN_OTHER;
 406 }
 407
 408
 409 static int
 410 is_anno(chasen_tok_t *tok, unsigned char *string, int len)
 411 {
 412     int i;
 413     anno_info *anno = tok->anno;
 414
 415     if (anno == NULL) {
 416         return 0;
 417     }
 418     for (i = 1; (anno[i].str1 != NULL); i++) {
 419         if (len < anno[i].len1) {
 420             continue;
 421         }
 422         if (!memcmp(string, anno[i].str1, anno[i].len1)) {
 423             return -i;
 424         }
 425     }
 426     return 0;
 427 }
 428
 429 static int
 430 is_anno2(anno_info *anno, unsigned char *bos, int cursor)
 431 {
 432     int len2 = anno->len2;
 433
 434     if (cursor < len2) {
 435         return 0;
 436     }
 437
 438     return (memcmp(bos + cursor - len2, anno->str2, len2) == 0);
 439 }