2 * Copyright (c) 2003 Nara Institute of Science and Technology
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name Nara Institute of Science and Technology may not be used to
15 * endorse or promote products derived from this software without
16 * specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute
22 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * $Id: chalib.c,v 1.3 2007/03/25 11:57:27 kazuma-t Exp $
36 #include "tokenizer.h"
38 #define CHA_NAME "ChaSen"
40 #define STR_UNSPECIFIED "UNSPEC"
41 #define STR_ANNOTATION "ANNO"
43 darts_t *Da_dicfile[DIC_NUM];
46 int Cha_cost_width = -1;
47 enum cha_lang Cha_lang = CHASEN_LANG_JA;
49 enum cha_encode Cha_encode = CHASEN_ENCODE_SJIS;
51 enum cha_encode Cha_encode = CHASEN_ENCODE_EUCJP;
54 static int cost_width0;
56 static char dadic_filename[DIC_NUM][PATH_MAX];
58 static int opt_show = 'b', opt_form = 'f', opt_ja, opt_cmd;
59 static char *opt_form_string;
65 cha_version(FILE * fp)
71 "%s version %s (c) 1996-2007 Nara Institute of Science and Technology\n",
73 fprintf(fp, "Grammar files are in ChaSen's new v-gram format.\n");
80 cha_set_opt_form(char *format)
88 format[0] == '-' && strchr("fecdv", format[1])
89 && format[2] == '\0') {
95 if (opt_form == 'd' || opt_form == 'v')
100 "morph(%pi,%ps,%pe,%pc,'%m','%U(%y)','%M',%U(%P'),NIL,%T0,%F0,'%I0',%c,[%ppc,],[%ppi,])";
104 "%pb%3pi %3ps %3pe %5pc %m\t%U(%y)\t%U(%a)\t%M\t%U(%P-) NIL %T0 %F0 %I0 %c %ppi, %ppc,\n";
107 opt_form_string = "%m\t%y\t%M\t%U(%P-)\t%T \t%F \n";
110 opt_form_string = "%m\t%U(%y)\t%M\t%P- %h %T* %t %F* %f\n";
113 opt_form_string = "%m\t%y\t%M\t%h %t %f\n";
122 opt_form_string = format;
124 * opt_form_string = cha_convert_escape(cha_strdup(format), 1);
127 f = opt_form_string + strlen(opt_form_string);
138 cha_set_language(char *langstr)
140 Cha_lang = CHASEN_LANG_JA;
142 if (langstr[0] == 'j') {
143 Cha_lang = CHASEN_LANG_JA;
144 } else if (langstr[0] == 'e') {
145 Cha_lang = CHASEN_LANG_EN;
150 * cha_set_cost_width()
153 cha_set_cost_width(int cw)
155 cost_width0 = cw * MRPH_DEFAULT_WEIGHT;
158 * ºÇŬ²ò°Ê³°¤âɽ¼¨¤¹¤ë¤È¤¤Ï Cha_cost_width ¤òÀ¸¤«¤¹
160 Cha_cost_width = opt_show == 'b' ? -1 : cost_width0;
164 * chasen_getopt_argv - initialize and read options
171 chasen_getopt_argv(char **argv, FILE * fp)
179 while ((c = cha_getopt_chasen(argv, fp)) != EOF) {
182 cha_set_encode(Cha_optarg);
188 cha_set_rcpath(Cha_optarg);
196 * initialize if not done
198 if (!Cha_undef_info_num)
205 while ((c = cha_getopt_chasen(argv, fp)) != EOF) {
218 cha_set_opt_form(NULL);
221 cha_set_opt_form(cha_convert_escape
222 (cha_strdup(Cha_optarg), 0));
225 cha_set_language(Cha_optarg);
227 case 'w': /* ¥³¥¹¥ÈÉý¤Î»ØÄê */
228 cha_set_cost_width(atoi(Cha_optarg));
231 Cha_output_iscompound = *Cha_optarg == 'c';
234 cha_set_output(stdout);
235 switch (*Cha_optarg) {
238 * display the list of Cha_hinsi table
240 cha_print_hinsi_table();
244 cha_print_ctype_table();
248 cha_print_cform_table();
261 #if 0 /* not necessary */
269 * ºÇŬ²ò°Ê³°¤âɽ¼¨¤¹¤ë¤È¤¤Ï Cha_cost_width ¤òÀ¸¤«¤¹
271 Cha_cost_width = opt_show == 'b' ? -1 : cost_width0;
277 * parse a string and output to fp or str
280 * 0 - ok / no result / too many morphs
284 chasen_sparse_main(char *input, FILE *output)
290 * initialize if not done
292 if (!Cha_undef_info_num)
294 if (!opt_form_string)
295 cha_set_opt_form(NULL);
297 cha_set_output(output);
299 if (input[0] == '\0') {
300 cha_print_bos_eos(opt_form);
305 * parse a sentence and print
308 int c = 0, len, cursor;
309 if ((crlf = strpbrk(input, "\r\n")) == NULL)
320 while (cursor < len) {
321 seg.text = input + cursor;
323 seg.len = cha_tok_parse(Cha_tokenizer, seg.text, seg.char_type,
324 len - cursor, &seg.anno_no);
325 if (seg.anno_no >= 0)
326 seg.type = SEGTYPE_ANNOTATION;
328 seg.type = SEGTYPE_NORMAL;
329 cha_parse_segment(&lat, &seg);
333 cha_print_path(&lat, opt_show, opt_form, opt_form_string);
337 if (c == '\r' && crlf[1] == '\n')
347 * read from file/str, parse, and write to file
350 * 0 - ok / no result / too many morphs
357 chasen_fparse(FILE * fp_in, FILE * fp_out)
359 char line[CHA_INPUT_SIZE];
361 if (cha_fgets(line, sizeof(line), fp_in) == NULL)
364 return chasen_sparse_main(line, fp_out);
370 chasen_sparse(char *str_in, FILE * fp_out)
375 euc_str = cha_malloc(strlen(str_in) + 1);
376 cha_jistoeuc(str_in, euc_str);
377 rc = chasen_sparse_main(euc_str, fp_out);
384 set_normal(cha_seg_t *seg)
386 seg->type = SEGTYPE_NORMAL;
387 cha_tok_parse(Cha_tokenizer, seg->text, seg->char_type,
394 seg_tokenize(unsigned char *line, cha_seg_t *seg)
401 while (line[len] != '\t' && line[len] != '\0')
404 seg->posid = seg->inf_type = seg->inf_form = 0;
406 if (line[len] == '\0')
407 return set_normal(seg);
409 /* skip reading and base form */
410 for (i = 0; i < 2; i++) {
412 while (line[len] != '\t' && line[len] != '\0')
414 if (line[len] == '\0')
415 return set_normal(seg);
419 if (strcmp(line, STR_UNSPECIFIED) == 0) {
420 seg->type = SEGTYPE_UNSPECIFIED;
421 seg->char_type[0] = seg->len;
422 } else if (strcmp(line, STR_ANNOTATION) == 0) {
423 seg->type = SEGTYPE_ANNOTATION;
424 cha_tok_parse(Cha_tokenizer, seg->text, seg->char_type,
425 seg->len, &seg->anno_no);
426 seg->char_type[0] = seg->len;
427 } else { /* read POS */
428 char *pos[256], *itype;
431 seg->type = SEGTYPE_MORPH;
432 seg->char_type[0] = seg->len;
433 if ((l = strchr(l, '\t')) != NULL) {
436 if ((l = strchr(l, '\t')) != NULL) {
438 seg->inf_type = cha_get_type_id(itype);
439 seg->inf_form = cha_get_form_id(l, seg->inf_type);
441 fprintf(stderr, "invalid format: %s\n", line);
447 while ((l = strchr(l, '-')) != NULL) {
452 if (cha_litmatch(pos[0], 3, STR_UNKNOWN_WORD,
453 STR_UNKNOWN_WORD1, STR_UNKNOWN_WORD2))
457 seg->posid = cha_get_nhinsi_str_id(pos);
465 strip(unsigned char *s)
469 if (s[len - 1] == '\n')
472 while (len > 0 && s[len - 1] == '\t')
480 chasen_parse_segments(FILE *input, FILE *output)
483 unsigned char buf[CHA_INPUT_SIZE]; /* XXX */
487 if (!Cha_undef_info_num)
489 if (!opt_form_string)
490 cha_set_opt_form(NULL);
492 cha_set_output(output);
494 while (fgets(buf, CHA_INPUT_SIZE, input) != NULL) {
501 if (!buf[0] || cha_litmatch(buf, 2, STR_EOS, STR_BOS_EOS)) {
504 cha_print_path(&lat, opt_show, opt_form, opt_form_string);
508 if (seg_tokenize(buf, &seg) < 0) {
509 fprintf(stderr, "invalid format: %s\n", buf);
512 cha_parse_segment(&lat, &seg);
516 cha_print_path(&lat, opt_show, opt_form, opt_form_string);
523 * read from file/str, parse, and output to string
525 * return value: string
526 * !NULL - ok / no result / too many morphs
534 chasen_fparse_tostr(FILE * fp_in)
536 char line[CHA_INPUT_SIZE];
538 if (cha_fgets(line, sizeof(line), fp_in) == NULL)
541 if (chasen_sparse_main(line, NULL))
544 return cha_get_output();
551 chasen_sparse_tostr(char *str_in)
555 euc_str = cha_malloc(strlen(str_in) + 1);
556 cha_jistoeuc(str_in, euc_str);
558 if (chasen_sparse_main(euc_str, NULL))
563 return cha_get_output();
567 cha_fgets(char *s, int n, FILE * fp)
570 return cha_jfgets(s, n, fp);
572 return cha_fget_line(s, n, fp);
576 set_dic_filename(char *filename, size_t len, char *s)
578 #ifdef PATHTYPE_MSDOS
579 if (*s == PATH_DELIMITER || *s && s[1] == ':')
580 strncpy(filename, s, len);
582 if (*s == PATH_DELIMITER)
583 strncpy(filename, s, len);
584 #endif /* PATHTYPE_MSDOS */
586 snprintf(filename, len, "%s%s", cha_get_grammar_dir(), s);
590 cha_read_dadic(chasen_cell_t * cell)
593 char da_filename[PATH_MAX];
594 char lex_filename[PATH_MAX];
595 char dat_filename[PATH_MAX];
597 if (dadic_filename[0][0])
600 for (num = 0; !nullp(cell); num++, cell = cha_cdr(cell)) {
602 cha_exit_file(1, "too many Darts dictionary files");
603 set_dic_filename(dadic_filename[num], PATH_MAX,
604 cha_s_atom(cha_car(cell)));
606 snprintf(da_filename, PATH_MAX, "%s.da", dadic_filename[num]);
607 snprintf(lex_filename, PATH_MAX, "%s.lex", dadic_filename[num]);
608 snprintf(dat_filename, PATH_MAX, "%s.dat", dadic_filename[num]);
609 Da_dicfile[num] = da_open(da_filename,
610 lex_filename, dat_filename);