2 * Copyright (c) 2003 Nara Institute of Science and Technology
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name Nara Institute of Science and Technology may not be used to
15 * endorse or promote products derived from this software without
16 * specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute
22 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * $Id: init.c,v 1.1.1.1 2007/03/13 07:40:10 masayu-a Exp $
36 #include "tokenizer.h"
39 * .chasenrc default values
41 #define POS_COST_DEFAULT 1
42 #define RENSETSU_WEIGHT_DEFAULT 1
43 #define KEITAISO_WEIGHT_DEFAULT 1
44 #define COST_WIDTH_DEFAULT 0
45 #define UNDEF_WORD_DEFAULT 10000
47 int Cha_con_cost_weight = RENSETSU_WEIGHT_DEFAULT * MRPH_DEFAULT_WEIGHT;
48 int Cha_con_cost_undef = 0;
49 int Cha_mrph_cost_weight = KEITAISO_WEIGHT_DEFAULT;
51 anno_info Cha_anno_info[UNDEF_HINSI_MAX];
52 undef_info Cha_undef_info[UNDEF_HINSI_MAX];
53 int Cha_undef_info_num = 0;
54 int Cha_output_iscompound = 1;
56 char *Cha_bos_string = "";
57 char *Cha_eos_string = "EOS\n";
59 chasen_tok_t *Cha_tokenizer;
62 read_class_cost(chasen_cell_t * cell)
66 for (; !nullp(cell); cell = cha_cdr(cell)) {
67 chasen_cell_t *cell1 = cha_car(cha_car(cell));
68 chasen_cell_t *cell2 = cha_cdr(cha_car(cell));
69 char *s = cha_s_atom(cha_car(cell1));
70 if (cha_litmatch(s, 3, STR_UNKNOWN_WORD,
71 STR_UNKNOWN_WORD1, STR_UNKNOWN_WORD2)) {
73 for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2);
74 i++, cell2 = cha_cdr(cell2)) {
75 chasen_cell_t *cell3 = cha_car(cell2);
77 Cha_undef_info[i].cost = atoi(cha_s_atom(cell3));
78 Cha_undef_info[i].cost_step = 0;
80 Cha_undef_info[i].cost =
81 atoi(cha_s_atom(cha_car(cell3)));
82 Cha_undef_info[i].cost_step =
83 atoi(cha_s_atom(cha_car(cha_cdr(cell3))));
86 if (Cha_undef_info_num == 0 || Cha_undef_info_num > i)
87 Cha_undef_info_num = i;
88 } else if (!strcmp(s, "*")) {
89 cost = atoi(cha_s_atom(cha_car(cell2)));
90 for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++)
91 if (Cha_hinsi[hinsi].cost == 0)
92 Cha_hinsi[hinsi].cost = cost;
95 cost = atoi(cha_s_atom(cha_car(cell2)));
96 for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++) {
97 if (cha_match_nhinsi(cell1, hinsi)) {
98 Cha_hinsi[hinsi].cost = cost;
103 cha_exit_file(1, "invalid hinsi name `%s'\n",
111 for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++)
112 if (Cha_hinsi[hinsi].cost == 0)
113 Cha_hinsi[hinsi].cost = POS_COST_DEFAULT;
118 Cha_hinsi[0].cost = 0;
122 read_composition(chasen_cell_t * cell)
125 chasen_cell_t *cell2, *cell3;
127 for (; !nullp(cell); cell = cha_cdr(cell)) {
128 cell2 = cha_car(cell);
129 composit = cha_get_nhinsi_id(cha_car(cell2));
130 if (!nullp(cha_cdr(cell2)))
131 cell2 = cha_cdr(cell2);
132 for (; !nullp(cell2); cell2 = cha_cdr(cell2)) {
133 cell3 = cha_car(cell2);
134 for (pos = 1; Cha_hinsi[pos].name; pos++)
135 if (cha_match_nhinsi(cell3, pos))
136 Cha_hinsi[pos].composit = composit;
142 eval_chasenrc_sexp(chasen_cell_t * cell)
145 chasen_cell_t *cell2;
147 cell1_str = cha_s_atom(cha_car(cell));
148 cell2 = cha_car(cha_cdr(cell));
155 if (!strcmp(cell1_str, CHA_LIT(STR_DA_FILE)))
156 cha_read_dadic(cha_cdr(cell));
158 * ¶õÇòÉÊ»ì(space pos)
160 else if (cha_litmatch(cell1_str, 1, STR_SPACE_POS)) {
161 Cha_anno_info[0].hinsi = cha_get_nhinsi_id(cell2);
166 else if (cha_litmatch(cell1_str, 1, STR_ANNOTATION)) {
168 for (i = 1, cell2 = cha_cdr(cell);
169 i < UNDEF_HINSI_MAX && !nullp(cell2);
170 i++, cell2 = cha_cdr(cell2)) {
171 chasen_cell_t *cell3 = cha_car(cell2);
172 chasen_cell_t *cell4;
176 Cha_anno_info[i].str1 = cha_s_atom(cha_car(cha_car(cell3)));
177 Cha_anno_info[i].len1 = strlen(Cha_anno_info[i].str1);
178 cell4 = cha_car(cha_cdr(cha_car(cell3)));
182 Cha_anno_info[i].str2 = nullp(cell4) ? "" : cha_s_atom(cell4);
183 Cha_anno_info[i].len2 = strlen(Cha_anno_info[i].str2);
187 cell4 = cha_car(cha_cdr(cell3));
193 Cha_anno_info[i].format = cha_s_atom(cell4);
198 Cha_anno_info[i].hinsi = cha_get_nhinsi_id(cell4);
206 else if (cha_litmatch(cell1_str, 2,
207 STR_UNKNOWN_POS1, STR_UNKNOWN_POS2)) {
209 cell2 = cha_cdr(cell);
210 for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2);
211 i++, cell2 = cha_cdr(cell2)) {
212 Cha_undef_info[i].hinsi = cha_get_nhinsi_id(cha_car(cell2));
214 if (Cha_undef_info_num == 0 || Cha_undef_info_num > i)
215 Cha_undef_info_num = i;
220 else if (cha_litmatch(cell1_str, 1, STR_CONN_WEIGHT))
221 Cha_con_cost_weight =
222 atoi(cha_s_atom(cell2)) * MRPH_DEFAULT_WEIGHT;
226 else if (cha_litmatch(cell1_str, 1, STR_MRPH_WEIGHT))
227 Cha_mrph_cost_weight = atoi(cha_s_atom(cell2));
231 else if (cha_litmatch(cell1_str, 1, STR_COST_WIDTH))
232 cha_set_cost_width(atoi(cha_s_atom(cell2)));
236 else if (cha_litmatch(cell1_str, 1, STR_POS_COST))
237 read_class_cost(cha_cdr(cell));
241 else if (cha_litmatch(cell1_str, 1, STR_DEF_CONN_COST))
242 Cha_con_cost_undef = (int) atoi(cha_s_atom(cell2));
246 else if (cha_litmatch(cell1_str, 1, STR_COMPOSIT_POS))
247 read_composition(cha_cdr(cell));
251 else if (cha_litmatch(cell1_str, 1, STR_OUTPUT_COMPOUND))
252 Cha_output_iscompound =
253 cha_litmatch(cha_s_atom(cell2), 1, STR_SEG) ? 0 : 1;
257 else if (cha_litmatch(cell1_str, 1, STR_OUTPUT_FORMAT))
258 cha_set_opt_form(cha_s_atom(cell2));
262 else if (cha_litmatch(cell1_str, 1, STR_LANG))
263 cha_set_language(cha_s_atom(cell2));
267 else if (cha_litmatch(cell1_str, 1, STR_BOS_STR))
268 Cha_bos_string = cha_s_atom(cell2);
272 else if (cha_litmatch(cell1_str, 1, STR_EOS_STR))
273 Cha_eos_string = cha_s_atom(cell2);
277 else if (cha_litmatch(cell1_str, 1, STR_DELIMITER))
278 cha_set_jfgets_delimiter(cha_s_atom(cell2));
282 * cha_read_rcfile_fp()
285 cha_read_rcfile_fp(FILE * fp)
289 while (!cha_s_feof(fp)) {
290 cell = cha_s_read(fp);
292 eval_chasenrc_sexp(cell);
302 rcpath = cha_get_rcpath();
304 fp = cha_fopen(rcpath, "r", 1);
305 cha_read_rcfile_fp(fp);
311 if (!Cha_undef_info[0].hinsi)
312 cha_exit(1, "%s: UNKNOWN_POS/michigo-hinsi is not specified",
316 cha_exit(1, "%s: dictionary is not specified",
321 * cha_init - ChaSen's initialization
331 cha_set_cost_width(COST_WIDTH_DEFAULT);
333 if (cha_literal[0][2] == NULL)
336 cha_read_grammar_dir();
337 cha_read_grammar(NULL, 1, 1);
341 cha_read_katuyou(NULL, 1);
342 cha_read_table(NULL, 1);
343 cha_read_matrix(NULL);
345 for (i = 0; i < Cha_undef_info_num; i++)
346 Cha_undef_info[i].con_tbl =
347 cha_check_table_for_undef(Cha_undef_info[i].hinsi);
350 * initialize the tokenizer
352 Cha_tokenizer = cha_tok_new(Cha_lang, Cha_encode);
353 cha_tok_set_annotation(Cha_tokenizer, Cha_anno_info);
355 Cha_mrph_block = cha_block_new(sizeof(mrph_t), MRPH_NUM);