2 * Copyright (c) 2003 Nara Institute of Science and Technology
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name Nara Institute of Science and Technology may not be used to
15 * endorse or promote products derived from this software without
16 * specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute
22 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * $Id: translate.c,v 1.1.1.1 2007/03/13 07:40:10 masayu-a Exp $
48 #define MRPH_WEIGHT_MAX USHRT_MAX
50 int dump_dic(lexicon_t *lexies, FILE *output[], da_build_t *builder);
54 dump_lex(lexicon_t *lex, char *indent)
56 fprintf(stderr, "%sheadw:\t '%s'\n", indent, lex->headword);
57 fprintf(stderr, "%sread:\t '%s'\n", indent, lex->reading);
58 fprintf(stderr, "%spron:\t '%s'\n", indent, lex->pron);
59 fprintf(stderr, "%sbase:\t '%s'\n", indent, lex->base);
60 fprintf(stderr, "%sPOS:\t %d\n", indent, lex->pos);
61 fprintf(stderr, "%sitype:\t %d\n", indent, lex->inf_type);
62 fprintf(stderr, "%siform:\t %d\n", indent, lex->inf_form);
63 fprintf(stderr, "%sweight:\t %d\n", indent, lex->weight);
64 fprintf(stderr, "%scon_t:\t %d\n", indent, lex->con_tbl);
65 fprintf(stderr, "%sinfo:\t '%s'\n\n", indent, lex->info);
74 err_msg(char *msg, chasen_cell_t *cell)
78 else if (Cha_lineno == Cha_lineno_error)
79 fprintf(stderr, "%d: ", Cha_lineno);
81 fprintf(stderr, "%d-%d: ", Cha_lineno_error, Cha_lineno);
83 fprintf(stderr, "`%s' %s\n", cha_s_tostr(cell), msg);
88 get_string(chasen_cell_t *cell, char *dist, int length)
93 string = s_atom_val(cell);
96 return err_msg("is too long", cell);
98 memcpy(dist, string, len + 1);
104 parse_headword(chasen_cell_t *cell, int default_weight, lexicon_t *lex)
106 chasen_cell_t *headword;
110 lex->weight = (unsigned short)default_weight;
111 } else if (atomp(cha_car(cell))) {
112 headword = cha_car(cell);
113 if (nullp(cha_cdr(cell)))
114 lex->weight = (unsigned short)default_weight;
115 else if (!atomp(cha_car(cha_cdr(cell))))
116 return err_msg("has invalid form", cell);
119 weight = (int)(atof(s_atom_val(cha_car(cha_cdr(cell))))
120 * MRPH_DEFAULT_WEIGHT);
123 return err_msg(": weight must be between 0 and 6553.5", cell);
124 } else if (weight > MRPH_WEIGHT_MAX) {
125 weight = MRPH_WEIGHT_MAX;
126 return err_msg(": weight must be between 0 and 6553.5", cell);
128 lex->weight = (unsigned short)weight;
131 return err_msg("has invalid form", cell);
133 if (get_string(headword, lex->headword, MIDASI_LEN) < 0)
140 stem(char *string, char *ending)
142 int string_len, ending_len;
147 string_len = strlen(string);
148 ending_len = strlen(ending);
150 if (string_len < ending_len ||
151 strcmp(string + string_len - ending_len, ending))
152 return err_msg(":ending conflicts headword", cha_tmp_atom(ending));
154 string[string_len - ending_len] = '\0';
156 return string_len - ending_len;
160 parse_lexicon(chasen_cell_t *entry, lexicon_t *lexies, int pos, int weight)
162 chasen_cell_t *cell, *cdr;
165 memset(lexies, 0, sizeof(lexicon_t));
168 lexies[0].weight = weight;
169 lexies[0].base = lexies[0].info = "";
170 lexies[0].reading_len = lexies[0].pron_len = -1;
173 return err_msg("is not list", entry);
175 for (cell = cha_car(entry), cdr = cha_cdr(entry); !nullp(cell);
176 cell = cha_car(cdr), cdr = cha_cdr(cdr)) {
181 return err_msg("is not list", entry);
182 pred = s_atom_val(cha_car(cell));
183 val = cha_car(cha_cdr(cell));
184 if (cha_litmatch(pred, 1, STR_POS)) {
185 lexies[0].pos = cha_get_nhinsi_id(val);
186 } else if (cha_litmatch(pred, 1, STR_WORD)) {
187 stat = parse_headword(val, weight, lexies);
188 } else if (cha_litmatch(pred, 1, STR_READING)) {
189 stat = get_string(val, lexies[0].reading, MIDASI_LEN * 2);
190 lexies[0].reading_len = strlen(lexies[0].reading);
191 } else if (cha_litmatch(pred, 1, STR_PRON)) {
192 stat = get_string(val, lexies[0].pron, MIDASI_LEN * 2);
193 lexies[0].pron_len = strlen(lexies[0].pron);
194 } else if (cha_litmatch(pred, 1, STR_BASE)) {
195 lexies[0].base = s_atom_val(val);
196 } else if (cha_litmatch(pred, 1, STR_CTYPE)) {
197 lexies[0].inf_type = cha_get_type_id(s_atom_val(val));
198 } else if (cha_litmatch(pred, 1, STR_CFORM)) {
199 lexies[0].inf_form = cha_get_form_id(s_atom_val(val),
201 } else if (cha_litmatch(pred, 2, STR_INFO1, STR_INFO2)) {
202 lexies[0].info = s_atom_val(val);
203 } else if (cha_litmatch(pred, 1, STR_COMPOUND)) {
204 chasen_cell_t *head, *tail;
205 lexicon_t *lex = lexies + 1;
206 for (head = val, tail = cha_cdr(cha_cdr(cell));
208 head = cha_car(tail), tail = cha_cdr(tail))
209 stat = parse_lexicon(head, lex++, pos, 0);
210 if (lexies[0].inf_type > 0 && lexies[0].inf_form == 0 &&
211 lexies[0].inf_type != lex[-1].inf_type)
212 stat = err_msg(": conjugation type is different from that of the compound word", entry);
214 stat = err_msg("is not defined", cha_car(cell));
220 if (cha_check_table(lexies) <= 0)
221 return err_msg("is invalid connection", cell);
223 if (lexies[0].inf_type > 0) {
224 if (lexies[0].inf_form == 0) {
226 basic_form = &Cha_form[lexies[0].inf_type]
227 [Cha_type[lexies[0].inf_type].basic];
228 stat = stem(lexies[0].headword, basic_form->gobi);
229 if (lexies[0].reading_len >= 0) {
230 stat = stem(lexies[0].reading, basic_form->ygobi);
231 lexies[0].reading_len = strlen(lexies[0].reading);
233 if (lexies[0].pron_len >= 0) {
234 stat = stem(lexies[0].pron, basic_form->pgobi);
235 lexies[0].pron_len = strlen(lexies[0].pron);
237 lexies[0].stem_len = strlen(lexies[0].headword);
240 form = &Cha_form[lexies[0].inf_type][lexies[0].inf_form];
241 lexies[0].stem_len = -1;
242 if (!lexies[0].base[0])
243 return err_msg("needs base form",
244 cha_tmp_atom(lexies[0].headword));
248 lexies[0].inf_type = 0;
249 lexies[0].inf_form = 0;
250 lexies[0].stem_len = strlen(lexies[0].headword);
257 parse_dic(FILE *input, FILE *output[], da_build_t *builder)
260 lexicon_t lexicons[256]; /* XXX */
262 int weight = MRPH_WEIGHT_MAX;
265 while (!cha_s_feof(input)) {
266 cell = cha_s_read(input);
268 return err_msg("is not list", cell);
269 if (atomp(cha_car(cell))) {
270 char *s = s_atom_val(cha_car(cell));
271 if (cha_litmatch(s, 1, STR_POS))
272 pos = cha_get_nhinsi_id(cha_car(cha_cdr(cell)));
273 else if (cha_litmatch(s, 1, STR_DEF_POS_COST))
274 weight = atoi(s_atom_val(cha_car(cha_cdr(cell))));
276 stat = err_msg("is not defined", cell);
279 stat = err_msg("POS is not specified", NULL);
280 else if (parse_lexicon(cell, lexicons, pos, weight) < 0)
283 if (lexicons[0].inf_type > 0 &&
284 lexicons[0].inf_form > 0) {
285 lexicons[0].con_tbl += lexicons[0].inf_form - 1;
287 stat = dump_dic(lexicons, output, builder);
297 translate(char *path, FILE *output[], da_build_t *builder)
302 input = cha_fopen(path, "r", 1);
303 fprintf(stderr, "%s\n", path);
304 stat = parse_dic(input, output, builder);
311 translate_files(char *pathes[], FILE *output[], da_build_t *builder)
313 #if defined HAVE_IO_H && !defined __CYGWIN__
314 struct _finddata_t fileinfo;
319 fputs("parsing dictionaries...\n", stderr);
321 for (; *pathes != NULL; pathes++) {
322 #if defined HAVE_IO_H && !defined __CYGWIN__
323 handle = _findfirst(*pathes, &fileinfo);
327 if (translate(fileinfo.name, output, builder) < 0)
330 } while (!_findnext(handle, &fileinfo));
333 if (translate(*pathes, output, builder) < 0)
346 fputs("usage: makeda [-i encode] output dicfile...\n", stderr);
351 main(int argc, char *argv[])
360 cha_set_progpath(argv[0]);
363 while ((c = cha_getopt(argv, "i:", stderr)) != EOF) {
366 cha_set_encode(Cha_optarg);
381 cha_read_grammar(stderr, 1, 2);
382 cha_read_katuyou(stderr, 2);
383 cha_read_table(stderr, 2);
385 snprintf(path, PATH_MAX, "%s.da", dic_base);
386 builder = da_build_new(path);
387 snprintf(path, PATH_MAX, "%s.dat", dic_base);
388 output[0] = cha_fopen(path, "wb", 1);
389 snprintf(path, PATH_MAX, "%s.lex", dic_base);
390 output[1] = cha_fopen(path, "wb", 1);
391 snprintf(path, PATH_MAX, "%s.tmp", dic_base);
392 output[2] = cha_fopen(path, "wb", 1);
394 if (translate_files(argv, output, builder) < 0)
398 tmpfile = cha_mmap_file(path);
399 da_build_dump(builder, cha_mmap_map(tmpfile), output[1]);
400 cha_munmap_file(tmpfile);