2 * makemat.c - make table file and matrix file
4 * $Id: makemat.c,v 1.1 2007/03/13 07:40:10 masayu-a Exp $
11 #define REN_TBL_MAX 5000
12 #define RENSETU_CELL_MAX (8192*4)
14 #define HINSI_ID_MAX USHRT_MAX
16 typedef struct _kankei_t {
21 typedef struct _rensetu_pair2_t {
22 short i_pos; /* the POS index in the current state (= preceding morpheme) */
23 short j_pos; /* the POS index in the input (= current morpheme) */
24 chasen_cell_t *hinsi; /* POS */
25 unsigned char type; /* CTYPE */
26 char *form; /* CFORM */
27 char *goi; /* Lexicalized POS */
30 typedef struct _rensetu_chasen_cell_t {
34 } rensetu_chasen_cell_t;
36 static kankei_t kankei_tbl[CTYPE_MAX];
38 static rensetu_pair_t rensetu_tbl[RENSETU_CELL_MAX];
42 static connect_rule_t **connect_mtr;
43 typedef unsigned char rensetu_mtr_t;
46 * read_kankei - read chasen's kankei file
52 chasen_cell_t *cell1, *cell2;
57 * read only from current directory
59 fp = cha_fopen(CTYPE_FILE, "r", 1);
61 fprintf(stderr, "parsing %s\n", CTYPE_FILE);
63 while (!cha_s_feof(fp)) {
64 cell1 = cha_s_read(fp);
66 hinsi = cha_get_nhinsi_id(cha_car(cell1));
67 cell1 = cha_car(cha_cdr(cell1));
69 while (!nullp(cell2 = cha_car(cell1))) {
70 type = cha_get_type_id(cha_s_atom(cell2));
71 kankei_tbl[j].hinsi = hinsi;
72 kankei_tbl[j].type = type;
75 cha_exit(1, "not enough size for CTYPE_MAX");
76 cell1 = cha_cdr(cell1);
78 kankei_tbl[j].hinsi = HINSI_ID_MAX;
86 get_pair1(chasen_cell_t * cell, rensetu_pair_t * pair)
88 chasen_cell_t *cell_p;
95 if (nullp(cell_p = cha_car(cell)))
97 pair->hinsi = cha_get_nhinsi_id(cell_p);
99 if (nullp(cell_p = cha_car(cell = cha_cdr(cell))))
101 pair->type = cha_get_type_id(cha_s_atom(cell_p));
103 if (nullp(cell_p = cha_car(cell = cha_cdr(cell))))
105 pair->form = cha_get_form_id(cha_s_atom(cell_p), pair->type);
107 if (nullp(cell_p = cha_car(cell = cha_cdr(cell))))
109 pair->goi = cha_strdup(cha_s_atom(cell_p));
116 get_pair2(chasen_cell_t * cell, rensetu_pair2_t * pair)
118 chasen_cell_t *cell_p;
126 if (nullp(cell_p = cha_car(cell)))
129 s = cha_s_atom(cha_car(cell_p));
130 if (cha_litmatch(s, 2, STR_BOS, STR_EOS)) {
134 pair->hinsi = cell_p;
136 if (nullp(cell_p = cha_car(cell = cha_cdr(cell))))
138 pair->type = cha_get_type_id(cha_s_atom(cell_p));
140 if (nullp(cell_p = cha_car(cell = cha_cdr(cell))))
143 if (strcmp(s = cha_s_atom(cell_p), "*"))
144 pair->form = cha_strdup(s);
146 if (nullp(cell_p = cha_car(cell = cha_cdr(cell))))
148 pair->goi = cha_strdup(cha_s_atom(cell_p));
155 match_pair1(rensetu_pair_t * pair1, rensetu_pair_t * pair2)
157 if (pair1->hinsi == pair2->hinsi &&
158 pair1->type == pair2->type &&
159 (!pair2->form || pair1->form == pair2->form) &&
160 !strcmp(pair1->goi, pair2->goi))
170 match_pair2(rensetu_pair2_t * pair, rensetu_pair_t * tbl)
172 if (pair->hinsi == NULL) /* BOS/EOS */
173 return tbl->hinsi == 0;
175 if (cha_match_nhinsi(pair->hinsi, (int) tbl->hinsi) &&
176 (!pair->type || pair->type == tbl->type) &&
179 && !strcmp(pair->form, Cha_form[tbl->type][tbl->form].name)))
180 && (!pair->goi || (tbl->goi && !strcmp(pair->goi, tbl->goi))))
187 * make_rensetu_tbl1 - register hinsi with goi(Lexicalized POS)
190 make_rensetu_tbl1(chasen_cell_t * cell1, int *cnt)
193 rensetu_pair_t r_pair;
194 chasen_cell_t *cell11;
196 for (; !nullp(cell11 = cha_car(cell1)); cell1 = cha_cdr(cell1)) {
197 if (nullp(cha_car(cha_cdr(cha_cdr(cha_cdr(cell11))))))
200 get_pair1(cell11, &r_pair);
202 for (i = 1; i < *cnt; i++)
203 if (match_pair1(&rensetu_tbl[i], &r_pair))
209 for (i = 1; Cha_form[r_pair.type][i].name != NULL; i++) {
210 rensetu_tbl[*cnt].hinsi = r_pair.hinsi;
211 rensetu_tbl[*cnt].type = r_pair.type;
212 rensetu_tbl[*cnt].form = i;
213 rensetu_tbl[*cnt].goi = r_pair.goi;
214 if (++*cnt >= REN_TBL_MAX)
215 cha_exit(1, "not enough size for table");
218 rensetu_tbl[*cnt].hinsi = r_pair.hinsi;
219 rensetu_tbl[*cnt].type = r_pair.type;
220 rensetu_tbl[*cnt].form = r_pair.form;
221 rensetu_tbl[*cnt].goi = r_pair.goi;
222 if (++*cnt >= REN_TBL_MAX)
223 cha_exit(1, "not enough size for table");
229 * make_rensetu_tbl2 - register hinsi
232 make_rensetu_tbl2(int hinsi, int bunrui, int *cnt)
236 if (Cha_hinsi[hinsi].kt == 1) { /* with conjugation */
237 for (i = 0; kankei_tbl[i].hinsi != HINSI_ID_MAX; i++) {
238 if (kankei_tbl[i].hinsi == hinsi) {
239 for (j = 1; Cha_form[kankei_tbl[i].type][j].name != NULL;
241 rensetu_tbl[*cnt].hinsi = hinsi;
242 rensetu_tbl[*cnt].type = kankei_tbl[i].type;
243 rensetu_tbl[*cnt].form = j;
244 rensetu_tbl[*cnt].goi = NULL;
245 if (++*cnt >= REN_TBL_MAX)
246 cha_exit(1, "not enough size for table");
250 } else { /* without conjugation */
251 rensetu_tbl[*cnt].hinsi = hinsi;
252 rensetu_tbl[*cnt].type = 0;
253 rensetu_tbl[*cnt].form = 0;
254 rensetu_tbl[*cnt].goi = NULL;
256 if (++*cnt >= REN_TBL_MAX)
257 cha_exit(1, "not enough size for table");
262 * make_rensetu_tbl - register hinsi into table
265 make_rensetu_tbl(FILE * fp)
269 int tbl_count = 1; /* 0 is for BOS/EOS */
271 /* regist POS with lexicalization
272 ¸ì×äò»ØÄꤷ¤Æ¤¤¤ë¤â¤Î¤ò¥Æ¡¼¥Ö¥ë¤ËÅÐÏ¿ */
273 for (lines = 0; !cha_s_feof(fp); lines++) {
274 for (cell = cha_car(cha_s_read(fp)); !nullp(cell);
275 cell = cha_cdr(cell))
276 make_rensetu_tbl1(cha_car(cell), &tbl_count);
279 /* regist POS with extracted conjugation
280 ³èÍѤòŸ³«¤·¤Æ¥Æ¡¼¥Ö¥ë¤ËÅÐÏ¿ */
281 for (i = 1; Cha_hinsi[i].name; i++)
282 make_rensetu_tbl2(i, 0, &tbl_count); /* second argument is dummy for compatibility */
285 /* print for check */
286 fprintf(stderr, "table size: %d\n", tbl_num);
292 * variables and functions for rensetu_cell
294 static rensetu_chasen_cell_t rensetu_cell[RENSETU_CELL_MAX];
297 static int new_cell1[RENSETU_CELL_MAX], new_cell2[RENSETU_CELL_MAX];
298 static int new_cell1_num, new_cell2_num;
301 search_rensetu_cell(int tbl, int prev)
305 for (i = 0; i < cell_num; i++)
306 if (rensetu_cell[i].tbl == tbl)
307 if (rensetu_cell[i].prev == prev)
314 * c2 ¤¬ c1 ¤Î suffix ¤«¤É¤¦¤«
317 match_rensetu_cell_suffix(int c1, int c2)
321 for (n = 0; c2 >= 0; n++) {
322 if (rensetu_cell[c1].tbl != rensetu_cell[c2].tbl)
324 c1 = rensetu_cell[c1].prev;
325 c2 = rensetu_cell[c2].prev;
332 match_rensetu_cell_tbl(int tbl, int *cells)
338 for (i = tbl_num; i < cell_num; i++)
339 if (tbl == rensetu_cell[i].tbl)
348 add_connect_rule(int in, int prev, int cost, int is_last, int *in_cells,
351 int cur, next, *curp, *cellp;
352 int suffix_len, suffix_len_max;
354 next = 0; /* to avoid warning */
355 match_rensetu_cell_tbl(rensetu_cell[prev].tbl, cur_cells);
358 * cell Ã椫¤é cur(¸½¾õÂÖ)¤ò¸¡º÷
360 for (curp = cur_cells; (cur = *curp) >= 0; curp++) {
362 * prev ¤¬ cur ¤Î suffix ¤Ë¤Ê¤Ã¤Æ¤¤¤ì¤Ð ok
364 if (!match_rensetu_cell_suffix(cur, prev))
367 * ºÇ¸å¤ÎÉÊ»ì¤Ç¤Ê¤¤¾ì¹ç¤Ïµ¬Â§¤ò¾å½ñ¤¤·¤Ê¤¤
369 if (!is_last && connect_mtr[cur][in].cost)
373 * cell Ã椫¤é next(¼¡¾õÂÖ)¤ò¸¡º÷
375 for (cellp = in_cells; *cellp >= 0; cellp++) {
377 * cur+in ¤Î suffix ¤Î¤¦¤ÁºÇ¤âŤ¤¤â¤Î¤òõ¤¹
380 match_rensetu_cell_suffix(cur,
381 rensetu_cell[*cellp].prev) + 1;
382 if (suffix_len_max < suffix_len) {
383 suffix_len_max = suffix_len;
388 if (suffix_len_max > 1) {
389 printf("suffix_len:%d,prev:%d,cur:%d,in:%d,next:%d,cost:%d\n",
390 suffix_len_max, prev, cur, in, next, cost);
396 if (suffix_len_max) {
397 connect_mtr[cur][in].next = next - in;
398 connect_mtr[cur][in].cost = cost < 0 ? 0 : cost + 1;
407 read_rensetu(FILE * fp, int lines)
409 chasen_cell_t **rule;
411 rensetu_pair2_t pair;
412 chasen_cell_t *cell, *cell1;
413 int rule_len_max, rlen;
414 int prev, in, c1, ln, linenum, linecnt;
416 int *in_cells, *cur_cells;
419 rule = (chasen_cell_t **) cha_malloc(sizeof(chasen_cell_t *) * lines);
420 rule_len = (int *) cha_malloc(sizeof(int) * lines);
422 fputs("lines: ", stderr);
424 * rensetu_cell ¤Î½é´ü²½
426 if (cell_num >= RENSETU_CELL_MAX)
427 cha_exit(1, "not enough size for cell");
428 for (cell_num = 0; cell_num < tbl_num; cell_num++) {
429 rensetu_cell[cell_num].tbl = cell_num;
430 rensetu_cell[cell_num].prev = -1;
434 for (ln = 0; !cha_s_feof(fp); ln++) {
435 rule[ln] = cha_s_read(fp);
436 if ((ln % 500) == 0) {
442 * ºÇ¤âŤ¤µ¬Â§¤ò¸«¤Ä¤±¤ë
444 rule_len[ln] = cha_s_length(cha_car(rule[ln]));
445 if (rule_len[ln] < 2)
446 cha_exit_file(1, "too few morphemes");
447 if (rule_len_max < rule_len[ln])
448 rule_len_max = rule_len[ln];
451 * new_cell2: ³ÆÉÊ»ì¤ÇÅÐÏ¿¤·¤¿ rensetu_cell
453 new_cell2[0] = -1; /* ʸƬ¡¦Ê¸Ëö */
459 * cell: Éʻ췲¤Î¥ê¥¹¥È
461 for (cell = cha_car(rule[ln]); !nullp(cha_cdr(cell));
462 cell = cha_cdr(cell)) {
464 * new_cell2 ¤ò new_cell1 ¤Ë¥³¥Ô¡¼
466 memcpy(new_cell1, new_cell2, sizeof(int) * new_cell2_num);
467 new_cell1_num = new_cell2_num;
472 for (cell1 = cha_car(cell); !nullp(cell1);
473 cell1 = cha_cdr(cell1)) {
476 * pair: ¥ï¥¤¥ë¥É¥«¡¼¥É¤Ä¤¤ÎÉÊ»ì
478 get_pair2(cha_car(cell1), &pair);
480 * pair ¤«¤é tbl(ÉÊ»ì1¤Ä1¤Ä)¤ò¼è¤ê½Ð¤·¤Æ½èÍý
482 for (tbl = 0; tbl < tbl_num; tbl++) {
483 if (!match_pair2(&pair, &rensetu_tbl[tbl]))
486 * c1, prev: 1¤ÄÁ°¤ÎÉÊ»ì¤ÇÅÐÏ¿¤µ¤ì¤¿cell
488 for (c1 = 0; c1 < new_cell1_num; c1++) {
489 int prev = new_cell1[c1], cellno;
490 if ((cellno = search_rensetu_cell(tbl, prev)) < 0) {
492 if (++cell_num >= RENSETU_CELL_MAX)
494 "not enough size for cell");
495 rensetu_cell[cellno].tbl = tbl;
496 rensetu_cell[cellno].prev = prev;
498 printf("cellno:%d,tbl:%d,prev:%d\n", cellno,
502 new_cell2[new_cell2_num++] = cellno;
509 fprintf(stderr, " %d\n", ln);
510 fprintf(stderr, "number of states: %d\n", cell_num);
513 (connect_rule_t *) cha_malloc(sizeof(connect_rule_t) * cell_num *
515 memset(ptr, 0, sizeof(connect_rule_t) * cell_num * tbl_num);
517 (connect_rule_t **) cha_malloc(sizeof(connect_rule_t *) *
519 for (c1 = 0; c1 < cell_num; c1++)
520 connect_mtr[c1] = ptr + c1 * tbl_num;
522 in_cells = cha_malloc(sizeof(int) * cell_num);
523 cur_cells = cha_malloc(sizeof(int) * cell_num);
531 for (rlen = 2; rlen <= rule_len_max; rlen++) {
532 /* fprintf(stderr, rlen == 2 ? "bi%s" : rlen == 3 ? "tri%s" : "%d%s",
535 fprintf(stderr, rlen == 2 ? "bi%s" : rlen == 3 ? "tri%s" : "%s",
538 fprintf(stderr, "%d-gram: ", rlen);
540 for (ln = 0; ln < linenum; ln++) {
541 if (rule_len[ln] != rlen)
543 Cha_lineno_error = Cha_lineno = ln + 1;
545 printf("Line: %d/%d\n", ln + 1, linenum);
547 if ((++linecnt % 500) == 0) {
549 if ((linecnt % 20000) == 0)
550 fprintf(stderr, " %d\n", linecnt);
554 cell = cha_car(cha_cdr(rule[ln]));
555 cost = nullp(cell) ? DEFAULT_C_WEIGHT : atoi(cha_s_atom(cell));
558 * new_cell2: ³ÆÉÊ»ì¤ÇÅÐÏ¿¤·¤¿ rensetu_cell
560 new_cell2[0] = -1; /* ʸƬ¡¦Ê¸Ëö */
563 * cell: Éʻ췲¤Î¥ê¥¹¥È
565 for (cell = cha_car(rule[ln]); !is_last; cell = cha_cdr(cell)) {
566 is_last = nullp(cha_cdr(cell));
568 * new_cell2 ¤ò new_cell1 ¤Ë¥³¥Ô¡¼
570 memcpy(new_cell1, new_cell2, sizeof(int) * new_cell2_num);
571 new_cell1_num = new_cell2_num;
576 for (cell1 = cha_car(cell); !nullp(cell1);
577 cell1 = cha_cdr(cell1)) {
579 * pair: ¥ï¥¤¥ë¥É¥«¡¼¥É¤Ä¤¤ÎÉÊ»ì
581 get_pair2(cha_car(cell1), &pair);
583 * pair ¤«¤é in(ÉÊ»ì1¤Ä1¤Ä)¤ò¼è¤ê½Ð¤·¤Æ½èÍý
585 for (in = 0; in < tbl_num; in++) {
586 if (!match_pair2(&pair, &rensetu_tbl[in]))
588 match_rensetu_cell_tbl(in, in_cells);
590 * c1, prev: 1¤ÄÁ°¤ÎÉÊ»ì¤ÇÅÐÏ¿¤µ¤ì¤¿cell
592 for (c1 = 0; c1 < new_cell1_num; c1++) {
593 prev = new_cell1[c1];
595 int cellno = search_rensetu_cell(in, prev);
596 new_cell2[new_cell2_num++] = cellno;
601 add_connect_rule(in, prev, cost, is_last,
602 in_cells, cur_cells);
608 printf(" %d\n", linecnt);
613 compare_vector1(int k, int j, int num)
617 for (i = 0; i < num; i++)
618 if (connect_mtr[i][k].next != connect_mtr[i][j].next ||
619 connect_mtr[i][k].cost != connect_mtr[i][j].cost)
626 copy_vector1(int j, int j_n, int num)
630 for (i = 0; i < num; i++) {
631 connect_mtr[i][j_n].next = connect_mtr[i][j].next;
632 connect_mtr[i][j_n].cost = connect_mtr[i][j].cost;
637 compare_vector2(int k, int i, int num)
641 for (j = 0; j < num; j++)
642 if (connect_mtr[i][j].next != connect_mtr[k][j].next ||
643 connect_mtr[i][j].cost != connect_mtr[k][j].cost)
650 copy_vector2(int i, int i_n, int num)
654 for (j = 0; j < num; j++) {
655 connect_mtr[i_n][j].next = connect_mtr[i][j].next;
656 connect_mtr[i_n][j].cost = connect_mtr[i][j].cost;
664 condense_matrix(void)
670 fprintf(stderr, "matrix size: %dx%d", cell_num, tbl_num);
672 for (j = 0; j < tbl_num; j++) {
675 for (k = 0; k < j_n; k++) {
676 if (compare_vector1(k, j, cell_num)) {
677 rensetu_tbl[j].j_pos = k;
684 copy_vector1(j, j_n, cell_num);
685 rensetu_tbl[j].j_pos = j_n++;
690 for (i = 0; i < cell_num; i++) {
693 for (k = 0; k < i_n; k++) {
694 if (compare_vector2(k, i, j_num)) {
695 rensetu_tbl[i].i_pos = k;
702 copy_vector2(i, i_n, j_num);
703 rensetu_tbl[i].i_pos = i_n++;
711 fprintf(stderr, " -> %dx%d\n", i_num, j_num);
715 * write_table, write_matrix
724 fp = cha_fopen(TABLE_FILE, "w", 1);
725 fprintf(fp, "%d\n", cell_num);
726 for (i = 0, tbl = &rensetu_tbl[0]; i < tbl_num; i++, tbl++) {
730 fprintf(fp, "%s %s %s %s\n",
731 Cha_hinsi[tbl->hinsi].name ?
732 Cha_hinsi[tbl->hinsi].name : "(null)",
733 tbl->type ? Cha_type[tbl->type].name : "",
734 tbl->form ? Cha_form[tbl->type][tbl->form].name : "",
735 tbl->goi ? tbl->goi : "");
739 fprintf(fp, "%d %d %d %d %d %s\n",
740 tbl->i_pos, tbl->j_pos, tbl->hinsi,
741 tbl->type, tbl->form, tbl->goi ? tbl->goi : "*");
743 for (; i < cell_num; i++, tbl++)
744 fprintf(fp, ";\n%d -1 0 0 0 *\n", tbl->i_pos);
755 fp = cha_fopen(MATRIX_FILE, "w", 1);
756 fprintf(fp, "%d %d\n", i_num, j_num);
758 for (i = 0; i < i_num; i++) {
760 int next0 = connect_mtr[i][0].next;
761 int cost0 = connect_mtr[i][0].cost;
762 for (j = 0; j < j_num; j++) {
763 if (connect_mtr[i][j].next == next0 &&
764 connect_mtr[i][j].cost == cost0) {
767 if (next0 == 0 && cost0 == 0)
768 fprintf(fp, "o%d ", nval);
770 fprintf(fp, "%d,%d ", next0, cost0);
772 fprintf(fp, "%d,%dx%d ", next0, cost0, nval);
774 next0 = connect_mtr[i][j].next;
775 cost0 = connect_mtr[i][j].cost;
779 if (next0 == 0 && cost0 == 0)
780 fprintf(fp, "o%d ", nval);
782 fprintf(fp, "%d,%d ", next0, cost0);
784 fprintf(fp, "%d,%dx%d ", next0, cost0, nval);
795 main(int argc, char *argv[])
802 cha_set_progpath(argv[0]);
805 while ((c = cha_getopt(argv, "i:", stderr)) != EOF) {
808 cha_set_encode(Cha_optarg);
815 con_filename = CONNECT_FILE;
817 con_filename = argv[0];
820 * .chasenrc ¤ÏÆɤ߹þ¤àɬÍפʤ¤
824 * ʸˡ¡¦³èÍÑ¡¦´Ø·¸¥Õ¥¡¥¤¥ë
826 cha_read_grammar(stderr, 0, 0);
827 cha_read_katuyou(stderr, 0);
831 * Ï¢Àܵ¬Â§¥Õ¥¡¥¤¥ë¤Î¥ª¡¼¥×¥ó
833 fpc = cha_fopen(con_filename, "r", 1);
836 * Ï¢Àܵ¬Â§¥Õ¥¡¥¤¥ë¤Î½èÍý
838 fprintf(stderr, "parsing %s\n", con_filename);
839 cha_set_skip_char('#');
840 lines = make_rensetu_tbl(fpc);
843 read_rensetu(fpc, lines);
857 /*---------------------------------------------------
859 Memo for connection matrix.
863 current state : the preceding morpheme state number
864 input: the current morpheme state number
865 output: the next state number & connection cost
870 +-----------+ +------------+
871 | the cur |---->| the prec |
872 | mrph state| | mrph state | i_pos in rensetu_pair_t
873 +-----------+ +============+
877 | the next | lib/chadic.h
878 | state | typedef struct _connect_rule_t {
879 +------------+ unsigned short next;
880 | connection | unsigned short cost;
881 | cost | } connect_rule_t;
886 Àܳ½õ»ì ¤Ê¤¬¤é [POS name] [lexicalized POS]
887 3 3 64 0 0 ¤Ê¤¬¤é i_pos j_pos hinshi type form goi
892 0,394 0,8001 0,3430 0,8001 0,3766 0,8001x2 0,3094 ....
894 Hcolumn: current state = preceding morpheme (cell_num)
895 Vcolumn: input = current morpheme (tbl_num)
897 the entry X,Yx2 means as follows:
899 X = next state number (nonzero for tri-gram context, zero for bi-gram context)
902 `x2' means two times of state(compressed expression)