2 * Copyright (c) 2003 Nara Institute of Science and Technology
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name Nara Institute of Science and Technology may not be used to
15 * endorse or promote products derived from this software without
16 * specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute
22 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * $Id: chasen.c,v 1.1.1.1 2007/03/13 07:40:10 masayu-a Exp $
44 #define isatty(handle) _isatty(handle)
46 #endif /* HAVE_UNISTD_H */
50 static char *output_file = NULL;
51 static int is_partial = 0;
53 #define CHA_NAME "ChaSen"
54 #define CHA_PROG "chasen"
60 opt_form_usage(FILE * fp)
62 static char *message[] = {
63 "Conversion characters of -F option:\n",
64 " %m surface form (inflected form)\n",
65 " %M surface form (base form)\n",
66 " %y,%y1 first candidate of reading (inflected form)\n",
67 " %Y,%Y1 first candidate of reading (base form)\n",
68 " %y0 reading (inflected form)\n",
69 " %Y0 reading (base form)\n",
70 " %a,%a1 first candidate of pronounciation (inflected form)\n",
71 " %A,%A1 first candidate of pronounciation (base form)\n",
72 " %a0 pronounciation (inflected form)\n",
73 " %A0 pronounciation (base form)\n",
74 " %rABC surface form with ruby (the format is \"AkanjiBkanaC\")\n",
75 " %i,%i1 first candidate of semantic information\n",
76 " %i0 semantic information\n",
77 " %Ic semantic information (if NIL, print character 'c'.)\n",
78 " %Pc part of speech separated by character 'c'\n",
79 " %Pnc part of speech separated by character 'c'\n",
80 " %h part of speech (code)\n",
81 " %H part of speech (name)\n",
82 " %Hn the part of speech (name) at the n-th layer\n",
83 " (if NIL, the part of speech at the most specific layer)\n",
84 " %b sub-part of speech (code)\n",
85 " %BB sub-part of speech (name)(if NIL, print part of speech)\n",
86 " %Bc sub-part of speech (name)(if NIL, print character 'c')\n",
87 " %t inflection type (code)\n",
88 " %Tc inflection type (name)(if NIL, print character 'c')\n",
89 " %f inflected form (code)\n",
90 " %Fc inflected form (name)(if NIL, print character 'c')\n",
91 " %c cost value of the morpheme\n",
92 " %S the input sentence\n",
93 " %pb if the best path, '*', otherwise, ' '\n",
94 " %pi the index of the path of the output lattice\n",
95 " %ps the starting position of the morpheme\n",
96 " at the path of the output lattice\n",
97 " %pe the ending position of the morpheme\n",
98 " at the path of the output lattice\n",
99 " %pc the cost of the path of the output lattice\n",
100 " %ppiC the indices of the preceding paths,\n",
101 " concatenated with the character 'C'\n",
102 " %ppcC the costs of the preceding paths,\n",
103 " concatenated with the character 'C'\n",
105 " if sub-part of speech exists, STR1, otherwise, STR2\n",
107 " unless the semantic information is NIL and \"\", STR1,\n",
108 " otherwise, STR2\n",
110 " if conjugative, STR1, otherwise, STR2\n",
112 " same as %?T/STR1/STR2/\n",
114 " if unknown word, STR1, otherwise, STR2\n",
116 " if unknown word, \"UNKNOWN\", otherwise, STR\n",
118 " . specify the field width\n",
119 " - specify the field width\n",
120 " 1-9 specify the field width\n",
121 " \\n carriage return\n",
123 " \\\\ back slash\n",
124 " \\' single quotation mark\n",
125 " \\\" double quotation mark\n",
128 " \"%m \" split words by space (wakachi-gaki)\n",
129 " \"%y\" Kanji to Kana conversion\n",
130 " \"%r ()\" print surface form with ruby as \"kanji(kana)\"\n",
131 " \"%m\\t%y\\t%M\\t%U(%P-)\\t%T \\t%F \\n\" same as -f option (default)\n",
132 " \"%m\\t%U(%y)\\t%M\\t%P- %h %T* %t %F* %f\\n\" same as -e option\n",
135 " If the format ends with `\\n' then outputs `EOS',\n",
136 " otherwise outputs newline every sentence.\n",
142 for (mes = message; *mes; mes++)
152 static char *message[] = {
153 "Usage: ", CHA_PROG, " [options] [file...]\n",
154 " -s partial analyzing mode\n",
155 " (how to print ambiguous results)\n",
156 " -b show the best path (default)\n",
157 " -m show all morphemes\n",
158 " -p show all paths\n",
159 " (output format)\n",
160 " -f show formatted morpheme data (default)\n",
161 " -e show entire morpheme data\n",
162 " -c show coded morpheme data\n",
163 " -d show detailed morpheme data for Prolog\n",
164 " -v show detailed morpheme data for VisualMorphs\n",
165 " -F format show morpheme with formatted output\n",
166 " -Fh print help of -F option\n",
167 " (miscellaneous)\n",
168 " -i encoding character encoding.\n",
169 " e: EUC-JP, s: Shift_JIS, ",
170 "w: UTF-8, a: ISO-8859-1\n",
171 " -j Japanese sentence mode\n",
172 " -o file write output to `file'\n",
173 " -w width specify the cost width\n",
174 " -C use command mode\n",
175 " -r rc-file use rc-file as a ", CHA_PROG,
176 "rc file other than the default\n",
177 " -R with -D, do not read ", CHA_PROG,
178 "rc file, without -D, read the\n",
179 " default chasenrc file `", RCPATH, "'\n",
180 " -L lang specify languages\n",
181 " -O[c|s] output with compound words or their segments\n",
182 " -lp print the list of parts of speech\n",
183 " -lt print the list of conjugation types\n",
184 " -lf print the list of conjugation forms\n",
185 " -h print this help\n",
186 " -V print ", CHA_NAME, " version number\n",
193 for (mes = message; *mes; mes++)
201 getopt_argv(char **argv)
206 while ((c = cha_getopt_chasen(argv, stderr)) != EOF) {
211 case 'r': /* chasenrc file */
212 cha_set_rcpath(Cha_optarg);
214 case 'R': /* don't read chasenrc file */
218 output_file = Cha_optarg;
222 * -Fh: print help of -F
224 if (Cha_optarg[0] == 'h' && Cha_optarg[1] == '\0') {
225 opt_form_usage(stdout);
236 fprintf(stderr, "Try `%s -h' for more information.\n",
244 * do_chasen_standalone()
247 do_chasen_standalone(FILE * ifp, FILE * ofp)
252 * output: whether `stdout' or not
254 istty = ofp == stdout && isatty(fileno(stdout));
257 chasen_parse_segments(ifp, ofp);
259 while (!chasen_fparse(ifp, ofp))
266 * chasen_standalone()
271 chasen_standalone(char **argv, FILE * output)
276 if (chasen_getopt_argv(argv, stderr))
281 do_chasen_standalone(stdin, output);
283 for (; *argv; argv++)
284 do_chasen_standalone(cha_fopen(*argv, "r", 1), output);
293 main(int argc, char *argv[])
298 cha_set_progpath(argv[0]);
302 output = output_file ? cha_fopen(output_file, "w", 1) : stdout;
303 rc = chasen_standalone(argv, output);
304 if (output != stdout)