1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
3 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
4 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 /* ----------------------------------------------------------------- */
7 /* The Japanese TTS System "Open JTalk" */
8 /* developed by HTS Working Group */
9 /* http://open-jtalk.sourceforge.net/ */
10 /* ----------------------------------------------------------------- */
12 /* Copyright (c) 2008-2011 Nagoya Institute of Technology */
13 /* Department of Computer Science */
15 /* All rights reserved. */
17 /* Redistribution and use in source and binary forms, with or */
18 /* without modification, are permitted provided that the following */
19 /* conditions are met: */
21 /* - Redistributions of source code must retain the above copyright */
22 /* notice, this list of conditions and the following disclaimer. */
23 /* - Redistributions in binary form must reproduce the above */
24 /* copyright notice, this list of conditions and the following */
25 /* disclaimer in the documentation and/or other materials provided */
26 /* with the distribution. */
27 /* - Neither the name of the HTS working group nor the names of its */
28 /* contributors may be used to endorse or promote products derived */
29 /* from this software without specific prior written permission. */
31 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
32 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
33 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
34 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
35 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
36 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
37 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
38 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
39 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
40 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
41 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
42 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
43 /* POSSIBILITY OF SUCH DAMAGE. */
44 /* ----------------------------------------------------------------- */
51 #include "dictionary_rewriter.h"
52 #include "char_property.h"
54 #include "connector.h"
55 #include "dictionary.h"
63 class DictionaryComplier {
65 static int run(int argc, char **argv) {
66 static const MeCab::Option long_options[] = {
67 { "dicdir", 'd', ".", "DIR", "set DIR as dicdi (default \".\")" },
68 { "outdir", 'o', ".", "DIR",
69 "set DIR as output dir (default \".\")" },
70 { "unknown", 'U', 0, 0, "build parameters for unknown words" },
71 { "userdic", 'u', 0, "FILE", "build user dictionary" },
72 { "charcategory", 'C', 0, 0, "build character category maps" },
73 { "matrix", 'm', 0, 0, "build connection matrix" },
74 { "charset", 'c', MECAB_DEFAULT_CHARSET, "ENC",
75 "make charset of binary dictionary ENC (default "
76 MECAB_DEFAULT_CHARSET ")" },
77 { "charset", 't', MECAB_DEFAULT_CHARSET, "ENC", "alias of -c" },
78 { "dictionary-charset", 'f', MECAB_DEFAULT_CHARSET,
79 "ENC", "assume charset of input CSVs as ENC (default "
80 MECAB_DEFAULT_CHARSET ")" },
81 { "wakati", 'w', 0, 0, "build wakati-gaki only dictionary", },
82 { "posid", 'p', 0, 0, "assign Part-of-speech id" },
83 { "node-format", 'F', 0, "STR",
84 "use STR as the user defined node format" },
85 { "version", 'v', 0, 0, "show the version and exit." },
86 { "help", 'h', 0, 0, "show this help and exit." },
92 if (!param.open(argc, argv, long_options)) {
93 std::cout << param.what() << "\n\n" << COPYRIGHT
94 << "\ntry '--help' for more information." << std::endl;
98 if (!param.help_version()) return 0;
100 const std::string dicdir = param.get<std::string>("dicdir");
101 const std::string outdir = param.get<std::string>("outdir");
102 bool opt_unknown = param.get<bool>("unknown");
103 bool opt_matrix = param.get<bool>("matrix");
104 bool opt_charcategory = param.get<bool>("charcategory");
105 bool opt_sysdic = param.get<bool>("sysdic");
106 const std::string userdic = param.get<std::string>("userdic");
108 #define DCONF(file) create_filename(dicdir, std::string(file)).c_str()
109 #define OCONF(file) create_filename(outdir, std::string(file)).c_str()
112 CHECK_DIE(param.load(DCONF(DICRC)))
113 << "no such file or directory: " << DCONF(DICRC);
116 std::vector<std::string> dic;
118 enum_csv_dictionaries(dicdir.c_str(), &dic);
120 dic = param.rest_args();
122 if (!userdic.empty()) {
123 CHECK_DIE(dic.size()) << "no dictionaries are specified";
125 param.set("type", MECAB_USR_DIC);
126 Dictionary::compile(param, dic,
127 DCONF(MATRIX_DEF_FILE),
130 DCONF(RIGHT_ID_FILE),
135 if (!opt_unknown && !opt_matrix && !opt_charcategory && !opt_sysdic) {
136 opt_unknown = opt_matrix = opt_charcategory = opt_sysdic = true;
139 if (opt_charcategory || opt_unknown) {
140 CharProperty::compile(DCONF(CHAR_PROPERTY_DEF_FILE),
142 OCONF(CHAR_PROPERTY_FILE));
146 std::vector<std::string> tmp;
147 tmp.push_back(DCONF(UNK_DEF_FILE));
148 param.set("type", MECAB_UNK_DIC);
149 Dictionary::compile(param, tmp,
150 DCONF(MATRIX_DEF_FILE),
153 DCONF(RIGHT_ID_FILE),
156 OCONF(UNK_DIC_FILE));
160 CHECK_DIE(dic.size()) << "no dictionaries are specified";
161 param.set("type", MECAB_SYS_DIC);
162 Dictionary::compile(param, dic,
163 DCONF(MATRIX_DEF_FILE),
166 DCONF(RIGHT_ID_FILE),
169 OCONF(SYS_DIC_FILE));
173 Connector::compile(DCONF(MATRIX_DEF_FILE),
178 std::cout << "\ndone!\n";
188 int mecab_dict_index(int argc, char **argv) {
189 return MeCab::DictionaryComplier::run(argc, argv);