1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
3 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
4 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
11 #include "char_property.h"
23 std::vector<std::string> c;
26 int atohex(const char *s) {
29 CHECK_DIE(std::strlen(s) >= 3
30 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
31 << "no hex value: " << s;
37 if (*s >= '0' && *s <= '9')
39 else if (*s >= 'A' && *s <= 'F')
41 else if (*s >= 'a' && *s <= 'f')
44 CHECK_DIE(false) << "no hex value: " << p;
53 CharInfo encode(const std::vector<std::string> &c,
54 std::map<std::string, CharInfo> *category) {
55 CHECK_DIE(c.size()) << "category size is empty";
57 std::map<std::string, CharInfo>::const_iterator it = category->find(c[0]);
58 CHECK_DIE(it != category->end())
59 << "category [" << c[0] << "] is undefined";
61 CharInfo base = it->second;
62 for (size_t i = 0; i < c.size(); ++i) {
63 std::map<std::string, CharInfo>::const_iterator it =
65 CHECK_DIE(it != category->end())
66 << "category [" << c[i] << "] is undefined";
67 base.type += (1 << it->second.default_type);
74 bool CharProperty::open(const Param ¶m) {
75 const std::string prefix = param.get<std::string>("dicdir");
76 const std::string filename = create_filename(prefix, CHAR_PROPERTY_FILE);
77 return open(filename.c_str());
80 bool CharProperty::open(const char *filename) {
81 std::ostringstream error;
82 CHECK_FALSE(cmmap_->open(filename, "r"));
84 const char *ptr = cmmap_->begin();
86 read_static<unsigned int>(&ptr, csize);
88 size_t fsize = sizeof(unsigned int) +
89 (32 * csize) + sizeof(unsigned int) * 0xffff;
91 CHECK_FALSE(fsize == cmmap_->size())
92 << "invalid file size: " << filename;
95 for (unsigned int i = 0; i < csize; ++i) {
96 const char *s = read_ptr(&ptr, 32);
100 map_ = reinterpret_cast<const CharInfo *>(ptr);
105 void CharProperty::close() {
109 size_t CharProperty::size() const { return clist_.size(); }
111 const char *CharProperty::name(size_t i) const {
112 return const_cast<const char*>(clist_[i]);
115 // this function must be rewritten.
116 void CharProperty::set_charset(const char *ct) {
117 charset_ = decode_charset(ct);
120 int CharProperty::id(const char *key) const {
121 for (int i = 0; i < static_cast<long>(clist_.size()); ++i) {
122 if (std::strcmp(key, clist_[i]) == 0) {
129 bool CharProperty::compile(const char *cfile,
132 scoped_fixed_array<char, BUF_SIZE> line;
133 scoped_fixed_array<char *, 512> col;
135 std::vector<Range> range;
136 std::map<std::string, CharInfo> category;
137 std::vector<std::string> category_ary;
138 std::ifstream ifs(WPATH(cfile));
139 std::istringstream iss(CHAR_PROPERTY_DEF_DEFAULT);
140 std::istream *is = &ifs;
144 << " is not found. minimum setting is used" << std::endl;
148 while (is->getline(line.get(), line.size())) {
149 if (std::strlen(line.get()) == 0 || line[0] == '#') {
152 const size_t size = tokenize2(line.get(), "\t ", col.get(), col.size());
153 CHECK_DIE(size >= 2) << "format error: " << line.get();
155 // 0xFFFF..0xFFFF hoge hoge hgoe #
156 if (std::strncmp(col[0], "0x", 2) == 0) {
157 std::string low = col[0];
159 size_t pos = low.find("..");
161 if (pos != std::string::npos) {
162 high = low.substr(pos + 2, low.size() - pos - 2);
163 low = low.substr(0, pos);
169 r.low = atohex(low.c_str());
170 r.high = atohex(high.c_str());
172 CHECK_DIE(r.low >= 0 && r.low < 0xffff &&
173 r.high >= 0 && r.high < 0xffff &&
175 << "range error: low=" << r.low << " high=" << r.high;
177 for (size_t i = 1; i < size; ++i) {
178 if (col[i][0] == '#') {
179 break; // skip comments
181 CHECK_DIE(category.find(std::string(col[i])) != category.end())
182 << "category [" << col[i] << "] is undefined";
183 r.c.push_back(col[i]);
187 CHECK_DIE(size >= 4) << "format error: " << line.get();
189 std::string key = col[0];
190 CHECK_DIE(category.find(key) == category.end())
191 << "category " << key << " is already defined";
194 std::memset(&c, 0, sizeof(c));
195 c.invoke = std::atoi(col[1]);
196 c.group = std::atoi(col[2]);
197 c.length = std::atoi(col[3]);
198 c.default_type = id++;
200 category.insert(std::pair<std::string, CharInfo>(key, c));
201 category_ary.push_back(key);
205 CHECK_DIE(category.size() < 18) << "too many categories(>= 18)";
207 CHECK_DIE(category.find("DEFAULT") != category.end())
208 << "category [DEFAULT] is undefined";
210 CHECK_DIE(category.find("SPACE") != category.end())
211 << "category [SPACE] is undefined";
213 std::istringstream iss2(UNK_DEF_DEFAULT);
214 std::ifstream ifs2(WPATH(ufile));
215 std::istream *is2 = &ifs2;
219 << " is not found. minimum setting is used." << std::endl;
223 std::set<std::string> unk;
224 while (is2->getline(line.get(), line.size())) {
225 const size_t n = tokenizeCSV(line.get(), col.get(), 2);
226 CHECK_DIE(n >= 1) << "format error: " << line.get();
227 const std::string key = col[0];
228 CHECK_DIE(category.find(key) != category.end())
229 << "category [" << key << "] is undefined in " << cfile;
233 for (std::map<std::string, CharInfo>::const_iterator it = category.begin();
234 it != category.end();
236 CHECK_DIE(unk.find(it->first) != unk.end())
237 << "category [" << it->first << "] is undefined in " << ufile;
240 std::vector<CharInfo> table(0xffff);
242 std::vector<std::string> tmp;
243 tmp.push_back("DEFAULT");
244 const CharInfo c = encode(tmp, &category);
245 std::fill(table.begin(), table.end(), c);
248 for (std::vector<Range>::const_iterator it = range.begin();
251 const CharInfo c = encode(it->c, &category);
252 for (int i = it->low; i <= it->high; ++i) {
257 // output binary table
259 std::ofstream ofs(WPATH(ofile), std::ios::binary|std::ios::out);
260 CHECK_DIE(ofs) << "permission denied: " << ofile;
262 unsigned int size = static_cast<unsigned int>(category.size());
263 ofs.write(reinterpret_cast<const char*>(&size), sizeof(size));
264 for (std::vector<std::string>::const_iterator it = category_ary.begin();
265 it != category_ary.end();
268 std::fill(buf, buf + sizeof(buf), '\0');
269 std::strncpy(buf, it->c_str(), sizeof(buf) - 1);
270 ofs.write(reinterpret_cast<const char*>(buf), sizeof(buf));
272 ofs.write(reinterpret_cast<const char*>(&table[0]),
273 sizeof(CharInfo) * table.size());