1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
14 #include "learner_tagger.h"
16 #include "feature_index.h"
17 #include "string_buffer.h"
20 double toLogProb(double f1, double f2) {
21 return std::log(1.0 * f1 / f2) - VERY_SMALL_LOGPROB; // avoid 0
27 #define DCONF(file) create_filename(dicdir, std::string(file)).c_str()
31 static int run(Param *param) {
32 DictionaryRewriter rewrite;
35 const std::string dicdir = param->get<std::string>("dicdir");
36 CHECK_DIE(param->load(DCONF(DICRC)))
37 << "no such file or directory: " << DCONF(DICRC);
39 CHECK_DIE(rewrite.open(DCONF(REWRITE_FILE)))
40 << "no such file or directory: " << DCONF(REWRITE_FILE);
42 const std::vector<std::string> files = param->rest_args();
43 if (files.size() != 2) {
44 std::cout << "Usage: " <<
45 param->program_name() << " corpus model" << std::endl;
49 const std::string ifile = files[0];
50 const std::string model = files[1];
52 const bool text_only = param->get<bool>("text-only");
53 const bool em_hmm = param->get<bool>("em-hmm");
54 const std::string bos_feature = param->get<std::string>("bos-feature");
56 CHECK_DIE(!bos_feature.empty()) << "bos-feature is empty";
60 std::string word, feature;
61 std::string ufeature, lfeature, rfeature;
62 std::string plfeature, prfeature;
63 std::map<std::string, std::map<std::string, double> > emission;
64 std::map<std::string, std::map<std::string, double> > transition;
68 std::ifstream ifs(ifile.c_str());
69 CHECK_DIE(ifs) << "no such file or directory: " << ifile;
71 while (ifs.getline(line, sizeof(line))) {
72 if (std::strcmp("EOS", line) == 0) {
73 if (++size % 100 == 0)
74 std::cout << size << "... " << std::flush;
78 CHECK_DIE(tokenize(line, "\t", col, 4) == 4)
80 CHECK_DIE(std::strcmp("B", col[0]) == 0 &&
81 std::strcmp("U", col[0]) == 0)
83 if (col[0][0] == 'B') { // bigram
85 CHECK_DIE(rewrite.rewrite(feature,
92 CHECK_DIE(rewrite.rewrite(feature,
98 transition[prfeature][plfeature] += std::atof(col[3]);
101 CHECK_DIE(rewrite.rewrite(feature,
106 std::strncpy(line, ufeature.c_str(), sizeof(line));
107 size_t n = tokenize2(line, "\t ", col, 2);
108 CHECK_DIE(n == 2) << "format error in rewrite.def: " << ufeature;
111 emission[ufeature][word] += atof(col[3]);
116 std::ifstream ifs(ifile.c_str());
117 CHECK_DIE(ifs) << "no such file or directory: " << ifile;
119 CHECK_DIE(rewrite.rewrite(bos_feature,
122 &prfeature)) << "rewrite failed";
125 while (ifs.getline(line, sizeof(line))) {
126 if (std::strcmp("EOS", line) == 0) {
127 if (++size % 100 == 0)
128 std::cout << size << "... " << std::flush;
129 feature = bos_feature;
131 CHECK_DIE(tokenize(line, "\t", col, 2) == 2)
136 CHECK_DIE(rewrite.rewrite(feature,
142 std::strncpy(line, ufeature.c_str(), sizeof(line));
143 // unigram rule must contain ' '
144 const size_t n = tokenize2(line, "\t ", col, 2);
145 CHECK_DIE(n == 2) << "format error in rewrite.def: " << ufeature;
148 transition[prfeature][lfeature] += 1.0;
149 emission[ufeature][word] += 1.0;
150 plfeature = lfeature;
151 prfeature = rfeature;
157 std::vector<std::string> dic;
158 enum_csv_dictionaries(dicdir.c_str(), &dic);
160 const double freq = param->get<double>("default-emission-freq");
161 CHECK_DIE(freq >= 0.0) << " default-emission-freq must be >= 0 "
164 for (std::vector<std::string>::const_iterator it = dic.begin();
165 it != dic.end(); ++it) {
166 std::cout << "reading " << *it << " ... " << std::flush;
168 std::ifstream ifs(it->c_str());
169 CHECK_DIE(ifs) << "no such file or directory: " << *it;
171 while (ifs.getline(line, sizeof(line))) {
172 CHECK_DIE(tokenizeCSV(line, col, 5) == 5) << "format error";
174 CHECK_DIE(rewrite.rewrite(feature,
177 &rfeature)) << "rewrite failed";
178 std::strncpy(line, ufeature.c_str(), sizeof(line));
179 const size_t n = tokenize2(line, "\t ", col, 2);
180 CHECK_DIE(n == 2) << "format error: " << ufeature;
183 emission[ufeature][word] += freq;
186 std::cout << std::endl;
191 std::cout << std::endl;
192 std::string txtfile = model;
195 std::ofstream ofs(txtfile.c_str());
196 CHECK_DIE(ofs) << "permission denied: " << model;
198 ofs.setf(std::ios::fixed, std::ios::floatfield);
202 for (std::map<std::string, std::map<std::string, double> >
204 it = transition.begin();
205 it != transition.end(); ++it) {
207 for (std::map<std::string, double>::
208 const_iterator it2 = it->second.begin();
209 it2 != it->second.end(); ++it2) {
213 for (std::map<std::string, double>
214 ::const_iterator it2 = it->second.begin();
215 it2 != it->second.end(); ++it2)
216 ofs << toLogProb(it2->second, freq) << '\t'
217 << 'B' << ':' << it->first << '/' << it2->first << std::endl;
221 for (std::map<std::string, std::map<std::string, double> >
223 it = emission.begin();
224 it != emission.end(); ++it) {
226 for (std::map<std::string, double>
227 ::const_iterator it2 = it->second.begin();
228 it2 != it->second.end(); ++it2)
231 for (std::map<std::string, double>
232 ::const_iterator it2 = it->second.begin();
233 it2 != it->second.end(); ++it2) {
234 std::string w = it2->first;
235 CHECK_DIE(escape_csv_element(&w));
236 ofs << toLogProb(it2->second, freq) << '\t'
237 << 'U' << ':' << it->first << ' ' << w << std::endl;
244 EncoderFeatureIndex feature_index;
245 CHECK_DIE(feature_index.convert(txtfile.c_str(), model.c_str()))
246 << "unexpected error in LBFGS routin";
249 std::cout << "Done!" << std::endl;
258 static int run(Param *param) {
259 const std::string dicdir = param->get<std::string>("dicdir");
260 CHECK_DIE(param->load(DCONF(DICRC)))
261 << "no such file or directory: " << DCONF(DICRC);
263 const std::vector<std::string> files = param->rest_args();
264 if (files.size() != 2) {
265 std::cout << "Usage: " <<
266 param->program_name() << " corpus model" << std::endl;
270 const std::string ifile = files[0];
271 const std::string model = files[1];
273 const double C = param->get<double>("cost");
274 const bool text_only = param->get<bool>("text-only");
275 const size_t eval_size = param->get<size_t>("eval-size");
276 const size_t unk_eval_size = param->get<size_t>("unk-eval-size");
277 const size_t iter = param->get<size_t>("iteration");
278 const size_t freq = param->get<size_t>("freq");
280 EncoderFeatureIndex feature_index;
281 LearnerTokenizer tokenizer;
282 FreeList<LearnerPath> path_freelist(PATH_FREELIST_SIZE);
283 std::vector<double> expected;
284 std::vector<double> observed;
285 std::vector<double> alpha;
287 std::cout.setf(std::ios::fixed, std::ios::floatfield);
288 std::cout.precision(5);
291 CHECK_DIE(C > 0) << "cost parameter is out of range: " << C;
292 CHECK_DIE(eval_size > 0) << "eval-size is out of range: " << eval_size;
293 CHECK_DIE(unk_eval_size > 0) <<
294 "unk-eval-size is out of range: " << unk_eval_size;
295 CHECK_DIE(tokenizer.open(*param)) << tokenizer.what();
296 CHECK_DIE(feature_index.open(*param)) << feature_index.what();
297 CHECK_DIE(iter >= 1 && iter <= 100) << "iteration should be <= 100";
298 CHECK_DIE(freq == 1) << "freq must be 1";
301 std::cout << "reading corpus ..." << std::flush;
303 EncoderLearnerTagger x;
304 for (size_t i = 0; i < 10; ++i) {
305 std::ifstream ifs(ifile.c_str());
306 CHECK_DIE(ifs) << "no such file or directory: " << ifile;
308 path_freelist.free();
310 std::fill(expected.begin(), expected.end(), 0.0);
311 std::fill(observed.begin(), observed.end(), 0.0);
313 CHECK_DIE(x.open(&tokenizer,
317 unk_eval_size)) << x.what();
318 CHECK_DIE(x.read(&ifs, &observed)) << x.what();
324 alpha.resize(feature_index.size());
325 expected.resize(feature_index.size());
326 observed.resize(feature_index.size());
327 feature_index.set_alpha(&alpha[0]);
329 x.online_update(&expected[0]);
334 size_t err = x.eval(µ_c, µ_p, µ_r);
335 std::cout << micro_p << " " << micro_r << " " << micro_c << " " << err << std::endl;
340 for (size_t k = 0; k < feature_index.size(); ++k) {
341 const double tmp = (observed[k] - expected[k]);
342 margin += alpha[k] * tmp;
346 // Passive Aggressive I algorithm
348 const double diff = _max(0.0, 10 - margin) / s;
350 for (size_t k = 0; k < feature_index.size(); ++k) {
351 alpha[k] += diff * (observed[k] - expected[k]);
358 std::cout << "\nDone! writing model file ... " << std::endl;
360 std::string txtfile = model;
363 CHECK_DIE(feature_index.save(txtfile.c_str()))
364 << feature_index.what();
367 CHECK_DIE(feature_index.convert(txtfile.c_str(), model.c_str()))
368 << feature_index.what();
376 #ifdef MECAB_USE_THREAD
377 class learner_thread: public thread {
379 unsigned short start_i;
380 unsigned short thread_num;
387 EncoderLearnerTagger **x;
388 std::vector<double> expected;
390 micro_p = micro_r = micro_c = err = 0;
392 std::fill(expected.begin(), expected.end(), 0.0);
393 for (size_t i = start_i; i < size; i += thread_num) {
394 f += x[i]->gradient(&expected[0]);
395 err += x[i]->eval(µ_c, µ_p, µ_r);
403 static int run(Param *param) {
404 const std::string dicdir = param->get<std::string>("dicdir");
405 CHECK_DIE(param->load(DCONF(DICRC)))
406 << "no such file or directory: " << DCONF(DICRC);
408 const std::vector<std::string> &files = param->rest_args();
409 if (files.size() != 2) {
410 std::cout << "Usage: " <<
411 param->program_name() << " corpus model" << std::endl;
415 const std::string ifile = files[0];
416 const std::string model = files[1];
418 const double C = param->get<double>("cost");
419 const double eta = param->get<double>("eta");
420 const bool text_only = param->get<bool>("text-only");
421 const size_t eval_size = param->get<size_t>("eval-size");
422 const size_t unk_eval_size = param->get<size_t>("unk-eval-size");
423 const size_t freq = param->get<size_t>("freq");
424 const size_t thread_num = param->get<size_t>("thread");
426 EncoderFeatureIndex feature_index;
427 LearnerTokenizer tokenizer;
428 FreeList<LearnerPath> path_freelist(PATH_FREELIST_SIZE);
429 std::vector<double> expected;
430 std::vector<double> observed;
431 std::vector<double> alpha;
432 std::vector<EncoderLearnerTagger *> x_;
434 std::cout.setf(std::ios::fixed, std::ios::floatfield);
435 std::cout.precision(5);
437 std::ifstream ifs(ifile.c_str());
439 CHECK_DIE(C > 0) << "cost parameter is out of range: " << C;
440 CHECK_DIE(eta > 0) "eta is out of range: " << eta;
441 CHECK_DIE(eval_size > 0) << "eval-size is out of range: " << eval_size;
442 CHECK_DIE(unk_eval_size > 0) <<
443 "unk-eval-size is out of range: " << unk_eval_size;
444 CHECK_DIE(freq > 0) <<
445 "freq is out of range: " << unk_eval_size;
446 CHECK_DIE(thread_num > 0 && thread_num <= 512)
447 << "# thread is invalid: " << thread_num;
448 CHECK_DIE(tokenizer.open(*param)) << tokenizer.what();
449 CHECK_DIE(feature_index.open(*param)) << feature_index.what();
450 CHECK_DIE(ifs) << "no such file or directory: " << ifile;
453 std::cout << "reading corpus ..." << std::flush;
456 EncoderLearnerTagger *_x = new EncoderLearnerTagger();
458 CHECK_DIE(_x->open(&tokenizer, &path_freelist,
464 CHECK_DIE(_x->read(&ifs, &observed)) << _x->what();
471 if (x_.size() % 100 == 0)
472 std::cout << x_.size() << "... " << std::flush;
475 feature_index.shrink(freq, &observed);
476 feature_index.clearcache();
480 size_t psize = feature_index.size();
481 observed.resize(psize);
485 expected.resize(psize);
486 std::fill(alpha.begin(), alpha.end(), 0.0);
488 feature_index.set_alpha(&alpha[0]);
490 std::cout << std::endl;
491 std::cout << "Number of sentences: " << x_.size() << std::endl;
492 std::cout << "Number of features: " << psize << std::endl;
493 std::cout << "eta: " << eta << std::endl;
494 std::cout << "freq: " << freq << std::endl;
495 #ifdef MECAB_USE_THREAD
496 std::cout << "threads: " << thread_num << std::endl;
498 std::cout << "C(sigma^2): " << C << std::endl
501 #ifdef MECAB_USE_THREAD
502 std::vector<learner_thread> thread;
503 if (thread_num > 1) {
504 thread.resize(thread_num);
505 for (size_t i = 0; i < thread_num; ++i) {
506 thread[i].start_i = i;
507 thread[i].size = x_.size();
508 thread[i].thread_num = thread_num;
509 thread[i].x = &x_[0];
510 thread[i].expected.resize(expected.size());
515 for (size_t itr = 0; ; ++itr) {
516 std::fill(expected.begin(), expected.end(), 0.0);
524 #ifdef MECAB_USE_THREAD
525 if (thread_num > 1) {
526 for (size_t i = 0; i < thread_num; ++i)
529 for (size_t i = 0; i < thread_num; ++i)
532 for (size_t i = 0; i < thread_num; ++i) {
534 err += thread[i].err;
535 micro_r += thread[i].micro_r;
536 micro_p += thread[i].micro_p;
537 micro_c += thread[i].micro_c;
538 for (size_t k = 0; k < psize; ++k)
539 expected[k] += thread[i].expected[k];
545 for (size_t i = 0; i < x_.size(); ++i) {
546 f += x_[i]->gradient(&expected[0]);
547 err += x_[i]->eval(µ_c, µ_p, µ_r);
551 const double p = 1.0 * micro_c / micro_p;
552 const double r = 1.0 * micro_c / micro_r;
553 const double micro_f = 2 * p * r /(p + r);
555 for (size_t i = 0; i < psize; ++i) {
556 f += (alpha[i] * alpha[i]/(2.0 * C));
557 expected[i] = expected[i] - observed[i] + alpha[i]/C;
560 double diff = (itr == 0 ? 1.0 : std::fabs(1.0 *(old_f - f) )/old_f);
561 std::cout << "iter=" << itr
562 << " err=" << 1.0 * err/x_.size()
565 << " diff=" << diff << std::endl;
574 break; // 3 is ad-hoc
576 int ret = lbfgs.optimize(psize, &alpha[0], f, &expected[0], false, C);
578 CHECK_DIE(ret > 0) << "unexpected error in LBFGS routin";
581 std::cout << "\nDone! writing model file ... " << std::endl;
583 std::string txtfile = model;
586 CHECK_DIE(feature_index.save(txtfile.c_str()))
587 << feature_index.what();
590 CHECK_DIE(feature_index.convert(txtfile.c_str(), model.c_str()))
591 << feature_index.what();
600 static bool run(int argc, char **argv) {
601 static const MeCab::Option long_options[] = {
602 { "dicdir", 'd', ".", "DIR",
603 "set DIR as dicdir(default \".\" )" },
604 { "cost", 'c', "1.0", "FLOAT",
605 "set FLOAT for cost C for constraints violatoin" },
606 { "training-algorithm", 'a', "crf",
607 "(crf|hmm|oll)", "set training algorithm" },
608 { "em-hmm", 'E', 0, 0, "use EM in HMM training (experimental)" },
609 { "freq", 'f', "1", "INT",
610 "set the frequency cut-off (default 1)" },
611 { "default-emission-freq", 'E', "0.5", "FLOAT",
612 "set the default emission frequency for HMM (default 0.5)" },
613 { "default-transition-freq", 'T', "0.5", "FLOAT",
614 "set the default transition frequency for HMM (default 0.0)" },
615 { "eta", 'e', "0.001", "DIR",
616 "set FLOAT for tolerance of termination criterion" },
617 { "iteration", 'N', "10", "INT",
618 "numer of iterations in online learning (default 1)" },
619 { "thread", 'p', "1", "INT", "number of threads(default 1)" },
620 { "build", 'b', 0, 0, "build binary model from text model"},
621 { "text-only", 'y', 0, 0, "output text model only" },
622 { "version", 'v', 0, 0, "show the version and exit" },
623 { "help", 'h', 0, 0, "show this help and exit." },
629 if (!param.open(argc, argv, long_options)) {
630 std::cout << param.what() << "\n\n" << COPYRIGHT
631 << "\ntry '--help' for more information." << std::endl;
635 if (!param.help_version()) {
641 const bool build = param.get<bool>("build");
643 const std::vector<std::string> files = param.rest_args();
644 if (files.size() != 2) {
645 std::cout << "Usage: " <<
646 param.program_name() << " corpus model" << std::endl;
649 const std::string ifile = files[0];
650 const std::string model = files[1];
651 EncoderFeatureIndex feature_index;
652 CHECK_DIE(feature_index.convert(ifile.c_str(), model.c_str()))
653 << feature_index.what();
658 std::string type = param.get<std::string>("training-algorithm");
661 return CRFLearner::run(¶m);
662 } else if (type == "hmm") {
663 return HMMLearner::run(¶m);
664 } else if (type == "oll") {
665 return OLLearner::run(¶m);
667 std::cerr << "unknown type: " << type << std::endl;
676 int mecab_cost_train(int argc, char **argv) {
677 return MeCab::Learner::run(argc, argv);