third_party/mecab-0.996/src/learner_tagger.cpp

   1 //  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
   2 //
   3 //
   4 //  Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
   5 //  Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
   6 #include <algorithm>
   7 #include <functional>
   8 #include <iostream>
   9 #include <iterator>
  10 #include <vector>
  11 #include "common.h"
  12 #include "learner_node.h"
  13 #include "learner_tagger.h"
  14 #include "utils.h"
  15
  16 namespace MeCab {
  17 namespace {
  18 char *mystrdup(const char *str) {
  19   const size_t l = std::strlen(str);
  20   char *r = new char[l + 1];
  21   std::strncpy(r, str, l+1);
  22   return r;
  23 }
  24
  25 char *mystrdup(const std::string &str) {
  26   return mystrdup(str.c_str());
  27 }
  28 }  // namespace
  29
  30 bool EncoderLearnerTagger::open(Tokenizer<LearnerNode, LearnerPath> *tokenizer,
  31                                 Allocator<LearnerNode, LearnerPath> *allocator,
  32                                 FeatureIndex               *feature_index,
  33                                 size_t eval_size,
  34                                 size_t unk_eval_size) {
  35   close();
  36   tokenizer_      = tokenizer;
  37   allocator_      = allocator;
  38   feature_index_  = feature_index;
  39   eval_size_      = eval_size;
  40   unk_eval_size_  = unk_eval_size;
  41   return true;
  42 }
  43
  44 bool DecoderLearnerTagger::open(const Param &param) {
  45   close();
  46   allocator_data_.reset(new Allocator<LearnerNode, LearnerPath>());
  47   tokenizer_data_.reset(new Tokenizer<LearnerNode, LearnerPath>());
  48   feature_index_data_.reset(new DecoderFeatureIndex);
  49   allocator_ = allocator_data_.get();
  50   tokenizer_ = tokenizer_data_.get();
  51   feature_index_ = feature_index_data_.get();
  52
  53   CHECK_DIE(tokenizer_->open(param)) << tokenizer_->what();
  54   CHECK_DIE(feature_index_->open(param));
  55
  56   return true;
  57 }
  58
  59 bool EncoderLearnerTagger::read(std::istream *is,
  60                                 std::vector<double> *observed) {
  61   scoped_fixed_array<char, BUF_SIZE> line;
  62   char *column[8];
  63   std::string sentence;
  64   std::vector<LearnerNode *> corpus;
  65   ans_path_list_.clear();
  66
  67   bool eos = false;
  68
  69   for (;;) {
  70     if (!is->getline(line.get(), line.size())) {
  71       is->clear(std::ios::eofbit|std::ios::badbit);
  72       return true;
  73     }
  74
  75     eos = (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0');
  76
  77     LearnerNode *m = new LearnerNode;
  78     std::memset(m, 0, sizeof(LearnerNode));
  79
  80     if (eos) {
  81       m->stat = MECAB_EOS_NODE;
  82     } else {
  83       const size_t size = tokenize(line.get(), "\t", column, 2);
  84       CHECK_DIE(size == 2) << "format error: " << line.get();
  85       m->stat    = MECAB_NOR_NODE;
  86       m->surface = mystrdup(column[0]);
  87       m->feature = mystrdup(column[1]);
  88       m->length  = m->rlength = std::strlen(column[0]);
  89     }
  90
  91     corpus.push_back(m);
  92
  93     if (eos) {
  94       break;
  95     }
  96
  97     sentence.append(column[0]);
  98   }
  99
 100   CHECK_DIE(!sentence.empty()) << "empty sentence";
 101
 102   CHECK_DIE(eos) << "\"EOS\" is not found";
 103
 104   begin_data_.reset_string(sentence);
 105   begin_ = begin_data_.get();
 106
 107   initList();
 108
 109   size_t pos = 0;
 110   for (size_t i = 0; corpus[i]->stat != MECAB_EOS_NODE; ++i) {
 111     LearnerNode *found = 0;
 112     for (LearnerNode *node = lookup(pos); node; node = node->bnext) {
 113       if (node_cmp_eq(*(corpus[i]), *node, eval_size_, unk_eval_size_)) {
 114         found = node;
 115         break;
 116       }
 117     }
 118
 119     // cannot find node even using UNKNOWN WORD PROSESSING
 120     if (!found) {
 121       LearnerNode *node = allocator_->newNode();
 122       node->surface  = begin_ + pos;
 123       node->length   = node->rlength = std::strlen(corpus[i]->surface);
 124       node->feature  = feature_index_->strdup(corpus[i]->feature);
 125       node->stat     = MECAB_NOR_NODE;
 126       node->fvector  = 0;
 127       node->wcost    = 0.0;
 128       node->bnext    = begin_node_list_[pos];
 129       begin_node_list_[pos] = node;
 130       std::cout << "adding virtual node: " << node->feature << std::endl;
 131     }
 132
 133     pos += corpus[i]->length;
 134   }
 135
 136   buildLattice();
 137
 138   LearnerNode* prev = end_node_list_[0];  // BOS
 139   prev->anext = 0;
 140   pos = 0;
 141
 142   for (size_t i = 0; i < corpus.size(); ++i) {
 143     LearnerNode *rNode = 0;
 144     for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
 145       if (corpus[i]->stat == MECAB_EOS_NODE ||
 146           node_cmp_eq(*(corpus[i]), *node, eval_size_, unk_eval_size_)) {
 147         rNode = node;  // take last node
 148       }
 149     }
 150
 151     LearnerPath *lpath = 0;
 152     for (LearnerPath *path = rNode->lpath; path; path = path->lnext) {
 153       if (prev == path->lnode) {
 154         lpath = path;
 155         break;
 156       }
 157     }
 158
 159     CHECK_DIE(lpath->fvector) << "lpath is NULL";
 160     for (const int *f = lpath->fvector; *f != -1; ++f) {
 161       if (*f >= static_cast<long>(observed->size())) {
 162         observed->resize(*f + 1);
 163       }
 164       ++(*observed)[*f];
 165     }
 166
 167     if (lpath->rnode->stat != MECAB_EOS_NODE) {
 168       for (const int *f = lpath->rnode->fvector; *f != -1; ++f) {
 169         if (*f >= static_cast<long>(observed->size())) {
 170           observed->resize(*f + 1);
 171         }
 172         ++(*observed)[*f];
 173       }
 174     }
 175
 176     ans_path_list_.push_back(lpath);
 177
 178     prev->anext = rNode;
 179     prev = rNode;
 180
 181     if (corpus[i]->stat == MECAB_EOS_NODE) {
 182       break;
 183     }
 184
 185     pos += std::strlen(corpus[i]->surface);
 186   }
 187
 188   prev->anext = begin_node_list_[len_];  // connect to EOS
 189   begin_node_list_[len_]->anext = 0;
 190
 191   for (size_t i = 0 ; i < corpus.size(); ++i) {
 192     delete [] corpus[i]->surface;
 193     delete [] corpus[i]->feature;
 194     delete corpus[i];
 195   }
 196
 197   return true;
 198 }
 199
 200 int EncoderLearnerTagger::eval(size_t *crr,
 201                                size_t *prec, size_t *recall) const {
 202   int zeroone = 0;
 203
 204   LearnerNode *res = end_node_list_[0]->next;
 205   LearnerNode *ans = end_node_list_[0]->anext;
 206
 207   size_t resp = 0;
 208   size_t ansp = 0;
 209
 210   while (ans->anext && res->next) {
 211     if (resp == ansp) {
 212       if (node_cmp_eq(*ans, *res, eval_size_, unk_eval_size_)) {
 213         ++(*crr);  // same
 214       } else {
 215         zeroone = 1;
 216       }
 217       ++(*prec);
 218       ++(*recall);
 219       res = res->next;
 220       ans = ans->anext;
 221       resp += res->rlength;
 222       ansp += ans->rlength;
 223     } else if (resp < ansp) {
 224       res = res->next;
 225       resp += res->rlength;
 226       zeroone = 1;
 227       ++(*recall);
 228     } else {
 229       ans = ans->anext;
 230       ansp += ans->rlength;
 231       zeroone = 1;
 232       ++(*prec);
 233     }
 234   }
 235
 236   while (ans->anext) {
 237     ++(*prec);
 238     ans = ans->anext;
 239   }
 240
 241   while (res->next) {
 242     ++(*recall);
 243     res = res->next;
 244   }
 245
 246   return zeroone;
 247 }
 248
 249 bool DecoderLearnerTagger::parse(std::istream* is, std::ostream *os) {
 250   allocator_->free();
 251   feature_index_->clear();
 252
 253   if (!begin_) {
 254     begin_data_.reset(new char[BUF_SIZE * 16]);
 255     begin_ = begin_data_.get();
 256   }
 257
 258   if (!is->getline(const_cast<char *>(begin_), BUF_SIZE * 16)) {
 259     is->clear(std::ios::eofbit|std::ios::badbit);
 260     return false;
 261   }
 262
 263   initList();
 264   buildLattice();
 265   viterbi();
 266
 267   for (LearnerNode *node = end_node_list_[0]->next;
 268        node->next; node = node->next) {
 269     os->write(node->surface, node->length);
 270     *os << '\t' << node->feature << '\n';
 271   }
 272   *os << "EOS\n";
 273
 274   return true;
 275 }
 276
 277 LearnerNode *LearnerTagger::lookup(size_t pos) {
 278   if (begin_node_list_[pos]) {
 279     return begin_node_list_[pos];
 280   }
 281   LearnerNode *m = tokenizer_->lookup<false>(begin_ + pos, end_, allocator_, 0);
 282   begin_node_list_[pos] = m;
 283   return m;
 284 }
 285
 286 bool LearnerTagger::connect(size_t pos, LearnerNode *_rNode) {
 287   for (LearnerNode *rNode = _rNode ; rNode; rNode = rNode->bnext) {
 288     for (LearnerNode *lNode = end_node_list_[pos]; lNode;
 289          lNode = lNode->enext) {
 290       LearnerPath *path   = allocator_->newPath();
 291       std::memset(path, 0, sizeof(Path));
 292       path->rnode   = rNode;
 293       path->lnode   = lNode;
 294       path->fvector = 0;
 295       path->cost    = 0.0;
 296       path->rnode   = rNode;
 297       path->lnode   = lNode;
 298       path->lnext   = rNode->lpath;
 299       rNode->lpath  = path;
 300       path->rnext   = lNode->rpath;
 301       lNode->rpath  = path;
 302       CHECK_DIE(feature_index_->buildFeature(path));
 303       CHECK_DIE(path->fvector);
 304     }
 305     const size_t x = rNode->rlength + pos;
 306     rNode->enext  = end_node_list_[x];
 307     end_node_list_[x] = rNode;
 308   }
 309
 310   return true;
 311 }
 312
 313 bool LearnerTagger::initList() {
 314   if (!begin_) {
 315     return false;
 316   }
 317
 318   len_ = std::strlen(begin_);
 319   end_ = begin_ + len_;
 320
 321   end_node_list_.resize(len_ + 2);
 322   std::fill(end_node_list_.begin(), end_node_list_.end(),
 323             reinterpret_cast<LearnerNode *>(0));
 324
 325   begin_node_list_.resize(len_ + 2);
 326   std::fill(begin_node_list_.begin(), begin_node_list_.end(),
 327             reinterpret_cast<LearnerNode *>(0));
 328
 329   end_node_list_[0] = tokenizer_->getBOSNode(allocator_);
 330   end_node_list_[0]->surface = begin_;
 331   begin_node_list_[len_] = tokenizer_->getEOSNode(allocator_);
 332
 333   return true;
 334 }
 335
 336 bool LearnerTagger::buildLattice() {
 337   for (int pos = 0; pos <= static_cast<long>(len_);  pos++) {
 338     if (!end_node_list_[pos]) {
 339       continue;
 340     }
 341     connect(pos, lookup(pos));
 342   }
 343
 344   if (!end_node_list_[len_]) {
 345     begin_node_list_[len_] = lookup(len_);
 346     for (size_t pos = len_; static_cast<long>(pos) >= 0;  pos--) {
 347       if (end_node_list_[pos]) {
 348         connect(pos, begin_node_list_[len_]);
 349         break;
 350       }
 351     }
 352   }
 353
 354   return true;
 355 }
 356
 357 bool LearnerTagger::viterbi() {
 358   for (int pos = 0;   pos <= static_cast<long>(len_);  ++pos) {
 359     for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
 360       double bestc = -1e37;
 361       LearnerNode *best = 0;
 362       feature_index_->calcCost(node);
 363       for (LearnerPath *path = node->lpath; path; path = path->lnext) {
 364         feature_index_->calcCost(path);
 365         double cost = path->cost + path->lnode->cost;
 366         if (cost > bestc) {
 367           bestc = cost;
 368           best  = path->lnode;
 369         }
 370       }
 371
 372       node->prev = best;
 373       node->cost = bestc;
 374     }
 375   }
 376
 377   LearnerNode *node = begin_node_list_[len_];  // EOS
 378   for (LearnerNode *prev; node->prev;) {
 379     prev = node->prev;
 380     prev->next = node;
 381     node = prev;
 382   }
 383
 384   return true;
 385 }
 386
 387 double EncoderLearnerTagger::gradient(double *expected) {
 388   viterbi();
 389
 390   for (int pos = 0;   pos <= static_cast<long>(len_);  ++pos) {
 391     for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
 392       calc_alpha(node);
 393     }
 394   }
 395
 396   for (int pos = static_cast<long>(len_); pos >=0;    --pos) {
 397     for (LearnerNode *node = end_node_list_[pos]; node; node = node->enext) {
 398       calc_beta(node);
 399     }
 400   }
 401
 402   double Z = begin_node_list_[len_]->alpha;  // alpha of EOS
 403
 404   for (int pos = 0;   pos <= static_cast<long>(len_);  ++pos) {
 405     for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
 406       for (LearnerPath *path = node->lpath; path; path = path->lnext) {
 407         calc_expectation(path, expected, Z);
 408       }
 409     }
 410   }
 411
 412   for (size_t i = 0; i < ans_path_list_.size(); ++i) {
 413     Z -= ans_path_list_[i]->cost;
 414   }
 415
 416   return Z;
 417 }
 418 }