1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
12 #include "learner_node.h"
13 #include "learner_tagger.h"
18 char *mystrdup(const char *str) {
19 const size_t l = std::strlen(str);
20 char *r = new char[l + 1];
21 std::strncpy(r, str, l+1);
25 char *mystrdup(const std::string &str) {
26 return mystrdup(str.c_str());
30 bool EncoderLearnerTagger::open(Tokenizer<LearnerNode, LearnerPath> *tokenizer,
31 Allocator<LearnerNode, LearnerPath> *allocator,
32 FeatureIndex *feature_index,
34 size_t unk_eval_size) {
36 tokenizer_ = tokenizer;
37 allocator_ = allocator;
38 feature_index_ = feature_index;
39 eval_size_ = eval_size;
40 unk_eval_size_ = unk_eval_size;
44 bool DecoderLearnerTagger::open(const Param ¶m) {
46 allocator_data_.reset(new Allocator<LearnerNode, LearnerPath>());
47 tokenizer_data_.reset(new Tokenizer<LearnerNode, LearnerPath>());
48 feature_index_data_.reset(new DecoderFeatureIndex);
49 allocator_ = allocator_data_.get();
50 tokenizer_ = tokenizer_data_.get();
51 feature_index_ = feature_index_data_.get();
53 CHECK_DIE(tokenizer_->open(param)) << tokenizer_->what();
54 CHECK_DIE(feature_index_->open(param));
59 bool EncoderLearnerTagger::read(std::istream *is,
60 std::vector<double> *observed) {
61 scoped_fixed_array<char, BUF_SIZE> line;
64 std::vector<LearnerNode *> corpus;
65 ans_path_list_.clear();
70 if (!is->getline(line.get(), line.size())) {
71 is->clear(std::ios::eofbit|std::ios::badbit);
75 eos = (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0');
77 LearnerNode *m = new LearnerNode;
78 std::memset(m, 0, sizeof(LearnerNode));
81 m->stat = MECAB_EOS_NODE;
83 const size_t size = tokenize(line.get(), "\t", column, 2);
84 CHECK_DIE(size == 2) << "format error: " << line.get();
85 m->stat = MECAB_NOR_NODE;
86 m->surface = mystrdup(column[0]);
87 m->feature = mystrdup(column[1]);
88 m->length = m->rlength = std::strlen(column[0]);
97 sentence.append(column[0]);
100 CHECK_DIE(!sentence.empty()) << "empty sentence";
102 CHECK_DIE(eos) << "\"EOS\" is not found";
104 begin_data_.reset_string(sentence);
105 begin_ = begin_data_.get();
110 for (size_t i = 0; corpus[i]->stat != MECAB_EOS_NODE; ++i) {
111 LearnerNode *found = 0;
112 for (LearnerNode *node = lookup(pos); node; node = node->bnext) {
113 if (node_cmp_eq(*(corpus[i]), *node, eval_size_, unk_eval_size_)) {
119 // cannot find node even using UNKNOWN WORD PROSESSING
121 LearnerNode *node = allocator_->newNode();
122 node->surface = begin_ + pos;
123 node->length = node->rlength = std::strlen(corpus[i]->surface);
124 node->feature = feature_index_->strdup(corpus[i]->feature);
125 node->stat = MECAB_NOR_NODE;
128 node->bnext = begin_node_list_[pos];
129 begin_node_list_[pos] = node;
130 std::cout << "adding virtual node: " << node->feature << std::endl;
133 pos += corpus[i]->length;
138 LearnerNode* prev = end_node_list_[0]; // BOS
142 for (size_t i = 0; i < corpus.size(); ++i) {
143 LearnerNode *rNode = 0;
144 for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
145 if (corpus[i]->stat == MECAB_EOS_NODE ||
146 node_cmp_eq(*(corpus[i]), *node, eval_size_, unk_eval_size_)) {
147 rNode = node; // take last node
151 LearnerPath *lpath = 0;
152 for (LearnerPath *path = rNode->lpath; path; path = path->lnext) {
153 if (prev == path->lnode) {
159 CHECK_DIE(lpath->fvector) << "lpath is NULL";
160 for (const int *f = lpath->fvector; *f != -1; ++f) {
161 if (*f >= static_cast<long>(observed->size())) {
162 observed->resize(*f + 1);
167 if (lpath->rnode->stat != MECAB_EOS_NODE) {
168 for (const int *f = lpath->rnode->fvector; *f != -1; ++f) {
169 if (*f >= static_cast<long>(observed->size())) {
170 observed->resize(*f + 1);
176 ans_path_list_.push_back(lpath);
181 if (corpus[i]->stat == MECAB_EOS_NODE) {
185 pos += std::strlen(corpus[i]->surface);
188 prev->anext = begin_node_list_[len_]; // connect to EOS
189 begin_node_list_[len_]->anext = 0;
191 for (size_t i = 0 ; i < corpus.size(); ++i) {
192 delete [] corpus[i]->surface;
193 delete [] corpus[i]->feature;
200 int EncoderLearnerTagger::eval(size_t *crr,
201 size_t *prec, size_t *recall) const {
204 LearnerNode *res = end_node_list_[0]->next;
205 LearnerNode *ans = end_node_list_[0]->anext;
210 while (ans->anext && res->next) {
212 if (node_cmp_eq(*ans, *res, eval_size_, unk_eval_size_)) {
221 resp += res->rlength;
222 ansp += ans->rlength;
223 } else if (resp < ansp) {
225 resp += res->rlength;
230 ansp += ans->rlength;
249 bool DecoderLearnerTagger::parse(std::istream* is, std::ostream *os) {
251 feature_index_->clear();
254 begin_data_.reset(new char[BUF_SIZE * 16]);
255 begin_ = begin_data_.get();
258 if (!is->getline(const_cast<char *>(begin_), BUF_SIZE * 16)) {
259 is->clear(std::ios::eofbit|std::ios::badbit);
267 for (LearnerNode *node = end_node_list_[0]->next;
268 node->next; node = node->next) {
269 os->write(node->surface, node->length);
270 *os << '\t' << node->feature << '\n';
277 LearnerNode *LearnerTagger::lookup(size_t pos) {
278 if (begin_node_list_[pos]) {
279 return begin_node_list_[pos];
281 LearnerNode *m = tokenizer_->lookup<false>(begin_ + pos, end_, allocator_, 0);
282 begin_node_list_[pos] = m;
286 bool LearnerTagger::connect(size_t pos, LearnerNode *_rNode) {
287 for (LearnerNode *rNode = _rNode ; rNode; rNode = rNode->bnext) {
288 for (LearnerNode *lNode = end_node_list_[pos]; lNode;
289 lNode = lNode->enext) {
290 LearnerPath *path = allocator_->newPath();
291 std::memset(path, 0, sizeof(Path));
298 path->lnext = rNode->lpath;
300 path->rnext = lNode->rpath;
302 CHECK_DIE(feature_index_->buildFeature(path));
303 CHECK_DIE(path->fvector);
305 const size_t x = rNode->rlength + pos;
306 rNode->enext = end_node_list_[x];
307 end_node_list_[x] = rNode;
313 bool LearnerTagger::initList() {
318 len_ = std::strlen(begin_);
319 end_ = begin_ + len_;
321 end_node_list_.resize(len_ + 2);
322 std::fill(end_node_list_.begin(), end_node_list_.end(),
323 reinterpret_cast<LearnerNode *>(0));
325 begin_node_list_.resize(len_ + 2);
326 std::fill(begin_node_list_.begin(), begin_node_list_.end(),
327 reinterpret_cast<LearnerNode *>(0));
329 end_node_list_[0] = tokenizer_->getBOSNode(allocator_);
330 end_node_list_[0]->surface = begin_;
331 begin_node_list_[len_] = tokenizer_->getEOSNode(allocator_);
336 bool LearnerTagger::buildLattice() {
337 for (int pos = 0; pos <= static_cast<long>(len_); pos++) {
338 if (!end_node_list_[pos]) {
341 connect(pos, lookup(pos));
344 if (!end_node_list_[len_]) {
345 begin_node_list_[len_] = lookup(len_);
346 for (size_t pos = len_; static_cast<long>(pos) >= 0; pos--) {
347 if (end_node_list_[pos]) {
348 connect(pos, begin_node_list_[len_]);
357 bool LearnerTagger::viterbi() {
358 for (int pos = 0; pos <= static_cast<long>(len_); ++pos) {
359 for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
360 double bestc = -1e37;
361 LearnerNode *best = 0;
362 feature_index_->calcCost(node);
363 for (LearnerPath *path = node->lpath; path; path = path->lnext) {
364 feature_index_->calcCost(path);
365 double cost = path->cost + path->lnode->cost;
377 LearnerNode *node = begin_node_list_[len_]; // EOS
378 for (LearnerNode *prev; node->prev;) {
387 double EncoderLearnerTagger::gradient(double *expected) {
390 for (int pos = 0; pos <= static_cast<long>(len_); ++pos) {
391 for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
396 for (int pos = static_cast<long>(len_); pos >=0; --pos) {
397 for (LearnerNode *node = end_node_list_[pos]; node; node = node->enext) {
402 double Z = begin_node_list_[len_]->alpha; // alpha of EOS
404 for (int pos = 0; pos <= static_cast<long>(len_); ++pos) {
405 for (LearnerNode *node = begin_node_list_[pos]; node; node = node->bnext) {
406 for (LearnerPath *path = node->lpath; path; path = path->lnext) {
407 calc_expectation(path, expected, Z);
412 for (size_t i = 0; i < ans_path_list_.size(); ++i) {
413 Z -= ans_path_list_[i]->cost;