1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
10 #include "connector.h"
12 #include "nbest_generator.h"
14 #include "scoped_ptr.h"
15 #include "stream_wrapper.h"
16 #include "string_buffer.h"
18 #include "tokenizer.h"
26 const char *getGlobalError();
27 void setGlobalError(const char *str);
32 const float kDefaultTheta = 0.75;
34 const MeCab::Option long_options[] = {
35 { "rcfile", 'r', 0, "FILE", "use FILE as resource file" },
36 { "dicdir", 'd', 0, "DIR", "set DIR as a system dicdir" },
37 { "userdic", 'u', 0, "FILE", "use FILE as a user dictionary" },
38 { "lattice-level", 'l', "0", "INT",
39 "lattice information level (DEPRECATED)" },
40 { "dictionary-info", 'D', 0, 0, "show dictionary information and exit" },
41 { "output-format-type", 'O', 0, "TYPE",
42 "set output format type (wakati,none,...)" },
43 { "all-morphs", 'a', 0, 0, "output all morphs(default false)" },
45 "INT", "output N best results (default 1)" },
46 { "partial", 'p', 0, 0,
47 "partial parsing mode (default false)" },
48 { "marginal", 'm', 0, 0,
49 "output marginal probability (default false)" },
50 { "max-grouping-size", 'M', "24",
51 "INT", "maximum grouping size for unknown words (default 24)" },
52 { "node-format", 'F', "%m\\t%H\\n", "STR",
53 "use STR as the user-defined node format" },
54 { "unk-format", 'U', "%m\\t%H\\n", "STR",
55 "use STR as the user-defined unknown node format" },
56 { "bos-format", 'B', "", "STR",
57 "use STR as the user-defined beginning-of-sentence format" },
58 { "eos-format", 'E', "EOS\\n", "STR",
59 "use STR as the user-defined end-of-sentence format" },
60 { "eon-format", 'S', "", "STR",
61 "use STR as the user-defined end-of-NBest format" },
62 { "unk-feature", 'x', 0, "STR",
63 "use STR as the feature for unknown word" },
64 { "input-buffer-size", 'b', 0, "INT",
65 "set input buffer size (default 8192)" },
66 { "dump-config", 'P', 0, 0, "dump MeCab parameters" },
67 { "allocate-sentence", 'C', 0, 0,
68 "allocate new memory for input sentence" },
69 { "theta", 't', "0.75", "FLOAT",
70 "set temparature parameter theta (default 0.75)" },
71 { "cost-factor", 'c', "700", "INT",
72 "set cost factor (default 700)" },
73 { "output", 'o', 0, "FILE", "set the output file name" },
74 { "version", 'v', 0, 0, "show the version and exit." },
75 { "help", 'h', 0, 0, "show this help and exit." },
79 class ModelImpl: public Model {
84 bool open(int argc, char **argv);
85 bool open(const char *arg);
86 bool open(const Param ¶m);
88 bool swap(Model *model);
90 bool is_available() const {
91 return (viterbi_ && writer_.get());
94 int request_type() const {
98 double theta() const {
102 const DictionaryInfo *dictionary_info() const {
103 return viterbi_->tokenizer() ?
104 viterbi_->tokenizer()->dictionary_info() : 0;
107 int transition_cost(unsigned short rcAttr,
108 unsigned short lcAttr) const {
109 return viterbi_->connector()->transition_cost(rcAttr, lcAttr);
112 Node *lookup(const char *begin, const char *end,
113 Lattice *lattice) const {
114 return viterbi_->tokenizer()->lookup<false>(
116 lattice->allocator(), lattice);
119 Tagger *createTagger() const;
121 Lattice *createLattice() const;
123 const Viterbi *viterbi() const {
127 // moves the owership.
128 Viterbi *take_viterbi() {
129 Viterbi *result = viterbi_;
134 const Writer *writer() const {
135 return writer_.get();
138 #ifdef HAVE_ATOMIC_OPS
139 read_write_mutex *mutex() const {
146 scoped_ptr<Writer> writer_;
150 #ifdef HAVE_ATOMIC_OPS
151 mutable read_write_mutex mutex_;
155 class TaggerImpl: public Tagger {
157 bool open(int argc, char **argv);
158 bool open(const char *arg);
159 bool open(const ModelImpl &model);
161 bool parse(Lattice *lattice) const;
163 void set_request_type(int request_type);
164 int request_type() const;
166 const char* parse(const char*);
167 const char* parse(const char*, size_t);
168 const char* parse(const char*, size_t, char*, size_t);
169 const Node* parseToNode(const char*);
170 const Node* parseToNode(const char*, size_t = 0);
171 const char* parseNBest(size_t, const char*);
172 const char* parseNBest(size_t, const char*, size_t);
173 const char* parseNBest(size_t, const char*,
174 size_t, char *, size_t);
175 bool parseNBestInit(const char*);
176 bool parseNBestInit(const char*, size_t);
177 const Node* nextNode();
179 const char* next(char*, size_t);
181 const char *formatNode(const Node *);
182 const char *formatNode(const Node *, char *, size_t);
184 const DictionaryInfo *dictionary_info() const;
186 void set_partial(bool partial);
187 bool partial() const;
188 void set_theta(float theta);
190 void set_lattice_level(int level);
191 int lattice_level() const;
192 void set_all_morphs(bool all_morphs);
193 bool all_morphs() const;
195 const char* what() const;
198 virtual ~TaggerImpl();
201 const ModelImpl *model() const { return current_model_; }
203 void set_what(const char *str) {
207 void initRequestType() {
208 mutable_lattice()->set_request_type(request_type_);
209 mutable_lattice()->set_theta(theta_);
212 Lattice *mutable_lattice() {
213 if (!lattice_.get()) {
214 lattice_.reset(model()->createLattice());
216 return lattice_.get();
219 const ModelImpl *current_model_;
220 scoped_ptr<ModelImpl> model_;
221 scoped_ptr<Lattice> lattice_;
227 class LatticeImpl : public Lattice {
229 explicit LatticeImpl(const Writer *writer = 0);
232 // clear internal lattice
235 bool is_available() const {
237 !begin_nodes_.empty() &&
238 !end_nodes_.empty());
244 // return bos/eos node
245 Node *bos_node() const { return end_nodes_[0]; }
246 Node *eos_node() const { return begin_nodes_[size()]; }
247 Node **begin_nodes() const { return const_cast<Node **>(&begin_nodes_[0]); }
248 Node **end_nodes() const { return const_cast<Node **>(&end_nodes_[0]); }
249 Node *begin_nodes(size_t pos) const { return begin_nodes_[pos]; }
250 Node *end_nodes(size_t pos) const { return end_nodes_[pos]; }
252 const char *sentence() const { return sentence_; }
253 void set_sentence(const char *sentence);
254 void set_sentence(const char *sentence, size_t len);
255 size_t size() const { return size_; }
257 void set_Z(double Z) { Z_ = Z; }
258 double Z() const { return Z_; }
260 float theta() const { return theta_; }
261 void set_theta(float theta) { theta_ = theta; }
263 int request_type() const { return request_type_; }
265 void set_request_type(int request_type) {
266 request_type_ = request_type;
268 bool has_request_type(int request_type) const {
269 return request_type & request_type_;
271 void add_request_type(int request_type) {
272 request_type_ |= request_type;
274 void remove_request_type(int request_type) {
275 request_type_ &= ~request_type;
278 Allocator<Node, Path> *allocator() const {
279 return allocator_.get();
283 return allocator_->newNode();
286 bool has_constraint() const;
287 int boundary_constraint(size_t pos) const;
288 const char *feature_constraint(size_t begin_pos) const;
290 void set_boundary_constraint(size_t pos,
291 int boundary_constraint_type);
293 void set_feature_constraint(size_t begin_pos, size_t end_pos,
294 const char *feature);
296 void set_result(const char *result);
298 const char *what() const { return what_.c_str(); }
300 void set_what(const char *str) {
304 const char *toString();
305 const char *toString(char *buf, size_t size);
306 const char *toString(const Node *node);
307 const char *toString(const Node *node,
308 char *buf, size_t size);
309 const char *enumNBestAsString(size_t N);
310 const char *enumNBestAsString(size_t N, char *buf, size_t size);
313 const char *sentence_;
319 std::vector<Node *> end_nodes_;
320 std::vector<Node *> begin_nodes_;
321 std::vector<const char *> feature_constraint_;
322 std::vector<unsigned char> boundary_constraint_;
323 const Writer *writer_;
324 scoped_ptr<StringBuffer> ostrs_;
325 scoped_ptr<Allocator<Node, Path> > allocator_;
327 StringBuffer *stream() {
329 ostrs_.reset(new StringBuffer);
334 const char *toStringInternal(StringBuffer *os);
335 const char *toStringInternal(const Node *node, StringBuffer *os);
336 const char *enumNBestAsStringInternal(size_t N, StringBuffer *os);
339 ModelImpl::ModelImpl()
340 : viterbi_(new Viterbi), writer_(new Writer),
341 request_type_(MECAB_ONE_BEST), theta_(0.0) {}
343 ModelImpl::~ModelImpl() {
348 bool ModelImpl::open(int argc, char **argv) {
350 if (!param.open(argc, argv, long_options) ||
351 !load_dictionary_resource(¶m)) {
352 setGlobalError(param.what());
358 bool ModelImpl::open(const char *arg) {
360 if (!param.open(arg, long_options) ||
361 !load_dictionary_resource(¶m)) {
362 setGlobalError(param.what());
368 bool ModelImpl::open(const Param ¶m) {
369 if (!writer_->open(param) || !viterbi_->open(param)) {
370 std::string error = viterbi_->what();
371 if (!error.empty()) {
374 error.append(writer_->what());
375 setGlobalError(error.c_str());
379 request_type_ = load_request_type(param);
380 theta_ = param.get<double>("theta");
382 return is_available();
385 bool ModelImpl::swap(Model *model) {
386 scoped_ptr<Model> model_data(model);
388 if (!is_available()) {
389 setGlobalError("current model is not available");
392 #ifndef HAVE_ATOMIC_OPS
393 setGlobalError("atomic model replacement is not supported");
396 ModelImpl *m = static_cast<ModelImpl *>(model_data.get());
398 setGlobalError("Invalid model is passed");
402 if (!m->is_available()) {
403 setGlobalError("Passed model is not available");
407 Viterbi *current_viterbi = viterbi_;
409 scoped_writer_lock l(mutex());
410 viterbi_ = m->take_viterbi();
411 request_type_ = m->request_type();
415 delete current_viterbi;
421 Tagger *ModelImpl::createTagger() const {
422 if (!is_available()) {
423 setGlobalError("Model is not available");
426 TaggerImpl *tagger = new TaggerImpl;
427 if (!tagger->open(*this)) {
428 setGlobalError(tagger->what());
432 tagger->set_theta(theta_);
433 tagger->set_request_type(request_type_);
437 Lattice *ModelImpl::createLattice() const {
438 if (!is_available()) {
439 setGlobalError("Model is not available");
442 return new LatticeImpl(writer_.get());
445 TaggerImpl::TaggerImpl()
447 request_type_(MECAB_ONE_BEST), theta_(kDefaultTheta) {}
449 TaggerImpl::~TaggerImpl() {}
451 const char *TaggerImpl::what() const {
452 return what_.c_str();
455 bool TaggerImpl::open(int argc, char **argv) {
456 model_.reset(new ModelImpl);
457 if (!model_->open(argc, argv)) {
461 current_model_ = model_.get();
462 request_type_ = model()->request_type();
463 theta_ = model()->theta();
467 bool TaggerImpl::open(const char *arg) {
468 model_.reset(new ModelImpl);
469 if (!model_->open(arg)) {
473 current_model_ = model_.get();
474 request_type_ = model()->request_type();
475 theta_ = model()->theta();
479 bool TaggerImpl::open(const ModelImpl &model) {
480 if (!model.is_available()) {
484 current_model_ = &model;
485 request_type_ = current_model_->request_type();
486 theta_ = current_model_->theta();
490 void TaggerImpl::set_request_type(int request_type) {
491 request_type_ = request_type;
494 int TaggerImpl::request_type() const {
495 return request_type_;
498 void TaggerImpl::set_partial(bool partial) {
500 request_type_ |= MECAB_PARTIAL;
502 request_type_ &= ~MECAB_PARTIAL;
506 bool TaggerImpl::partial() const {
507 return request_type_ & MECAB_PARTIAL;
510 void TaggerImpl::set_theta(float theta) {
514 float TaggerImpl::theta() const {
518 void TaggerImpl::set_lattice_level(int level) {
520 case 0: request_type_ |= MECAB_ONE_BEST;
522 case 1: request_type_ |= MECAB_NBEST;
524 case 2: request_type_ |= MECAB_MARGINAL_PROB;
531 int TaggerImpl::lattice_level() const {
532 if (request_type_ & MECAB_MARGINAL_PROB) {
534 } else if (request_type_ & MECAB_NBEST) {
541 void TaggerImpl::set_all_morphs(bool all_morphs) {
543 request_type_ |= MECAB_ALL_MORPHS;
545 request_type_ &= ~MECAB_ALL_MORPHS;
549 bool TaggerImpl::all_morphs() const {
550 return request_type_ & MECAB_ALL_MORPHS;
553 bool TaggerImpl::parse(Lattice *lattice) const {
554 #ifdef HAVE_ATOMIC_OPS
555 scoped_reader_lock l(model()->mutex());
558 return model()->viterbi()->analyze(lattice);
561 const char *TaggerImpl::parse(const char *str) {
562 return parse(str, std::strlen(str));
565 const char *TaggerImpl::parse(const char *str, size_t len) {
566 Lattice *lattice = mutable_lattice();
567 lattice->set_sentence(str, len);
569 if (!parse(lattice)) {
570 set_what(lattice->what());
573 const char *result = lattice->toString();
575 set_what(lattice->what());
581 const char *TaggerImpl::parse(const char *str, size_t len,
582 char *out, size_t len2) {
583 Lattice *lattice = mutable_lattice();
584 lattice->set_sentence(str, len);
586 if (!parse(lattice)) {
587 set_what(lattice->what());
590 const char *result = lattice->toString(out, len2);
592 set_what(lattice->what());
598 const Node *TaggerImpl::parseToNode(const char *str) {
599 return parseToNode(str, std::strlen(str));
602 const Node *TaggerImpl::parseToNode(const char *str, size_t len) {
603 Lattice *lattice = mutable_lattice();
604 lattice->set_sentence(str, len);
606 if (!parse(lattice)) {
607 set_what(lattice->what());
610 return lattice->bos_node();
613 bool TaggerImpl::parseNBestInit(const char *str) {
614 return parseNBestInit(str, std::strlen(str));
617 bool TaggerImpl::parseNBestInit(const char *str, size_t len) {
618 Lattice *lattice = mutable_lattice();
619 lattice->set_sentence(str, len);
621 lattice->add_request_type(MECAB_NBEST);
622 if (!parse(lattice)) {
623 set_what(lattice->what());
629 const Node* TaggerImpl::nextNode() {
630 Lattice *lattice = mutable_lattice();
631 if (!lattice->next()) {
632 lattice->set_what("no more results");
635 return lattice->bos_node();
638 const char* TaggerImpl::next() {
639 Lattice *lattice = mutable_lattice();
640 if (!lattice->next()) {
641 lattice->set_what("no more results");
644 const char *result = lattice->toString();
646 set_what(lattice->what());
652 const char* TaggerImpl::next(char *out, size_t len2) {
653 Lattice *lattice = mutable_lattice();
654 if (!lattice->next()) {
655 lattice->set_what("no more results");
658 const char *result = lattice->toString(out, len2);
660 set_what(lattice->what());
666 const char* TaggerImpl::parseNBest(size_t N, const char* str) {
667 return parseNBest(N, str, std::strlen(str));
670 const char* TaggerImpl::parseNBest(size_t N,
671 const char* str, size_t len) {
672 Lattice *lattice = mutable_lattice();
673 lattice->set_sentence(str, len);
675 lattice->add_request_type(MECAB_NBEST);
677 if (!parse(lattice)) {
678 set_what(lattice->what());
682 const char *result = lattice->enumNBestAsString(N);
684 set_what(lattice->what());
690 const char* TaggerImpl::parseNBest(size_t N, const char* str, size_t len,
691 char *out, size_t len2) {
692 Lattice *lattice = mutable_lattice();
693 lattice->set_sentence(str, len);
695 lattice->add_request_type(MECAB_NBEST);
697 if (!parse(lattice)) {
698 set_what(lattice->what());
702 const char *result = lattice->enumNBestAsString(N, out, len2);
704 set_what(lattice->what());
710 const char* TaggerImpl::formatNode(const Node* node) {
711 const char *result = mutable_lattice()->toString(node);
713 set_what(mutable_lattice()->what());
719 const char* TaggerImpl::formatNode(const Node* node,
720 char *out, size_t len) {
721 const char *result = mutable_lattice()->toString(node, out, len);
723 set_what(mutable_lattice()->what());
729 const DictionaryInfo *TaggerImpl::dictionary_info() const {
730 return model()->dictionary_info();
733 LatticeImpl::LatticeImpl(const Writer *writer)
734 : sentence_(0), size_(0), theta_(kDefaultTheta), Z_(0.0),
735 request_type_(MECAB_ONE_BEST),
738 allocator_(new Allocator<Node, Path>) {
739 begin_nodes_.reserve(MIN_INPUT_BUFFER_SIZE);
740 end_nodes_.reserve(MIN_INPUT_BUFFER_SIZE);
743 LatticeImpl::~LatticeImpl() {}
745 void LatticeImpl::clear() {
750 begin_nodes_.clear();
752 feature_constraint_.clear();
753 boundary_constraint_.clear();
755 theta_ = kDefaultTheta;
760 void LatticeImpl::set_sentence(const char *sentence) {
761 return set_sentence(sentence, strlen(sentence));
764 void LatticeImpl::set_sentence(const char *sentence, size_t len) {
766 end_nodes_.resize(len + 4);
767 begin_nodes_.resize(len + 4);
769 if (has_request_type(MECAB_ALLOCATE_SENTENCE) ||
770 has_request_type(MECAB_PARTIAL)) {
771 char *new_sentence = allocator()->strdup(sentence, len);
772 sentence_ = new_sentence;
774 sentence_ = sentence;
778 std::memset(&end_nodes_[0], 0,
779 sizeof(end_nodes_[0]) * (len + 4));
780 std::memset(&begin_nodes_[0], 0,
781 sizeof(begin_nodes_[0]) * (len + 4));
784 bool LatticeImpl::next() {
785 if (!has_request_type(MECAB_NBEST)) {
786 set_what("MECAB_NBEST request type is not set");
790 if (!allocator()->nbest_generator()->next()) {
794 Viterbi::buildResultForNBest(this);
798 void LatticeImpl::set_result(const char *result) {
799 char *str = allocator()->strdup(result, std::strlen(result));
800 std::vector<char *> lines;
801 const size_t lsize = tokenize(str, "\n",
802 std::back_inserter(lines),
803 std::strlen(result));
804 CHECK_DIE(lsize == lines.size());
806 std::string sentence;
807 std::vector<std::string> surfaces, features;
808 for (size_t i = 0; i < lines.size(); ++i) {
809 if (::strcmp("EOS", lines[i]) == 0) {
813 if (tokenize(lines[i], "\t", cols, 2) != 2) {
817 surfaces.push_back(cols[0]);
818 features.push_back(cols[1]);
821 CHECK_DIE(features.size() == surfaces.size());
823 set_sentence(allocator()->strdup(sentence.c_str(), sentence.size()));
825 Node *bos_node = allocator()->newNode();
826 bos_node->surface = const_cast<const char *>(BOS_KEY); // dummy
827 bos_node->feature = "BOS/EOS";
828 bos_node->isbest = 1;
829 bos_node->stat = MECAB_BOS_NODE;
831 Node *eos_node = allocator()->newNode();
832 eos_node->surface = const_cast<const char *>(BOS_KEY); // dummy
833 eos_node->feature = "BOS/EOS";
834 eos_node->isbest = 1;
835 eos_node->stat = MECAB_EOS_NODE;
837 bos_node->surface = sentence_;
838 end_nodes_[0] = bos_node;
841 Node *prev = bos_node;
842 for (size_t i = 0; i < surfaces.size(); ++i) {
843 Node *node = allocator()->newNode();
846 node->surface = sentence_ + offset;
847 node->length = surfaces[i].size();
848 node->rlength = surfaces[i].size();
850 node->stat = MECAB_NOR_NODE;
853 node->feature = allocator()->strdup(features[i].c_str(),
855 begin_nodes_[offset] = node;
856 end_nodes_[offset + node->length] = node;
857 offset += node->length;
861 prev->next = eos_node;
862 eos_node->prev = prev;
865 // default implementation of Lattice formatter.
867 void writeLattice(Lattice *lattice, StringBuffer *os) {
868 for (const Node *node = lattice->bos_node()->next;
869 node->next; node = node->next) {
870 os->write(node->surface, node->length);
871 *os << '\t' << node->feature;
878 const char *LatticeImpl::toString() {
879 return toStringInternal(stream());
882 const char *LatticeImpl::toString(char *buf, size_t size) {
883 StringBuffer os(buf, size);
884 return toStringInternal(&os);
887 const char *LatticeImpl::toStringInternal(StringBuffer *os) {
890 if (!writer_->write(this, os)) {
894 writeLattice(this, os);
898 set_what("output buffer overflow");
904 const char *LatticeImpl::toString(const Node *node) {
905 return toStringInternal(node, stream());
908 const char *LatticeImpl::toString(const Node *node,
909 char *buf, size_t size) {
910 StringBuffer os(buf, size);
911 return toStringInternal(node, &os);
914 const char *LatticeImpl::toStringInternal(const Node *node,
918 set_what("node is NULL");
922 if (!writer_->writeNode(this, node, os)) {
926 os->write(node->surface, node->length);
927 *os << '\t' << node->feature;
931 set_what("output buffer overflow");
937 const char *LatticeImpl::enumNBestAsString(size_t N) {
938 return enumNBestAsStringInternal(N, stream());
941 const char *LatticeImpl::enumNBestAsString(size_t N, char *buf, size_t size) {
942 StringBuffer os(buf, size);
943 return enumNBestAsStringInternal(N, &os);
946 const char *LatticeImpl::enumNBestAsStringInternal(size_t N,
950 if (N == 0 || N > NBEST_MAX) {
951 set_what("nbest size must be 1 <= nbest <= 512");
955 for (size_t i = 0; i < N; ++i) {
960 if (!writer_->write(this, os)) {
964 writeLattice(this, os);
968 // make a dummy node for EON
971 memset(&eon_node, 0, sizeof(eon_node));
972 eon_node.stat = MECAB_EON_NODE;
974 eon_node.surface = this->sentence() + this->size();
975 if (!writer_->writeNode(this, &eon_node, os)) {
982 set_what("output buffer overflow");
989 bool LatticeImpl::has_constraint() const {
990 return !boundary_constraint_.empty();
993 int LatticeImpl::boundary_constraint(size_t pos) const {
994 if (!boundary_constraint_.empty()) {
995 return boundary_constraint_[pos];
997 return MECAB_ANY_BOUNDARY;
1000 const char *LatticeImpl::feature_constraint(size_t begin_pos) const {
1001 if (!feature_constraint_.empty()) {
1002 return feature_constraint_[begin_pos];
1007 void LatticeImpl::set_boundary_constraint(size_t pos,
1008 int boundary_constraint_type) {
1009 if (boundary_constraint_.empty()) {
1010 boundary_constraint_.resize(size() + 4, MECAB_ANY_BOUNDARY);
1012 boundary_constraint_[pos] = boundary_constraint_type;
1015 void LatticeImpl::set_feature_constraint(size_t begin_pos, size_t end_pos,
1016 const char *feature) {
1017 if (begin_pos >= end_pos || !feature) {
1021 if (feature_constraint_.empty()) {
1022 feature_constraint_.resize(size() + 4, 0);
1025 end_pos = std::min(end_pos, size());
1027 set_boundary_constraint(begin_pos, MECAB_TOKEN_BOUNDARY);
1028 set_boundary_constraint(end_pos, MECAB_TOKEN_BOUNDARY);
1029 for (size_t i = begin_pos + 1; i < end_pos; ++i) {
1030 set_boundary_constraint(i, MECAB_INSIDE_TOKEN);
1033 feature_constraint_[begin_pos] = feature;
1037 Tagger *Tagger::create(int argc, char **argv) {
1038 return createTagger(argc, argv);
1041 Tagger *Tagger::create(const char *arg) {
1042 return createTagger(arg);
1045 const char *Tagger::version() {
1049 Tagger *createTagger(int argc, char **argv) {
1050 TaggerImpl *tagger = new TaggerImpl();
1051 if (!tagger->open(argc, argv)) {
1052 setGlobalError(tagger->what());
1059 Tagger *createTagger(const char *argv) {
1060 TaggerImpl *tagger = new TaggerImpl();
1061 if (!tagger->open(argv)) {
1062 setGlobalError(tagger->what());
1069 void deleteTagger(Tagger *tagger) {
1073 const char *getTaggerError() {
1074 return getLastError();
1077 const char *getLastError() {
1078 return getGlobalError();
1081 Model *createModel(int argc, char **argv) {
1082 ModelImpl *model = new ModelImpl;
1083 if (!model->open(argc, argv)) {
1090 Model *createModel(const char *arg) {
1091 ModelImpl *model = new ModelImpl;
1092 if (!model->open(arg)) {
1099 void deleteModel(Model *model) {
1103 Model *Model::create(int argc, char **argv) {
1104 return createModel(argc, argv);
1107 Model *Model::create(const char *arg) {
1108 return createModel(arg);
1111 const char *Model::version() {
1115 bool Tagger::parse(const Model &model, Lattice *lattice) {
1116 scoped_ptr<Tagger> tagger(model.createTagger());
1117 return tagger->parse(lattice);
1120 Lattice *Lattice::create() {
1121 return createLattice();
1124 Lattice *createLattice() {
1125 return new LatticeImpl;
1128 void deleteLattice(Lattice *lattice) {
1133 int mecab_do(int argc, char **argv) {
1134 #define WHAT_ERROR(msg) do { \
1135 std::cout << msg << std::endl; \
1136 return EXIT_FAILURE; } \
1140 if (!param.open(argc, argv, MeCab::long_options)) {
1141 std::cout << param.what() << std::endl;
1142 return EXIT_FAILURE;
1145 if (param.get<bool>("help")) {
1146 std::cout << param.help() << std::endl;
1147 return EXIT_SUCCESS;
1150 if (param.get<bool>("version")) {
1151 std::cout << param.version() << std::endl;
1152 return EXIT_SUCCESS;
1155 if (!load_dictionary_resource(¶m)) {
1156 std::cout << param.what() << std::endl;
1157 return EXIT_SUCCESS;
1160 if (param.get<int>("lattice-level") >= 1) {
1161 std::cerr << "lattice-level is DEPERCATED. "
1162 << "use --marginal or --nbest." << std::endl;
1165 MeCab::scoped_ptr<MeCab::ModelImpl> model(new MeCab::ModelImpl);
1166 if (!model->open(param)) {
1167 std::cout << MeCab::getLastError() << std::endl;
1168 return EXIT_FAILURE;
1171 std::string ofilename = param.get<std::string>("output");
1172 if (ofilename.empty()) {
1176 const int nbest = param.get<int>("nbest");
1177 if (nbest <= 0 || nbest > NBEST_MAX) {
1178 WHAT_ERROR("invalid N value");
1181 MeCab::ostream_wrapper ofs(ofilename.c_str());
1183 WHAT_ERROR("no such file or directory: " << ofilename);
1186 if (param.get<bool>("dump-config")) {
1187 param.dump_config(&*ofs);
1188 return EXIT_FAILURE;
1191 if (param.get<bool>("dictionary-info")) {
1192 for (const MeCab::DictionaryInfo *d = model->dictionary_info();
1194 *ofs << "filename:\t" << d->filename << std::endl;
1195 *ofs << "version:\t" << d->version << std::endl;
1196 *ofs << "charset:\t" << d->charset << std::endl;
1197 *ofs << "type:\t" << d->type << std::endl;
1198 *ofs << "size:\t" << d->size << std::endl;
1199 *ofs << "left size:\t" << d->lsize << std::endl;
1200 *ofs << "right size:\t" << d->rsize << std::endl;
1203 return EXIT_FAILURE;
1206 const std::vector<std::string>& rest_ = param.rest_args();
1207 std::vector<std::string> rest = rest_;
1210 rest.push_back("-");
1213 size_t ibufsize = std::min(MAX_INPUT_BUFFER_SIZE,
1214 std::max(param.get<int>
1215 ("input-buffer-size"),
1216 MIN_INPUT_BUFFER_SIZE));
1218 const bool partial = param.get<bool>("partial");
1223 MeCab::scoped_array<char> ibuf_data(new char[ibufsize]);
1224 char *ibuf = ibuf_data.get();
1226 MeCab::scoped_ptr<MeCab::Tagger> tagger(model->createTagger());
1228 if (!tagger.get()) {
1229 WHAT_ERROR("cannot create tagger");
1232 for (size_t i = 0; i < rest.size(); ++i) {
1233 MeCab::istream_wrapper ifs(rest[i].c_str());
1235 WHAT_ERROR("no such file or directory: " << rest[i]);
1240 ifs->getline(ibuf, ibufsize);
1242 std::string sentence;
1243 MeCab::scoped_fixed_array<char, BUF_SIZE> line;
1245 if (!ifs->getline(line.get(), line.size())) {
1246 ifs->clear(std::ios::eofbit|std::ios::badbit);
1249 sentence += line.get();
1251 if (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0') {
1255 std::strncpy(ibuf, sentence.c_str(), ibufsize);
1257 if (ifs->eof() && !ibuf[0]) {
1261 std::cerr << "input-buffer overflow. "
1262 << "The line is split. use -b #SIZE option." << std::endl;
1265 const char *r = (nbest >= 2) ? tagger->parseNBest(nbest, ibuf) :
1266 tagger->parse(ibuf);
1268 WHAT_ERROR(tagger->what());
1270 *ofs << r << std::flush;
1274 return EXIT_SUCCESS;