1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
14 #ifdef HAVE_SYS_TYPES_H
15 #include <sys/types.h>
30 #if defined(_WIN32) && !defined(__CYGWIN__)
31 extern HINSTANCE DllInstance;
41 #if defined(_WIN32) && !defined(__CYGWIN__)
42 std::wstring Utf8ToWide(const std::string &input) {
43 int output_length = ::MultiByteToWideChar(CP_UTF8, 0,
44 input.c_str(), -1, NULL, 0);
45 output_length = output_length <= 0 ? 0 : output_length - 1;
46 if (output_length == 0) {
49 scoped_array<wchar_t> input_wide(new wchar_t[output_length + 1]);
50 const int result = ::MultiByteToWideChar(CP_UTF8, 0, input.c_str(), -1,
51 input_wide.get(), output_length + 1);
54 output.assign(input_wide.get());
59 std::string WideToUtf8(const std::wstring &input) {
60 const int output_length = ::WideCharToMultiByte(CP_UTF8, 0,
61 input.c_str(), -1, NULL, 0,
63 if (output_length == 0) {
67 scoped_array<char> input_encoded(new char[output_length + 1]);
68 const int result = ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1,
70 output_length + 1, NULL, NULL);
73 output.assign(input_encoded.get());
79 int decode_charset(const char *charset) {
80 std::string tmp = charset;
82 if (tmp == "sjis" || tmp == "shift-jis" ||
83 tmp == "shift_jis" || tmp == "cp932")
85 else if (tmp == "euc" || tmp == "euc_jp" ||
88 else if (tmp == "utf8" || tmp == "utf_8" ||
91 else if (tmp == "utf16" || tmp == "utf_16" ||
94 else if (tmp == "utf16be" || tmp == "utf_16be" ||
97 else if (tmp == "utf16le" || tmp == "utf_16le" ||
100 else if (tmp == "ascii")
103 return UTF8; // default is UTF8
106 std::string create_filename(const std::string &path,
107 const std::string &file) {
108 std::string s = path;
109 #if defined(_WIN32) && !defined(__CYGWIN__)
110 if (s.size() && s[s.size()-1] != '\\') s += '\\';
112 if (s.size() && s[s.size()-1] != '/') s += '/';
118 void remove_filename(std::string *s) {
119 int len = static_cast<int>(s->size()) - 1;
121 for (; len >= 0; --len) {
122 #if defined(_WIN32) && !defined(__CYGWIN__)
123 if ((*s)[len] == '\\') {
128 if ((*s)[len] == '/') {
135 *s = s->substr(0, len);
140 void remove_pathname(std::string *s) {
141 int len = static_cast<int>(s->size()) - 1;
143 for (; len >= 0; --len) {
144 #if defined(_WIN32) && !defined(__CYGWIN__)
145 if ((*s)[len] == '\\') {
150 if ((*s)[len] == '/') {
157 *s = s->substr(len + 1, s->size() - len);
162 void replace_string(std::string *s,
163 const std::string &src,
164 const std::string &dst) {
165 const std::string::size_type pos = s->find(src);
166 if (pos != std::string::npos) {
167 s->replace(pos, src.size(), dst);
171 void enum_csv_dictionaries(const char *path,
172 std::vector<std::string> *dics) {
175 #if defined(_WIN32) && !defined(__CYGWIN__)
176 WIN32_FIND_DATAW wfd;
178 const std::wstring pat = Utf8ToWide(create_filename(path, "*.csv"));
179 hFind = ::FindFirstFileW(pat.c_str(), &wfd);
180 CHECK_DIE(hFind != INVALID_HANDLE_VALUE)
181 << "Invalid File Handle. Get Last Error reports";
183 std::string tmp = create_filename(path, WideToUtf8(wfd.cFileName));
184 dics->push_back(tmp);
185 } while (::FindNextFileW(hFind, &wfd));
188 DIR *dir = opendir(path);
189 CHECK_DIE(dir) << "no such directory: " << path;
191 for (struct dirent *dp = readdir(dir);
194 const std::string tmp = dp->d_name;
195 if (tmp.size() >= 5) {
196 std::string ext = tmp.substr(tmp.size() - 4, 4);
199 dics->push_back(create_filename(path, tmp));
207 bool toLower(std::string *s) {
208 for (size_t i = 0; i < s->size(); ++i) {
210 if ((c >= 'A') && (c <= 'Z')) {
218 bool escape_csv_element(std::string *w) {
219 if (w->find(',') != std::string::npos ||
220 w->find('"') != std::string::npos) {
221 std::string tmp = "\"";
222 for (size_t j = 0; j < w->size(); j++) {
223 if ((*w)[j] == '"') tmp += '"';
232 int progress_bar(const char* message, size_t current, size_t total) {
233 static char bar[] = "###########################################";
234 static int scale = sizeof(bar) - 1;
237 int cur_percentage = static_cast<int>(100.0 * current/total);
238 int bar_len = static_cast<int>(1.0 * current*scale/total);
240 if (prev != cur_percentage) {
241 printf("%s: %3d%% |%.*s%*s| ", message, cur_percentage,
242 bar_len, bar, scale - bar_len, "");
243 if (cur_percentage == 100)
250 prev = cur_percentage;
255 int load_request_type(const Param ¶m) {
256 int request_type = MECAB_ONE_BEST;
258 if (param.get<bool>("allocate-sentence")) {
259 request_type |= MECAB_ALLOCATE_SENTENCE;
262 if (param.get<bool>("partial")) {
263 request_type |= MECAB_PARTIAL;
266 if (param.get<bool>("all-morphs")) {
267 request_type |= MECAB_ALL_MORPHS;
270 if (param.get<bool>("marginal")) {
271 request_type |= MECAB_MARGINAL_PROB;
274 const int nbest = param.get<int>("nbest");
276 request_type |= MECAB_NBEST;
280 const int lattice_level = param.get<int>("lattice-level");
281 if (lattice_level >= 1) {
282 request_type |= MECAB_NBEST;
285 if (lattice_level >= 2) {
286 request_type |= MECAB_MARGINAL_PROB;
292 bool load_dictionary_resource(Param *param) {
293 std::string rcfile = param->get<std::string>("rcfile");
296 if (rcfile.empty()) {
297 const char *homedir = getenv("HOME");
299 const std::string s = MeCab::create_filename(std::string(homedir),
301 std::ifstream ifs(WPATH(s.c_str()));
308 if (rcfile.empty()) {
309 const char *rcenv = getenv("MECABRC");
316 #if defined (HAVE_GETENV) && defined(_WIN32) && !defined(__CYGWIN__)
317 if (rcfile.empty()) {
318 scoped_fixed_array<wchar_t, BUF_SIZE> buf;
319 const DWORD len = ::GetEnvironmentVariableW(L"MECABRC",
322 if (len < buf.size() && len > 0) {
323 rcfile = WideToUtf8(buf.get());
328 #if defined(_WIN32) && !defined(__CYGWIN__)
330 scoped_fixed_array<wchar_t, BUF_SIZE> v;
332 DWORD size = v.size() * sizeof(v[0]);
334 if (rcfile.empty()) {
335 ::RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"software\\mecab", 0, KEY_READ, &hKey);
336 ::RegQueryValueExW(hKey, L"mecabrc", 0, &vt,
337 reinterpret_cast<BYTE *>(v.get()), &size);
340 rcfile = WideToUtf8(v.get());
344 if (rcfile.empty()) {
345 ::RegOpenKeyExW(HKEY_CURRENT_USER, L"software\\mecab", 0, KEY_READ, &hKey);
346 ::RegQueryValueExW(hKey, L"mecabrc", 0, &vt,
347 reinterpret_cast<BYTE *>(v.get()), &size);
350 rcfile = WideToUtf8(v.get());
354 if (rcfile.empty()) {
355 vt = ::GetModuleFileNameW(DllInstance, v.get(), size);
357 scoped_fixed_array<wchar_t, _MAX_DRIVE> drive;
358 scoped_fixed_array<wchar_t, _MAX_DIR> dir;
359 _wsplitpath(v.get(), drive.get(), dir.get(), NULL, NULL);
360 const std::wstring path =
361 std::wstring(drive.get()) + std::wstring(dir.get()) + L"mecabrc";
362 if (::GetFileAttributesW(path.c_str()) != -1) {
363 rcfile = WideToUtf8(path);
369 if (rcfile.empty()) {
370 rcfile = MECAB_DEFAULT_RC;
373 if (!param->load(rcfile.c_str())) {
377 std::string dicdir = param->get<std::string>("dicdir");
378 if (dicdir.empty()) {
379 dicdir = "."; // current
381 remove_filename(&rcfile);
382 replace_string(&dicdir, "$(rcpath)", rcfile);
383 param->set<std::string>("dicdir", dicdir, true);
384 dicdir = create_filename(dicdir, DICRC);
386 if (!param->load(dicdir.c_str())) {
394 // Copied from MurmurHash3.cpp
395 // http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
396 //-----------------------------------------------------------------------------
397 // Platform-specific functions and macros
398 // Microsoft Visual Studio
399 #if defined(_MSC_VER)
401 #define FORCE_INLINE __forceinline
403 #define ROTL32(x,y) _rotl(x,y)
405 #define BIG_CONSTANT(x) (x)
409 #else // defined(_MSC_VER)
411 #define FORCE_INLINE inline __attribute__((always_inline))
413 inline uint32_t rotl32 ( uint32_t x, uint8_t r ) {
414 return (x << r) | (x >> (32 - r));
417 #define ROTL32(x,y) rotl32(x,y)
419 #endif // !defined(_MSC_VER)
421 //-----------------------------------------------------------------------------
422 // Block read - if your platform needs to do endian-swapping or can only
423 // handle aligned reads, do the conversion here
425 FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) {
429 //-----------------------------------------------------------------------------
430 // Finalization mix - force all bits of a hash block to avalanche
432 FORCE_INLINE uint32_t fmix (uint32_t h) {
442 void MurmurHash3_x86_128(const void * key, const int len,
443 uint32_t seed, char *out) {
444 const uint8_t * data = (const uint8_t*)key;
445 const int nblocks = len / 16;
452 uint32_t c1 = 0x239b961b;
453 uint32_t c2 = 0xab0e9789;
454 uint32_t c3 = 0x38b34ae5;
455 uint32_t c4 = 0xa1e38b93;
460 const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
462 for(int i = -nblocks; i; i++)
464 uint32_t k1 = getblock(blocks,i*4+0);
465 uint32_t k2 = getblock(blocks,i*4+1);
466 uint32_t k3 = getblock(blocks,i*4+2);
467 uint32_t k4 = getblock(blocks,i*4+3);
469 k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
471 h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
473 k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
475 h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
477 k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
479 h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
481 k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
483 h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
489 const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
497 case 15: k4 ^= tail[14] << 16;
498 case 14: k4 ^= tail[13] << 8;
499 case 13: k4 ^= tail[12] << 0;
500 k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
502 case 12: k3 ^= tail[11] << 24;
503 case 11: k3 ^= tail[10] << 16;
504 case 10: k3 ^= tail[ 9] << 8;
505 case 9: k3 ^= tail[ 8] << 0;
506 k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
508 case 8: k2 ^= tail[ 7] << 24;
509 case 7: k2 ^= tail[ 6] << 16;
510 case 6: k2 ^= tail[ 5] << 8;
511 case 5: k2 ^= tail[ 4] << 0;
512 k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
514 case 4: k1 ^= tail[ 3] << 24;
515 case 3: k1 ^= tail[ 2] << 16;
516 case 2: k1 ^= tail[ 1] << 8;
517 case 1: k1 ^= tail[ 0] << 0;
518 k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
524 h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
526 h1 += h2; h1 += h3; h1 += h4;
527 h2 += h1; h3 += h1; h4 += h1;
534 h1 += h2; h1 += h3; h1 += h4;
535 h2 += h1; h3 += h1; h4 += h1;
537 std::memcpy(out, reinterpret_cast<char *>(&h1), 4);
538 std::memcpy(out + 4, reinterpret_cast<char *>(&h2), 4);
539 std::memcpy(out + 8, reinterpret_cast<char *>(&h3), 4);
540 std::memcpy(out+ 12, reinterpret_cast<char *>(&h4), 4);
544 uint64_t fingerprint(const char *str, size_t size) {
545 uint64_t result[2] = { 0 };
546 const uint32_t kFingerPrint32Seed = 0xfd14deff;
547 MurmurHash3_x86_128(str, size, kFingerPrint32Seed,
548 reinterpret_cast<char *>(result));
552 uint64_t fingerprint(const std::string &str) {
553 return fingerprint(str.data(), str.size());
556 bool file_exists(const char *filename) {
557 std::ifstream ifs(WPATH(filename));