2 Darts -- Double-ARray Trie System
5 Copyright(C) 2001-2007 Taku Kudo <taku@chasen.org>
10 #define DARTS_VERSION "0.31"
19 #define SH(p)((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8))
20 #define LG(p)((unsigned long)(SH(p)) |((unsigned long)(SH((p)+2)) << 16))
27 template <class T> inline T _max(T x, T y) { return(x > y) ? x : y; }
28 template <class T> inline T* _resize(T* ptr, size_t n, size_t l, T v) {
30 for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i];
31 for (size_t i = n; i < l; ++i) tmp[i] = v;
38 public: size_t operator()(const T *key) const
39 { size_t i; for (i = 0; key[i] != (T)0; ++i) {} return i; }
42 template <> class Length<char> {
43 public: size_t operator()(const char *key) const
44 { return std::strlen(key); }
47 template <class node_type_, class node_u_type_,
48 class array_type_, class array_u_type_,
49 class length_func_ = Length<node_type_> >
50 class DoubleArrayImpl {
74 size_t next_check_pos_;
77 int (*progress_func_)(size_t, size_t);
79 size_t resize(const size_t new_size) {
83 array_ = _resize(array_, alloc_size_, new_size, tmp);
84 used_ = _resize(used_, alloc_size_, new_size,
85 static_cast<unsigned char>(0));
86 alloc_size_ = new_size;
90 size_t fetch(const node_t &parent, std::vector <node_t> &siblings) {
91 if (error_ < 0) return 0;
93 array_u_type_ prev = 0;
95 for (size_t i = parent.left; i < parent.right; ++i) {
96 if ((length_ ? length_[i] : length_func_()(key_[i])) < parent.depth)
99 const node_u_type_ *tmp = reinterpret_cast<node_u_type_ *>(key_[i]);
101 array_u_type_ cur = 0;
102 if ((length_ ? length_[i] : length_func_()(key_[i])) != parent.depth)
103 cur = (array_u_type_)tmp[parent.depth] + 1;
110 if (cur != prev || siblings.empty()) {
112 tmp_node.depth = parent.depth + 1;
115 if (!siblings.empty()) siblings[siblings.size()-1].right = i;
117 siblings.push_back(tmp_node);
123 if (!siblings.empty())
124 siblings[siblings.size()-1].right = parent.right;
126 return siblings.size();
129 size_t insert(const std::vector <node_t> &siblings) {
130 if (error_ < 0) return 0;
133 size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1;
134 size_t nonzero_num = 0;
137 if (alloc_size_ <= pos) resize(pos + 1);
143 if (alloc_size_ <= pos) resize(pos + 1);
145 if (array_[pos].check) {
149 next_check_pos_ = pos;
153 begin = pos - siblings[0].code;
154 if (alloc_size_ <= (begin + siblings[siblings.size()-1].code))
155 resize(static_cast<size_t>(alloc_size_ *
156 _max(1.05, 1.0 * key_size_ / progress_)));
158 if (used_[begin]) continue;
160 for (size_t i = 1; i < siblings.size(); ++i)
161 if (array_[begin + siblings[i].code].check != 0) goto next;
166 // -- Simple heuristics --
167 // if the percentage of non-empty contents in check between the index
168 // 'next_check_pos' and 'check' is greater than some constant
170 // new 'next_check_pos' index is written by 'check'.
171 if (1.0 * nonzero_num/(pos - next_check_pos_ + 1) >= 0.95)
172 next_check_pos_ = pos;
177 static_cast<size_t>(siblings[siblings.size() - 1].code + 1));
179 for (size_t i = 0; i < siblings.size(); ++i)
180 array_[begin + siblings[i].code].check = begin;
182 for (size_t i = 0; i < siblings.size(); ++i) {
183 std::vector <node_t> new_siblings;
185 if (!fetch(siblings[i], new_siblings)) {
186 array_[begin + siblings[i].code].base =
188 static_cast<array_type_>(-value_[siblings[i].left]-1) :
189 static_cast<array_type_>(-siblings[i].left-1);
191 if (value_ && (array_type_)(-value_[siblings[i].left]-1) >= 0) {
197 if (progress_func_)(*progress_func_)(progress_, key_size_);
200 size_t h = insert(new_siblings);
201 array_[begin + siblings[i].code].base = h;
210 typedef array_type_ value_type;
211 typedef node_type_ key_type;
212 typedef array_type_ result_type; // for compatibility
214 struct result_pair_type {
219 explicit DoubleArrayImpl(): array_(0), used_(0),
220 size_(0), alloc_size_(0),
221 no_delete_(0), error_(0) {}
222 ~DoubleArrayImpl() { clear(); }
224 void set_result(value_type& x, value_type r, size_t) const {
228 void set_result(result_pair_type& x, value_type r, size_t l) const {
233 void set_array(void *ptr, size_t size = 0) {
235 array_ = reinterpret_cast<unit_t *>(ptr);
240 const void *array() const {
241 return const_cast<const void *>(reinterpret_cast<void *>(array_));
255 size_t unit_size() const { return sizeof(unit_t); }
256 size_t size() const { return size_; }
257 size_t total_size() const { return size_ * sizeof(unit_t); }
259 size_t nonzero_size() const {
261 for (size_t i = 0; i < size_; ++i)
262 if (array_[i].check) ++result;
266 int build(size_t key_size,
269 value_type *value = 0,
270 int (*progress_func)(size_t, size_t) = 0) {
271 if (!key_size || !key) return 0;
273 progress_func_ = progress_func;
276 key_size_ = key_size;
287 root_node.right = key_size;
290 std::vector <node_t> siblings;
291 fetch(root_node, siblings);
294 size_ += (1 << 8 * sizeof(key_type)) + 1;
295 if (size_ >= alloc_size_) resize(size_);
303 int open(const char *file,
304 const char *mode = "rb",
307 std::FILE *fp = std::fopen(file, mode);
309 if (std::fseek(fp, offset, SEEK_SET) != 0) return -1;
312 if (std::fseek(fp, 0L, SEEK_END) != 0) return -1;
313 size = std::ftell(fp);
314 if (std::fseek(fp, offset, SEEK_SET) != 0) return -1;
320 size_ /= sizeof(unit_t);
321 array_ = new unit_t[size_];
322 if (size_ != std::fread(reinterpret_cast<unit_t *>(array_),
323 sizeof(unit_t), size_, fp)) return -1;
329 int save(const char *file,
330 const char *mode = "wb",
332 if (!size_) return -1;
333 std::FILE *fp = std::fopen(file, mode);
335 if (size_ != std::fwrite(reinterpret_cast<unit_t *>(array_),
336 sizeof(unit_t), size_, fp))
343 int gzopen(const char *file,
344 const char *mode = "rb",
347 std::FILE *fp = std::fopen(file, mode);
353 if (-1L != static_cast<long>(std::fseek(fp, -8, SEEK_END))) {
355 if (std::fread(static_cast<char*>(buf),
356 1, 8, fp) != sizeof(buf)) {
361 size_ /= sizeof(unit_t);
366 if (!size_) return -1;
368 zlib::gzFile gzfp = zlib::gzopen(file, mode);
369 if (!gzfp) return -1;
370 array_ = new unit_t[size_];
371 if (zlib::gzseek(gzfp, offset, SEEK_SET) != 0) return -1;
372 zlib::gzread(gzfp, reinterpret_cast<unit_t *>(array_),
373 sizeof(unit_t) * size_);
378 int gzsave(const char *file, const char *mode = "wb",
380 zlib::gzFile gzfp = zlib::gzopen(file, mode);
381 if (!gzfp) return -1;
382 zlib::gzwrite(gzfp, reinterpret_cast<unit_t *>(array_),
383 sizeof(unit_t) * size_);
390 inline void exactMatchSearch(const key_type *key,
393 size_t node_pos = 0) const {
394 result = exactMatchSearch<T>(key, len, node_pos);
399 inline T exactMatchSearch(const key_type *key,
401 size_t node_pos = 0) const {
402 if (!len) len = length_func_()(key);
405 set_result(result, -1, 0);
407 register array_type_ b = array_[node_pos].base;
408 register array_u_type_ p;
410 for (register size_t i = 0; i < len; ++i) {
411 p = b +(node_u_type_)(key[i]) + 1;
412 if (static_cast<array_u_type_>(b) == array_[p].check)
419 array_type_ n = array_[p].base;
420 if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
421 set_result(result, -n-1, len);
427 size_t commonPrefixSearch(const key_type *key,
431 size_t node_pos = 0) const {
432 if (!len) len = length_func_()(key);
434 register array_type_ b = array_[node_pos].base;
435 register size_t num = 0;
436 register array_type_ n;
437 register array_u_type_ p;
439 for (register size_t i = 0; i < len; ++i) {
442 if ((array_u_type_) b == array_[p].check && n < 0) {
443 // result[num] = -n-1;
444 if (num < result_len) set_result(result[num], -n-1, i);
448 p = b +(node_u_type_)(key[i]) + 1;
449 if ((array_u_type_) b == array_[p].check)
458 if ((array_u_type_)b == array_[p].check && n < 0) {
459 if (num < result_len) set_result(result[num], -n-1, len);
466 value_type traverse(const key_type *key,
469 size_t len = 0) const {
470 if (!len) len = length_func_()(key);
472 register array_type_ b = array_[node_pos].base;
473 register array_u_type_ p;
475 for (; key_pos < len; ++key_pos) {
476 p = b +(node_u_type_)(key[key_pos]) + 1;
477 if (static_cast<array_u_type_>(b) == array_[p].check) {
481 return -2; // no node
486 array_type_ n = array_[p].base;
487 if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
490 return -1; // found, but no value
495 typedef Darts::DoubleArrayImpl<char, unsigned char, short,
496 unsigned short> DoubleArray;
497 #define DARTS_ARRAY_SIZE_IS_DEFINED 1
500 #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
501 typedef Darts::DoubleArrayImpl<char, unsigned char, int,
502 unsigned int> DoubleArray;
503 #define DARTS_ARRAY_SIZE_IS_DEFINED 1
506 #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
507 typedef Darts::DoubleArrayImpl<char, unsigned char, long,
508 unsigned long> DoubleArray;
509 #define DARTS_ARRAY_SIZE_IS_DEFINED 1
512 #if 4 == 8 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
513 typedef Darts::DoubleArrayImpl<char, unsigned char, long long,
514 unsigned long long> DoubleArray;