2 * Copyright (C) 2009 The Android Open Source Project
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef PINYINIME_INCLUDE_DICTDEF_H__
18 #define PINYINIME_INCLUDE_DICTDEF_H__
21 #include "./utf16char.h"
23 namespace ime_pinyin {
25 // Enable the following line when building the binary dictionary model.
26 // #define ___BUILD_MODEL___
28 typedef unsigned char uint8;
29 typedef unsigned short uint16;
30 typedef unsigned int uint32;
32 typedef signed char int8;
35 typedef long long int64;
36 typedef unsigned long long uint64;
38 const bool kPrintDebug0 = false;
39 const bool kPrintDebug1 = false;
40 const bool kPrintDebug2 = false;
42 // The max length of a lemma.
43 const size_t kMaxLemmaSize = 8;
45 // The max length of a Pinyin (spelling).
46 const size_t kMaxPinyinSize = 6;
48 // The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
49 // See SpellingTrie.h for details.
50 const size_t kHalfSpellingIdNum = 29;
52 // The maximum number of full spellings. For Chinese Pinyin, there are only
53 // about 410 spellings.
54 // If change this value is bigger(needs more bits), please also update
55 // other structures like SpellingNode, to make sure than a spelling id can be
57 // -1 is because that 0 is never used.
58 const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
59 const size_t kMaxSearchSteps = 40;
61 // One character predicts its following characters.
62 const size_t kMaxPredictSize = (kMaxLemmaSize - 1);
64 // LemmaIdType must always be size_t.
65 typedef size_t LemmaIdType;
66 const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage.
67 const size_t kLemmaIdComposing = 0xffffff;
69 typedef uint16 LmaScoreType;
70 typedef uint16 KeyScoreType;
72 // Number of items with highest score are kept for prediction purpose.
73 const size_t kTopScoreLemmaNum = 10;
75 const size_t kMaxPredictNumByGt3 = 1;
76 const size_t kMaxPredictNumBy3 = 2;
77 const size_t kMaxPredictNumBy2 = 2;
79 // The last lemma id (included) for the system dictionary. The system
80 // dictionary's ids always start from 1.
81 const LemmaIdType kSysDictIdEnd = 500000;
83 // The first lemma id for the user dictionary.
84 const LemmaIdType kUserDictIdStart = 500001;
86 // The last lemma id (included) for the user dictionary.
87 const LemmaIdType kUserDictIdEnd = 600000;
92 } SpellingId, *PSpellingId;
96 * We use different node types for different layers
97 * Statistical data of the building result for a testing dictionary:
98 * root, level 0, level 1, level 2, level 3
99 * max son num of one node: 406 280 41 2 -
100 * max homo num of one node: 0 90 23 2 2
101 * total node num of a layer: 1 406 31766 13516 993
102 * total homo num of a layer: 9 5674 44609 12667 995
104 * The node number for root and level 0 won't be larger than 500
105 * According to the information above, two kinds of nodes can be used; one for
106 * root and level 0, the other for these layers deeper than 0.
108 * LE = less and equal,
109 * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
113 size_t homo_idx_buf_off;
120 * GE = great and equal
121 * A node occupies 8 bytes.
124 uint16 son_1st_off_l; // Low bits of the son_1st_off
125 uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1
127 unsigned char num_of_son; // number of son nodes
128 unsigned char num_of_homo; // number of homo words
129 unsigned char son_1st_off_h; // high bits of the son_1st_off
130 unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off
133 #ifdef ___BUILD_MODEL___
134 struct SingleCharItem {
141 LemmaIdType idx_by_py;
142 LemmaIdType idx_by_hz;
143 char16 hanzi_str[kMaxLemmaSize + 1];
145 // The SingleCharItem id for each Hanzi.
146 uint16 hanzi_scis_ids[kMaxLemmaSize];
148 uint16 spl_idx_arr[kMaxLemmaSize + 1];
149 char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
150 unsigned char hz_str_len;
153 #endif // ___BUILD_MODEL___
155 } // namespace ime_pinyin
157 #endif // PINYINIME_INCLUDE_DICTDEF_H__