OSDN Git Service

fixed reading of latin character words such as: japanese, mouse, one.
[nvdajp/nvdajpmiscdep.git] / include / jtalk / roma_dic_maker.py
1 # roma_dic_maker.py for nvdajp_jtalk\r
2 # -*- coding: utf-8 -*-\r
3 # since 2011-04-06 by Takuya Nishimoto\r
4 from __future__ import unicode_literals\r
5 \r
6 OUT_FILE = 'nvdajp-roma-dic.csv'\r
7 \r
8 import sys\r
9 import re\r
10 import os\r
11 from os import path\r
12 from alpha2mb import alpha2mb\r
13 \r
14 romadic = [\r
15                 # third item is number of morae\r
16                 ['bba',                 'ッバ',                       2],\r
17                 ['bbi',                 'ッビ',                       2],\r
18                 ['bbu',                 'ッブ',                       2],\r
19                 ['bbe',                 'ッベ',                       2],\r
20                 ['bbo',                 'ッボ',                       2],\r
21                 #\r
22                 ['ccha',                'ッチャ',                    2],\r
23                 ['cchi',                'ッチ',                       2],\r
24                 ['cchu',                'ッチュ',                    2],\r
25                 ['cche',                'ッチェ',                    2],\r
26                 ['ccho',                'ッチョ',                    2],\r
27                 #\r
28                 ['dda',                 'ッダ',                       2],\r
29                 ['ddi',                 'ッジ',                       2],\r
30                 ['ddu',                 'ッヅ',                       2],\r
31                 ['dde',                 'ッデ',                       2],\r
32                 ['ddo',                 'ッド',                       2],\r
33                 #\r
34                 ['ffa',                 'ッファ',                    2],\r
35                 ['ffi',                 'ッフィ',                    2],\r
36                 ['ffu',                 'ッフ',                       2],\r
37                 ['ffe',                 'ッフェ',                    2],\r
38                 ['ffo',                 'ッフォ',                    2],\r
39                 #\r
40                 ['gga',                 'ッガ',                       2],\r
41                 ['ggi',                 'ッギ',                       2],\r
42                 ['ggu',                 'ッグ',                       2],\r
43                 ['gge',                 'ッゲ',                       2],\r
44                 ['ggo',                 'ッゴ',                       2],\r
45                 #\r
46                 ['hha',                 'ッハ',                       2],\r
47                 ['hhi',                 'ッヒ',                       2],\r
48                 ['hhu',                 'ッフ',                       2],\r
49                 ['hhe',                 'ッヘ',                       2],\r
50                 ['hho',                 'ッホ',                       2],\r
51                 #\r
52                 ['jja',                 'ッジャ',                    2],\r
53                 ['jji',                 'ッジ',                       2],\r
54                 ['jju',                 'ッジュ',                    2],\r
55                 ['jje',                 'ッジェ',                    2],\r
56                 ['jjo',                 'ッジョ',                    2],\r
57                 #\r
58                 ['kka',                 'ッカ',                       2],\r
59                 ['kki',                 'ッキ',                       2],\r
60                 ['kku',                 'ック',                       2],\r
61                 ['kke',                 'ッケ',                       2],\r
62                 ['kko',                 'ッコ',                       2],\r
63                 #\r
64                 ['ppa',                 'ッパ',                       2],\r
65                 ['ppi',                 'ッピ',                       2],\r
66                 ['ppu',                 'ップ',                       2],\r
67                 ['ppe',                 'ッペ',                       2],\r
68                 ['ppo',                 'ッポ',                       2],\r
69                 #\r
70                 ['ssa',                 'ッサ',                       2],\r
71                 ['ssi',                 'ッシ',                       2],\r
72                 ['ssu',                 'ッス',                       2],\r
73                 ['sse',                 'ッセ',                       2],\r
74                 ['sso',                 'ッソ',                       2],\r
75                 #\r
76                 ['tta',                 'ッタ',                       2],\r
77                 ['tti',                 'ッチ',                       2],\r
78                 ['ttu',                 'ッツ',                       2],\r
79                 ['tte',                 'ッテ',                       2],\r
80                 ['tto',                 'ット',                       2],\r
81                 #\r
82                 ['zza',                 'ッザ',                       2],\r
83                 ['zzi',                 'ッジ',                       2],\r
84                 ['zzu',                 'ッズ',                       2],\r
85                 ['zze',                 'ッゼ',                       2],\r
86                 ['zzo',                 'ッゾ',                       2],\r
87                 #\r
88                 ['cha',                 'チャ',                       1],\r
89                 ['chu',                 'チュ',                       1],\r
90                 ['cho',                 'チョ',                       1],\r
91                 #\r
92                 ['tsu',                 'ツ',                          1],\r
93                 #\r
94                 ['ka',                  'カ',                          1],\r
95                 ['ki',                  'キ',                          1],\r
96                 ['ku',                  'ク',                          1],\r
97                 ['ke',                  'ケ',                          1],\r
98                 ['ko',                  'コ',                          1],\r
99                 #\r
100                 ['tya',                 'チャ',                       1],\r
101                 ['tyu',                 'チュ',                       1],\r
102                 ['tyo',                 'チョ',                       1],\r
103                 #\r
104                 ['jya',                 'ジャ',                       1],\r
105                 ['jyu',                 'ジュ',                       1],\r
106                 ['jyo',                 'ジョ',                       1],\r
107                 #\r
108                 ['kya',                 'キャ',                       1],\r
109                 ['kyu',                 'キュ',                       1],\r
110                 ['kyo',                 'キョ',                       1],\r
111                 #\r
112                 ['ga',                  'ガ',                          1],\r
113                 ['gi',                  'ギ',                          1],\r
114                 ['gu',                  'グ',                          1],\r
115                 ['ge',                  'ゲ',                          1],\r
116                 ['go',                  'ゴ',                          1],\r
117                 #\r
118                 ['gya',                 'ギャ',                       1],\r
119                 ['gyu',                 'ギュ',                       1],\r
120                 ['gyo',                 'ギョ',                       1],\r
121                 #\r
122                 ['sa',                  'サ',                          1],\r
123                 ['si',                  'シ',                          1],\r
124                 ['shi',                 'シ',                          1],\r
125                 ['su',                  'ス',                          1],\r
126                 ['se',                  'セ',                          1],\r
127                 ['so',                  'ソ',                          1],\r
128                 #\r
129                 ['sya',                 'シャ',                       1],\r
130                 ['syu',                 'シュ',                       1],\r
131                 ['syo',                 'ショ',                       1],\r
132                 #\r
133                 ['sha',                 'シャ',                       1],\r
134                 ['shu',                 'シュ',                       1],\r
135                 ['sho',                 'ショ',                       1],\r
136                 #\r
137                 ['za',                  'ザ',                          1],\r
138                 ['zi',                  'ジ',                          1],\r
139                 ['ji',                  'ジ',                          1],\r
140                 ['zu',                  'ズ',                          1],\r
141                 ['ze',                  'ゼ',                          1],\r
142                 ['zo',                  'ゾ',                          1],\r
143                 #\r
144                 ['ja',                  'ジャ',                       1],\r
145                 ['ju',                  'ジュ',                       1],\r
146                 ['jo',                  'ジョ',                       1],\r
147                 #\r
148                 ['ta',                  'タ',                          1],\r
149                 ['ti',                  'チ',                          1],\r
150                 ['chi',                 'チ',                          1],\r
151                 ['tu',                  'ツ',                          1],\r
152                 ['te',                  'テ',                          1],\r
153                 ['to',                  'ト',                          1],\r
154                 #\r
155                 ['da',                  'ダ',                          1],\r
156                 ['di',                  'ヂ',                          1],\r
157                 ['du',                  'ヅ',                          1],\r
158                 ['de',                  'デ',                          1],\r
159                 ['do',                  'ド',                          1],\r
160                 #\r
161                 ['na',                  'ナ',                          1],\r
162                 ['ni',                  'ニ',                          1],\r
163                 ['nu',                  'ヌ',                          1],\r
164                 ['ne',                  'ネ',                          1],\r
165                 ['no',                  'ノ',                          1],\r
166                 #\r
167                 ['nn',                  'ン',                          1],\r
168                 #\r
169                 ['nya',                 'ニャ',                       1],\r
170                 ['nyu',                 'ニュ',                       1],\r
171                 ['nyo',                 'ニョ',                       1],\r
172                 #\r
173                 ['ha',                  'ハ',                          1],\r
174                 ['hi',                  'ヒ',                          1],\r
175                 ['hu',                  'フ',                          1],\r
176                 ['he',                  'ヘ',                          1],\r
177                 ['ho',                  'ホ',                          1],\r
178                 #\r
179                 ['hya',                 'ヒャ',                       1],\r
180                 ['hyu',                 'ヒュ',                       1],\r
181                 ['hyo',                 'ヒョ',                       1],\r
182                 #\r
183                 ['fa',                  'ファ',                       1],\r
184                 ['fi',                  'フィ',                       1],\r
185                 ['fu',                  'フ',                          1],\r
186                 ['fe',                  'フェ',                       1],\r
187                 ['fo',                  'フォ',                       1],\r
188                 #\r
189                 ['ba',                  'バ',                          1],\r
190                 ['bi',                  'ビ',                          1],\r
191                 ['bu',                  'ブ',                          1],\r
192                 ['be',                  'ベ',                          1],\r
193                 ['bo',                  'ボ',                          1],\r
194                 #\r
195                 ['pa',                  'パ',                          1],\r
196                 ['pi',                  'ピ',                          1],\r
197                 ['pu',                  'プ',                          1],\r
198                 ['pe',                  'ペ',                          1],\r
199                 ['po',                  'ポ',                          1],\r
200                 #\r
201                 ['pya',                 'ピャ',                       1],\r
202                 ['pyu',                 'ピュ',                       1],\r
203                 ['pyo',                 'ピョ',                       1],\r
204                 #\r
205                 ['ma',                  'マ',                          1],\r
206                 ['mi',                  'ミ',                          1],\r
207                 ['mu',                  'ム',                          1],\r
208                 ['me',                  'メ',                          1],\r
209                 ['mo',                  'モ',                          1],\r
210                 #\r
211                 ['mya',                 'ミャ',                       1],\r
212                 ['myu',                 'ミュ',                       1],\r
213                 ['myo',                 'ミョ',                       1],\r
214                 #\r
215                 ['rya',                 'リャ',                       1],\r
216                 ['ryu',                 'リュ',                       1],\r
217                 ['ryo',                 'リョ',                       1],\r
218                 #\r
219                 ['ya',                  'ヤ',                          1],\r
220                 ['yu',                  'ユ',                          1],\r
221                 ['yo',                  'ヨ',                          1],\r
222                 #\r
223                 ['ra',                  'ラ',                          1],\r
224                 ['ri',                  'リ',                          1],\r
225                 ['ru',                  'ル',                          1],\r
226                 ['re',                  'レ',                          1],\r
227                 ['ro',                  'ロ',                          1],\r
228                 #\r
229                 ['wa',                  'ワ',                          1],\r
230                 ['wi',                  'ウィ',                               1],\r
231                 ['wo',                  'オ',                          1],\r
232                 # \r
233                 ['a',                   'ア',                          1],\r
234                 ['i',                   'イ',                          1],\r
235                 ['u',                   'ウ',                          1],\r
236                 ['e',                   'エ',                          1],\r
237                 ['o',                   'オ',                          1],\r
238         ]\r
239 \r
240 def isGoodEntry(s):\r
241         a = s.split(',')\r
242         if a[0] == 'echo' and a[12] == 'エチョー':\r
243                 return False\r
244         if a[0] == 'use' and a[12] == 'ウセー':\r
245                 return False\r
246         if a[0] == 'one' and a[12] == 'オネー':\r
247                 return False\r
248         return True\r
249 \r
250 def make_dic(CODE, THISDIR):\r
251         with open(path.join(THISDIR, OUT_FILE), "w") as file:\r
252                 ## romadic\r
253                 cost = 500.0\r
254                 step = 0.5\r
255                 for i in romadic:\r
256                         k = i[0]\r
257                         for p in [('a', 'ア'), ('i', 'イ'), ('u', 'ウ'), ('e', 'エ'), ('o', 'オ')]:\r
258                                 k1 = k1 = alpha2mb(k.lower() + p[0])\r
259                                 y = i[1] + p[1] + 'ー'\r
260                                 pros = "%d/%d" % (0, i[2] + 2)\r
261                                 # 表層形,左文脈ID,右文脈ID,コスト,品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音\r
262                                 s = "%s,-1,-1,%.1f,名詞,一般,*,*,*,*,%s,%s,%s,%s,C0\n" % (k1,cost,k1,y,y,pros)\r
263                                 if isGoodEntry(s): file.write(s.encode(CODE))\r
264                         cost += step\r
265                         for p in [('a', 'ア'), ('i', 'イ'), ('u', 'ウ'), ('e', 'エ'), ('o', 'オ')]:\r
266                                 k1 = k1 = alpha2mb(p[0] + k.lower())\r
267                                 y = p[1] + i[1] + 'ー'\r
268                                 pros = "%d/%d" % (0, i[2] + 2)\r
269                                 # 表層形,左文脈ID,右文脈ID,コスト,品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音\r
270                                 s = "%s,-1,-1,%.1f,名詞,一般,*,*,*,*,%s,%s,%s,%s,C0\n" % (k1,cost,k1,y,y,pros)\r
271                                 if isGoodEntry(s): file.write(s.encode(CODE))\r
272                         cost += step\r
273                 for i in romadic:\r
274                         k = i[0]\r
275                         if k != 'nn':\r
276                                 k1 = k1 = alpha2mb(k.lower() + 'x')\r
277                                 y = i[1] + 'ックスー'\r
278                                 pros = "%d/%d" % (0, i[2] + 4)\r
279                                 # 表層形,左文脈ID,右文脈ID,コスト,品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音\r
280                                 s = "%s,-1,-1,%.1f,名詞,一般,*,*,*,*,%s,%s,%s,%s,C0\n" % (k1,cost,k1,y,y,pros)\r
281                                 if isGoodEntry(s): file.write(s.encode(CODE))\r
282                                 cost += step\r
283                 for i in romadic:\r
284                         k = i[0]\r
285                         if k != 'nn':\r
286                                 k1 = k1 = alpha2mb(k.lower() + 'n')\r
287                                 y = i[1] + 'ンー'\r
288                                 pros = "%d/%d" % (0, i[2] + 2)\r
289                                 # 表層形,左文脈ID,右文脈ID,コスト,品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音\r
290                                 s = "%s,-1,-1,%.1f,名詞,一般,*,*,*,*,%s,%s,%s,%s,C0\n" % (k1,cost,k1,y,y,pros)\r
291                                 if isGoodEntry(s): file.write(s.encode(CODE))\r
292                                 cost += step\r
293                 for i in romadic:\r
294                         k = i[0]\r
295                         if len(k) != 1:\r
296                                 k1 = k1 = alpha2mb(k.lower())\r
297                                 y = i[1] + 'ー'\r
298                                 pros = "%d/%d" % (0, i[2] + 1)\r
299                                 # 表層形,左文脈ID,右文脈ID,コスト,品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音\r
300                                 s = "%s,-1,-1,%.1f,名詞,一般,*,*,*,*,%s,%s,%s,%s,C0\n" % (k1,cost,k1,y,y,pros)\r
301                                 if isGoodEntry(s): file.write(s.encode(CODE))\r
302                                 cost += step\r
303 \r
304 if __name__ == '__main__':\r
305         make_dic()\r