OSDN Git Service

Merge branch 'master' of git.sourceforge.jp:/gitroot/hengband/hengband
[hengband/hengband.git] / src / japanese.c
1 /*!
2  *  @file japanese.c
3  *  @brief 日本語処理関数
4  *  @date 2014/07/07
5  */
6
7 #include "angband.h"
8
9 #ifdef JP
10
11 typedef struct convert_key convert_key;
12
13 struct convert_key
14 {
15         cptr key1;
16         cptr key2;
17 };
18
19 static const convert_key s2j_table[] = {
20         {"mb","nb"}, {"mp","np"}, {"mv","nv"}, {"mm","nm"},
21         {"x","ks"},
22         /* sindar:シンダール  parantir:パランティア  feanor:フェアノール */
23         {"ar$","a-ru$"}, {"ir$","ia$"}, {"or$","o-ru$"},
24         {"ra","ラ"}, {"ri","リ"}, {"ru","ル"}, {"re","レ"}, {"ro","ロ"},
25         {"ir","ia"}, {"ur","ua"}, {"er","ea"}, {"ar","aル"},
26         {"sha","シャ"}, {"shi","シ"}, {"shu","シュ"}, {"she","シェ"}, {"sho","ショ"},
27         {"tha","サ"}, {"thi","シ"}, {"thu","ス"}, {"the","セ"}, {"tho","ソ"},
28         {"cha","ハ"}, {"chi","ヒ"}, {"chu","フ"}, {"che","ヘ"}, {"cho","ホ"},
29         {"dha","ザ"}, {"dhi","ジ"}, {"dhu","ズ"}, {"dhe","ゼ"}, {"dho","ゾ"},
30         {"ba","バ"}, {"bi","ビ"}, {"bu","ブ"}, {"be","ベ"}, {"bo","ボ"},
31         {"ca","カ"}, {"ci","キ"}, {"cu","ク"}, {"ce","ケ"}, {"co","コ"},
32         {"da","ダ"}, {"di","ディ"}, {"du","ドゥ"}, {"de","デ"}, {"do","ド"},
33         {"fa","ファ"}, {"fi","フィ"}, {"fu","フ"}, {"fe","フェ"}, {"fo","フォ"},
34         {"ga","ガ"}, {"gi","ギ"}, {"gu","グ"}, {"ge","ゲ"}, {"go","ゴ"},
35         {"ha","ハ"}, {"hi","ヒ"}, {"hu","フ"}, {"he","ヘ"}, {"ho","ホ"},
36         {"ja","ジャ"}, {"ji","ジ"}, {"ju","ジュ"}, {"je","ジェ"}, {"jo","ジョ"},
37         {"ka","カ"}, {"ki","キ"}, {"ku","ク"}, {"ke","ケ"}, {"ko","コ"},
38         {"la","ラ"}, {"li","リ"}, {"lu","ル"}, {"le","レ"}, {"lo","ロ"},
39         {"ma","マ"}, {"mi","ミ"}, {"mu","ム"}, {"me","メ"}, {"mo","モ"},
40         {"na","ナ"}, {"ni","ニ"}, {"nu","ヌ"}, {"ne","ネ"}, {"no","ノ"},
41         {"pa","パ"}, {"pi","ピ"}, {"pu","プ"}, {"pe","ペ"}, {"po","ポ"},
42         {"qu","ク"},
43         {"sa","サ"}, {"si","シ"}, {"su","ス"}, {"se","セ"}, {"so","ソ"},
44         {"ta","タ"}, {"ti","ティ"}, {"tu","トゥ"}, {"te","テ"}, {"to","ト"},
45         {"va","ヴァ"}, {"vi","ヴィ"}, {"vu","ヴ"}, {"ve","ヴェ"}, {"vo","ヴォ"},
46         {"wa","ワ"}, {"wi","ウィ"}, {"wu","ウ"}, {"we","ウェ"}, {"wo","ウォ"},
47         {"ya","ヤ"}, {"yu","ユ"}, {"yo","ヨ"},
48         {"za","ザ"}, {"zi","ジ"}, {"zu","ズ"}, {"ze","ゼ"}, {"zo","ゾ"},
49         {"dh","ズ"}, {"ch","フ"}, {"th","ス"},
50         {"b","ブ"}, {"c","ク"}, {"d","ド"}, {"f","フ"}, {"g","グ"},
51         {"h","フ"}, {"j","ジュ"}, {"k","ク"}, {"l","ル"}, {"m","ム"},
52         {"n","ン"}, {"p","プ"}, {"q","ク"}, {"r","ル"}, {"s","ス"},
53         {"t","ト"}, {"v","ヴ"}, {"w","ウ"}, {"y","イ"},
54         {"a","ア"}, {"i","イ"}, {"u","ウ"}, {"e","エ"}, {"o","オ"},
55         {"-","ー"},
56         {NULL,NULL}
57 };
58
59 /*!
60  * @brief シンダリンを日本語の読みに変換する
61  * @param kana 変換後の日本語文字列ポインタ
62  * @param sindarin 変換前のシンダリン文字列ポインタ
63  * @return なし
64  * @details
65  */
66 void sindarin_to_kana(char *kana, const char *sindarin)
67 {
68         char buf[256];
69         int idx;
70
71         sprintf(kana, "%s$", sindarin);
72         for (idx = 0; kana[idx]; idx++)
73                 if (isupper(kana[idx])) kana[idx] = (char)tolower(kana[idx]);
74
75         for (idx = 0; s2j_table[idx].key1 != NULL; idx++)
76         {
77                 cptr pat1 = s2j_table[idx].key1;
78                 cptr pat2 = s2j_table[idx].key2;
79                 int len = strlen(pat1);
80                 char *src = kana;
81                 char *dest = buf;
82
83                 while (*src)
84                 {
85                         if (strncmp(src, pat1, len) == 0)
86                         {
87                                 strcpy(dest, pat2);
88                                 src += len;
89                                 dest += strlen(pat2);
90                         }
91                         else
92                         {
93                                 if (iskanji(*src))
94                                 {
95                                         *dest = *src;
96                                         src++;
97                                         dest++;
98                                 }
99                                 *dest = *src;
100                                 src++;
101                                 dest++;
102                         }
103                 }
104
105                 *dest = 0;
106                 strcpy(kana, buf);
107         }
108
109         idx = 0;
110
111         while (kana[idx] != '$') idx++;
112
113         kana[idx] = '\0';
114 }
115
116
117 /*! 日本語動詞活用 (打つ>打って,打ち etc)
118  * JVERB_AND: 殴る,蹴る > 殴り,蹴る
119  * JVERB_TO:  殴る,蹴る > 殴って蹴る
120  * JVERB_OR:  殴る,蹴る > 殴ったり蹴ったり */
121 static const struct jverb_table_t {
122         const char* from;
123         const char* to[3];
124 } jverb_table[] = {
125         { "する", {"し", "して", "した"}},
126         { "いる", {"いて", "いて", "いた"}},
127
128         { "える", {"え", "えて", "えた"}},
129         { "ける", {"け", "けて", "けた"}},
130         { "げる", {"げ", "えて", "げた"}},
131         { "せる", {"せ", "せて", "せた"}},
132         { "ぜる", {"ぜ", "ぜて", "ぜた"}},
133         { "てる", {"て", "てって", "てった"}},
134         { "でる", {"で", "でて", "でた"}},
135         { "ねる", {"ね", "ねて", "ねた"}},
136         { "へる", {"へ", "へて", "へた"}},
137         { "べる", {"べ", "べて", "べた"}},
138         { "める", {"め", "めて", "めた"}},
139         { "れる", {"れ", "れて", "れた"}},
140
141         { "う", {"い", "って", "った"}},
142         { "く", {"き", "いて", "いた"}},
143         { "ぐ", {"ぎ", "いで", "いだ"}},
144         { "す", {"し", "して", "した"}},
145         { "ず", {"じ", "じて", "じた"}},
146         { "つ", {"ち", "って", "った"}},
147         { "づ", {"ぢ", "って", "った"}},
148         { "ぬ", {"に", "ねて", "ねた"}},
149         { "ふ", {"ひ", "へて", "へた"}},
150         { "ぶ", {"び", "んで", "んだ"}},
151         { "む", {"み", "んで", "んだ"}},
152         { "る", {"り", "って", "った"}},
153         { NULL, {"そして", "ことにより", "ことや"}},
154 };
155
156 /*!
157  * @brief jverb_table_tに従って動詞を活用する
158  * @param in 変換元文字列ポインタ
159  * @param out 変換先文字列ポインタ
160  * @param flag 変換種類を指定(JVERB_AND/JVERB_TO/JVERB_OR)
161  * @return なし
162  * @details
163  */
164 void jverb(const char *in, char *out, int flag)
165 {
166         const struct jverb_table_t * p;
167         int in_len = strlen(in);
168
169         strcpy(out, in);
170
171         for (p = jverb_table; p->from; p++) {
172                 int from_len = strlen(p->from);
173                 if (strncmp(&in[in_len-from_len], p->from, from_len) == 0) {
174                         strcpy(&out[in_len - from_len], p->to[flag - 1]);
175                         break;
176                 }
177         }
178
179         if (p->from == NULL)
180                 strcpy(&out[in_len], p->to[flag - 1]);
181 }
182
183 /*!
184  * @brief 文字コードをSJISからEUCに変換する / Convert SJIS string to EUC string
185  * @param str 変換する文字列のポインタ
186  * @return なし
187  * @details
188  */
189 void sjis2euc(char *str)
190 {
191         int i;
192         unsigned char c1, c2;
193         unsigned char *tmp;
194
195         int len = strlen(str);
196
197         C_MAKE(tmp, len+1, byte);
198
199         for (i = 0; i < len; i++)
200         {
201                 c1 = str[i];
202                 if (c1 & 0x80)
203                 {
204                         i++;
205                         c2 = str[i];
206                         if (c2 >= 0x9f)
207                         {
208                                 c1 = c1 * 2 - (c1 >= 0xe0 ? 0xe0 : 0x60);
209                                 c2 += 2;
210                         }
211                         else
212                         {
213                                 c1 = c1 * 2 - (c1 >= 0xe0 ? 0xe1 : 0x61);
214                                 c2 += 0x60 + (c2 < 0x7f);
215                         }
216                         tmp[i - 1] = c1;
217                         tmp[i] = c2;
218                 }
219                 else
220                         tmp[i] = c1;
221         }
222         tmp[len] = 0;
223         strcpy(str, (char *)tmp);
224
225         C_KILL(tmp, len+1, byte);
226 }  
227
228
229 /*!
230  * @brief 文字コードをEUCからSJISに変換する / Convert EUC string to SJIS string
231  * @param str 変換する文字列のポインタ
232  * @return なし
233  * @details
234  */
235 void euc2sjis(char *str)
236 {
237         int i;
238         unsigned char c1, c2;
239         unsigned char *tmp;
240         
241         int len = strlen(str);
242
243         C_MAKE(tmp, len+1, byte);
244
245         for (i = 0; i < len; i++)
246         {
247                 c1 = str[i];
248                 if (c1 & 0x80)
249                 {
250                         i++;
251                         c2 = str[i];
252                         if (c1 % 2)
253                         {
254                                 c1 = (c1 >> 1) + (c1 < 0xdf ? 0x31 : 0x71);
255                                 c2 -= 0x60 + (c2 < 0xe0);
256                         }
257                         else
258                         {
259                                 c1 = (c1 >> 1) + (c1 < 0xdf ? 0x30 : 0x70);
260                                 c2 -= 2;
261                         }
262
263                         tmp[i - 1] = c1;
264                         tmp[i] = c2;
265                 }
266                 else
267                         tmp[i] = c1;
268         }
269         tmp[len] = 0;
270         strcpy(str, (char *)tmp);
271
272         C_KILL(tmp, len+1, byte);
273 }  
274
275
276 /*!
277  * @brief strを環境に合った文字コードに変換し、変換前の文字コードを返す。strの長さに制限はない。
278  * @param str 変換する文字列のポインタ
279  * @return 
280  * 0: Unknown<br>
281  * 1: ASCII (Never known to be ASCII in this function.)<br>
282  * 2: EUC<br>
283  * 3: SJIS<br>
284  */
285 byte codeconv(char *str)
286 {
287         byte code = 0;
288         int i;
289
290         for (i = 0; str[i]; i++)
291         {
292                 unsigned char c1;
293                 unsigned char c2;
294
295                 /* First byte */
296                 c1 = str[i];
297
298                 /* ASCII? */
299                 if (!(c1 & 0x80)) continue;
300
301                 /* Second byte */
302                 i++;
303                 c2 = str[i];
304
305                 if (((0xa1 <= c1 && c1 <= 0xdf) || (0xfd <= c1 && c1 <= 0xfe)) &&
306                     (0xa1 <= c2 && c2 <= 0xfe))
307                 {
308                         /* Only EUC is allowed */
309                         if (!code)
310                         {
311                                 /* EUC */
312                                 code = 2;
313                         }
314
315                         /* Broken string? */
316                         else if (code != 2)
317                         {
318                                 /* No conversion */
319                                 return 0;
320                         }
321                 }
322
323                 else if (((0x81 <= c1 && c1 <= 0x9f) &&
324                           ((0x40 <= c2 && c2 <= 0x7e) || (0x80 <= c2 && c2 <= 0xfc))) ||
325                          ((0xe0 <= c1 && c1 <= 0xfc) &&
326                           (0x40 <= c2 && c2 <= 0x7e)))
327                 {
328                         /* Only SJIS is allowed */
329                         if (!code)
330                         {
331                                 /* SJIS */
332                                 code = 3;
333                         }
334
335                         /* Broken string? */
336                         else if (code != 3)
337                         {
338                                 /* No conversion */
339                                 return 0;
340                         }
341                 }
342         }
343
344
345         switch (code)
346         {
347 #ifdef EUC
348         case 3:
349                 /* SJIS -> EUC */
350                 sjis2euc(str);
351                 break;
352 #endif
353
354 #ifdef SJIS
355         case 2:
356                 /* EUC -> SJIS */
357                 euc2sjis(str);
358
359                 break;
360 #endif
361         }
362
363         /* Return kanji code */
364         return code;
365 }
366
367 /*!
368  * @brief 文字列sのxバイト目が漢字の1バイト目かどうか判定する
369  * @param s 判定する文字列のポインタ
370  * @param x 判定する位置(バイト)
371  * @return 漢字の1バイト目ならばTRUE
372  */
373 bool iskanji2(cptr s, int x)
374 {
375         int i;
376
377         for (i = 0; i < x; i++)
378         {
379                 if (iskanji(s[i])) i++;
380         }
381         if ((x == i) && iskanji(s[x])) return TRUE;
382
383         return FALSE;
384 }
385
386 /*!
387  * @brief 文字列の文字コードがASCIIかどうかを判定する
388  * @param str 判定する文字列へのポインタ
389  * @return 文字列の文字コードがASCIIならTRUE、そうでなければFALSE
390  */
391 static bool is_ascii_str(cptr str)
392 {
393         for (;*str; str++) {
394                 if (!(0x00 < *str && *str <= 0x7f))
395                         return FALSE;
396         }
397         return TRUE;
398 }
399
400 /*!
401  * @brief 文字列の文字コードがUTF-8かどうかを判定する
402  * @param str 判定する文字列へのポインタ
403  * @return 文字列の文字コードがUTF-8ならTRUE、そうでなければFALSE
404  */
405 static bool is_utf8_str(cptr str)
406 {
407         const unsigned char* p;
408         for (p = (const unsigned char*)str; *p; p++) {
409                 int subseq_num = 0;
410                 if (0x00 < *p && *p <= 0x7f) continue;
411                 
412                 if ((*p & 0xe0) == 0xc0) subseq_num = 1;
413                 if ((*p & 0xf0) == 0xe0) subseq_num = 2;
414                 if ((*p & 0xf8) == 0xf0) subseq_num = 3;
415
416                 if (subseq_num == 0) return FALSE;
417                 while (subseq_num--) {
418                         p++;
419                         if (!*p || (*p & 0xc0) != 0x80) return FALSE;
420                 }
421         }
422         return TRUE;
423 }
424
425 #if defined(EUC)
426 #include <iconv.h>
427
428 static const struct ms_to_jis_unicode_conv_t {
429         char from[3];
430         char to[3];
431 } ms_to_jis_unicode_conv[] = {
432         {{0xef, 0xbd, 0x9e}, {0xe3, 0x80, 0x9c}}, /* FULLWIDTH TILDE -> WAVE DASH */
433         {{0xef, 0xbc, 0x8d}, {0xe2, 0x88, 0x92}}, /* FULLWIDTH HYPHEN-MINUS -> MINUS SIGN */
434 };
435
436 /*!
437  * @brief EUCがシステムコードである環境下向けにUTF-8から変換処理を行うサブルーチン
438  * @param str 変換する文字列のポインタ
439  * @return なし
440  */
441 static void ms_to_jis_unicode(char* str)
442 {
443         unsigned char* p;
444         for (p = (unsigned char*)str; *p; p++) {
445                 int subseq_num = 0;
446                 if (0x00 < *p && *p <= 0x7f) continue;
447
448                 if ((*p & 0xe0) == 0xc0) subseq_num = 1;
449                 if ((*p & 0xf0) == 0xe0) {
450                         int i;
451                         for (i = 0; i < sizeof(ms_to_jis_unicode_conv) / sizeof(ms_to_jis_unicode_conv[0]); ++ i) {
452                                 const struct ms_to_jis_unicode_conv_t *c = &ms_to_jis_unicode_conv[i];
453                                 if (memcmp(p, c->from, 3) == 0) {
454                                         memcpy(p, c->to, 3);
455                                 }
456                         }
457                         subseq_num = 2;
458                 }
459                 if ((*p & 0xf8) == 0xf0) subseq_num = 3;
460
461                 p += subseq_num;
462         }
463 }
464
465 #elif defined(SJIS) && defined(WINDOWS)
466 #include <Windows.h>
467 #endif
468 /*!
469  * @brief 文字コードがUTF-8の文字列をシステムの文字コードに変換する
470  * @param utf8_str 変換するUTF-8の文字列へのポインタ
471  * @param sys_str_buffer 変換したシステムの文字コードの文字列を格納するバッファへのポインタ
472  * @param sys_str_buflen 変換したシステムの文字コードの文字列を格納するバッファの長さ
473  * @return 変換に成功した場合TRUE、失敗した場合FALSEを返す
474  */
475 static bool utf8_to_sys(char* utf8_str, char* sys_str_buffer, size_t sys_str_buflen)
476 {
477 #if defined(EUC)
478
479         iconv_t cd = iconv_open("EUC-JP", "UTF-8");
480         size_t utf8_len = strlen(utf8_str) + 1; /* include termination character */
481         char *from = utf8_str;
482         int ret;
483
484         ms_to_jis_unicode(utf8_str);
485         ret = iconv(cd, &from, &utf8_len, &sys_str_buffer, &sys_str_buflen);
486         iconv_close(cd);
487         return (ret >= 0);
488
489 #elif defined(SJIS) && defined(WINDOWS)
490
491         LPWSTR utf16buf;
492         int input_len = strlen(utf8_str) + 1; /* include termination character */
493
494         C_MAKE(utf16buf, input_len, WCHAR);
495
496         /* UTF-8 -> UTF-16 */
497         if (MultiByteToWideChar( CP_UTF8, 0, utf8_str, input_len, utf16buf, input_len) == 0) {
498                 C_KILL(utf16buf, input_len, WCHAR);
499                 return FALSE;
500         }
501
502         /* UTF-8 -> SJIS(CP932) */
503         if (WideCharToMultiByte( CP_ACP, 0, utf16buf, -1, sys_str_buffer, sys_str_buflen, NULL, NULL ) == 0) {
504                 C_KILL(utf16buf, input_len, WCHAR);
505                 return FALSE;
506         }
507
508         C_KILL(utf16buf, input_len, WCHAR);
509         return TRUE;
510
511 #endif
512 }
513
514 /*!
515  * @brief 受け取った文字列の文字コードを推定し、システムの文字コードへ変換する
516  * @param strbuf 変換する文字列を格納したバッファへのポインタ。
517  *               バッファは変換した文字列で上書きされる。
518  *               UTF-8からSJISもしくはEUCへの変換を想定しているのでバッファの長さが足りなくなることはない。
519  * @param buflen バッファの長さ。
520  * @return なし
521  */
522 void guess_convert_to_system_encoding(char* strbuf, int buflen)
523 {
524         if (is_ascii_str(strbuf)) return;
525
526         if (is_utf8_str(strbuf)) {
527                 char* work;
528                 C_MAKE(work, buflen, char);
529                 my_strcpy(work, strbuf, buflen);
530                 if (!utf8_to_sys(work, strbuf, buflen)) {
531                         msg_print("警告:文字コードの変換に失敗しました");
532                         msg_print(NULL);
533                 }
534                 C_KILL(work, buflen, char);
535         }
536 }
537
538 #endif /* JP */