2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.0.9"
24 #define NKF_RELEASE_DATE "2009-06-23"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2009, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
213 {"ISO-2022-JP", ISO_2022_JP},
214 {"ISO2022JP-CP932", CP50220},
215 {"CP50220", CP50220},
216 {"CP50221", CP50221},
217 {"CSISO2022JP", CP50221},
218 {"CP50222", CP50222},
219 {"ISO-2022-JP-1", ISO_2022_JP_1},
220 {"ISO-2022-JP-3", ISO_2022_JP_3},
221 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
222 {"SHIFT_JIS", SHIFT_JIS},
224 {"WINDOWS-31J", WINDOWS_31J},
225 {"CSWINDOWS31J", WINDOWS_31J},
226 {"CP932", WINDOWS_31J},
227 {"MS932", WINDOWS_31J},
228 {"CP10001", CP10001},
231 {"EUCJP-NKF", EUCJP_NKF},
232 {"CP51932", CP51932},
233 {"EUC-JP-MS", EUCJP_MS},
234 {"EUCJP-MS", EUCJP_MS},
235 {"EUCJPMS", EUCJP_MS},
236 {"EUC-JP-ASCII", EUCJP_ASCII},
237 {"EUCJP-ASCII", EUCJP_ASCII},
238 {"SHIFT_JISX0213", SHIFT_JISX0213},
239 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
240 {"EUC-JISX0213", EUC_JISX0213},
241 {"EUC-JIS-2004", EUC_JIS_2004},
244 {"UTF-8-BOM", UTF_8_BOM},
245 {"UTF8-MAC", UTF8_MAC},
246 {"UTF-8-MAC", UTF8_MAC},
248 {"UTF-16BE", UTF_16BE},
249 {"UTF-16BE-BOM", UTF_16BE_BOM},
250 {"UTF-16LE", UTF_16LE},
251 {"UTF-16LE-BOM", UTF_16LE_BOM},
253 {"UTF-32BE", UTF_32BE},
254 {"UTF-32BE-BOM", UTF_32BE_BOM},
255 {"UTF-32LE", UTF_32LE},
256 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 #if defined(DEFAULT_CODE_JIS)
262 #define DEFAULT_ENCIDX ISO_2022_JP
263 #elif defined(DEFAULT_CODE_SJIS)
264 #define DEFAULT_ENCIDX SHIFT_JIS
265 #elif defined(DEFAULT_CODE_WINDOWS_31J)
266 #define DEFAULT_ENCIDX WINDOWS_31J
267 #elif defined(DEFAULT_CODE_EUC)
268 #define DEFAULT_ENCIDX EUC_JP
269 #elif defined(DEFAULT_CODE_UTF8)
270 #define DEFAULT_ENCIDX UTF_8
274 #define is_alnum(c) \
275 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
277 /* I don't trust portablity of toupper */
278 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
279 #define nkf_isoctal(c) ('0'<=c && c<='7')
280 #define nkf_isdigit(c) ('0'<=c && c<='9')
281 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
282 #define nkf_isblank(c) (c == SP || c == TAB)
283 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
284 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
285 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
286 #define nkf_isprint(c) (SP<=c && c<='~')
287 #define nkf_isgraph(c) ('!'<=c && c<='~')
288 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
289 ('A'<=c&&c<='F') ? (c-'A'+10) : \
290 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
291 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
292 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
293 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
294 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
295 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
297 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
300 #define HOLD_SIZE 1024
301 #if defined(INT_IS_SHORT)
302 #define IOBUF_SIZE 2048
304 #define IOBUF_SIZE 16384
307 #define DEFAULT_J 'B'
308 #define DEFAULT_R 'B'
315 /* MIME preprocessor */
317 #ifdef EASYWIN /*Easy Win */
318 extern POINT _BufferSize;
327 void (*status_func)(struct input_code *, nkf_char);
328 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
332 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
333 static nkf_encoding *input_encoding = NULL;
334 static nkf_encoding *output_encoding = NULL;
336 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
338 * 0: Shift_JIS, eucJP-ascii
343 #define UCS_MAP_ASCII 0
345 #define UCS_MAP_CP932 2
346 #define UCS_MAP_CP10001 3
347 static int ms_ucs_map_f = UCS_MAP_ASCII;
349 #ifdef UTF8_INPUT_ENABLE
350 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
351 static int no_cp932ext_f = FALSE;
352 /* ignore ZERO WIDTH NO-BREAK SPACE */
353 static int no_best_fit_chars_f = FALSE;
354 static int input_endian = ENDIAN_BIG;
355 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
356 static void (*encode_fallback)(nkf_char c) = NULL;
357 static void w_status(struct input_code *, nkf_char);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int output_bom_f = FALSE;
361 static int output_endian = ENDIAN_BIG;
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_getc(FILE *f);
373 static void mime_putc(nkf_char c);
377 #if !defined(PERL_XS) && !defined(WIN32DLL)
378 static unsigned char stdibuf[IOBUF_SIZE];
379 static unsigned char stdobuf[IOBUF_SIZE];
383 static int unbuf_f = FALSE;
384 static int estab_f = FALSE;
385 static int nop_f = FALSE;
386 static int binmode_f = TRUE; /* binary mode */
387 static int rot_f = FALSE; /* rot14/43 mode */
388 static int hira_f = FALSE; /* hira/kata henkan */
389 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
390 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
391 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
392 static int mimebuf_f = FALSE; /* MIME buffered input */
393 static int broken_f = FALSE; /* convert ESC-less broken JIS */
394 static int iso8859_f = FALSE; /* ISO8859 through */
395 static int mimeout_f = FALSE; /* base64 mode */
396 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
397 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399 #ifdef UNICODE_NORMALIZATION
400 static int nfc_f = FALSE;
401 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
402 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
406 static int cap_f = FALSE;
407 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
408 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 static int url_f = FALSE;
411 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
415 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
416 #define CLASS_MASK NKF_INT32_C(0xFF000000)
417 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
418 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
419 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
420 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
421 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
422 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
423 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
424 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
425 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
427 #ifdef NUMCHAR_OPTION
428 static int numchar_f = FALSE;
429 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
430 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static int noout_f = FALSE;
435 static void no_putc(nkf_char c);
436 static int debug_f = FALSE;
437 static void debug(const char *str);
438 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
441 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
442 static void set_input_codename(const char *codename);
445 static int exec_f = 0;
448 #ifdef SHIFTJIS_CP932
449 /* invert IBM extended characters to others */
450 static int cp51932_f = FALSE;
452 /* invert NEC-selected IBM extended characters to IBM extended characters */
453 static int cp932inv_f = TRUE;
455 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
456 #endif /* SHIFTJIS_CP932 */
458 static int x0212_f = FALSE;
459 static int x0213_f = FALSE;
461 static unsigned char prefix_table[256];
463 static void e_status(struct input_code *, nkf_char);
464 static void s_status(struct input_code *, nkf_char);
466 struct input_code input_code_list[] = {
467 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
468 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469 #ifdef UTF8_INPUT_ENABLE
470 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
471 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
472 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
477 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
478 static int base64_count = 0;
480 /* X0208 -> ASCII converter */
483 static int f_line = 0; /* chars in line */
484 static int f_prev = 0;
485 static int fold_preserve_f = FALSE; /* preserve new lines */
486 static int fold_f = FALSE;
487 static int fold_len = 0;
490 static unsigned char kanji_intro = DEFAULT_J;
491 static unsigned char ascii_intro = DEFAULT_R;
495 #define FOLD_MARGIN 10
496 #define DEFAULT_FOLD 60
498 static int fold_margin = FOLD_MARGIN;
500 /* process default */
503 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
505 fprintf(stderr,"nkf internal module connection failure.\n");
511 no_connection(nkf_char c2, nkf_char c1)
513 no_connection2(c2,c1,0);
516 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
517 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
524 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
525 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
527 /* static redirections */
529 static void (*o_putc)(nkf_char c) = std_putc;
531 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
532 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
534 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
535 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
537 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
539 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
540 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
542 /* for strict mime */
543 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
544 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
547 static int output_mode = ASCII; /* output kanji mode */
548 static int input_mode = ASCII; /* input kanji mode */
549 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
551 /* X0201 / X0208 conversion tables */
553 /* X0201 kana conversion table */
555 static const unsigned char cv[]= {
556 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
557 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
558 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
559 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
560 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
561 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
562 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
563 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
564 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
565 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
566 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
567 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
568 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
569 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
570 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
571 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
575 /* X0201 kana conversion table for daguten */
577 static const unsigned char dv[]= {
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
583 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
584 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
585 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
586 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
587 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
589 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 /* X0201 kana conversion table for han-daguten */
598 static const unsigned char ev[]= {
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
610 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 /* X0208 kigou conversion table */
619 /* 0x8140 - 0x819e */
620 static const unsigned char fv[] = {
622 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
623 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
624 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
625 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
626 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
627 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
628 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
629 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
630 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
638 static int option_mode = 0;
639 static int file_out_f = FALSE;
641 static int overwrite_f = FALSE;
642 static int preserve_time_f = FALSE;
643 static int backup_f = FALSE;
644 static char *backup_suffix = "";
647 static int eolmode_f = 0; /* CR, LF, CRLF */
648 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
649 static nkf_char prev_cr = 0; /* CR or 0 */
650 #ifdef EASYWIN /*Easy Win */
651 static int end_check;
655 nkf_xmalloc(size_t size)
659 if (size == 0) size = 1;
663 perror("can't malloc");
671 nkf_xrealloc(void *ptr, size_t size)
673 if (size == 0) size = 1;
675 ptr = realloc(ptr, size);
677 perror("can't realloc");
684 #define nkf_xfree(ptr) free(ptr)
687 nkf_str_caseeql(const char *src, const char *target)
690 for (i = 0; src[i] && target[i]; i++) {
691 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
693 if (src[i] || target[i]) return FALSE;
698 nkf_enc_from_index(int idx)
700 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
703 return &nkf_encoding_table[idx];
707 nkf_enc_find_index(const char *name)
710 if (name[0] == 'X' && *(name+1) == '-') name += 2;
711 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
712 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
713 return encoding_name_to_id_table[i].id;
720 nkf_enc_find(const char *name)
723 idx = nkf_enc_find_index(name);
724 if (idx < 0) return 0;
725 return nkf_enc_from_index(idx);
728 #define nkf_enc_name(enc) (enc)->name
729 #define nkf_enc_to_index(enc) (enc)->id
730 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
731 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
732 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
733 #define nkf_enc_asciicompat(enc) (\
734 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
735 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
736 #define nkf_enc_unicode_p(enc) (\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
738 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
739 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
740 #define nkf_enc_cp5022x_p(enc) (\
741 nkf_enc_to_index(enc) == CP50220 ||\
742 nkf_enc_to_index(enc) == CP50221 ||\
743 nkf_enc_to_index(enc) == CP50222)
745 #ifdef DEFAULT_CODE_LOCALE
749 #ifdef HAVE_LANGINFO_H
750 return nl_langinfo(CODESET);
751 #elif defined(__WIN32__)
753 sprintf(buf, "CP%d", GetACP());
755 #elif defined(__OS2__)
756 # if defined(INT_IS_SHORT)
762 ULONG ulCP[1], ulncp;
763 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
764 if (ulCP[0] == 932 || ulCP[0] == 943)
765 strcpy(buf, "Shift_JIS");
767 sprintf(buf, "CP%lu", ulCP[0]);
775 nkf_locale_encoding()
777 nkf_encoding *enc = 0;
778 const char *encname = nkf_locale_charmap();
780 enc = nkf_enc_find(encname);
783 #endif /* DEFAULT_CODE_LOCALE */
788 return &nkf_encoding_table[UTF_8];
792 nkf_default_encoding()
794 nkf_encoding *enc = 0;
795 #ifdef DEFAULT_CODE_LOCALE
796 enc = nkf_locale_encoding();
797 #elif defined(DEFAULT_ENCIDX)
798 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
800 if (!enc) enc = nkf_utf8_encoding();
811 nkf_buf_new(int length)
813 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
814 buf->ptr = nkf_xmalloc(length);
822 nkf_buf_dispose(nkf_buf_t *buf)
829 #define nkf_buf_length(buf) ((buf)->len)
830 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
833 nkf_buf_at(nkf_buf_t *buf, int index)
835 assert(index <= buf->len);
836 return buf->ptr[index];
840 nkf_buf_clear(nkf_buf_t *buf)
846 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
848 if (buf->capa <= buf->len) {
851 buf->ptr[buf->len++] = c;
855 nkf_buf_pop(nkf_buf_t *buf)
857 assert(!nkf_buf_empty_p(buf));
858 return buf->ptr[--buf->len];
861 /* Normalization Form C */
864 #define fprintf dllprintf
870 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
877 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
878 #ifdef UTF8_OUTPUT_ENABLE
879 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
880 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
883 #ifdef UTF8_INPUT_ENABLE
884 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
885 " UTF option is -W[8,[16,32][B,L]]\n"
887 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
891 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
892 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
893 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
896 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
897 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
898 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
899 " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
902 " O Output to File (DEFAULT 'nkf.out')\n"
903 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
906 " --ic=<encoding> Specify the input encoding\n"
907 " --oc=<encoding> Specify the output encoding\n"
908 " --hiragana --katakana Hiragana/Katakana Conversion\n"
909 " --katakana-hiragana Converts each other\n"
913 " --{cap, url}-input Convert hex after ':' or '%%'\n"
915 #ifdef NUMCHAR_OPTION
916 " --numchar-input Convert Unicode Character Reference\n"
918 #ifdef UTF8_INPUT_ENABLE
919 " --fb-{skip, html, xml, perl, java, subchar}\n"
920 " Specify unassigned character's replacement\n"
925 " --in-place[=SUF] Overwrite original files\n"
926 " --overwrite[=SUF] Preserve timestamp of original files\n"
928 " -g --guess Guess the input code\n"
929 " -v --version Print the version\n"
930 " --help/-V Print this help / configuration\n"
936 show_configuration(void)
939 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
940 " Compile-time options:\n"
941 " Compiled at: " __DATE__ " " __TIME__ "\n"
944 " Default output encoding: "
945 #ifdef DEFAULT_CODE_LOCALE
946 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
947 #elif defined(DEFAULT_ENCIDX)
948 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
954 " Default output end of line: "
955 #if DEFAULT_NEWLINE == CR
957 #elif DEFAULT_NEWLINE == CRLF
963 " Decode MIME encoded string: "
964 #if MIME_DECODE_DEFAULT
970 " Convert JIS X 0201 Katakana: "
977 " --help, --version output: "
978 #if HELP_OUTPUT_HELP_OUTPUT
989 get_backup_filename(const char *suffix, const char *filename)
991 char *backup_filename;
992 int asterisk_count = 0;
994 int filename_length = strlen(filename);
996 for(i = 0; suffix[i]; i++){
997 if(suffix[i] == '*') asterisk_count++;
1001 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1002 for(i = 0, j = 0; suffix[i];){
1003 if(suffix[i] == '*'){
1004 backup_filename[j] = '\0';
1005 strncat(backup_filename, filename, filename_length);
1007 j += filename_length;
1009 backup_filename[j++] = suffix[i++];
1012 backup_filename[j] = '\0';
1014 j = filename_length + strlen(suffix);
1015 backup_filename = nkf_xmalloc(j + 1);
1016 strcpy(backup_filename, filename);
1017 strcat(backup_filename, suffix);
1018 backup_filename[j] = '\0';
1020 return backup_filename;
1024 #ifdef UTF8_INPUT_ENABLE
1026 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1033 (*f)(0, bin2hex(c>>shift));
1044 encode_fallback_html(nkf_char c)
1049 if(c >= NKF_INT32_C(1000000))
1050 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1051 if(c >= NKF_INT32_C(100000))
1052 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1054 (*oconv)(0, 0x30+(c/10000 )%10);
1056 (*oconv)(0, 0x30+(c/1000 )%10);
1058 (*oconv)(0, 0x30+(c/100 )%10);
1060 (*oconv)(0, 0x30+(c/10 )%10);
1062 (*oconv)(0, 0x30+ c %10);
1068 encode_fallback_xml(nkf_char c)
1073 nkf_each_char_to_hex(oconv, c);
1079 encode_fallback_java(nkf_char c)
1083 if(!nkf_char_unicode_bmp_p(c)){
1087 (*oconv)(0, bin2hex(c>>20));
1088 (*oconv)(0, bin2hex(c>>16));
1092 (*oconv)(0, bin2hex(c>>12));
1093 (*oconv)(0, bin2hex(c>> 8));
1094 (*oconv)(0, bin2hex(c>> 4));
1095 (*oconv)(0, bin2hex(c ));
1100 encode_fallback_perl(nkf_char c)
1105 nkf_each_char_to_hex(oconv, c);
1111 encode_fallback_subchar(nkf_char c)
1113 c = unicode_subchar;
1114 (*oconv)((c>>8)&0xFF, c&0xFF);
1119 static const struct {
1143 {"katakana-hiragana","h3"},
1151 #ifdef UTF8_OUTPUT_ENABLE
1161 {"fb-subchar=", ""},
1163 #ifdef UTF8_INPUT_ENABLE
1164 {"utf8-input", "W"},
1165 {"utf16-input", "W16"},
1166 {"no-cp932ext", ""},
1167 {"no-best-fit-chars",""},
1169 #ifdef UNICODE_NORMALIZATION
1170 {"utf8mac-input", ""},
1182 #ifdef NUMCHAR_OPTION
1183 {"numchar-input", ""},
1189 #ifdef SHIFTJIS_CP932
1200 set_input_encoding(nkf_encoding *enc)
1202 switch (nkf_enc_to_index(enc)) {
1209 #ifdef SHIFTJIS_CP932
1212 #ifdef UTF8_OUTPUT_ENABLE
1213 ms_ucs_map_f = UCS_MAP_CP932;
1223 case ISO_2022_JP_2004:
1230 #ifdef SHIFTJIS_CP932
1233 #ifdef UTF8_OUTPUT_ENABLE
1234 ms_ucs_map_f = UCS_MAP_CP932;
1239 #ifdef SHIFTJIS_CP932
1242 #ifdef UTF8_OUTPUT_ENABLE
1243 ms_ucs_map_f = UCS_MAP_CP10001;
1251 #ifdef SHIFTJIS_CP932
1254 #ifdef UTF8_OUTPUT_ENABLE
1255 ms_ucs_map_f = UCS_MAP_CP932;
1259 #ifdef SHIFTJIS_CP932
1262 #ifdef UTF8_OUTPUT_ENABLE
1263 ms_ucs_map_f = UCS_MAP_MS;
1267 #ifdef SHIFTJIS_CP932
1270 #ifdef UTF8_OUTPUT_ENABLE
1271 ms_ucs_map_f = UCS_MAP_ASCII;
1274 case SHIFT_JISX0213:
1275 case SHIFT_JIS_2004:
1277 #ifdef SHIFTJIS_CP932
1284 #ifdef SHIFTJIS_CP932
1288 #ifdef UTF8_INPUT_ENABLE
1289 #ifdef UNICODE_NORMALIZATION
1297 input_endian = ENDIAN_BIG;
1301 input_endian = ENDIAN_LITTLE;
1306 input_endian = ENDIAN_BIG;
1310 input_endian = ENDIAN_LITTLE;
1317 set_output_encoding(nkf_encoding *enc)
1319 switch (nkf_enc_to_index(enc)) {
1322 #ifdef SHIFTJIS_CP932
1323 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1325 #ifdef UTF8_OUTPUT_ENABLE
1326 ms_ucs_map_f = UCS_MAP_CP932;
1330 #ifdef SHIFTJIS_CP932
1331 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1333 #ifdef UTF8_OUTPUT_ENABLE
1334 ms_ucs_map_f = UCS_MAP_CP932;
1338 #ifdef SHIFTJIS_CP932
1339 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1344 #ifdef SHIFTJIS_CP932
1345 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1351 #ifdef SHIFTJIS_CP932
1352 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1358 #ifdef UTF8_OUTPUT_ENABLE
1359 ms_ucs_map_f = UCS_MAP_CP932;
1363 #ifdef UTF8_OUTPUT_ENABLE
1364 ms_ucs_map_f = UCS_MAP_CP10001;
1369 #ifdef SHIFTJIS_CP932
1370 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1372 #ifdef UTF8_OUTPUT_ENABLE
1373 ms_ucs_map_f = UCS_MAP_ASCII;
1378 #ifdef SHIFTJIS_CP932
1379 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1381 #ifdef UTF8_OUTPUT_ENABLE
1382 ms_ucs_map_f = UCS_MAP_ASCII;
1386 #ifdef SHIFTJIS_CP932
1387 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1389 #ifdef UTF8_OUTPUT_ENABLE
1390 ms_ucs_map_f = UCS_MAP_CP932;
1395 #ifdef UTF8_OUTPUT_ENABLE
1396 ms_ucs_map_f = UCS_MAP_MS;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_ASCII;
1405 case SHIFT_JISX0213:
1406 case SHIFT_JIS_2004:
1408 #ifdef SHIFTJIS_CP932
1409 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1416 #ifdef SHIFTJIS_CP932
1417 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1420 #ifdef UTF8_OUTPUT_ENABLE
1422 output_bom_f = TRUE;
1426 output_bom_f = TRUE;
1429 output_endian = ENDIAN_LITTLE;
1430 output_bom_f = FALSE;
1433 output_endian = ENDIAN_LITTLE;
1434 output_bom_f = TRUE;
1438 output_bom_f = TRUE;
1441 output_endian = ENDIAN_LITTLE;
1442 output_bom_f = FALSE;
1445 output_endian = ENDIAN_LITTLE;
1446 output_bom_f = TRUE;
1452 static struct input_code*
1453 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1456 struct input_code *p = input_code_list;
1458 if (iconv_func == p->iconv_func){
1468 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1470 #ifdef INPUT_CODE_FIX
1471 if (f || !input_encoding)
1478 #ifdef INPUT_CODE_FIX
1479 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1485 if (estab_f && iconv_for_check != iconv){
1486 struct input_code *p = find_inputcode_byfunc(iconv);
1488 set_input_codename(p->name);
1491 iconv_for_check = iconv;
1498 x0212_shift(nkf_char c)
1503 if (0x75 <= c && c <= 0x7f){
1504 ret = c + (0x109 - 0x75);
1507 if (0x75 <= c && c <= 0x7f){
1508 ret = c + (0x113 - 0x75);
1516 x0212_unshift(nkf_char c)
1519 if (0x7f <= c && c <= 0x88){
1520 ret = c + (0x75 - 0x7f);
1521 }else if (0x89 <= c && c <= 0x92){
1522 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1526 #endif /* X0212_ENABLE */
1529 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1535 if((0x21 <= ndx && ndx <= 0x2F)){
1536 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1537 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1539 }else if(0x6E <= ndx && ndx <= 0x7E){
1540 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1541 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1547 else if(nkf_isgraph(ndx)){
1549 const unsigned short *ptr;
1550 ptr = x0212_shiftjis[ndx - 0x21];
1552 val = ptr[(c1 & 0x7f) - 0x21];
1561 c2 = x0212_shift(c2);
1563 #endif /* X0212_ENABLE */
1565 if(0x7F < c2) return 1;
1566 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1567 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1572 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1574 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1577 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1578 if (0xFC < c1) return 1;
1579 #ifdef SHIFTJIS_CP932
1580 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1581 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1588 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1589 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1595 #endif /* SHIFTJIS_CP932 */
1597 if (!x0213_f && is_ibmext_in_sjis(c2)){
1598 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1601 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1614 if(x0213_f && c2 >= 0xF0){
1615 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1616 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1617 }else{ /* 78<=k<=94 */
1618 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1619 if (0x9E < c1) c2++;
1622 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1623 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1624 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1625 if (0x9E < c1) c2++;
1628 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1635 c2 = x0212_unshift(c2);
1642 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1644 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1652 }else if (val < 0x800){
1653 *p1 = 0xc0 | (val >> 6);
1654 *p2 = 0x80 | (val & 0x3f);
1657 } else if (nkf_char_unicode_bmp_p(val)) {
1658 *p1 = 0xe0 | (val >> 12);
1659 *p2 = 0x80 | ((val >> 6) & 0x3f);
1660 *p3 = 0x80 | ( val & 0x3f);
1662 } else if (nkf_char_unicode_value_p(val)) {
1663 *p1 = 0xf0 | (val >> 18);
1664 *p2 = 0x80 | ((val >> 12) & 0x3f);
1665 *p3 = 0x80 | ((val >> 6) & 0x3f);
1666 *p4 = 0x80 | ( val & 0x3f);
1676 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1683 else if (c1 <= 0xC3) {
1684 /* trail byte or invalid */
1687 else if (c1 <= 0xDF) {
1689 wc = (c1 & 0x1F) << 6;
1692 else if (c1 <= 0xEF) {
1694 wc = (c1 & 0x0F) << 12;
1695 wc |= (c2 & 0x3F) << 6;
1698 else if (c2 <= 0xF4) {
1700 wc = (c1 & 0x0F) << 18;
1701 wc |= (c2 & 0x3F) << 12;
1702 wc |= (c3 & 0x3F) << 6;
1712 #ifdef UTF8_INPUT_ENABLE
1714 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1715 const unsigned short *const *pp, nkf_char psize,
1716 nkf_char *p2, nkf_char *p1)
1719 const unsigned short *p;
1722 if (pp == 0) return 1;
1725 if (c1 < 0 || psize <= c1) return 1;
1727 if (p == 0) return 1;
1730 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1732 if (val == 0) return 1;
1733 if (no_cp932ext_f && (
1734 (val>>8) == 0x2D || /* NEC special characters */
1735 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1743 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1751 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1753 const unsigned short *const *pp;
1754 const unsigned short *const *const *ppp;
1755 static const char no_best_fit_chars_table_C2[] =
1756 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1759 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1760 static const char no_best_fit_chars_table_C2_ms[] =
1761 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1764 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1765 static const char no_best_fit_chars_table_932_C2[] =
1766 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1767 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1768 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1769 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1770 static const char no_best_fit_chars_table_932_C3[] =
1771 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1772 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1773 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1774 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1780 }else if(c2 < 0xe0){
1781 if(no_best_fit_chars_f){
1782 if(ms_ucs_map_f == UCS_MAP_CP932){
1785 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1788 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1791 }else if(!cp932inv_f){
1794 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1797 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1800 }else if(ms_ucs_map_f == UCS_MAP_MS){
1801 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1802 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1820 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1821 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1822 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1824 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1825 }else if(c0 < 0xF0){
1826 if(no_best_fit_chars_f){
1827 if(ms_ucs_map_f == UCS_MAP_CP932){
1828 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1829 }else if(ms_ucs_map_f == UCS_MAP_MS){
1834 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1837 if(c0 == 0x92) return 1;
1842 if(c1 == 0x80 || c0 == 0x9C) return 1;
1845 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1850 if(c0 == 0x94) return 1;
1853 if(c0 == 0xBB) return 1;
1863 if(c0 == 0x95) return 1;
1866 if(c0 == 0xA5) return 1;
1873 if(c0 == 0x8D) return 1;
1876 if(c0 == 0x9E && !cp932inv_f) return 1;
1879 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1887 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1888 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1889 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1891 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1893 #ifdef SHIFTJIS_CP932
1894 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1896 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1897 s2e_conv(s2, s1, p2, p1);
1906 #ifdef UTF8_OUTPUT_ENABLE
1908 e2w_conv(nkf_char c2, nkf_char c1)
1910 const unsigned short *p;
1912 if (c2 == JIS_X_0201_1976_K) {
1913 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1921 p = euc_to_utf8_1byte;
1923 } else if (is_eucg3(c2)){
1924 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1927 c2 = (c2&0x7f) - 0x21;
1928 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1929 p = x0212_to_utf8_2bytes[c2];
1935 c2 = (c2&0x7f) - 0x21;
1936 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1938 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1939 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1940 euc_to_utf8_2bytes_ms[c2];
1945 c1 = (c1 & 0x7f) - 0x21;
1946 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1953 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1960 }else if (0xc0 <= c2 && c2 <= 0xef) {
1961 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1962 #ifdef NUMCHAR_OPTION
1965 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1973 #ifdef UTF8_INPUT_ENABLE
1975 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1977 nkf_char c1, c2, c3, c4;
1984 else if (nkf_char_unicode_bmp_p(val)){
1985 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1986 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1989 *p1 = nkf_char_unicode_new(val);
1995 *p1 = nkf_char_unicode_new(val);
2002 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2004 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2005 if (iso2022jp_f && !x0201_f) {
2006 c2 = GETA1; c1 = GETA2;
2008 c2 = JIS_X_0201_1976_K;
2012 }else if (c2 == 0x8f){
2016 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2017 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2018 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2021 c2 = (c2 << 8) | (c1 & 0x7f);
2023 #ifdef SHIFTJIS_CP932
2026 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2027 s2e_conv(s2, s1, &c2, &c1);
2034 #endif /* SHIFTJIS_CP932 */
2036 #endif /* X0212_ENABLE */
2037 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2040 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2041 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2042 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2047 #ifdef SHIFTJIS_CP932
2048 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2050 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2051 s2e_conv(s2, s1, &c2, &c1);
2058 #endif /* SHIFTJIS_CP932 */
2066 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2068 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2069 if (iso2022jp_f && !x0201_f) {
2070 c2 = GETA1; c1 = GETA2;
2074 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2076 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2078 if(c1 == 0x7F) return 0;
2079 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2082 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2083 if (ret) return ret;
2090 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2092 nkf_char ret = 0, c4 = 0;
2093 static const char w_iconv_utf8_1st_byte[] =
2095 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2096 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2097 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2098 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2105 if (c1 < 0 || 0xff < c1) {
2106 }else if (c1 == 0) { /* 0 : 1 byte*/
2108 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2111 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2113 if (c2 < 0x80 || 0xBF < c2) return 0;
2116 if (c3 == 0) return -1;
2117 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2122 if (c3 == 0) return -1;
2123 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2127 if (c3 == 0) return -1;
2128 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2132 if (c3 == 0) return -2;
2133 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2137 if (c3 == 0) return -2;
2138 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2142 if (c3 == 0) return -2;
2143 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2151 if (c1 == 0 || c1 == EOF){
2152 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2153 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2156 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2164 #define NKF_ICONV_INVALID_CODE_RANGE -13
2166 unicode_iconv(nkf_char wc)
2174 }else if ((wc>>11) == 27) {
2175 /* unpaired surrogate */
2176 return NKF_ICONV_INVALID_CODE_RANGE;
2177 }else if (wc < 0xFFFF) {
2178 ret = w16e_conv(wc, &c2, &c1);
2179 if (ret) return ret;
2180 }else if (wc < 0x10FFFF) {
2182 c1 = nkf_char_unicode_new(wc);
2184 return NKF_ICONV_INVALID_CODE_RANGE;
2190 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2191 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2192 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2194 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2203 if (input_endian == ENDIAN_BIG) {
2204 if (0xD8 <= c1 && c1 <= 0xDB) {
2205 if (0xDC <= c3 && c3 <= 0xDF) {
2206 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2207 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2212 if (0xD8 <= c2 && c2 <= 0xDB) {
2213 if (0xDC <= c4 && c4 <= 0xDF) {
2214 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2215 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2221 return (*unicode_iconv)(wc);
2225 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2231 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2237 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2246 switch(input_endian){
2248 wc = c2 << 16 | c3 << 8 | c4;
2251 wc = c3 << 16 | c2 << 8 | c1;
2254 wc = c1 << 16 | c4 << 8 | c3;
2257 wc = c4 << 16 | c1 << 8 | c2;
2260 return NKF_ICONV_INVALID_CODE_RANGE;
2263 return (*unicode_iconv)(wc);
2267 #define output_ascii_escape_sequence(mode) do { \
2268 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2271 (*o_putc)(ascii_intro); \
2272 output_mode = mode; \
2277 output_escape_sequence(int mode)
2279 if (output_mode == mode)
2287 case JIS_X_0201_1976_K:
2295 (*o_putc)(kanji_intro);
2320 j_oconv(nkf_char c2, nkf_char c1)
2322 #ifdef NUMCHAR_OPTION
2323 if (c2 == 0 && nkf_char_unicode_p(c1)){
2324 w16e_conv(c1, &c2, &c1);
2325 if (c2 == 0 && nkf_char_unicode_p(c1)){
2326 c2 = c1 & VALUE_MASK;
2327 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2330 c2 = 0x7F + c1 / 94;
2331 c1 = 0x21 + c1 % 94;
2333 if (encode_fallback) (*encode_fallback)(c1);
2340 output_ascii_escape_sequence(ASCII);
2343 else if (c2 == EOF) {
2344 output_ascii_escape_sequence(ASCII);
2347 else if (c2 == ISO_8859_1) {
2348 output_ascii_escape_sequence(ISO_8859_1);
2351 else if (c2 == JIS_X_0201_1976_K) {
2352 output_escape_sequence(JIS_X_0201_1976_K);
2355 } else if (is_eucg3(c2)){
2356 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2357 (*o_putc)(c2 & 0x7f);
2362 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2363 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2364 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2371 e_oconv(nkf_char c2, nkf_char c1)
2373 if (c2 == 0 && nkf_char_unicode_p(c1)){
2374 w16e_conv(c1, &c2, &c1);
2375 if (c2 == 0 && nkf_char_unicode_p(c1)){
2376 c2 = c1 & VALUE_MASK;
2377 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2381 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2382 c1 = 0x21 + c1 % 94;
2385 (*o_putc)((c2 & 0x7f) | 0x080);
2386 (*o_putc)(c1 | 0x080);
2388 (*o_putc)((c2 & 0x7f) | 0x080);
2389 (*o_putc)(c1 | 0x080);
2393 if (encode_fallback) (*encode_fallback)(c1);
2401 } else if (c2 == 0) {
2402 output_mode = ASCII;
2404 } else if (c2 == JIS_X_0201_1976_K) {
2405 output_mode = EUC_JP;
2406 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2407 } else if (c2 == ISO_8859_1) {
2408 output_mode = ISO_8859_1;
2409 (*o_putc)(c1 | 0x080);
2411 } else if (is_eucg3(c2)){
2412 output_mode = EUC_JP;
2413 #ifdef SHIFTJIS_CP932
2416 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2417 s2e_conv(s2, s1, &c2, &c1);
2422 output_mode = ASCII;
2424 }else if (is_eucg3(c2)){
2427 (*o_putc)((c2 & 0x7f) | 0x080);
2428 (*o_putc)(c1 | 0x080);
2431 (*o_putc)((c2 & 0x7f) | 0x080);
2432 (*o_putc)(c1 | 0x080);
2436 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2437 set_iconv(FALSE, 0);
2438 return; /* too late to rescue this char */
2440 output_mode = EUC_JP;
2441 (*o_putc)(c2 | 0x080);
2442 (*o_putc)(c1 | 0x080);
2447 s_oconv(nkf_char c2, nkf_char c1)
2449 #ifdef NUMCHAR_OPTION
2450 if (c2 == 0 && nkf_char_unicode_p(c1)){
2451 w16e_conv(c1, &c2, &c1);
2452 if (c2 == 0 && nkf_char_unicode_p(c1)){
2453 c2 = c1 & VALUE_MASK;
2454 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2457 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2459 c1 += 0x40 + (c1 > 0x3e);
2464 if(encode_fallback)(*encode_fallback)(c1);
2473 } else if (c2 == 0) {
2474 output_mode = ASCII;
2476 } else if (c2 == JIS_X_0201_1976_K) {
2477 output_mode = SHIFT_JIS;
2479 } else if (c2 == ISO_8859_1) {
2480 output_mode = ISO_8859_1;
2481 (*o_putc)(c1 | 0x080);
2483 } else if (is_eucg3(c2)){
2484 output_mode = SHIFT_JIS;
2485 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2491 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2492 set_iconv(FALSE, 0);
2493 return; /* too late to rescue this char */
2495 output_mode = SHIFT_JIS;
2496 e2s_conv(c2, c1, &c2, &c1);
2498 #ifdef SHIFTJIS_CP932
2500 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2501 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2507 #endif /* SHIFTJIS_CP932 */
2510 if (prefix_table[(unsigned char)c1]){
2511 (*o_putc)(prefix_table[(unsigned char)c1]);
2517 #ifdef UTF8_OUTPUT_ENABLE
2519 w_oconv(nkf_char c2, nkf_char c1)
2525 output_bom_f = FALSE;
2536 if (c2 == 0 && nkf_char_unicode_p(c1)){
2537 val = c1 & VALUE_MASK;
2538 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2540 if (c2) (*o_putc)(c2);
2541 if (c3) (*o_putc)(c3);
2542 if (c4) (*o_putc)(c4);
2549 val = e2w_conv(c2, c1);
2551 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2553 if (c2) (*o_putc)(c2);
2554 if (c3) (*o_putc)(c3);
2555 if (c4) (*o_putc)(c4);
2561 w_oconv16(nkf_char c2, nkf_char c1)
2564 output_bom_f = FALSE;
2565 if (output_endian == ENDIAN_LITTLE){
2579 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2580 if (nkf_char_unicode_bmp_p(c1)) {
2581 c2 = (c1 >> 8) & 0xff;
2585 if (c1 <= UNICODE_MAX) {
2586 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2587 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2588 if (output_endian == ENDIAN_LITTLE){
2589 (*o_putc)(c2 & 0xff);
2590 (*o_putc)((c2 >> 8) & 0xff);
2591 (*o_putc)(c1 & 0xff);
2592 (*o_putc)((c1 >> 8) & 0xff);
2594 (*o_putc)((c2 >> 8) & 0xff);
2595 (*o_putc)(c2 & 0xff);
2596 (*o_putc)((c1 >> 8) & 0xff);
2597 (*o_putc)(c1 & 0xff);
2603 nkf_char val = e2w_conv(c2, c1);
2604 c2 = (val >> 8) & 0xff;
2609 if (output_endian == ENDIAN_LITTLE){
2619 w_oconv32(nkf_char c2, nkf_char c1)
2622 output_bom_f = FALSE;
2623 if (output_endian == ENDIAN_LITTLE){
2641 if (c2 == ISO_8859_1) {
2643 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2646 c1 = e2w_conv(c2, c1);
2649 if (output_endian == ENDIAN_LITTLE){
2650 (*o_putc)( c1 & 0xFF);
2651 (*o_putc)((c1 >> 8) & 0xFF);
2652 (*o_putc)((c1 >> 16) & 0xFF);
2656 (*o_putc)((c1 >> 16) & 0xFF);
2657 (*o_putc)((c1 >> 8) & 0xFF);
2658 (*o_putc)( c1 & 0xFF);
2663 #define SCORE_L2 (1) /* Kanji Level 2 */
2664 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2665 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2666 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2667 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2668 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2669 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2670 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2672 #define SCORE_INIT (SCORE_iMIME)
2674 static const nkf_char score_table_A0[] = {
2677 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2678 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2681 static const nkf_char score_table_F0[] = {
2682 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2683 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2684 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2685 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2689 set_code_score(struct input_code *ptr, nkf_char score)
2692 ptr->score |= score;
2697 clr_code_score(struct input_code *ptr, nkf_char score)
2700 ptr->score &= ~score;
2705 code_score(struct input_code *ptr)
2707 nkf_char c2 = ptr->buf[0];
2708 #ifdef UTF8_OUTPUT_ENABLE
2709 nkf_char c1 = ptr->buf[1];
2712 set_code_score(ptr, SCORE_ERROR);
2713 }else if (c2 == SS2){
2714 set_code_score(ptr, SCORE_KANA);
2715 }else if (c2 == 0x8f){
2716 set_code_score(ptr, SCORE_X0212);
2717 #ifdef UTF8_OUTPUT_ENABLE
2718 }else if (!e2w_conv(c2, c1)){
2719 set_code_score(ptr, SCORE_NO_EXIST);
2721 }else if ((c2 & 0x70) == 0x20){
2722 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2723 }else if ((c2 & 0x70) == 0x70){
2724 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2725 }else if ((c2 & 0x70) >= 0x50){
2726 set_code_score(ptr, SCORE_L2);
2731 status_disable(struct input_code *ptr)
2736 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2740 status_push_ch(struct input_code *ptr, nkf_char c)
2742 ptr->buf[ptr->index++] = c;
2746 status_clear(struct input_code *ptr)
2753 status_reset(struct input_code *ptr)
2756 ptr->score = SCORE_INIT;
2760 status_reinit(struct input_code *ptr)
2763 ptr->_file_stat = 0;
2767 status_check(struct input_code *ptr, nkf_char c)
2769 if (c <= DEL && estab_f){
2775 s_status(struct input_code *ptr, nkf_char c)
2779 status_check(ptr, c);
2784 }else if (nkf_char_unicode_p(c)){
2786 }else if (0xa1 <= c && c <= 0xdf){
2787 status_push_ch(ptr, SS2);
2788 status_push_ch(ptr, c);
2791 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2793 status_push_ch(ptr, c);
2794 }else if (0xed <= c && c <= 0xee){
2796 status_push_ch(ptr, c);
2797 #ifdef SHIFTJIS_CP932
2798 }else if (is_ibmext_in_sjis(c)){
2800 status_push_ch(ptr, c);
2801 #endif /* SHIFTJIS_CP932 */
2803 }else if (0xf0 <= c && c <= 0xfc){
2805 status_push_ch(ptr, c);
2806 #endif /* X0212_ENABLE */
2808 status_disable(ptr);
2812 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2813 status_push_ch(ptr, c);
2814 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2818 status_disable(ptr);
2822 #ifdef SHIFTJIS_CP932
2823 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2824 status_push_ch(ptr, c);
2825 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2826 set_code_score(ptr, SCORE_CP932);
2831 #endif /* SHIFTJIS_CP932 */
2832 status_disable(ptr);
2835 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2836 status_push_ch(ptr, c);
2837 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2838 set_code_score(ptr, SCORE_CP932);
2841 status_disable(ptr);
2848 e_status(struct input_code *ptr, nkf_char c)
2852 status_check(ptr, c);
2857 }else if (nkf_char_unicode_p(c)){
2859 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2861 status_push_ch(ptr, c);
2863 }else if (0x8f == c){
2865 status_push_ch(ptr, c);
2866 #endif /* X0212_ENABLE */
2868 status_disable(ptr);
2872 if (0xa1 <= c && c <= 0xfe){
2873 status_push_ch(ptr, c);
2877 status_disable(ptr);
2882 if (0xa1 <= c && c <= 0xfe){
2884 status_push_ch(ptr, c);
2886 status_disable(ptr);
2888 #endif /* X0212_ENABLE */
2892 #ifdef UTF8_INPUT_ENABLE
2894 w_status(struct input_code *ptr, nkf_char c)
2898 status_check(ptr, c);
2903 }else if (nkf_char_unicode_p(c)){
2905 }else if (0xc0 <= c && c <= 0xdf){
2907 status_push_ch(ptr, c);
2908 }else if (0xe0 <= c && c <= 0xef){
2910 status_push_ch(ptr, c);
2911 }else if (0xf0 <= c && c <= 0xf4){
2913 status_push_ch(ptr, c);
2915 status_disable(ptr);
2920 if (0x80 <= c && c <= 0xbf){
2921 status_push_ch(ptr, c);
2922 if (ptr->index > ptr->stat){
2923 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2924 && ptr->buf[2] == 0xbf);
2925 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2926 &ptr->buf[0], &ptr->buf[1]);
2933 status_disable(ptr);
2937 if (0x80 <= c && c <= 0xbf){
2938 if (ptr->index < ptr->stat){
2939 status_push_ch(ptr, c);
2944 status_disable(ptr);
2952 code_status(nkf_char c)
2954 int action_flag = 1;
2955 struct input_code *result = 0;
2956 struct input_code *p = input_code_list;
2958 if (!p->status_func) {
2962 if (!p->status_func)
2964 (p->status_func)(p, c);
2967 }else if(p->stat == 0){
2978 if (result && !estab_f){
2979 set_iconv(TRUE, result->iconv_func);
2980 }else if (c <= DEL){
2981 struct input_code *ptr = input_code_list;
2991 nkf_buf_t *std_gc_buf;
2992 nkf_char broken_state;
2993 nkf_buf_t *broken_buf;
2994 nkf_char mimeout_state;
2998 static nkf_state_t *nkf_state = NULL;
3000 #define STD_GC_BUFSIZE (256)
3003 nkf_state_init(void)
3006 nkf_buf_clear(nkf_state->std_gc_buf);
3007 nkf_buf_clear(nkf_state->broken_buf);
3008 nkf_buf_clear(nkf_state->nfc_buf);
3011 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3012 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3013 nkf_state->broken_buf = nkf_buf_new(3);
3014 nkf_state->nfc_buf = nkf_buf_new(9);
3016 nkf_state->broken_state = 0;
3017 nkf_state->mimeout_state = 0;
3024 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3025 return nkf_buf_pop(nkf_state->std_gc_buf);
3032 std_ungetc(nkf_char c, FILE *f)
3034 nkf_buf_push(nkf_state->std_gc_buf, c);
3040 std_putc(nkf_char c)
3047 static unsigned char hold_buf[HOLD_SIZE*2];
3048 static int hold_count = 0;
3050 push_hold_buf(nkf_char c2)
3052 if (hold_count >= HOLD_SIZE*2)
3054 hold_buf[hold_count++] = (unsigned char)c2;
3055 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3059 h_conv(FILE *f, int c1, int c2)
3065 /** it must NOT be in the kanji shifte sequence */
3066 /** it must NOT be written in JIS7 */
3067 /** and it must be after 2 byte 8bit code */
3073 while ((c2 = (*i_getc)(f)) != EOF) {
3079 if (push_hold_buf(c2) == EOF || estab_f) {
3085 struct input_code *p = input_code_list;
3086 struct input_code *result = p;
3091 if (p->status_func && p->score < result->score) {
3096 set_iconv(TRUE, result->iconv_func);
3101 ** 1) EOF is detected, or
3102 ** 2) Code is established, or
3103 ** 3) Buffer is FULL (but last word is pushed)
3105 ** in 1) and 3) cases, we continue to use
3106 ** Kanji codes by oconv and leave estab_f unchanged.
3111 while (hold_index < hold_count){
3112 c1 = hold_buf[hold_index++];
3116 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3117 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3120 if (hold_index < hold_count){
3121 c2 = hold_buf[hold_index++];
3131 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3134 if (hold_index < hold_count){
3135 c3 = hold_buf[hold_index++];
3136 } else if ((c3 = (*i_getc)(f)) == EOF) {
3141 if (hold_index < hold_count){
3142 c4 = hold_buf[hold_index++];
3143 } else if ((c4 = (*i_getc)(f)) == EOF) {
3148 (*iconv)(c1, c2, (c3<<8)|c4);
3151 /* 3 bytes EUC or UTF-8 */
3152 if (hold_index < hold_count){
3153 c3 = hold_buf[hold_index++];
3154 } else if ((c3 = (*i_getc)(f)) == EOF) {
3160 (*iconv)(c1, c2, c3);
3163 if (c3 == EOF) break;
3169 * Check and Ignore BOM
3175 switch(c2 = (*i_getc)(f)){
3177 if((c2 = (*i_getc)(f)) == 0x00){
3178 if((c2 = (*i_getc)(f)) == 0xFE){
3179 if((c2 = (*i_getc)(f)) == 0xFF){
3180 if(!input_encoding){
3181 set_iconv(TRUE, w_iconv32);
3183 if (iconv == w_iconv32) {
3184 input_endian = ENDIAN_BIG;
3187 (*i_ungetc)(0xFF,f);
3188 }else (*i_ungetc)(c2,f);
3189 (*i_ungetc)(0xFE,f);
3190 }else if(c2 == 0xFF){
3191 if((c2 = (*i_getc)(f)) == 0xFE){
3192 if(!input_encoding){
3193 set_iconv(TRUE, w_iconv32);
3195 if (iconv == w_iconv32) {
3196 input_endian = ENDIAN_2143;
3199 (*i_ungetc)(0xFF,f);
3200 }else (*i_ungetc)(c2,f);
3201 (*i_ungetc)(0xFF,f);
3202 }else (*i_ungetc)(c2,f);
3203 (*i_ungetc)(0x00,f);
3204 }else (*i_ungetc)(c2,f);
3205 (*i_ungetc)(0x00,f);
3208 if((c2 = (*i_getc)(f)) == 0xBB){
3209 if((c2 = (*i_getc)(f)) == 0xBF){
3210 if(!input_encoding){
3211 set_iconv(TRUE, w_iconv);
3213 if (iconv == w_iconv) {
3216 (*i_ungetc)(0xBF,f);
3217 }else (*i_ungetc)(c2,f);
3218 (*i_ungetc)(0xBB,f);
3219 }else (*i_ungetc)(c2,f);
3220 (*i_ungetc)(0xEF,f);
3223 if((c2 = (*i_getc)(f)) == 0xFF){
3224 if((c2 = (*i_getc)(f)) == 0x00){
3225 if((c2 = (*i_getc)(f)) == 0x00){
3226 if(!input_encoding){
3227 set_iconv(TRUE, w_iconv32);
3229 if (iconv == w_iconv32) {
3230 input_endian = ENDIAN_3412;
3233 (*i_ungetc)(0x00,f);
3234 }else (*i_ungetc)(c2,f);
3235 (*i_ungetc)(0x00,f);
3236 }else (*i_ungetc)(c2,f);
3237 if(!input_encoding){
3238 set_iconv(TRUE, w_iconv16);
3240 if (iconv == w_iconv16) {
3241 input_endian = ENDIAN_BIG;
3244 (*i_ungetc)(0xFF,f);
3245 }else (*i_ungetc)(c2,f);
3246 (*i_ungetc)(0xFE,f);
3249 if((c2 = (*i_getc)(f)) == 0xFE){
3250 if((c2 = (*i_getc)(f)) == 0x00){
3251 if((c2 = (*i_getc)(f)) == 0x00){
3252 if(!input_encoding){
3253 set_iconv(TRUE, w_iconv32);
3255 if (iconv == w_iconv32) {
3256 input_endian = ENDIAN_LITTLE;
3259 (*i_ungetc)(0x00,f);
3260 }else (*i_ungetc)(c2,f);
3261 (*i_ungetc)(0x00,f);
3262 }else (*i_ungetc)(c2,f);
3263 if(!input_encoding){
3264 set_iconv(TRUE, w_iconv16);
3266 if (iconv == w_iconv16) {
3267 input_endian = ENDIAN_LITTLE;
3270 (*i_ungetc)(0xFE,f);
3271 }else (*i_ungetc)(c2,f);
3272 (*i_ungetc)(0xFF,f);
3281 broken_getc(FILE *f)
3285 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3286 return nkf_buf_pop(nkf_state->broken_buf);
3289 if (c=='$' && nkf_state->broken_state != ESC
3290 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3292 nkf_state->broken_state = 0;
3293 if (c1=='@'|| c1=='B') {
3294 nkf_buf_push(nkf_state->broken_buf, c1);
3295 nkf_buf_push(nkf_state->broken_buf, c);
3301 } else if (c=='(' && nkf_state->broken_state != ESC
3302 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3304 nkf_state->broken_state = 0;
3305 if (c1=='J'|| c1=='B') {
3306 nkf_buf_push(nkf_state->broken_buf, c1);
3307 nkf_buf_push(nkf_state->broken_buf, c);
3314 nkf_state->broken_state = c;
3320 broken_ungetc(nkf_char c, FILE *f)
3322 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3323 nkf_buf_push(nkf_state->broken_buf, c);
3328 eol_conv(nkf_char c2, nkf_char c1)
3330 if (guess_f && input_eol != EOF) {
3331 if (c2 == 0 && c1 == LF) {
3332 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3333 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3334 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3336 else if (!input_eol) input_eol = CR;
3337 else if (input_eol != CR) input_eol = EOF;
3339 if (prev_cr || (c2 == 0 && c1 == LF)) {
3341 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3342 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3344 if (c2 == 0 && c1 == CR) prev_cr = CR;
3345 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3349 Return value of fold_conv()
3351 LF add newline and output char
3352 CR add newline and output nothing
3355 1 (or else) normal output
3357 fold state in prev (previous character)
3359 >0x80 Japanese (X0208/X0201)
3364 This fold algorthm does not preserve heading space in a line.
3365 This is the main difference from fmt.
3368 #define char_size(c2,c1) (c2?2:1)
3371 fold_conv(nkf_char c2, nkf_char c1)
3374 nkf_char fold_state;
3376 if (c1== CR && !fold_preserve_f) {
3377 fold_state=0; /* ignore cr */
3378 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3380 fold_state=0; /* ignore cr */
3381 } else if (c1== BS) {
3382 if (f_line>0) f_line--;
3384 } else if (c2==EOF && f_line != 0) { /* close open last line */
3386 } else if ((c1==LF && !fold_preserve_f)
3387 || ((c1==CR||(c1==LF&&f_prev!=CR))
3388 && fold_preserve_f)) {
3390 if (fold_preserve_f) {
3394 } else if ((f_prev == c1 && !fold_preserve_f)
3395 || (f_prev == LF && fold_preserve_f)
3396 ) { /* duplicate newline */
3399 fold_state = LF; /* output two newline */
3405 if (f_prev&0x80) { /* Japanese? */
3407 fold_state = 0; /* ignore given single newline */
3408 } else if (f_prev==SP) {
3412 if (++f_line<=fold_len)
3416 fold_state = CR; /* fold and output nothing */
3420 } else if (c1=='\f') {
3423 fold_state = LF; /* output newline and clear */
3424 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3425 /* X0208 kankaku or ascii space */
3427 fold_state = 0; /* remove duplicate spaces */
3430 if (++f_line<=fold_len)
3431 fold_state = SP; /* output ASCII space only */
3433 f_prev = SP; f_line = 0;
3434 fold_state = CR; /* fold and output nothing */
3438 prev0 = f_prev; /* we still need this one... , but almost done */
3440 if (c2 || c2 == JIS_X_0201_1976_K)
3441 f_prev |= 0x80; /* this is Japanese */
3442 f_line += char_size(c2,c1);
3443 if (f_line<=fold_len) { /* normal case */
3446 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3447 f_line = char_size(c2,c1);
3448 fold_state = LF; /* We can't wait, do fold now */
3449 } else if (c2 == JIS_X_0201_1976_K) {
3450 /* simple kinsoku rules return 1 means no folding */
3451 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3452 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3453 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3454 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3455 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3456 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3457 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3459 fold_state = LF;/* add one new f_line before this character */
3462 fold_state = LF;/* add one new f_line before this character */
3465 /* kinsoku point in ASCII */
3466 if ( c1==')'|| /* { [ ( */
3477 /* just after special */
3478 } else if (!is_alnum(prev0)) {
3479 f_line = char_size(c2,c1);
3481 } else if ((prev0==SP) || /* ignored new f_line */
3482 (prev0==LF)|| /* ignored new f_line */
3483 (prev0&0x80)) { /* X0208 - ASCII */
3484 f_line = char_size(c2,c1);
3485 fold_state = LF;/* add one new f_line before this character */
3487 fold_state = 1; /* default no fold in ASCII */
3491 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3492 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3493 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3494 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3495 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3496 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3497 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3498 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3499 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3500 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3501 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3502 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3503 /* default no fold in kinsoku */
3506 f_line = char_size(c2,c1);
3507 /* add one new f_line before this character */
3510 f_line = char_size(c2,c1);
3512 /* add one new f_line before this character */
3517 /* terminator process */
3518 switch(fold_state) {
3520 OCONV_NEWLINE((*o_fconv));
3526 OCONV_NEWLINE((*o_fconv));
3537 static nkf_char z_prev2=0,z_prev1=0;
3540 z_conv(nkf_char c2, nkf_char c1)
3543 /* if (c2) c1 &= 0x7f; assertion */
3545 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3551 if (z_prev2 == JIS_X_0201_1976_K) {
3552 if (c2 == JIS_X_0201_1976_K) {
3553 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3555 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3557 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3559 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3564 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3566 if (c2 == JIS_X_0201_1976_K) {
3567 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3568 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3573 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3584 if (alpha_f&1 && c2 == 0x23) {
3585 /* JISX0208 Alphabet */
3587 } else if (c2 == 0x21) {
3588 /* JISX0208 Kigou */
3593 } else if (alpha_f&4) {
3598 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3604 if (alpha_f&8 && c2 == 0) {
3606 const char *entity = 0;
3608 case '>': entity = ">"; break;
3609 case '<': entity = "<"; break;
3610 case '\"': entity = """; break;
3611 case '&': entity = "&"; break;
3614 while (*entity) (*o_zconv)(0, *entity++);
3620 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3625 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3629 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3633 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3637 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3641 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3645 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3649 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3653 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3658 (*o_zconv)(JIS_X_0201_1976_K, c);
3661 } else if (c2 == 0x25) {
3662 /* JISX0208 Katakana */
3663 static const int fullwidth_to_halfwidth[] =
3665 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3666 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3667 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3668 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3669 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3670 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3671 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3672 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3673 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3674 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3675 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3676 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3678 if (fullwidth_to_halfwidth[c1-0x20]){
3679 c2 = fullwidth_to_halfwidth[c1-0x20];
3680 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3682 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3692 #define rot13(c) ( \
3694 (c <= 'M') ? (c + 13): \
3695 (c <= 'Z') ? (c - 13): \
3697 (c <= 'm') ? (c + 13): \
3698 (c <= 'z') ? (c - 13): \
3702 #define rot47(c) ( \
3704 ( c <= 'O') ? (c + 47) : \
3705 ( c <= '~') ? (c - 47) : \
3710 rot_conv(nkf_char c2, nkf_char c1)
3712 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3718 (*o_rot_conv)(c2,c1);
3722 hira_conv(nkf_char c2, nkf_char c1)
3726 if (0x20 < c1 && c1 < 0x74) {
3728 (*o_hira_conv)(c2,c1);
3730 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3732 c1 = nkf_char_unicode_new(0x3094);
3733 (*o_hira_conv)(c2,c1);
3736 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3738 (*o_hira_conv)(c2,c1);
3743 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3746 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3748 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3752 (*o_hira_conv)(c2,c1);
3757 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3759 #define RANGE_NUM_MAX 18
3760 static const nkf_char range[RANGE_NUM_MAX][2] = {
3781 nkf_char start, end, c;
3783 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3787 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3792 for (i = 0; i < RANGE_NUM_MAX; i++) {
3793 start = range[i][0];
3796 if (c >= start && c <= end) {
3801 (*o_iso2022jp_check_conv)(c2,c1);
3805 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3807 static const unsigned char *mime_pattern[] = {
3808 (const unsigned char *)"\075?EUC-JP?B?",
3809 (const unsigned char *)"\075?SHIFT_JIS?B?",
3810 (const unsigned char *)"\075?ISO-8859-1?Q?",
3811 (const unsigned char *)"\075?ISO-8859-1?B?",
3812 (const unsigned char *)"\075?ISO-2022-JP?B?",
3813 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3814 #if defined(UTF8_INPUT_ENABLE)
3815 (const unsigned char *)"\075?UTF-8?B?",
3816 (const unsigned char *)"\075?UTF-8?Q?",
3818 (const unsigned char *)"\075?US-ASCII?Q?",
3823 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3824 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3825 e_iconv, s_iconv, 0, 0, 0, 0,
3826 #if defined(UTF8_INPUT_ENABLE)
3832 static const nkf_char mime_encode[] = {
3833 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3834 #if defined(UTF8_INPUT_ENABLE)
3841 static const nkf_char mime_encode_method[] = {
3842 'B', 'B','Q', 'B', 'B', 'Q',
3843 #if defined(UTF8_INPUT_ENABLE)
3851 /* MIME preprocessor fifo */
3853 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3854 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3855 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3857 unsigned char buf[MIME_BUF_SIZE];
3859 unsigned int last; /* decoded */
3860 unsigned int input; /* undecoded */
3862 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3864 #define MAXRECOVER 20
3867 mime_input_buf_unshift(nkf_char c)
3869 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3873 mime_ungetc(nkf_char c, FILE *f)
3875 mime_input_buf_unshift(c);
3880 mime_ungetc_buf(nkf_char c, FILE *f)
3883 (*i_mungetc_buf)(c,f);
3885 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3890 mime_getc_buf(FILE *f)
3892 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3893 a terminator. It was checked in mime_integrity. */
3894 return ((mimebuf_f)?
3895 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3899 switch_mime_getc(void)
3901 if (i_getc!=mime_getc) {
3902 i_mgetc = i_getc; i_getc = mime_getc;
3903 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3904 if(mime_f==STRICT_MIME) {
3905 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3906 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3912 unswitch_mime_getc(void)
3914 if(mime_f==STRICT_MIME) {
3915 i_mgetc = i_mgetc_buf;
3916 i_mungetc = i_mungetc_buf;
3919 i_ungetc = i_mungetc;
3920 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3921 mime_iconv_back = NULL;
3925 mime_integrity(FILE *f, const unsigned char *p)
3929 /* In buffered mode, read until =? or NL or buffer full
3931 mime_input_state.input = mime_input_state.top;
3932 mime_input_state.last = mime_input_state.top;
3934 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3936 q = mime_input_state.input;
3937 while((c=(*i_getc)(f))!=EOF) {
3938 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3939 break; /* buffer full */
3941 if (c=='=' && d=='?') {
3942 /* checked. skip header, start decode */
3943 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3944 /* mime_last_input = mime_input_state.input; */
3945 mime_input_state.input = q;
3949 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3951 /* Should we check length mod 4? */
3952 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3955 /* In case of Incomplete MIME, no MIME decode */
3956 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3957 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3958 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3959 switch_mime_getc(); /* anyway we need buffered getc */
3964 mime_begin_strict(FILE *f)
3968 const unsigned char *p,*q;
3969 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3971 mime_decode_mode = FALSE;
3972 /* =? has been checked */
3974 p = mime_pattern[j];
3977 for(i=2;p[i]>SP;i++) { /* start at =? */
3978 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3979 /* pattern fails, try next one */
3981 while (mime_pattern[++j]) {
3982 p = mime_pattern[j];
3983 for(k=2;k<i;k++) /* assume length(p) > i */
3984 if (p[k]!=q[k]) break;
3985 if (k==i && nkf_toupper(c1)==p[k]) break;
3987 p = mime_pattern[j];
3988 if (p) continue; /* found next one, continue */
3989 /* all fails, output from recovery buffer */
3997 mime_decode_mode = p[i-2];
3999 mime_iconv_back = iconv;
4000 set_iconv(FALSE, mime_priority_func[j]);
4001 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4003 if (mime_decode_mode=='B') {
4004 mimebuf_f = unbuf_f;
4006 /* do MIME integrity check */
4007 return mime_integrity(f,mime_pattern[j]);
4021 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4022 /* re-read and convert again from mime_buffer. */
4024 /* =? has been checked */
4025 k = mime_input_state.last;
4026 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4027 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4028 /* We accept any character type even if it is breaked by new lines */
4029 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4030 if (c1==LF||c1==SP||c1==CR||
4031 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4033 /* Failed. But this could be another MIME preemble */
4035 mime_input_state.last--;
4041 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4042 if (!(++i<MAXRECOVER) || c1==EOF) break;
4043 if (c1=='b'||c1=='B') {
4044 mime_decode_mode = 'B';
4045 } else if (c1=='q'||c1=='Q') {
4046 mime_decode_mode = 'Q';
4050 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4051 if (!(++i<MAXRECOVER) || c1==EOF) break;
4053 mime_decode_mode = FALSE;
4059 if (!mime_decode_mode) {
4060 /* false MIME premble, restart from mime_buffer */
4061 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4062 /* Since we are in MIME mode until buffer becomes empty, */
4063 /* we never go into mime_begin again for a while. */
4066 /* discard mime preemble, and goto MIME mode */
4067 mime_input_state.last = k;
4068 /* do no MIME integrity check */
4069 return c1; /* used only for checking EOF */
4080 debug(const char *str)
4083 fprintf(stderr, "%s\n", str ? str : "NULL");
4089 set_input_codename(const char *codename)
4091 if (!input_codename) {
4092 input_codename = codename;
4093 } else if (strcmp(codename, input_codename) != 0) {
4094 input_codename = "";
4099 get_guessed_code(void)
4101 if (input_codename && !*input_codename) {
4102 input_codename = "BINARY";
4104 struct input_code *p = find_inputcode_byfunc(iconv);
4105 if (!input_codename) {
4106 input_codename = "ASCII";
4107 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4108 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4109 input_codename = "CP932";
4110 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4111 if (p->score & (SCORE_X0212))
4112 input_codename = "EUCJP-MS";
4113 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4114 input_codename = "CP51932";
4115 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4116 if (p->score & (SCORE_KANA))
4117 input_codename = "CP50221";
4118 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4119 input_codename = "CP50220";
4122 return input_codename;
4125 #if !defined(PERL_XS) && !defined(WIN32DLL)
4127 print_guessed_code(char *filename)
4129 if (filename != NULL) printf("%s: ", filename);
4130 if (input_codename && !*input_codename) {
4133 input_codename = get_guessed_code();
4135 printf("%s\n", input_codename);
4139 input_eol == CR ? " (CR)" :
4140 input_eol == LF ? " (LF)" :
4141 input_eol == CRLF ? " (CRLF)" :
4142 input_eol == EOF ? " (MIXED NL)" :
4152 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4154 nkf_char c1, c2, c3;
4160 if (!nkf_isxdigit(c2)){
4165 if (!nkf_isxdigit(c3)){
4170 return (hex2bin(c2) << 4) | hex2bin(c3);
4176 return hex_getc(':', f, i_cgetc, i_cungetc);
4180 cap_ungetc(nkf_char c, FILE *f)
4182 return (*i_cungetc)(c, f);
4188 return hex_getc('%', f, i_ugetc, i_uungetc);
4192 url_ungetc(nkf_char c, FILE *f)
4194 return (*i_uungetc)(c, f);
4198 #ifdef NUMCHAR_OPTION
4200 numchar_getc(FILE *f)
4202 nkf_char (*g)(FILE *) = i_ngetc;
4203 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4214 if (buf[i] == 'x' || buf[i] == 'X'){
4215 for (j = 0; j < 7; j++){
4217 if (!nkf_isxdigit(buf[i])){
4224 c |= hex2bin(buf[i]);
4227 for (j = 0; j < 8; j++){
4231 if (!nkf_isdigit(buf[i])){
4238 c += hex2bin(buf[i]);
4244 return nkf_char_unicode_new(c);
4254 numchar_ungetc(nkf_char c, FILE *f)
4256 return (*i_nungetc)(c, f);
4260 #ifdef UNICODE_NORMALIZATION
4265 nkf_char (*g)(FILE *f) = i_nfc_getc;
4266 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4267 nkf_buf_t *buf = nkf_state->nfc_buf;
4268 const unsigned char *array;
4269 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4270 nkf_char c = (*g)(f);
4272 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4274 nkf_buf_push(buf, c);
4276 while (lower <= upper) {
4277 int mid = (lower+upper) / 2;
4279 array = normalization_table[mid].nfd;
4280 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4281 if (len >= nkf_buf_length(buf)) {
4285 lower = 1, upper = 0;
4288 nkf_buf_push(buf, c);
4290 if (array[len] != nkf_buf_at(buf, len)) {
4291 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4292 else upper = mid - 1;
4299 array = normalization_table[mid].nfc;
4301 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4302 nkf_buf_push(buf, array[i]);
4306 } while (lower <= upper);
4308 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4309 c = nkf_buf_pop(buf);
4315 nfc_ungetc(nkf_char c, FILE *f)
4317 return (*i_nfc_ungetc)(c, f);
4319 #endif /* UNICODE_NORMALIZATION */
4323 base64decode(nkf_char c)
4328 i = c - 'A'; /* A..Z 0-25 */
4329 } else if (c == '_') {
4330 i = '?' /* 63 */ ; /* _ 63 */
4332 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4334 } else if (c > '/') {
4335 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4336 } else if (c == '+' || c == '-') {
4337 i = '>' /* 62 */ ; /* + and - 62 */
4339 i = '?' /* 63 */ ; /* / 63 */
4347 nkf_char c1, c2, c3, c4, cc;
4348 nkf_char t1, t2, t3, t4, mode, exit_mode;
4349 nkf_char lwsp_count;
4352 nkf_char lwsp_size = 128;
4354 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4355 return mime_input_buf(mime_input_state.top++);
4357 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4358 mime_decode_mode=FALSE;
4359 unswitch_mime_getc();
4360 return (*i_getc)(f);
4363 if (mimebuf_f == FIXED_MIME)
4364 exit_mode = mime_decode_mode;
4367 if (mime_decode_mode == 'Q') {
4368 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4370 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4371 if (c1<=SP || DEL<=c1) {
4372 mime_decode_mode = exit_mode; /* prepare for quit */
4375 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4379 mime_decode_mode = exit_mode; /* prepare for quit */
4380 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4381 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4382 /* end Q encoding */
4383 input_mode = exit_mode;
4385 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4386 while ((c1=(*i_getc)(f))!=EOF) {
4391 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4399 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4400 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4415 lwsp_buf[lwsp_count] = (unsigned char)c1;
4416 if (lwsp_count++>lwsp_size){
4418 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4419 lwsp_buf = lwsp_buf_new;
4425 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4427 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4428 i_ungetc(lwsp_buf[lwsp_count],f);
4431 nkf_xfree(lwsp_buf);
4434 if (c1=='='&&c2<SP) { /* this is soft wrap */
4435 while((c1 = (*i_mgetc)(f)) <=SP) {
4436 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4438 mime_decode_mode = 'Q'; /* still in MIME */
4439 goto restart_mime_q;
4442 mime_decode_mode = 'Q'; /* still in MIME */
4446 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4447 if (c2<=SP) return c2;
4448 mime_decode_mode = 'Q'; /* still in MIME */
4449 return ((hex2bin(c2)<<4) + hex2bin(c3));
4452 if (mime_decode_mode != 'B') {
4453 mime_decode_mode = FALSE;
4454 return (*i_mgetc)(f);
4458 /* Base64 encoding */
4460 MIME allows line break in the middle of
4461 Base64, but we are very pessimistic in decoding
4462 in unbuf mode because MIME encoded code may broken by
4463 less or editor's control sequence (such as ESC-[-K in unbuffered
4464 mode. ignore incomplete MIME.
4466 mode = mime_decode_mode;
4467 mime_decode_mode = exit_mode; /* prepare for quit */
4469 while ((c1 = (*i_mgetc)(f))<=SP) {
4474 if ((c2 = (*i_mgetc)(f))<=SP) {
4477 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4478 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4481 if ((c1 == '?') && (c2 == '=')) {
4484 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4485 while ((c1=(*i_getc)(f))!=EOF) {
4490 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4498 if ((c1=(*i_getc)(f))!=EOF) {
4502 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4517 lwsp_buf[lwsp_count] = (unsigned char)c1;
4518 if (lwsp_count++>lwsp_size){
4520 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4521 lwsp_buf = lwsp_buf_new;
4527 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4529 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4530 i_ungetc(lwsp_buf[lwsp_count],f);
4533 nkf_xfree(lwsp_buf);
4537 if ((c3 = (*i_mgetc)(f))<=SP) {
4540 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4541 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4545 if ((c4 = (*i_mgetc)(f))<=SP) {
4548 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4549 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4553 mime_decode_mode = mode; /* still in MIME sigh... */
4555 /* BASE 64 decoding */
4557 t1 = 0x3f & base64decode(c1);
4558 t2 = 0x3f & base64decode(c2);
4559 t3 = 0x3f & base64decode(c3);
4560 t4 = 0x3f & base64decode(c4);
4561 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4563 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4564 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4566 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4567 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4569 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4574 return mime_input_buf(mime_input_state.top++);
4577 static const char basis_64[] =
4578 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4580 #define MIMEOUT_BUF_LENGTH 74
4582 char buf[MIMEOUT_BUF_LENGTH+1];
4586 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4589 open_mime(nkf_char mode)
4591 const unsigned char *p;
4594 p = mime_pattern[0];
4595 for(i=0;mime_pattern[i];i++) {
4596 if (mode == mime_encode[i]) {
4597 p = mime_pattern[i];
4601 mimeout_mode = mime_encode_method[i];
4603 if (base64_count>45) {
4604 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4605 (*o_mputc)(mimeout_state.buf[i]);
4608 PUT_NEWLINE((*o_mputc));
4611 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
4615 for (;i<mimeout_state.count;i++) {
4616 if (nkf_isspace(mimeout_state.buf[i])) {
4617 (*o_mputc)(mimeout_state.buf[i]);
4627 j = mimeout_state.count;
4628 mimeout_state.count = 0;
4630 mime_putc(mimeout_state.buf[i]);
4635 mime_prechar(nkf_char c2, nkf_char c1)
4637 if (mimeout_mode > 0){
4639 if (base64_count + mimeout_state.count/3*4> 73){
4640 (*o_base64conv)(EOF,0);
4641 OCONV_NEWLINE((*o_base64conv));
4642 (*o_base64conv)(0,SP);
4646 if (base64_count + mimeout_state.count/3*4> 66) {
4647 (*o_base64conv)(EOF,0);
4648 OCONV_NEWLINE((*o_base64conv));
4649 (*o_base64conv)(0,SP);
4655 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4656 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4657 open_mime(output_mode);
4658 (*o_base64conv)(EOF,0);
4659 OCONV_NEWLINE((*o_base64conv));
4660 (*o_base64conv)(0,SP);
4679 switch(mimeout_mode) {
4684 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
4690 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
4695 if (mimeout_mode > 0) {
4696 if (mimeout_f!=FIXED_MIME) {
4698 } else if (mimeout_mode != 'Q')
4704 mimeout_addchar(nkf_char c)
4706 switch(mimeout_mode) {
4711 } else if(!nkf_isalnum(c)) {
4713 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4714 (*o_mputc)(bin2hex((c&0xf)));
4722 nkf_state->mimeout_state=c;
4723 (*o_mputc)(basis_64[c>>2]);
4728 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4729 nkf_state->mimeout_state=c;
4734 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4735 (*o_mputc)(basis_64[c & 0x3F]);
4747 mime_putc(nkf_char c)
4752 if (mimeout_f == FIXED_MIME){
4753 if (mimeout_mode == 'Q'){
4754 if (base64_count > 71){
4755 if (c!=CR && c!=LF) {
4757 PUT_NEWLINE((*o_mputc));
4762 if (base64_count > 71){
4764 PUT_NEWLINE((*o_mputc));
4767 if (c == EOF) { /* c==EOF */
4771 if (c != EOF) { /* c==EOF */
4777 /* mimeout_f != FIXED_MIME */
4779 if (c == EOF) { /* c==EOF */
4780 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4781 j = mimeout_state.count;
4782 mimeout_state.count = 0;
4784 if (mimeout_mode > 0) {
4785 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4787 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4790 mimeout_addchar(mimeout_state.buf[i]);
4794 mimeout_addchar(mimeout_state.buf[i]);
4798 mimeout_addchar(mimeout_state.buf[i]);
4804 mimeout_addchar(mimeout_state.buf[i]);
4810 if (mimeout_state.count > 0){
4811 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4816 if (mimeout_mode=='Q') {
4817 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4818 if (c == CR || c == LF) {
4823 } else if (c <= SP) {
4825 if (base64_count > 70) {
4826 PUT_NEWLINE((*o_mputc));
4829 if (!nkf_isblank(c)) {
4834 if (base64_count > 70) {
4836 PUT_NEWLINE((*o_mputc));
4839 open_mime(output_mode);
4841 if (!nkf_noescape_mime(c)) {
4852 if (mimeout_mode <= 0) {
4853 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4854 if (nkf_isspace(c)) {
4856 if (mimeout_mode == -1) {
4859 if (c==CR || c==LF) {
4861 open_mime(output_mode);
4867 for (i=0;i<mimeout_state.count;i++) {
4868 (*o_mputc)(mimeout_state.buf[i]);
4869 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4880 mimeout_state.buf[0] = (char)c;
4881 mimeout_state.count = 1;
4883 if (base64_count > 1
4884 && base64_count + mimeout_state.count > 76
4885 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4886 static const char *str = "boundary=\"";
4887 static int len = 10;
4890 for (; i < mimeout_state.count - len; ++i) {
4891 if (!strncmp(mimeout_state.buf+i, str, len)) {
4897 if (i == 0 || i == mimeout_state.count - len) {
4898 PUT_NEWLINE((*o_mputc));
4900 if (!nkf_isspace(mimeout_state.buf[0])){
4907 for (j = 0; j <= i; ++j) {
4908 (*o_mputc)(mimeout_state.buf[j]);
4910 PUT_NEWLINE((*o_mputc));
4912 for (; j <= mimeout_state.count; ++j) {
4913 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4915 mimeout_state.count -= i;
4918 mimeout_state.buf[mimeout_state.count++] = (char)c;
4919 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4920 open_mime(output_mode);
4925 if (lastchar==CR || lastchar == LF){
4926 for (i=0;i<mimeout_state.count;i++) {
4927 (*o_mputc)(mimeout_state.buf[i]);
4930 mimeout_state.count = 0;
4933 for (i=0;i<mimeout_state.count-1;i++) {
4934 (*o_mputc)(mimeout_state.buf[i]);
4937 mimeout_state.buf[0] = SP;
4938 mimeout_state.count = 1;
4940 open_mime(output_mode);
4943 /* mimeout_mode == 'B', 1, 2 */
4944 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4945 if (lastchar == CR || lastchar == LF){
4946 if (nkf_isblank(c)) {
4947 for (i=0;i<mimeout_state.count;i++) {
4948 mimeout_addchar(mimeout_state.buf[i]);
4950 mimeout_state.count = 0;
4951 } else if (SP<c && c<DEL) {
4953 for (i=0;i<mimeout_state.count;i++) {
4954 (*o_mputc)(mimeout_state.buf[i]);
4957 mimeout_state.count = 0;
4959 mimeout_state.buf[mimeout_state.count++] = (char)c;
4962 if (nkf_isspace(c)) {
4963 for (i=0;i<mimeout_state.count;i++) {
4964 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4966 for (i=0;i<mimeout_state.count;i++) {
4967 (*o_mputc)(mimeout_state.buf[i]);
4970 mimeout_state.count = 0;
4973 mimeout_state.buf[mimeout_state.count++] = (char)c;
4974 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4976 for (i=0;i<mimeout_state.count;i++) {
4977 (*o_mputc)(mimeout_state.buf[i]);
4980 mimeout_state.count = 0;
4984 if (mimeout_state.count>0 && SP<c && c!='=') {
4985 mimeout_state.buf[mimeout_state.count++] = (char)c;
4986 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4987 j = mimeout_state.count;
4988 mimeout_state.count = 0;
4990 mimeout_addchar(mimeout_state.buf[i]);
4997 if (mimeout_state.count>0) {
4998 j = mimeout_state.count;
4999 mimeout_state.count = 0;
5001 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5003 mimeout_addchar(mimeout_state.buf[i]);
5009 (*o_mputc)(mimeout_state.buf[i]);
5011 open_mime(output_mode);
5018 base64_conv(nkf_char c2, nkf_char c1)
5020 mime_prechar(c2, c1);
5021 (*o_base64conv)(c2,c1);
5025 typedef struct nkf_iconv_t {
5028 size_t input_buffer_size;
5029 char *output_buffer;
5030 size_t output_buffer_size;
5034 nkf_iconv_new(char *tocode, char *fromcode)
5036 nkf_iconv_t converter;
5038 converter->input_buffer_size = IOBUF_SIZE;
5039 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5040 converter->output_buffer_size = IOBUF_SIZE * 2;
5041 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5042 converter->cd = iconv_open(tocode, fromcode);
5043 if (converter->cd == (iconv_t)-1)
5047 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5050 perror("can't iconv_open");
5056 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5058 size_t invalid = (size_t)0;
5059 char *input_buffer = converter->input_buffer;
5060 size_t input_length = (size_t)0;
5061 char *output_buffer = converter->output_buffer;
5062 size_t output_length = converter->output_buffer_size;
5067 while ((c = (*i_getc)(f)) != EOF) {
5068 input_buffer[input_length++] = c;
5069 if (input_length < converter->input_buffer_size) break;
5073 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5074 while (output_length-- > 0) {
5075 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5077 if (ret == (size_t) - 1) {
5080 if (input_buffer != converter->input_buffer)
5081 memmove(converter->input_buffer, input_buffer, input_length);
5084 converter->output_buffer_size *= 2;
5085 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5086 if (output_buffer == NULL) {
5087 perror("can't realloc");
5090 converter->output_buffer = output_buffer;
5093 perror("can't iconv");
5106 nkf_iconv_close(nkf_iconv_t *convert)
5108 nkf_xfree(converter->inbuf);
5109 nkf_xfree(converter->outbuf);
5110 iconv_close(converter->cd);
5119 struct input_code *p = input_code_list;
5131 mime_f = MIME_DECODE_DEFAULT;
5132 mime_decode_f = FALSE;
5137 x0201_f = X0201_DEFAULT;
5138 iso2022jp_f = FALSE;
5139 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5140 ms_ucs_map_f = UCS_MAP_ASCII;
5142 #ifdef UTF8_INPUT_ENABLE
5143 no_cp932ext_f = FALSE;
5144 no_best_fit_chars_f = FALSE;
5145 encode_fallback = NULL;
5146 unicode_subchar = '?';
5147 input_endian = ENDIAN_BIG;
5149 #ifdef UTF8_OUTPUT_ENABLE
5150 output_bom_f = FALSE;
5151 output_endian = ENDIAN_BIG;
5153 #ifdef UNICODE_NORMALIZATION
5169 #ifdef SHIFTJIS_CP932
5179 for (i = 0; i < 256; i++){
5180 prefix_table[i] = 0;
5184 mimeout_state.count = 0;
5189 fold_preserve_f = FALSE;
5192 kanji_intro = DEFAULT_J;
5193 ascii_intro = DEFAULT_R;
5194 fold_margin = FOLD_MARGIN;
5195 o_zconv = no_connection;
5196 o_fconv = no_connection;
5197 o_eol_conv = no_connection;
5198 o_rot_conv = no_connection;
5199 o_hira_conv = no_connection;
5200 o_base64conv = no_connection;
5201 o_iso2022jp_check_conv = no_connection;
5204 i_ungetc = std_ungetc;
5206 i_bungetc = std_ungetc;
5209 i_mungetc = std_ungetc;
5210 i_mgetc_buf = std_getc;
5211 i_mungetc_buf = std_ungetc;
5212 output_mode = ASCII;
5214 mime_decode_mode = FALSE;
5220 z_prev2=0,z_prev1=0;
5222 iconv_for_check = 0;
5224 input_codename = NULL;
5225 input_encoding = NULL;
5226 output_encoding = NULL;
5234 module_connection(void)
5236 if (input_encoding) set_input_encoding(input_encoding);
5237 if (!output_encoding) {
5238 output_encoding = nkf_default_encoding();
5240 if (!output_encoding) {
5241 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5244 set_output_encoding(output_encoding);
5245 oconv = nkf_enc_to_oconv(output_encoding);
5248 /* replace continucation module, from output side */
5250 /* output redicrection */
5252 if (noout_f || guess_f){
5259 if (mimeout_f == TRUE) {
5260 o_base64conv = oconv; oconv = base64_conv;
5262 /* base64_count = 0; */
5265 if (eolmode_f || guess_f) {
5266 o_eol_conv = oconv; oconv = eol_conv;
5269 o_rot_conv = oconv; oconv = rot_conv;
5272 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5275 o_hira_conv = oconv; oconv = hira_conv;
5278 o_fconv = oconv; oconv = fold_conv;
5281 if (alpha_f || x0201_f) {
5282 o_zconv = oconv; oconv = z_conv;
5286 i_ungetc = std_ungetc;
5287 /* input redicrection */
5290 i_cgetc = i_getc; i_getc = cap_getc;
5291 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5294 i_ugetc = i_getc; i_getc = url_getc;
5295 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5298 #ifdef NUMCHAR_OPTION
5300 i_ngetc = i_getc; i_getc = numchar_getc;
5301 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5304 #ifdef UNICODE_NORMALIZATION
5306 i_nfc_getc = i_getc; i_getc = nfc_getc;
5307 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5310 if (mime_f && mimebuf_f==FIXED_MIME) {
5311 i_mgetc = i_getc; i_getc = mime_getc;
5312 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5315 i_bgetc = i_getc; i_getc = broken_getc;
5316 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5318 if (input_encoding) {
5319 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5321 set_iconv(FALSE, e_iconv);
5325 struct input_code *p = input_code_list;
5334 Conversion main loop. Code detection only.
5337 #if !defined(PERL_XS) && !defined(WIN32DLL)
5344 module_connection();
5345 while ((c = (*i_getc)(f)) != EOF)
5352 #define NEXT continue /* no output, get next */
5353 #define SKIP c2=0;continue /* no output, get next */
5354 #define MORE c2=c1;continue /* need one more byte */
5355 #define SEND ; /* output c1 and c2, get next */
5356 #define LAST break /* end of loop, go closing */
5357 #define set_input_mode(mode) do { \
5358 input_mode = mode; \
5360 set_input_codename("ISO-2022-JP"); \
5361 debug("ISO-2022-JP"); \
5365 kanji_convert(FILE *f)
5367 nkf_char c1=0, c2=0, c3=0, c4=0;
5368 int shift_mode = 0; /* 0, 1, 2, 3 */
5370 int is_8bit = FALSE;
5372 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5377 output_mode = ASCII;
5379 if (module_connection() < 0) {
5380 #if !defined(PERL_XS) && !defined(WIN32DLL)
5381 fprintf(stderr, "no output encoding given\n");
5387 #ifdef UTF8_INPUT_ENABLE
5388 if(iconv == w_iconv32){
5389 while ((c1 = (*i_getc)(f)) != EOF &&
5390 (c2 = (*i_getc)(f)) != EOF &&
5391 (c3 = (*i_getc)(f)) != EOF &&
5392 (c4 = (*i_getc)(f)) != EOF) {
5393 nkf_iconv_utf_32(c1, c2, c3, c4);
5395 (*i_ungetc)(EOF, f);
5397 else if (iconv == w_iconv16) {
5398 while ((c1 = (*i_getc)(f)) != EOF &&
5399 (c2 = (*i_getc)(f)) != EOF) {
5400 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5401 (c3 = (*i_getc)(f)) != EOF &&
5402 (c4 = (*i_getc)(f)) != EOF) {
5403 nkf_iconv_utf_16(c1, c2, c3, c4);
5406 (*i_ungetc)(EOF, f);
5410 while ((c1 = (*i_getc)(f)) != EOF) {
5411 #ifdef INPUT_CODE_FIX
5412 if (!input_encoding)
5418 /* in case of 8th bit is on */
5419 if (!estab_f&&!mime_decode_mode) {
5420 /* in case of not established yet */
5421 /* It is still ambiguious */
5422 if (h_conv(f, c2, c1)==EOF) {
5430 /* in case of already established */
5432 /* ignore bogus code */
5440 /* 2nd byte of 7 bit code or SJIS */
5444 else if (nkf_char_unicode_p(c1)) {
5450 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5453 }else if (input_codename && input_codename[0] == 'I' &&
5454 0xA1 <= c1 && c1 <= 0xDF) {
5455 /* JIS X 0201 Katakana in 8bit JIS */
5456 c2 = JIS_X_0201_1976_K;
5459 } else if (c1 > DEL) {
5461 if (!estab_f && !iso8859_f) {
5462 /* not established yet */
5464 } else { /* estab_f==TRUE */
5470 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5471 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5473 c2 = JIS_X_0201_1976_K;
5478 /* already established */
5482 } else if (SP < c1 && c1 < DEL) {
5483 /* in case of Roman characters */
5485 /* output 1 shifted byte */
5489 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5490 /* output 1 shifted byte */
5491 c2 = JIS_X_0201_1976_K;
5494 /* look like bogus code */
5497 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5498 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5499 /* in case of Kanji shifted */
5501 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5502 /* Check MIME code */
5503 if ((c1 = (*i_getc)(f)) == EOF) {
5506 } else if (c1 == '?') {
5507 /* =? is mime conversion start sequence */
5508 if(mime_f == STRICT_MIME) {
5509 /* check in real detail */
5510 if (mime_begin_strict(f) == EOF)
5513 } else if (mime_begin(f) == EOF)
5522 /* normal ASCII code */
5525 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5528 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5531 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5532 if ((c1 = (*i_getc)(f)) == EOF) {
5533 /* (*oconv)(0, ESC); don't send bogus code */
5536 else if (c1 == '&') {
5538 if ((c1 = (*i_getc)(f)) == EOF) {
5544 else if (c1 == '$') {
5546 if ((c1 = (*i_getc)(f)) == EOF) {
5547 /* don't send bogus code
5549 (*oconv)(0, '$'); */
5551 } else if (c1 == '@' || c1 == 'B') {
5553 set_input_mode(JIS_X_0208);
5555 } else if (c1 == '(') {
5557 if ((c1 = (*i_getc)(f)) == EOF) {
5558 /* don't send bogus code
5564 } else if (c1 == '@'|| c1 == 'B') {
5566 set_input_mode(JIS_X_0208);
5569 } else if (c1 == 'D'){
5570 set_input_mode(JIS_X_0212);
5572 #endif /* X0212_ENABLE */
5573 } else if (c1 == 'O' || c1 == 'Q'){
5574 set_input_mode(JIS_X_0213_1);
5576 } else if (c1 == 'P'){
5577 set_input_mode(JIS_X_0213_2);
5580 /* could be some special code */
5587 } else if (broken_f&0x2) {
5588 /* accept any ESC-(-x as broken code ... */
5589 input_mode = JIS_X_0208;
5598 } else if (c1 == '(') {
5600 if ((c1 = (*i_getc)(f)) == EOF) {
5601 /* don't send bogus code
5603 (*oconv)(0, '('); */
5606 else if (c1 == 'I') {
5607 /* JIS X 0201 Katakana */
5608 set_input_mode(JIS_X_0201_1976_K);
5611 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5612 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5613 set_input_mode(ASCII);
5616 else if (broken_f&0x2) {
5617 set_input_mode(ASCII);
5626 else if (c1 == '.') {
5628 if ((c1 = (*i_getc)(f)) == EOF) {
5631 else if (c1 == 'A') {
5642 else if (c1 == 'N') {
5645 if (g2 == ISO_8859_1) {
5660 } else if (c1 == ESC && iconv == s_iconv) {
5661 /* ESC in Shift_JIS */
5662 if ((c1 = (*i_getc)(f)) == EOF) {
5663 /* (*oconv)(0, ESC); don't send bogus code */
5665 } else if (c1 == '$') {
5667 if ((c1 = (*i_getc)(f)) == EOF) {
5669 } else if (('E' <= c1 && c1 <= 'G') ||
5670 ('O' <= c1 && c1 <= 'Q')) {
5678 static const nkf_char jphone_emoji_first_table[7] =
5679 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5680 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5681 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5682 while (SP <= c1 && c1 <= 'z') {
5683 (*oconv)(0, c1 + c3);
5684 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5699 } else if (c1 == LF || c1 == CR) {
5701 input_mode = ASCII; set_iconv(FALSE, 0);
5703 } else if (mime_decode_f && !mime_decode_mode){
5705 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5713 } else { /* if (c1 == CR)*/
5714 if ((c1=(*i_getc)(f))!=EOF) {
5718 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5738 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5741 if ((c3 = (*i_getc)(f)) != EOF) {
5744 if ((c4 = (*i_getc)(f)) != EOF) {
5746 (*iconv)(c2, c1, c3|c4);
5751 /* 3 bytes EUC or UTF-8 */
5752 if ((c3 = (*i_getc)(f)) != EOF) {
5754 (*iconv)(c2, c1, c3);
5762 0x7F <= c2 && c2 <= 0x92 &&
5763 0x21 <= c1 && c1 <= 0x7E) {
5765 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5768 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5772 (*oconv)(PREFIX_EUCG3 | c2, c1);
5774 #endif /* X0212_ENABLE */
5776 (*oconv)(PREFIX_EUCG3 | c2, c1);
5779 (*oconv)(input_mode, c1); /* other special case */
5785 /* goto next_word */
5789 (*iconv)(EOF, 0, 0);
5790 if (!input_codename)
5793 struct input_code *p = input_code_list;
5794 struct input_code *result = p;
5796 if (p->score < result->score) result = p;
5799 set_input_codename(result->name);
5801 debug(result->name);
5809 * int options(unsigned char *cp)
5816 options(unsigned char *cp)
5820 unsigned char *cp_back = NULL;
5825 while(*cp && *cp++!='-');
5826 while (*cp || cp_back) {
5834 case '-': /* literal options */
5835 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5839 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5840 p = (unsigned char *)long_option[i].name;
5841 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5842 if (*p == cp[j] || cp[j] == SP){
5849 #if !defined(PERL_XS) && !defined(WIN32DLL)
5850 fprintf(stderr, "unknown long option: --%s\n", cp);
5854 while(*cp && *cp != SP && cp++);
5855 if (long_option[i].alias[0]){
5857 cp = (unsigned char *)long_option[i].alias;
5860 if (strcmp(long_option[i].name, "help") == 0){
5865 if (strcmp(long_option[i].name, "ic=") == 0){
5866 enc = nkf_enc_find((char *)p);
5868 input_encoding = enc;
5871 if (strcmp(long_option[i].name, "oc=") == 0){
5872 enc = nkf_enc_find((char *)p);
5873 /* if (enc <= 0) continue; */
5875 output_encoding = enc;
5878 if (strcmp(long_option[i].name, "guess=") == 0){
5879 if (p[0] == '0' || p[0] == '1') {
5887 if (strcmp(long_option[i].name, "overwrite") == 0){
5890 preserve_time_f = TRUE;
5893 if (strcmp(long_option[i].name, "overwrite=") == 0){
5896 preserve_time_f = TRUE;
5898 backup_suffix = (char *)p;
5901 if (strcmp(long_option[i].name, "in-place") == 0){
5904 preserve_time_f = FALSE;
5907 if (strcmp(long_option[i].name, "in-place=") == 0){
5910 preserve_time_f = FALSE;
5912 backup_suffix = (char *)p;
5917 if (strcmp(long_option[i].name, "cap-input") == 0){
5921 if (strcmp(long_option[i].name, "url-input") == 0){
5926 #ifdef NUMCHAR_OPTION
5927 if (strcmp(long_option[i].name, "numchar-input") == 0){
5933 if (strcmp(long_option[i].name, "no-output") == 0){
5937 if (strcmp(long_option[i].name, "debug") == 0){
5942 if (strcmp(long_option[i].name, "cp932") == 0){
5943 #ifdef SHIFTJIS_CP932
5947 #ifdef UTF8_OUTPUT_ENABLE
5948 ms_ucs_map_f = UCS_MAP_CP932;
5952 if (strcmp(long_option[i].name, "no-cp932") == 0){
5953 #ifdef SHIFTJIS_CP932
5957 #ifdef UTF8_OUTPUT_ENABLE
5958 ms_ucs_map_f = UCS_MAP_ASCII;
5962 #ifdef SHIFTJIS_CP932
5963 if (strcmp(long_option[i].name, "cp932inv") == 0){
5970 if (strcmp(long_option[i].name, "x0212") == 0){
5977 if (strcmp(long_option[i].name, "exec-in") == 0){
5981 if (strcmp(long_option[i].name, "exec-out") == 0){
5986 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5987 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5988 no_cp932ext_f = TRUE;
5991 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5992 no_best_fit_chars_f = TRUE;
5995 if (strcmp(long_option[i].name, "fb-skip") == 0){
5996 encode_fallback = NULL;
5999 if (strcmp(long_option[i].name, "fb-html") == 0){
6000 encode_fallback = encode_fallback_html;
6003 if (strcmp(long_option[i].name, "fb-xml") == 0){
6004 encode_fallback = encode_fallback_xml;
6007 if (strcmp(long_option[i].name, "fb-java") == 0){
6008 encode_fallback = encode_fallback_java;
6011 if (strcmp(long_option[i].name, "fb-perl") == 0){
6012 encode_fallback = encode_fallback_perl;
6015 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6016 encode_fallback = encode_fallback_subchar;
6019 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6020 encode_fallback = encode_fallback_subchar;
6021 unicode_subchar = 0;
6023 /* decimal number */
6024 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6025 unicode_subchar *= 10;
6026 unicode_subchar += hex2bin(p[i]);
6028 }else if(p[1] == 'x' || p[1] == 'X'){
6029 /* hexadecimal number */
6030 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6031 unicode_subchar <<= 4;
6032 unicode_subchar |= hex2bin(p[i]);
6036 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6037 unicode_subchar *= 8;
6038 unicode_subchar += hex2bin(p[i]);
6041 w16e_conv(unicode_subchar, &i, &j);
6042 unicode_subchar = i<<8 | j;
6046 #ifdef UTF8_OUTPUT_ENABLE
6047 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6048 ms_ucs_map_f = UCS_MAP_MS;
6052 #ifdef UNICODE_NORMALIZATION
6053 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6058 if (strcmp(long_option[i].name, "prefix=") == 0){
6059 if (nkf_isgraph(p[0])){
6060 for (i = 1; nkf_isgraph(p[i]); i++){
6061 prefix_table[p[i]] = p[0];
6066 #if !defined(PERL_XS) && !defined(WIN32DLL)
6067 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6072 case 'b': /* buffered mode */
6075 case 'u': /* non bufferd mode */
6078 case 't': /* transparent mode */
6083 } else if (*cp=='2') {
6087 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6095 case 'j': /* JIS output */
6097 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6099 case 'e': /* AT&T EUC output */
6100 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6102 case 's': /* SJIS output */
6103 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6105 case 'l': /* ISO8859 Latin-1 support, no conversion */
6106 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6107 input_encoding = nkf_enc_from_index(ISO_8859_1);
6109 case 'i': /* Kanji IN ESC-$-@/B */
6110 if (*cp=='@'||*cp=='B')
6111 kanji_intro = *cp++;
6113 case 'o': /* ASCII IN ESC-(-J/B/H */
6114 /* ESC ( H was used in initial JUNET messages */
6115 if (*cp=='J'||*cp=='B'||*cp=='H')
6116 ascii_intro = *cp++;
6120 bit:1 katakana->hiragana
6121 bit:2 hiragana->katakana
6123 if ('9'>= *cp && *cp>='0')
6124 hira_f |= (*cp++ -'0');
6131 #if defined(MSDOS) || defined(__OS2__)
6138 show_configuration();
6146 #ifdef UTF8_OUTPUT_ENABLE
6147 case 'w': /* UTF-8 output */
6152 output_encoding = nkf_enc_from_index(UTF_8N);
6154 output_bom_f = TRUE;
6155 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6159 if ('1'== cp[0] && '6'==cp[1]) {
6162 } else if ('3'== cp[0] && '2'==cp[1]) {
6166 output_encoding = nkf_enc_from_index(UTF_8);
6171 output_endian = ENDIAN_LITTLE;
6172 } else if (cp[0] == 'B') {
6177 enc_idx = enc_idx == UTF_16
6178 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6179 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6181 output_bom_f = TRUE;
6182 enc_idx = enc_idx == UTF_16
6183 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6184 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6186 output_encoding = nkf_enc_from_index(enc_idx);
6190 #ifdef UTF8_INPUT_ENABLE
6191 case 'W': /* UTF input */
6194 input_encoding = nkf_enc_from_index(UTF_8);
6197 if ('1'== cp[0] && '6'==cp[1]) {
6199 input_endian = ENDIAN_BIG;
6201 } else if ('3'== cp[0] && '2'==cp[1]) {
6203 input_endian = ENDIAN_BIG;
6206 input_encoding = nkf_enc_from_index(UTF_8);
6211 input_endian = ENDIAN_LITTLE;
6212 } else if (cp[0] == 'B') {
6214 input_endian = ENDIAN_BIG;
6216 enc_idx = (enc_idx == UTF_16
6217 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6218 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6219 input_encoding = nkf_enc_from_index(enc_idx);
6223 /* Input code assumption */
6224 case 'J': /* ISO-2022-JP input */
6225 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6227 case 'E': /* EUC-JP input */
6228 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6230 case 'S': /* Shift_JIS input */
6231 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6233 case 'Z': /* Convert X0208 alphabet to asii */
6235 bit:0 Convert JIS X 0208 Alphabet to ASCII
6236 bit:1 Convert Kankaku to one space
6237 bit:2 Convert Kankaku to two spaces
6238 bit:3 Convert HTML Entity
6239 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6241 while ('0'<= *cp && *cp <='4') {
6242 alpha_f |= 1 << (*cp++ - '0');
6246 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6247 x0201_f = FALSE; /* No X0201->X0208 conversion */
6249 ESC-(-I in JIS, EUC, MS Kanji
6250 SI/SO in JIS, EUC, MS Kanji
6251 SS2 in EUC, JIS, not in MS Kanji
6252 MS Kanji (0xa0-0xdf)
6254 ESC-(-I in JIS (0x20-0x5f)
6255 SS2 in EUC (0xa0-0xdf)
6256 0xa0-0xd in MS Kanji (0xa0-0xdf)
6259 case 'X': /* Convert X0201 kana to X0208 */
6262 case 'F': /* prserve new lines */
6263 fold_preserve_f = TRUE;
6264 case 'f': /* folding -f60 or -f */
6267 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6269 fold_len += *cp++ - '0';
6271 if (!(0<fold_len && fold_len<BUFSIZ))
6272 fold_len = DEFAULT_FOLD;
6276 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6278 fold_margin += *cp++ - '0';
6282 case 'm': /* MIME support */
6283 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6284 if (*cp=='B'||*cp=='Q') {
6285 mime_decode_mode = *cp++;
6286 mimebuf_f = FIXED_MIME;
6287 } else if (*cp=='N') {
6288 mime_f = TRUE; cp++;
6289 } else if (*cp=='S') {
6290 mime_f = STRICT_MIME; cp++;
6291 } else if (*cp=='0') {
6292 mime_decode_f = FALSE;
6293 mime_f = FALSE; cp++;
6295 mime_f = STRICT_MIME;
6298 case 'M': /* MIME output */
6301 mimeout_f = FIXED_MIME; cp++;
6302 } else if (*cp=='Q') {
6304 mimeout_f = FIXED_MIME; cp++;
6309 case 'B': /* Broken JIS support */
6311 bit:1 allow any x on ESC-(-x or ESC-$-x
6312 bit:2 reset to ascii on NL
6314 if ('9'>= *cp && *cp>='0')
6315 broken_f |= 1<<(*cp++ -'0');
6320 case 'O':/* for Output file */
6324 case 'c':/* add cr code */
6327 case 'd':/* delete cr code */
6330 case 'I': /* ISO-2022-JP output */
6333 case 'L': /* line mode */
6334 if (*cp=='u') { /* unix */
6335 eolmode_f = LF; cp++;
6336 } else if (*cp=='m') { /* mac */
6337 eolmode_f = CR; cp++;
6338 } else if (*cp=='w') { /* windows */
6339 eolmode_f = CRLF; cp++;
6340 } else if (*cp=='0') { /* no conversion */
6341 eolmode_f = 0; cp++;
6346 if ('2' <= *cp && *cp <= '9') {
6349 } else if (*cp == '0' || *cp == '1') {
6358 /* module muliple options in a string are allowed for Perl moudle */
6359 while(*cp && *cp++!='-');
6362 #if !defined(PERL_XS) && !defined(WIN32DLL)
6363 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6365 /* bogus option but ignored */
6373 #include "nkf32dll.c"
6374 #elif defined(PERL_XS)
6375 #else /* WIN32DLL */
6377 main(int argc, char **argv)
6382 char *outfname = NULL;
6385 #ifdef EASYWIN /*Easy Win */
6386 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6388 #ifdef DEFAULT_CODE_LOCALE
6389 setlocale(LC_CTYPE, "");
6393 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6394 cp = (unsigned char *)*argv;
6399 if (pipe(fds) < 0 || (pid = fork()) < 0){
6410 execvp(argv[1], &argv[1]);
6427 int debug_f_back = debug_f;
6430 int exec_f_back = exec_f;
6433 int x0212_f_back = x0212_f;
6435 int x0213_f_back = x0213_f;
6436 int guess_f_back = guess_f;
6438 guess_f = guess_f_back;
6441 debug_f = debug_f_back;
6444 exec_f = exec_f_back;
6446 x0212_f = x0212_f_back;
6447 x0213_f = x0213_f_back;
6450 if (binmode_f == TRUE)
6451 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6452 if (freopen("","wb",stdout) == NULL)
6459 setbuf(stdout, (char *) NULL);
6461 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6464 if (binmode_f == TRUE)
6465 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6466 if (freopen("","rb",stdin) == NULL) return (-1);
6470 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6474 kanji_convert(stdin);
6475 if (guess_f) print_guessed_code(NULL);
6479 int is_argument_error = FALSE;
6481 input_codename = NULL;
6484 iconv_for_check = 0;
6486 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6488 is_argument_error = TRUE;
6496 /* reopen file for stdout */
6497 if (file_out_f == TRUE) {
6500 outfname = nkf_xmalloc(strlen(origfname)
6501 + strlen(".nkftmpXXXXXX")
6503 strcpy(outfname, origfname);
6507 for (i = strlen(outfname); i; --i){
6508 if (outfname[i - 1] == '/'
6509 || outfname[i - 1] == '\\'){
6515 strcat(outfname, "ntXXXXXX");
6517 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6518 S_IREAD | S_IWRITE);
6520 strcat(outfname, ".nkftmpXXXXXX");
6521 fd = mkstemp(outfname);
6524 || (fd_backup = dup(fileno(stdout))) < 0
6525 || dup2(fd, fileno(stdout)) < 0
6536 outfname = "nkf.out";
6539 if(freopen(outfname, "w", stdout) == NULL) {
6543 if (binmode_f == TRUE) {
6544 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6545 if (freopen("","wb",stdout) == NULL)
6552 if (binmode_f == TRUE)
6553 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6554 if (freopen("","rb",fin) == NULL)
6559 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6563 char *filename = NULL;
6565 if (nfiles > 1) filename = origfname;
6566 if (guess_f) print_guessed_code(filename);
6572 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6580 if (dup2(fd_backup, fileno(stdout)) < 0){
6583 if (stat(origfname, &sb)) {
6584 fprintf(stderr, "Can't stat %s\n", origfname);
6586 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6587 if (chmod(outfname, sb.st_mode)) {
6588 fprintf(stderr, "Can't set permission %s\n", outfname);
6591 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6592 if(preserve_time_f){
6593 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6594 tb[0] = tb[1] = sb.st_mtime;
6595 if (utime(outfname, tb)) {
6596 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6599 tb.actime = sb.st_atime;
6600 tb.modtime = sb.st_mtime;
6601 if (utime(outfname, &tb)) {
6602 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6607 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6609 unlink(backup_filename);
6611 if (rename(origfname, backup_filename)) {
6612 perror(backup_filename);
6613 fprintf(stderr, "Can't rename %s to %s\n",
6614 origfname, backup_filename);
6616 nkf_xfree(backup_filename);
6619 if (unlink(origfname)){
6624 if (rename(outfname, origfname)) {
6626 fprintf(stderr, "Can't rename %s to %s\n",
6627 outfname, origfname);
6629 nkf_xfree(outfname);
6634 if (is_argument_error)
6637 #ifdef EASYWIN /*Easy Win */
6638 if (file_out_f == FALSE)
6639 scanf("%d",&end_check);
6642 #else /* for Other OS */
6643 if (file_out_f == TRUE)
6645 #endif /*Easy Win */
6648 #endif /* WIN32DLL */