1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.158 2007/12/23 07:25:47 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-12-22"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #if defined(DEFAULT_CODE_JIS)
44 #elif defined(DEFAULT_CODE_SJIS)
45 #elif defined(DEFAULT_CODE_EUC)
46 #elif defined(DEFAULT_CODE_UTF8)
48 #define DEFAULT_CODE_JIS 1
51 #ifndef MIME_DECODE_DEFAULT
52 #define MIME_DECODE_DEFAULT STRICT_MIME
55 #define X0201_DEFAULT TRUE
58 #if DEFAULT_NEWLINE == 0x0D0A
59 #define PUT_NEWLINE(func) do {\
63 #define OCONV_NEWLINE(func) do {\
67 #elif DEFAULT_NEWLINE == 0x0D
68 #define PUT_NEWLINE(func) func(0x0D)
69 #define OCONV_NEWLINE(func) func(0, 0x0D)
71 #define DEFAULT_NEWLINE 0x0A
72 #define PUT_NEWLINE(func) func(0x0A)
73 #define OCONV_NEWLINE(func) func(0, 0x0A)
75 #ifdef HELP_OUTPUT_STDERR
76 #define HELP_OUTPUT stderr
78 #define HELP_OUTPUT stdout
81 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
83 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
99 #if defined(MSDOS) || defined(__OS2__)
102 #if defined(_MSC_VER) || defined(__WATCOMC__)
103 #define mktemp _mktemp
109 #define setbinmode(fp) fsetbin(fp)
110 #elif defined(__DJGPP__)
111 #include <libc/dosio.h>
112 #define setbinmode(fp) djgpp_setbinmode(fp)
113 #else /* Microsoft C, Turbo C */
114 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
117 #define setbinmode(fp)
120 #if defined(__DJGPP__)
121 void djgpp_setbinmode(FILE *fp)
123 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
126 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
127 __file_handle_set(fd, m);
131 #ifdef _IOFBF /* SysV and MSDOS, Windows */
132 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
134 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
137 /*Borland C++ 4.5 EasyWin*/
138 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
147 /* added by satoru@isoternet.org */
149 #include <sys/types.h>
151 #include <sys/stat.h>
152 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
154 #if defined(__WATCOMC__)
155 #include <sys/utime.h>
159 #else /* defined(MSDOS) */
161 #ifdef __BORLANDC__ /* BCC32 */
163 #else /* !defined(__BORLANDC__) */
164 #include <sys/utime.h>
165 #endif /* (__BORLANDC__) */
166 #else /* !defined(__WIN32__) */
167 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
168 #include <sys/utime.h>
169 #elif defined(__TURBOC__) /* BCC */
171 #elif defined(LSI_C) /* LSI C */
172 #endif /* (__WIN32__) */
180 /* state of output_mode and input_mode
191 /* Input Assumption */
196 #define LATIN1_INPUT 6
197 #define UTF8_INPUT 13
198 #define UTF16_INPUT 1015
199 #define UTF32_INPUT 1017
202 #define STRICT_MIME 8
209 #define ENDIAN_BIG 1234
210 #define ENDIAN_LITTLE 4321
211 #define ENDIAN_2143 2143
212 #define ENDIAN_3412 3412
276 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
277 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
278 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
279 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
280 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
281 void j_oconv(nkf_char c2, nkf_char c1);
282 void s_oconv(nkf_char c2, nkf_char c1);
283 void e_oconv(nkf_char c2, nkf_char c1);
284 void w_oconv(nkf_char c2, nkf_char c1);
285 void w_oconv16(nkf_char c2, nkf_char c1);
286 void w_oconv32(nkf_char c2, nkf_char c1);
290 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
291 void (*oconv_func)(nkf_char c2, nkf_char c1);
292 } nkf_native_encoding;
294 nkf_native_encoding NkfEncodingASCII = { "US_ASCII", e_iconv, e_oconv };
295 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
296 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
297 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
298 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
299 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
300 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
305 nkf_native_encoding *based_encoding;
307 nkf_encoding nkf_encoding_table[] = {
308 {ASCII, "ASCII", &NkfEncodingASCII},
309 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
310 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingASCII},
311 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
312 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
313 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
314 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
315 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
316 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
317 {WINDOWS_31J, "WINDOWS-31J", &NkfEncodingShift_JIS},
318 {CP10001, "CP10001", &NkfEncodingShift_JIS},
319 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
320 {CP51932, "CP51932", &NkfEncodingEUC_JP},
321 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
322 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
323 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
324 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
325 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
326 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
327 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
328 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
329 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
330 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
331 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
332 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
333 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
334 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
335 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
336 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
337 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
338 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
339 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
340 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
341 {BINARY, "BINARY", &NkfEncodingASCII},
344 #define NKF_ENCODING_TABLE_SIZE 34
348 } encoding_name_to_id_table[] = {
350 {"ISO-2022-JP", ISO_2022_JP},
351 {"X-ISO2022JP-CP932", CP50220},
352 {"CP50220", CP50220},
353 {"CP50221", CP50221},
354 {"CP50222", CP50222},
355 {"ISO-2022-JP-1", ISO_2022_JP_1},
356 {"ISO-2022-JP-3", ISO_2022_JP_3},
357 {"SHIFT_JIS", SHIFT_JIS},
359 {"WINDOWS-31J", WINDOWS_31J},
360 {"CSWINDOWS31J", WINDOWS_31J},
361 {"CP932", WINDOWS_31J},
362 {"MS932", WINDOWS_31J},
363 {"CP10001", CP10001},
366 {"CP51932", CP51932},
367 {"EUC-JP-MS", EUCJP_MS},
368 {"EUCJP-MS", EUCJP_MS},
369 {"EUCJPMS", EUCJP_MS},
370 {"EUC-JP-ASCII", EUCJP_ASCII},
371 {"EUCJP-ASCII", EUCJP_ASCII},
372 {"SHIFT_JISX0213", SHIFT_JISX0213},
373 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
374 {"EUC-JISX0213", EUC_JISX0213},
375 {"EUC-JIS-2004", EUC_JIS_2004},
378 {"UTF-8-BOM", UTF_8_BOM},
379 {"UTF8-MAC", UTF8_MAC},
380 {"UTF-8-MAC", UTF8_MAC},
382 {"UTF-16BE", UTF_16BE},
383 {"UTF-16BE-BOM", UTF_16BE_BOM},
384 {"UTF-16LE", UTF_16LE},
385 {"UTF-16LE-BOM", UTF_16LE_BOM},
387 {"UTF-32BE", UTF_32BE},
388 {"UTF-32BE-BOM", UTF_32BE_BOM},
389 {"UTF-32LE", UTF_32LE},
390 {"UTF-32LE-BOM", UTF_32LE_BOM},
394 #if defined(DEFAULT_CODE_JIS)
395 #define DEFAULT_ENCODING ISO_2022_JP
396 #elif defined(DEFAULT_CODE_SJIS)
397 #define DEFAULT_ENCODING SHIFT_JIS
398 #elif defined(DEFAULT_CODE_EUC)
399 #define DEFAULT_ENCODING EUC_JP
400 #elif defined(DEFAULT_CODE_UTF8)
401 #define DEFAULT_ENCODING UTF_8
405 #define is_alnum(c) \
406 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
408 /* I don't trust portablity of toupper */
409 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
410 #define nkf_isoctal(c) ('0'<=c && c<='7')
411 #define nkf_isdigit(c) ('0'<=c && c<='9')
412 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
413 #define nkf_isblank(c) (c == SP || c == TAB)
414 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
415 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
416 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
417 #define nkf_isprint(c) (SP<=c && c<='~')
418 #define nkf_isgraph(c) ('!'<=c && c<='~')
419 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
420 ('A'<=c&&c<='F') ? (c-'A'+10) : \
421 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
422 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
423 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
424 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
425 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
426 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
428 #define CP932_TABLE_BEGIN 0xFA
429 #define CP932_TABLE_END 0xFC
430 #define CP932INV_TABLE_BEGIN 0xED
431 #define CP932INV_TABLE_END 0xEE
432 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
434 #define HOLD_SIZE 1024
435 #if defined(INT_IS_SHORT)
436 #define IOBUF_SIZE 2048
438 #define IOBUF_SIZE 16384
441 #define DEFAULT_J 'B'
442 #define DEFAULT_R 'B'
444 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
445 #define SJ6394 0x0161 /* 63 - 94 ku offset */
447 #define RANGE_NUM_MAX 18
452 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
453 #define sizeof_euc_to_utf8_1byte 94
454 #define sizeof_euc_to_utf8_2bytes 94
455 #define sizeof_utf8_to_euc_C2 64
456 #define sizeof_utf8_to_euc_E5B8 64
457 #define sizeof_utf8_to_euc_2bytes 112
458 #define sizeof_utf8_to_euc_3bytes 16
461 /* MIME preprocessor */
463 #ifdef EASYWIN /*Easy Win */
464 extern POINT _BufferSize;
473 void (*status_func)(struct input_code *, nkf_char);
474 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
478 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
479 static nkf_encoding *output_encoding;
481 #if !defined(PERL_XS) && !defined(WIN32DLL)
482 static nkf_char noconvert(FILE *f);
484 static void module_connection(void);
485 static nkf_char kanji_convert(FILE *f);
486 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
487 static nkf_char push_hold_buf(nkf_char c2);
488 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
489 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
490 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
492 * 0: Shift_JIS, eucJP-ascii
497 #define UCS_MAP_ASCII 0
499 #define UCS_MAP_CP932 2
500 #define UCS_MAP_CP10001 3
501 static int ms_ucs_map_f = UCS_MAP_ASCII;
503 #ifdef UTF8_INPUT_ENABLE
504 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
505 static int no_cp932ext_f = FALSE;
506 /* ignore ZERO WIDTH NO-BREAK SPACE */
507 static int no_best_fit_chars_f = FALSE;
508 static int input_endian = ENDIAN_BIG;
509 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
510 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
511 static void encode_fallback_html(nkf_char c);
512 static void encode_fallback_xml(nkf_char c);
513 static void encode_fallback_java(nkf_char c);
514 static void encode_fallback_perl(nkf_char c);
515 static void encode_fallback_subchar(nkf_char c);
516 static void (*encode_fallback)(nkf_char c) = NULL;
517 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
518 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
519 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
520 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
521 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
522 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
523 static void w_status(struct input_code *, nkf_char);
525 #ifdef UTF8_OUTPUT_ENABLE
526 static int output_bom_f = FALSE;
527 static int output_endian = ENDIAN_BIG;
528 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
530 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
531 static void fold_conv(nkf_char c2,nkf_char c1);
532 static void nl_conv(nkf_char c2,nkf_char c1);
533 static void z_conv(nkf_char c2,nkf_char c1);
534 static void rot_conv(nkf_char c2,nkf_char c1);
535 static void hira_conv(nkf_char c2,nkf_char c1);
536 static void base64_conv(nkf_char c2,nkf_char c1);
537 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
538 static void no_connection(nkf_char c2,nkf_char c1);
539 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
541 static void code_score(struct input_code *ptr);
542 static void code_status(nkf_char c);
544 static void std_putc(nkf_char c);
545 static nkf_char std_getc(FILE *f);
546 static nkf_char std_ungetc(nkf_char c,FILE *f);
548 static nkf_char broken_getc(FILE *f);
549 static nkf_char broken_ungetc(nkf_char c,FILE *f);
551 static nkf_char mime_begin(FILE *f);
552 static nkf_char mime_getc(FILE *f);
553 static nkf_char mime_ungetc(nkf_char c,FILE *f);
555 static void switch_mime_getc(void);
556 static void unswitch_mime_getc(void);
557 static nkf_char mime_begin_strict(FILE *f);
558 static nkf_char mime_getc_buf(FILE *f);
559 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
560 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
562 static nkf_char base64decode(nkf_char c);
563 static void mime_prechar(nkf_char c2, nkf_char c1);
564 static void mime_putc(nkf_char c);
565 static void open_mime(nkf_char c);
566 static void close_mime(void);
567 static void eof_mime(void);
568 static void mimeout_addchar(nkf_char c);
570 static void usage(void);
571 static void version(void);
572 static void show_configuration(void);
574 static void options(unsigned char *c);
575 static void reinit(void);
579 #if !defined(PERL_XS) && !defined(WIN32DLL)
580 static unsigned char stdibuf[IOBUF_SIZE];
581 static unsigned char stdobuf[IOBUF_SIZE];
583 static unsigned char hold_buf[HOLD_SIZE*2];
584 static int hold_count = 0;
586 /* MIME preprocessor fifo */
588 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
589 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
590 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
591 static unsigned char mime_buf[MIME_BUF_SIZE];
592 static unsigned int mime_top = 0;
593 static unsigned int mime_last = 0; /* decoded */
594 static unsigned int mime_input = 0; /* undecoded */
595 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
598 static int unbuf_f = FALSE;
599 static int estab_f = FALSE;
600 static int nop_f = FALSE;
601 static int binmode_f = TRUE; /* binary mode */
602 static int rot_f = FALSE; /* rot14/43 mode */
603 static int hira_f = FALSE; /* hira/kata henkan */
604 static int input_f = FALSE; /* non fixed input code */
605 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
606 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
607 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
608 static int mimebuf_f = FALSE; /* MIME buffered input */
609 static int broken_f = FALSE; /* convert ESC-less broken JIS */
610 static int iso8859_f = FALSE; /* ISO8859 through */
611 static int mimeout_f = FALSE; /* base64 mode */
612 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
613 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
615 #ifdef UNICODE_NORMALIZATION
616 static int nfc_f = FALSE;
617 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
618 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
619 static nkf_char nfc_getc(FILE *f);
620 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
624 static int cap_f = FALSE;
625 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
626 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
627 static nkf_char cap_getc(FILE *f);
628 static nkf_char cap_ungetc(nkf_char c,FILE *f);
630 static int url_f = FALSE;
631 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
632 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
633 static nkf_char url_getc(FILE *f);
634 static nkf_char url_ungetc(nkf_char c,FILE *f);
637 #if defined(INT_IS_SHORT)
638 #define NKF_INT32_C(n) (n##L)
640 #define NKF_INT32_C(n) (n)
642 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
643 #define CLASS_MASK NKF_INT32_C(0xFF000000)
644 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
645 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
646 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
647 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
648 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
650 #ifdef NUMCHAR_OPTION
651 static int numchar_f = FALSE;
652 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
653 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
654 static nkf_char numchar_getc(FILE *f);
655 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
659 static int noout_f = FALSE;
660 static void no_putc(nkf_char c);
661 static int debug_f = FALSE;
662 static void debug(const char *str);
663 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
666 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
668 static void print_guessed_code(char *filename);
670 static void set_input_codename(char *codename);
673 static int exec_f = 0;
676 #ifdef SHIFTJIS_CP932
677 /* invert IBM extended characters to others */
678 static int cp51932_f = FALSE;
680 /* invert NEC-selected IBM extended characters to IBM extended characters */
681 static int cp932inv_f = TRUE;
683 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
684 #endif /* SHIFTJIS_CP932 */
687 static int x0212_f = FALSE;
688 static nkf_char x0212_shift(nkf_char c);
689 static nkf_char x0212_unshift(nkf_char c);
691 static int x0213_f = FALSE;
693 static unsigned char prefix_table[256];
695 static void set_code_score(struct input_code *ptr, nkf_char score);
696 static void clr_code_score(struct input_code *ptr, nkf_char score);
697 static void status_disable(struct input_code *ptr);
698 static void status_push_ch(struct input_code *ptr, nkf_char c);
699 static void status_clear(struct input_code *ptr);
700 static void status_reset(struct input_code *ptr);
701 static void status_reinit(struct input_code *ptr);
702 static void status_check(struct input_code *ptr, nkf_char c);
703 static void e_status(struct input_code *, nkf_char);
704 static void s_status(struct input_code *, nkf_char);
706 struct input_code input_code_list[] = {
707 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
708 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
709 #ifdef UTF8_INPUT_ENABLE
710 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
711 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
712 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
717 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
718 static int base64_count = 0;
720 /* X0208 -> ASCII converter */
723 static int f_line = 0; /* chars in line */
724 static int f_prev = 0;
725 static int fold_preserve_f = FALSE; /* preserve new lines */
726 static int fold_f = FALSE;
727 static int fold_len = 0;
730 static unsigned char kanji_intro = DEFAULT_J;
731 static unsigned char ascii_intro = DEFAULT_R;
735 #define FOLD_MARGIN 10
736 #define DEFAULT_FOLD 60
738 static int fold_margin = FOLD_MARGIN;
742 #ifdef DEFAULT_CODE_JIS
743 # define DEFAULT_CONV j_oconv
745 #ifdef DEFAULT_CODE_SJIS
746 # define DEFAULT_CONV s_oconv
748 #ifdef DEFAULT_CODE_EUC
749 # define DEFAULT_CONV e_oconv
751 #ifdef DEFAULT_CODE_UTF8
752 # define DEFAULT_CONV w_oconv
755 /* process default */
756 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
758 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
759 /* s_iconv or oconv */
760 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
762 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
763 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
764 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
765 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
766 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
767 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
768 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
770 /* static redirections */
772 static void (*o_putc)(nkf_char c) = std_putc;
774 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
775 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
777 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
778 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
780 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
782 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
783 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
785 /* for strict mime */
786 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
787 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
790 static int output_mode = ASCII, /* output kanji mode */
791 input_mode = ASCII, /* input kanji mode */
792 shift_mode = FALSE; /* TRUE shift out, or X0201 */
793 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
795 /* X0201 / X0208 conversion tables */
797 /* X0201 kana conversion table */
799 static const unsigned char cv[]= {
800 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
801 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
802 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
803 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
804 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
805 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
806 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
807 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
808 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
809 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
810 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
811 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
812 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
813 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
814 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
815 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
819 /* X0201 kana conversion table for daguten */
821 static const unsigned char dv[]= {
822 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
823 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
824 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
825 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
826 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
827 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
828 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
829 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
830 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
831 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
832 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
833 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
834 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
835 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
836 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
837 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
840 /* X0201 kana conversion table for han-daguten */
842 static const unsigned char ev[]= {
843 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
844 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
845 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
846 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
847 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
848 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
849 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
850 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
851 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
852 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
853 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
854 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
855 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
856 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
857 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
858 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
862 /* X0208 kigou conversion table */
863 /* 0x8140 - 0x819e */
864 static const unsigned char fv[] = {
866 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
867 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
868 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
869 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
870 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
871 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
872 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
873 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
874 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
875 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
876 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
877 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
882 static int file_out_f = FALSE;
884 static int overwrite_f = FALSE;
885 static int preserve_time_f = FALSE;
886 static int backup_f = FALSE;
887 static char *backup_suffix = "";
888 static char *get_backup_filename(const char *suffix, const char *filename);
891 static int nlmode_f = 0; /* CR, LF, CRLF */
892 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
893 static nkf_char prev_cr = 0; /* CR or 0 */
894 #ifdef EASYWIN /*Easy Win */
895 static int end_check;
898 #define STD_GC_BUFSIZE (256)
899 nkf_char std_gc_buf[STD_GC_BUFSIZE];
902 char* nkf_strcpy(const char *str)
904 char* result = malloc(strlen(str) + 1);
913 static void nkf_str_upcase(const char *str, char *res, size_t length)
916 for (; i < length && str[i]; i++) {
917 res[i] = nkf_toupper(str[i]);
922 static nkf_encoding *nkf_enc_from_index(int idx)
924 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
927 return &nkf_encoding_table[idx];
930 static int nkf_enc_find_index(const char *name)
933 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
934 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
935 return encoding_name_to_id_table[i].id;
941 static nkf_encoding *nkf_enc_find(const char *name)
944 idx = nkf_enc_find_index(name);
945 if (idx < 0) return 0;
946 return nkf_enc_from_index(idx);
949 #define nkf_enc_name(enc) (enc)->name
950 #define nkf_enc_to_index(enc) (enc)->id
951 #define nkf_enc_to_base_encoding(enc) (enc)->based_encoding
954 #include "nkf32dll.c"
955 #elif defined(PERL_XS)
957 int main(int argc, char **argv)
962 char *outfname = NULL;
965 #ifdef EASYWIN /*Easy Win */
966 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
969 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
970 cp = (unsigned char *)*argv;
974 int debug_f_back = debug_f;
977 int exec_f_back = exec_f;
980 int x0212_f_back = x0212_f;
982 int x0213_f_back = x0213_f;
983 int guess_f_back = guess_f;
985 guess_f = guess_f_back;
988 debug_f = debug_f_back;
991 exec_f = exec_f_back;
994 x0212_f = x0212_f_back;
996 x0213_f = x0213_f_back;
1001 if (pipe(fds) < 0 || (pid = fork()) < 0){
1012 execvp(argv[1], &argv[1]);
1027 if (binmode_f == TRUE)
1028 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1029 if (freopen("","wb",stdout) == NULL)
1036 setbuf(stdout, (char *) NULL);
1038 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
1041 if (binmode_f == TRUE)
1042 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1043 if (freopen("","rb",stdin) == NULL) return (-1);
1047 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
1051 kanji_convert(stdin);
1052 if (guess_f) print_guessed_code(NULL);
1056 int is_argument_error = FALSE;
1058 input_codename = NULL;
1061 iconv_for_check = 0;
1063 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
1066 is_argument_error = TRUE;
1074 /* reopen file for stdout */
1075 if (file_out_f == TRUE) {
1078 outfname = malloc(strlen(origfname)
1079 + strlen(".nkftmpXXXXXX")
1085 strcpy(outfname, origfname);
1089 for (i = strlen(outfname); i; --i){
1090 if (outfname[i - 1] == '/'
1091 || outfname[i - 1] == '\\'){
1097 strcat(outfname, "ntXXXXXX");
1099 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
1100 S_IREAD | S_IWRITE);
1102 strcat(outfname, ".nkftmpXXXXXX");
1103 fd = mkstemp(outfname);
1106 || (fd_backup = dup(fileno(stdout))) < 0
1107 || dup2(fd, fileno(stdout)) < 0
1118 outfname = "nkf.out";
1121 if(freopen(outfname, "w", stdout) == NULL) {
1125 if (binmode_f == TRUE) {
1126 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1127 if (freopen("","wb",stdout) == NULL)
1134 if (binmode_f == TRUE)
1135 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1136 if (freopen("","rb",fin) == NULL)
1141 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
1145 char *filename = NULL;
1147 if (nfiles > 1) filename = origfname;
1148 if (guess_f) print_guessed_code(filename);
1154 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1162 if (dup2(fd_backup, fileno(stdout)) < 0){
1165 if (stat(origfname, &sb)) {
1166 fprintf(stderr, "Can't stat %s\n", origfname);
1168 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
1169 if (chmod(outfname, sb.st_mode)) {
1170 fprintf(stderr, "Can't set permission %s\n", outfname);
1173 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
1174 if(preserve_time_f){
1175 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1176 tb[0] = tb[1] = sb.st_mtime;
1177 if (utime(outfname, tb)) {
1178 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1181 tb.actime = sb.st_atime;
1182 tb.modtime = sb.st_mtime;
1183 if (utime(outfname, &tb)) {
1184 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1189 char *backup_filename = get_backup_filename(backup_suffix, origfname);
1191 unlink(backup_filename);
1193 if (rename(origfname, backup_filename)) {
1194 perror(backup_filename);
1195 fprintf(stderr, "Can't rename %s to %s\n",
1196 origfname, backup_filename);
1200 if (unlink(origfname)){
1205 if (rename(outfname, origfname)) {
1207 fprintf(stderr, "Can't rename %s to %s\n",
1208 outfname, origfname);
1215 if (is_argument_error)
1218 #ifdef EASYWIN /*Easy Win */
1219 if (file_out_f == FALSE)
1220 scanf("%d",&end_check);
1223 #else /* for Other OS */
1224 if (file_out_f == TRUE)
1226 #endif /*Easy Win */
1229 #endif /* WIN32DLL */
1232 char *get_backup_filename(const char *suffix, const char *filename)
1234 char *backup_filename;
1235 int asterisk_count = 0;
1237 int filename_length = strlen(filename);
1239 for(i = 0; suffix[i]; i++){
1240 if(suffix[i] == '*') asterisk_count++;
1244 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1245 if (!backup_filename){
1246 perror("Can't malloc backup filename.");
1250 for(i = 0, j = 0; suffix[i];){
1251 if(suffix[i] == '*'){
1252 backup_filename[j] = '\0';
1253 strncat(backup_filename, filename, filename_length);
1255 j += filename_length;
1257 backup_filename[j++] = suffix[i++];
1260 backup_filename[j] = '\0';
1262 j = strlen(suffix) + filename_length;
1263 backup_filename = malloc( + 1);
1264 strcpy(backup_filename, filename);
1265 strcat(backup_filename, suffix);
1266 backup_filename[j] = '\0';
1268 return backup_filename;
1272 static const struct {
1296 {"katakana-hiragana","h3"},
1304 #ifdef UTF8_OUTPUT_ENABLE
1314 {"fb-subchar=", ""},
1316 #ifdef UTF8_INPUT_ENABLE
1317 {"utf8-input", "W"},
1318 {"utf16-input", "W16"},
1319 {"no-cp932ext", ""},
1320 {"no-best-fit-chars",""},
1322 #ifdef UNICODE_NORMALIZATION
1323 {"utf8mac-input", ""},
1335 #ifdef NUMCHAR_OPTION
1336 {"numchar-input", ""},
1342 #ifdef SHIFTJIS_CP932
1352 static int option_mode = 0;
1354 void options(unsigned char *cp)
1358 unsigned char *cp_back = NULL;
1364 while(*cp && *cp++!='-');
1365 while (*cp || cp_back) {
1373 case '-': /* literal options */
1374 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1378 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1379 p = (unsigned char *)long_option[i].name;
1380 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1381 if (*p == cp[j] || cp[j] == SP){
1388 fprintf(stderr, "unknown long option: --%s\n", cp);
1391 while(*cp && *cp != SP && cp++);
1392 if (long_option[i].alias[0]){
1394 cp = (unsigned char *)long_option[i].alias;
1396 if (strcmp(long_option[i].name, "ic=") == 0){
1397 nkf_str_upcase(p, codeset, 32);
1398 enc = nkf_enc_find(codeset);
1399 switch (nkf_enc_to_index(enc)) {
1401 input_f = JIS_INPUT;
1406 input_f = JIS_INPUT;
1407 #ifdef SHIFTJIS_CP932
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 ms_ucs_map_f = UCS_MAP_CP932;
1415 input_f = JIS_INPUT;
1421 input_f = JIS_INPUT;
1428 input_f = SJIS_INPUT;
1431 input_f = SJIS_INPUT;
1432 #ifdef SHIFTJIS_CP932
1435 #ifdef UTF8_OUTPUT_ENABLE
1436 ms_ucs_map_f = UCS_MAP_CP932;
1440 input_f = SJIS_INPUT;
1441 #ifdef SHIFTJIS_CP932
1444 #ifdef UTF8_OUTPUT_ENABLE
1445 ms_ucs_map_f = UCS_MAP_CP10001;
1449 input_f = EUC_INPUT;
1452 input_f = EUC_INPUT;
1453 #ifdef SHIFTJIS_CP932
1456 #ifdef UTF8_OUTPUT_ENABLE
1457 ms_ucs_map_f = UCS_MAP_CP932;
1461 input_f = EUC_INPUT;
1462 #ifdef SHIFTJIS_CP932
1465 #ifdef UTF8_OUTPUT_ENABLE
1466 ms_ucs_map_f = UCS_MAP_MS;
1470 input_f = EUC_INPUT;
1471 #ifdef SHIFTJIS_CP932
1474 #ifdef UTF8_OUTPUT_ENABLE
1475 ms_ucs_map_f = UCS_MAP_ASCII;
1478 case SHIFT_JISX0213:
1479 case SHIFT_JIS_2004:
1480 input_f = SJIS_INPUT;
1482 #ifdef SHIFTJIS_CP932
1488 input_f = EUC_INPUT;
1490 #ifdef SHIFTJIS_CP932
1494 #ifdef UTF8_INPUT_ENABLE
1498 input_f = UTF8_INPUT;
1500 #ifdef UNICODE_NORMALIZATION
1502 input_f = UTF8_INPUT;
1509 input_f = UTF16_INPUT;
1510 input_endian = ENDIAN_BIG;
1514 input_f = UTF16_INPUT;
1515 input_endian = ENDIAN_LITTLE;
1520 input_f = UTF32_INPUT;
1521 input_endian = ENDIAN_BIG;
1525 input_f = UTF32_INPUT;
1526 input_endian = ENDIAN_LITTLE;
1530 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1535 if (strcmp(long_option[i].name, "oc=") == 0){
1537 nkf_str_upcase(p, codeset, 32);
1538 output_encoding = nkf_enc_find(codeset);
1539 switch (nkf_enc_to_index(output_encoding)) {
1541 output_conv = j_oconv;
1544 output_conv = j_oconv;
1546 #ifdef SHIFTJIS_CP932
1549 #ifdef UTF8_OUTPUT_ENABLE
1550 ms_ucs_map_f = UCS_MAP_CP932;
1554 output_conv = j_oconv;
1555 #ifdef SHIFTJIS_CP932
1558 #ifdef UTF8_OUTPUT_ENABLE
1559 ms_ucs_map_f = UCS_MAP_CP932;
1563 output_conv = j_oconv;
1567 #ifdef SHIFTJIS_CP932
1572 output_conv = j_oconv;
1577 #ifdef SHIFTJIS_CP932
1582 output_conv = s_oconv;
1585 output_conv = s_oconv;
1586 #ifdef UTF8_OUTPUT_ENABLE
1587 ms_ucs_map_f = UCS_MAP_CP932;
1591 output_conv = s_oconv;
1592 #ifdef UTF8_OUTPUT_ENABLE
1593 ms_ucs_map_f = UCS_MAP_CP10001;
1597 output_conv = e_oconv;
1600 output_conv = e_oconv;
1601 #ifdef SHIFTJIS_CP932
1604 #ifdef UTF8_OUTPUT_ENABLE
1605 ms_ucs_map_f = UCS_MAP_CP932;
1609 output_conv = e_oconv;
1613 #ifdef UTF8_OUTPUT_ENABLE
1614 ms_ucs_map_f = UCS_MAP_MS;
1618 output_conv = e_oconv;
1622 #ifdef UTF8_OUTPUT_ENABLE
1623 ms_ucs_map_f = UCS_MAP_ASCII;
1626 case SHIFT_JISX0213:
1627 case SHIFT_JIS_2004:
1628 output_conv = s_oconv;
1630 #ifdef SHIFTJIS_CP932
1636 output_conv = e_oconv;
1641 #ifdef SHIFTJIS_CP932
1645 #ifdef UTF8_OUTPUT_ENABLE
1648 output_conv = w_oconv;
1651 output_conv = w_oconv;
1652 output_bom_f = TRUE;
1655 output_conv = w_oconv16;
1659 output_conv = w_oconv16;
1660 output_bom_f = TRUE;
1663 output_conv = w_oconv16;
1664 output_endian = ENDIAN_LITTLE;
1667 output_conv = w_oconv16;
1668 output_endian = ENDIAN_LITTLE;
1669 output_bom_f = TRUE;
1673 output_conv = w_oconv32;
1676 output_conv = w_oconv32;
1677 output_bom_f = TRUE;
1680 output_conv = w_oconv32;
1681 output_endian = ENDIAN_LITTLE;
1684 output_conv = w_oconv32;
1685 output_endian = ENDIAN_LITTLE;
1686 output_bom_f = TRUE;
1690 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1695 if (strcmp(long_option[i].name, "guess=") == 0){
1704 if (strcmp(long_option[i].name, "overwrite") == 0){
1707 preserve_time_f = TRUE;
1710 if (strcmp(long_option[i].name, "overwrite=") == 0){
1713 preserve_time_f = TRUE;
1715 backup_suffix = malloc(strlen((char *) p) + 1);
1716 strcpy(backup_suffix, (char *) p);
1719 if (strcmp(long_option[i].name, "in-place") == 0){
1722 preserve_time_f = FALSE;
1725 if (strcmp(long_option[i].name, "in-place=") == 0){
1728 preserve_time_f = FALSE;
1730 backup_suffix = malloc(strlen((char *) p) + 1);
1731 strcpy(backup_suffix, (char *) p);
1736 if (strcmp(long_option[i].name, "cap-input") == 0){
1740 if (strcmp(long_option[i].name, "url-input") == 0){
1745 #ifdef NUMCHAR_OPTION
1746 if (strcmp(long_option[i].name, "numchar-input") == 0){
1752 if (strcmp(long_option[i].name, "no-output") == 0){
1756 if (strcmp(long_option[i].name, "debug") == 0){
1761 if (strcmp(long_option[i].name, "cp932") == 0){
1762 #ifdef SHIFTJIS_CP932
1766 #ifdef UTF8_OUTPUT_ENABLE
1767 ms_ucs_map_f = UCS_MAP_CP932;
1771 if (strcmp(long_option[i].name, "no-cp932") == 0){
1772 #ifdef SHIFTJIS_CP932
1776 #ifdef UTF8_OUTPUT_ENABLE
1777 ms_ucs_map_f = UCS_MAP_ASCII;
1781 #ifdef SHIFTJIS_CP932
1782 if (strcmp(long_option[i].name, "cp932inv") == 0){
1789 if (strcmp(long_option[i].name, "x0212") == 0){
1796 if (strcmp(long_option[i].name, "exec-in") == 0){
1800 if (strcmp(long_option[i].name, "exec-out") == 0){
1805 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1806 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1807 no_cp932ext_f = TRUE;
1810 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1811 no_best_fit_chars_f = TRUE;
1814 if (strcmp(long_option[i].name, "fb-skip") == 0){
1815 encode_fallback = NULL;
1818 if (strcmp(long_option[i].name, "fb-html") == 0){
1819 encode_fallback = encode_fallback_html;
1822 if (strcmp(long_option[i].name, "fb-xml") == 0){
1823 encode_fallback = encode_fallback_xml;
1826 if (strcmp(long_option[i].name, "fb-java") == 0){
1827 encode_fallback = encode_fallback_java;
1830 if (strcmp(long_option[i].name, "fb-perl") == 0){
1831 encode_fallback = encode_fallback_perl;
1834 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1835 encode_fallback = encode_fallback_subchar;
1838 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1839 encode_fallback = encode_fallback_subchar;
1840 unicode_subchar = 0;
1842 /* decimal number */
1843 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1844 unicode_subchar *= 10;
1845 unicode_subchar += hex2bin(p[i]);
1847 }else if(p[1] == 'x' || p[1] == 'X'){
1848 /* hexadecimal number */
1849 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1850 unicode_subchar <<= 4;
1851 unicode_subchar |= hex2bin(p[i]);
1855 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1856 unicode_subchar *= 8;
1857 unicode_subchar += hex2bin(p[i]);
1860 w16e_conv(unicode_subchar, &i, &j);
1861 unicode_subchar = i<<8 | j;
1865 #ifdef UTF8_OUTPUT_ENABLE
1866 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1867 ms_ucs_map_f = UCS_MAP_MS;
1871 #ifdef UNICODE_NORMALIZATION
1872 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1873 input_f = UTF8_INPUT;
1878 if (strcmp(long_option[i].name, "prefix=") == 0){
1879 if (nkf_isgraph(p[0])){
1880 for (i = 1; nkf_isgraph(p[i]); i++){
1881 prefix_table[p[i]] = p[0];
1888 case 'b': /* buffered mode */
1891 case 'u': /* non bufferd mode */
1894 case 't': /* transparent mode */
1899 } else if (*cp=='2') {
1903 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1911 case 'j': /* JIS output */
1913 output_conv = j_oconv;
1914 output_encoding = nkf_enc_from_index(ISO_2022_JP);
1916 case 'e': /* AT&T EUC output */
1917 output_conv = e_oconv;
1919 output_encoding = nkf_enc_from_index(EUC_JP);
1921 case 's': /* SJIS output */
1922 output_conv = s_oconv;
1923 output_encoding = nkf_enc_from_index(SHIFT_JIS);
1925 case 'l': /* ISO8859 Latin-1 support, no conversion */
1926 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1927 input_f = LATIN1_INPUT;
1929 case 'i': /* Kanji IN ESC-$-@/B */
1930 if (*cp=='@'||*cp=='B')
1931 kanji_intro = *cp++;
1933 case 'o': /* ASCII IN ESC-(-J/B */
1934 if (*cp=='J'||*cp=='B'||*cp=='H')
1935 ascii_intro = *cp++;
1939 bit:1 katakana->hiragana
1940 bit:2 hiragana->katakana
1942 if ('9'>= *cp && *cp>='0')
1943 hira_f |= (*cp++ -'0');
1950 #if defined(MSDOS) || defined(__OS2__)
1957 show_configuration();
1965 #ifdef UTF8_OUTPUT_ENABLE
1966 case 'w': /* UTF-8 output */
1968 output_conv = w_oconv; cp++;
1971 output_encoding = nkf_enc_from_index(UTF_8N);
1973 output_bom_f = TRUE;
1974 output_encoding = nkf_enc_from_index(UTF_8_BOM);
1978 if ('1'== cp[0] && '6'==cp[1]) {
1979 output_conv = w_oconv16; cp+=2;
1981 } else if ('3'== cp[0] && '2'==cp[1]) {
1982 output_conv = w_oconv32; cp+=2;
1985 output_conv = w_oconv;
1986 output_encoding = nkf_enc_from_index(UTF_8);
1991 output_endian = ENDIAN_LITTLE;
1992 } else if (cp[0] == 'B') {
1995 output_encoding = nkf_enc_from_index(enc_idx);
2000 enc_idx = enc_idx == UTF_16
2001 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
2002 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
2004 output_bom_f = TRUE;
2005 enc_idx = enc_idx == UTF_16
2006 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
2007 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
2009 output_encoding = nkf_enc_from_index(enc_idx);
2013 #ifdef UTF8_INPUT_ENABLE
2014 case 'W': /* UTF input */
2017 input_f = UTF8_INPUT;
2019 if ('1'== cp[0] && '6'==cp[1]) {
2021 input_f = UTF16_INPUT;
2022 input_endian = ENDIAN_BIG;
2023 } else if ('3'== cp[0] && '2'==cp[1]) {
2025 input_f = UTF32_INPUT;
2026 input_endian = ENDIAN_BIG;
2028 input_f = UTF8_INPUT;
2033 input_endian = ENDIAN_LITTLE;
2034 } else if (cp[0] == 'B') {
2040 /* Input code assumption */
2041 case 'J': /* JIS input */
2042 input_f = JIS_INPUT;
2044 case 'E': /* AT&T EUC input */
2045 input_f = EUC_INPUT;
2047 case 'S': /* MS Kanji input */
2048 input_f = SJIS_INPUT;
2050 case 'Z': /* Convert X0208 alphabet to asii */
2052 bit:0 Convert JIS X 0208 Alphabet to ASCII
2053 bit:1 Convert Kankaku to one space
2054 bit:2 Convert Kankaku to two spaces
2055 bit:3 Convert HTML Entity
2056 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
2058 while ('0'<= *cp && *cp <='9') {
2059 alpha_f |= 1 << (*cp++ - '0');
2061 if (!alpha_f) alpha_f = 1;
2063 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
2064 x0201_f = FALSE; /* No X0201->X0208 conversion */
2066 ESC-(-I in JIS, EUC, MS Kanji
2067 SI/SO in JIS, EUC, MS Kanji
2068 SSO in EUC, JIS, not in MS Kanji
2069 MS Kanji (0xa0-0xdf)
2071 ESC-(-I in JIS (0x20-0x5f)
2072 SSO in EUC (0xa0-0xdf)
2073 0xa0-0xd in MS Kanji (0xa0-0xdf)
2076 case 'X': /* Convert X0201 kana to X0208 */
2079 case 'F': /* prserve new lines */
2080 fold_preserve_f = TRUE;
2081 case 'f': /* folding -f60 or -f */
2084 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2086 fold_len += *cp++ - '0';
2088 if (!(0<fold_len && fold_len<BUFSIZ))
2089 fold_len = DEFAULT_FOLD;
2093 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2095 fold_margin += *cp++ - '0';
2099 case 'm': /* MIME support */
2100 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
2101 if (*cp=='B'||*cp=='Q') {
2102 mime_decode_mode = *cp++;
2103 mimebuf_f = FIXED_MIME;
2104 } else if (*cp=='N') {
2105 mime_f = TRUE; cp++;
2106 } else if (*cp=='S') {
2107 mime_f = STRICT_MIME; cp++;
2108 } else if (*cp=='0') {
2109 mime_decode_f = FALSE;
2110 mime_f = FALSE; cp++;
2113 case 'M': /* MIME output */
2116 mimeout_f = FIXED_MIME; cp++;
2117 } else if (*cp=='Q') {
2119 mimeout_f = FIXED_MIME; cp++;
2124 case 'B': /* Broken JIS support */
2126 bit:1 allow any x on ESC-(-x or ESC-$-x
2127 bit:2 reset to ascii on NL
2129 if ('9'>= *cp && *cp>='0')
2130 broken_f |= 1<<(*cp++ -'0');
2135 case 'O':/* for Output file */
2139 case 'c':/* add cr code */
2142 case 'd':/* delete cr code */
2145 case 'I': /* ISO-2022-JP output */
2148 case 'L': /* line mode */
2149 if (*cp=='u') { /* unix */
2150 nlmode_f = LF; cp++;
2151 } else if (*cp=='m') { /* mac */
2152 nlmode_f = CR; cp++;
2153 } else if (*cp=='w') { /* windows */
2154 nlmode_f = CRLF; cp++;
2155 } else if (*cp=='0') { /* no conversion */
2164 } else if (*cp == '0') {
2173 /* module muliple options in a string are allowed for Perl moudle */
2174 while(*cp && *cp++!='-');
2177 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
2178 /* bogus option but ignored */
2184 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2187 struct input_code *p = input_code_list;
2189 if (iconv_func == p->iconv_func){
2198 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2200 #ifdef INPUT_CODE_FIX
2208 #ifdef INPUT_CODE_FIX
2209 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
2215 if (estab_f && iconv_for_check != iconv){
2216 struct input_code *p = find_inputcode_byfunc(iconv);
2218 set_input_codename(p->name);
2221 iconv_for_check = iconv;
2226 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2227 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2228 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2229 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2230 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2231 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2232 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2233 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2235 #define SCORE_INIT (SCORE_iMIME)
2237 static const char score_table_A0[] = {
2240 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2241 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2244 static const char score_table_F0[] = {
2245 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2246 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2247 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2248 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2251 void set_code_score(struct input_code *ptr, nkf_char score)
2254 ptr->score |= score;
2258 void clr_code_score(struct input_code *ptr, nkf_char score)
2261 ptr->score &= ~score;
2265 void code_score(struct input_code *ptr)
2267 nkf_char c2 = ptr->buf[0];
2268 #ifdef UTF8_OUTPUT_ENABLE
2269 nkf_char c1 = ptr->buf[1];
2272 set_code_score(ptr, SCORE_ERROR);
2273 }else if (c2 == SSO){
2274 set_code_score(ptr, SCORE_KANA);
2275 }else if (c2 == 0x8f){
2276 set_code_score(ptr, SCORE_X0212);
2277 #ifdef UTF8_OUTPUT_ENABLE
2278 }else if (!e2w_conv(c2, c1)){
2279 set_code_score(ptr, SCORE_NO_EXIST);
2281 }else if ((c2 & 0x70) == 0x20){
2282 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2283 }else if ((c2 & 0x70) == 0x70){
2284 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2285 }else if ((c2 & 0x70) >= 0x50){
2286 set_code_score(ptr, SCORE_L2);
2290 void status_disable(struct input_code *ptr)
2295 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2298 void status_push_ch(struct input_code *ptr, nkf_char c)
2300 ptr->buf[ptr->index++] = c;
2303 void status_clear(struct input_code *ptr)
2309 void status_reset(struct input_code *ptr)
2312 ptr->score = SCORE_INIT;
2315 void status_reinit(struct input_code *ptr)
2318 ptr->_file_stat = 0;
2321 void status_check(struct input_code *ptr, nkf_char c)
2323 if (c <= DEL && estab_f){
2328 void s_status(struct input_code *ptr, nkf_char c)
2332 status_check(ptr, c);
2337 #ifdef NUMCHAR_OPTION
2338 }else if (is_unicode_capsule(c)){
2341 }else if (0xa1 <= c && c <= 0xdf){
2342 status_push_ch(ptr, SSO);
2343 status_push_ch(ptr, c);
2346 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2348 status_push_ch(ptr, c);
2349 }else if (0xed <= c && c <= 0xee){
2351 status_push_ch(ptr, c);
2352 #ifdef SHIFTJIS_CP932
2353 }else if (is_ibmext_in_sjis(c)){
2355 status_push_ch(ptr, c);
2356 #endif /* SHIFTJIS_CP932 */
2358 }else if (0xf0 <= c && c <= 0xfc){
2360 status_push_ch(ptr, c);
2361 #endif /* X0212_ENABLE */
2363 status_disable(ptr);
2367 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2368 status_push_ch(ptr, c);
2369 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2373 status_disable(ptr);
2377 #ifdef SHIFTJIS_CP932
2378 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2379 status_push_ch(ptr, c);
2380 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2381 set_code_score(ptr, SCORE_CP932);
2386 #endif /* SHIFTJIS_CP932 */
2387 status_disable(ptr);
2390 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2391 status_push_ch(ptr, c);
2392 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2393 set_code_score(ptr, SCORE_CP932);
2396 status_disable(ptr);
2402 void e_status(struct input_code *ptr, nkf_char c)
2406 status_check(ptr, c);
2411 #ifdef NUMCHAR_OPTION
2412 }else if (is_unicode_capsule(c)){
2415 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2417 status_push_ch(ptr, c);
2419 }else if (0x8f == c){
2421 status_push_ch(ptr, c);
2422 #endif /* X0212_ENABLE */
2424 status_disable(ptr);
2428 if (0xa1 <= c && c <= 0xfe){
2429 status_push_ch(ptr, c);
2433 status_disable(ptr);
2438 if (0xa1 <= c && c <= 0xfe){
2440 status_push_ch(ptr, c);
2442 status_disable(ptr);
2444 #endif /* X0212_ENABLE */
2448 #ifdef UTF8_INPUT_ENABLE
2449 void w_status(struct input_code *ptr, nkf_char c)
2453 status_check(ptr, c);
2458 #ifdef NUMCHAR_OPTION
2459 }else if (is_unicode_capsule(c)){
2462 }else if (0xc0 <= c && c <= 0xdf){
2464 status_push_ch(ptr, c);
2465 }else if (0xe0 <= c && c <= 0xef){
2467 status_push_ch(ptr, c);
2468 }else if (0xf0 <= c && c <= 0xf4){
2470 status_push_ch(ptr, c);
2472 status_disable(ptr);
2477 if (0x80 <= c && c <= 0xbf){
2478 status_push_ch(ptr, c);
2479 if (ptr->index > ptr->stat){
2480 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2481 && ptr->buf[2] == 0xbf);
2482 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2483 &ptr->buf[0], &ptr->buf[1]);
2490 status_disable(ptr);
2494 if (0x80 <= c && c <= 0xbf){
2495 if (ptr->index < ptr->stat){
2496 status_push_ch(ptr, c);
2501 status_disable(ptr);
2508 void code_status(nkf_char c)
2510 int action_flag = 1;
2511 struct input_code *result = 0;
2512 struct input_code *p = input_code_list;
2514 if (!p->status_func) {
2518 if (!p->status_func)
2520 (p->status_func)(p, c);
2523 }else if(p->stat == 0){
2534 if (result && !estab_f){
2535 set_iconv(TRUE, result->iconv_func);
2536 }else if (c <= DEL){
2537 struct input_code *ptr = input_code_list;
2547 nkf_char std_getc(FILE *f)
2550 return std_gc_buf[--std_gc_ndx];
2556 nkf_char std_ungetc(nkf_char c, FILE *f)
2558 if (std_gc_ndx == STD_GC_BUFSIZE){
2561 std_gc_buf[std_gc_ndx++] = c;
2566 void std_putc(nkf_char c)
2573 #if !defined(PERL_XS) && !defined(WIN32DLL)
2574 nkf_char noconvert(FILE *f)
2579 module_connection();
2580 while ((c = (*i_getc)(f)) != EOF)
2587 void module_connection(void)
2589 oconv = output_conv;
2592 /* replace continucation module, from output side */
2594 /* output redicrection */
2596 if (noout_f || guess_f){
2603 if (mimeout_f == TRUE) {
2604 o_base64conv = oconv; oconv = base64_conv;
2606 /* base64_count = 0; */
2609 if (nlmode_f || guess_f) {
2610 o_nlconv = oconv; oconv = nl_conv;
2613 o_rot_conv = oconv; oconv = rot_conv;
2616 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2619 o_hira_conv = oconv; oconv = hira_conv;
2622 o_fconv = oconv; oconv = fold_conv;
2625 if (alpha_f || x0201_f) {
2626 o_zconv = oconv; oconv = z_conv;
2630 i_ungetc = std_ungetc;
2631 /* input redicrection */
2634 i_cgetc = i_getc; i_getc = cap_getc;
2635 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2638 i_ugetc = i_getc; i_getc = url_getc;
2639 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2642 #ifdef NUMCHAR_OPTION
2644 i_ngetc = i_getc; i_getc = numchar_getc;
2645 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2648 #ifdef UNICODE_NORMALIZATION
2649 if (nfc_f && input_f == UTF8_INPUT){
2650 i_nfc_getc = i_getc; i_getc = nfc_getc;
2651 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2654 if (mime_f && mimebuf_f==FIXED_MIME) {
2655 i_mgetc = i_getc; i_getc = mime_getc;
2656 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2659 i_bgetc = i_getc; i_getc = broken_getc;
2660 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2662 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2663 set_iconv(-TRUE, e_iconv);
2664 } else if (input_f == SJIS_INPUT) {
2665 set_iconv(-TRUE, s_iconv);
2666 #ifdef UTF8_INPUT_ENABLE
2667 } else if (input_f == UTF8_INPUT) {
2668 set_iconv(-TRUE, w_iconv);
2669 } else if (input_f == UTF16_INPUT) {
2670 set_iconv(-TRUE, w_iconv16);
2671 } else if (input_f == UTF32_INPUT) {
2672 set_iconv(-TRUE, w_iconv32);
2675 set_iconv(FALSE, e_iconv);
2679 struct input_code *p = input_code_list;
2687 * Check and Ignore BOM
2689 void check_bom(FILE *f)
2692 switch(c2 = (*i_getc)(f)){
2694 if((c2 = (*i_getc)(f)) == 0x00){
2695 if((c2 = (*i_getc)(f)) == 0xFE){
2696 if((c2 = (*i_getc)(f)) == 0xFF){
2698 set_iconv(TRUE, w_iconv32);
2700 if (iconv == w_iconv32) {
2701 input_endian = ENDIAN_BIG;
2704 (*i_ungetc)(0xFF,f);
2705 }else (*i_ungetc)(c2,f);
2706 (*i_ungetc)(0xFE,f);
2707 }else if(c2 == 0xFF){
2708 if((c2 = (*i_getc)(f)) == 0xFE){
2710 set_iconv(TRUE, w_iconv32);
2712 if (iconv == w_iconv32) {
2713 input_endian = ENDIAN_2143;
2716 (*i_ungetc)(0xFF,f);
2717 }else (*i_ungetc)(c2,f);
2718 (*i_ungetc)(0xFF,f);
2719 }else (*i_ungetc)(c2,f);
2720 (*i_ungetc)(0x00,f);
2721 }else (*i_ungetc)(c2,f);
2722 (*i_ungetc)(0x00,f);
2725 if((c2 = (*i_getc)(f)) == 0xBB){
2726 if((c2 = (*i_getc)(f)) == 0xBF){
2728 set_iconv(TRUE, w_iconv);
2730 if (iconv == w_iconv) {
2733 (*i_ungetc)(0xBF,f);
2734 }else (*i_ungetc)(c2,f);
2735 (*i_ungetc)(0xBB,f);
2736 }else (*i_ungetc)(c2,f);
2737 (*i_ungetc)(0xEF,f);
2740 if((c2 = (*i_getc)(f)) == 0xFF){
2741 if((c2 = (*i_getc)(f)) == 0x00){
2742 if((c2 = (*i_getc)(f)) == 0x00){
2744 set_iconv(TRUE, w_iconv32);
2746 if (iconv == w_iconv32) {
2747 input_endian = ENDIAN_3412;
2750 (*i_ungetc)(0x00,f);
2751 }else (*i_ungetc)(c2,f);
2752 (*i_ungetc)(0x00,f);
2753 }else (*i_ungetc)(c2,f);
2755 set_iconv(TRUE, w_iconv16);
2757 if (iconv == w_iconv16) {
2758 input_endian = ENDIAN_BIG;
2761 (*i_ungetc)(0xFF,f);
2762 }else (*i_ungetc)(c2,f);
2763 (*i_ungetc)(0xFE,f);
2766 if((c2 = (*i_getc)(f)) == 0xFE){
2767 if((c2 = (*i_getc)(f)) == 0x00){
2768 if((c2 = (*i_getc)(f)) == 0x00){
2770 set_iconv(TRUE, w_iconv32);
2772 if (iconv == w_iconv32) {
2773 input_endian = ENDIAN_LITTLE;
2776 (*i_ungetc)(0x00,f);
2777 }else (*i_ungetc)(c2,f);
2778 (*i_ungetc)(0x00,f);
2779 }else (*i_ungetc)(c2,f);
2781 set_iconv(TRUE, w_iconv16);
2783 if (iconv == w_iconv16) {
2784 input_endian = ENDIAN_LITTLE;
2787 (*i_ungetc)(0xFE,f);
2788 }else (*i_ungetc)(c2,f);
2789 (*i_ungetc)(0xFF,f);
2798 Conversion main loop. Code detection only.
2801 nkf_char kanji_convert(FILE *f)
2803 nkf_char c3, c2=0, c1, c0=0;
2804 int is_8bit = FALSE;
2806 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2807 #ifdef UTF8_INPUT_ENABLE
2808 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2815 output_mode = ASCII;
2818 #define NEXT continue /* no output, get next */
2819 #define SEND ; /* output c1 and c2, get next */
2820 #define LAST break /* end of loop, go closing */
2822 module_connection();
2825 while ((c1 = (*i_getc)(f)) != EOF) {
2826 #ifdef INPUT_CODE_FIX
2832 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2833 /* in case of 8th bit is on */
2834 if (!estab_f&&!mime_decode_mode) {
2835 /* in case of not established yet */
2836 /* It is still ambiguious */
2837 if (h_conv(f, c2, c1)==EOF)
2843 /* in case of already established */
2845 /* ignore bogus code and not CP5022x UCD */
2853 /* second byte, 7 bit code */
2854 /* it might be kanji shitfted */
2855 if ((c1 == DEL) || (c1 <= SP)) {
2856 /* ignore bogus first code */
2863 #ifdef UTF8_INPUT_ENABLE
2864 if (iconv == w_iconv16) {
2865 if (input_endian == ENDIAN_BIG) {
2867 if ((c1 = (*i_getc)(f)) != EOF) {
2868 if (0xD8 <= c2 && c2 <= 0xDB) {
2869 if ((c0 = (*i_getc)(f)) != EOF) {
2871 if ((c3 = (*i_getc)(f)) != EOF) {
2878 if ((c2 = (*i_getc)(f)) != EOF) {
2879 if (0xD8 <= c2 && c2 <= 0xDB) {
2880 if ((c3 = (*i_getc)(f)) != EOF) {
2881 if ((c0 = (*i_getc)(f)) != EOF) {
2890 } else if(iconv == w_iconv32){
2892 if((c2 = (*i_getc)(f)) != EOF &&
2893 (c1 = (*i_getc)(f)) != EOF &&
2894 (c0 = (*i_getc)(f)) != EOF){
2895 switch(input_endian){
2897 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2900 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2903 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2906 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2916 #ifdef NUMCHAR_OPTION
2917 if (is_unicode_capsule(c1)){
2921 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2923 if (!estab_f && !iso8859_f) {
2924 /* not established yet */
2927 } else { /* estab_f==TRUE */
2932 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2933 /* SJIS X0201 Case... */
2934 if (iso2022jp_f && !x0201_f) {
2935 (*oconv)(GETA1, GETA2);
2942 } else if (c1==SSO && iconv != s_iconv) {
2943 /* EUC X0201 Case */
2944 c1 = (*i_getc)(f); /* skip SSO */
2946 if (SSP<=c1 && c1<0xe0) {
2947 if (iso2022jp_f && !x0201_f) {
2948 (*oconv)(GETA1, GETA2);
2955 } else { /* bogus code, skip SSO and one byte */
2958 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2959 (c1 == 0xFD || c1 == 0xFE)) {
2965 /* already established */
2970 } else if ((c1 > SP) && (c1 != DEL)) {
2971 /* in case of Roman characters */
2973 /* output 1 shifted byte */
2977 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2978 /* output 1 shifted byte */
2979 if (iso2022jp_f && !x0201_f) {
2980 (*oconv)(GETA1, GETA2);
2987 /* look like bogus code */
2990 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
2991 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
2992 /* in case of Kanji shifted */
2995 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2996 /* Check MIME code */
2997 if ((c1 = (*i_getc)(f)) == EOF) {
3000 } else if (c1 == '?') {
3001 /* =? is mime conversion start sequence */
3002 if(mime_f == STRICT_MIME) {
3003 /* check in real detail */
3004 if (mime_begin_strict(f) == EOF)
3008 } else if (mime_begin(f) == EOF)
3018 /* normal ASCII code */
3021 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
3024 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
3027 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
3028 if ((c1 = (*i_getc)(f)) == EOF) {
3029 /* (*oconv)(0, ESC); don't send bogus code */
3031 } else if (c1 == '$') {
3032 if ((c1 = (*i_getc)(f)) == EOF) {
3034 (*oconv)(0, ESC); don't send bogus code
3035 (*oconv)(0, '$'); */
3037 } else if (c1 == '@'|| c1 == 'B') {
3038 /* This is kanji introduction */
3039 input_mode = JIS_X_0208;
3041 set_input_codename("ISO-2022-JP");
3043 debug("ISO-2022-JP");
3046 } else if (c1 == '(') {
3047 if ((c1 = (*i_getc)(f)) == EOF) {
3048 /* don't send bogus code
3054 } else if (c1 == '@'|| c1 == 'B') {
3055 /* This is kanji introduction */
3056 input_mode = JIS_X_0208;
3060 } else if (c1 == 'D'){
3061 input_mode = JIS_X_0212;
3064 #endif /* X0212_ENABLE */
3065 } else if (c1 == 0x4F){
3066 input_mode = JIS_X_0213_1;
3069 } else if (c1 == 0x50){
3070 input_mode = JIS_X_0213_2;
3074 /* could be some special code */
3081 } else if (broken_f&0x2) {
3082 /* accept any ESC-(-x as broken code ... */
3083 input_mode = JIS_X_0208;
3092 } else if (c1 == '(') {
3093 if ((c1 = (*i_getc)(f)) == EOF) {
3094 /* don't send bogus code
3096 (*oconv)(0, '('); */
3100 /* This is X0201 kana introduction */
3101 input_mode = JIS_X_0201; shift_mode = JIS_X_0201;
3103 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
3104 /* This is X0208 kanji introduction */
3105 input_mode = ASCII; shift_mode = FALSE;
3107 } else if (broken_f&0x2) {
3108 input_mode = ASCII; shift_mode = FALSE;
3113 /* maintain various input_mode here */
3117 } else if ( c1 == 'N' || c1 == 'n'){
3119 c3 = (*i_getc)(f); /* skip SS2 */
3120 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
3135 } else if (c1 == ESC && iconv == s_iconv) {
3136 /* ESC in Shift_JIS */
3137 if ((c1 = (*i_getc)(f)) == EOF) {
3138 /* (*oconv)(0, ESC); don't send bogus code */
3140 } else if (c1 == '$') {
3142 if ((c1 = (*i_getc)(f)) == EOF) {
3144 (*oconv)(0, ESC); don't send bogus code
3145 (*oconv)(0, '$'); */
3148 if (('E' <= c1 && c1 <= 'G') ||
3149 ('O' <= c1 && c1 <= 'Q')) {
3157 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
3158 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
3159 while ((c1 = (*i_getc)(f)) != EOF) {
3160 if (SP <= c1 && c1 <= 'z') {
3161 (*oconv)(0, c1 + c0);
3162 } else break; /* c1 == SO */
3166 if (c1 == EOF) LAST;
3173 } else if (c1 == LF || c1 == CR) {
3175 input_mode = ASCII; set_iconv(FALSE, 0);
3177 } else if (mime_decode_f && !mime_decode_mode){
3179 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
3187 } else { /* if (c1 == CR)*/
3188 if ((c1=(*i_getc)(f))!=EOF) {
3192 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
3206 } else if (c1 == DEL && input_mode == JIS_X_0208) {
3216 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
3219 if ((c0 = (*i_getc)(f)) != EOF) {
3222 if ((c3 = (*i_getc)(f)) != EOF) {
3224 (*iconv)(c2, c1, c0|c3);
3229 /* 3 bytes EUC or UTF-8 */
3230 if ((c0 = (*i_getc)(f)) != EOF) {
3232 (*iconv)(c2, c1, c0);
3240 0x7F <= c2 && c2 <= 0x92 &&
3241 0x21 <= c1 && c1 <= 0x7E) {
3243 if(c1 == 0x7F) return 0;
3244 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
3247 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
3251 (*oconv)(PREFIX_EUCG3 | c2, c1);
3253 #endif /* X0212_ENABLE */
3255 (*oconv)(PREFIX_EUCG3 | c2, c1);
3258 (*oconv)(input_mode, c1); /* other special case */
3264 /* goto next_word */
3268 (*iconv)(EOF, 0, 0);
3269 if (!input_codename)
3272 struct input_code *p = input_code_list;
3273 struct input_code *result = p;
3275 if (p->score < result->score) result = p;
3278 set_input_codename(result->name);
3280 debug(result->name);
3288 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3290 nkf_char ret, c3, c0;
3294 /** it must NOT be in the kanji shifte sequence */
3295 /** it must NOT be written in JIS7 */
3296 /** and it must be after 2 byte 8bit code */
3302 while ((c1 = (*i_getc)(f)) != EOF) {
3308 if (push_hold_buf(c1) == EOF || estab_f){
3314 struct input_code *p = input_code_list;
3315 struct input_code *result = p;
3320 if (p->status_func && p->score < result->score){
3325 set_iconv(TRUE, result->iconv_func);
3330 ** 1) EOF is detected, or
3331 ** 2) Code is established, or
3332 ** 3) Buffer is FULL (but last word is pushed)
3334 ** in 1) and 3) cases, we continue to use
3335 ** Kanji codes by oconv and leave estab_f unchanged.
3340 while (hold_index < hold_count){
3341 c2 = hold_buf[hold_index++];
3343 #ifdef NUMCHAR_OPTION
3344 || is_unicode_capsule(c2)
3349 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3350 (*iconv)(JIS_X_0201, c2, 0);
3353 if (hold_index < hold_count){
3354 c1 = hold_buf[hold_index++];
3364 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3367 if (hold_index < hold_count){
3368 c0 = hold_buf[hold_index++];
3369 } else if ((c0 = (*i_getc)(f)) == EOF) {
3375 if (hold_index < hold_count){
3376 c3 = hold_buf[hold_index++];
3377 } else if ((c3 = (*i_getc)(f)) == EOF) {
3382 (*iconv)(c2, c1, c0|c3);
3387 /* 3 bytes EUC or UTF-8 */
3388 if (hold_index < hold_count){
3389 c0 = hold_buf[hold_index++];
3390 } else if ((c0 = (*i_getc)(f)) == EOF) {
3396 (*iconv)(c2, c1, c0);
3399 if (c0 == EOF) break;
3404 nkf_char push_hold_buf(nkf_char c2)
3406 if (hold_count >= HOLD_SIZE*2)
3408 hold_buf[hold_count++] = (unsigned char)c2;
3409 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3412 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3414 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3417 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3418 #ifdef SHIFTJIS_CP932
3419 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3420 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3427 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3428 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3434 #endif /* SHIFTJIS_CP932 */
3436 if (!x0213_f && is_ibmext_in_sjis(c2)){
3437 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3440 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3453 if(x0213_f && c2 >= 0xF0){
3454 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3455 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3456 }else{ /* 78<=k<=94 */
3457 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3458 if (0x9E < c1) c2++;
3461 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3462 if (0x9E < c1) c2++;
3465 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3472 c2 = x0212_unshift(c2);
3479 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3481 if (c2 == JIS_X_0201) {
3483 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3485 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3487 if(c1 == 0x7F) return 0;
3488 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3491 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3492 if (ret) return ret;
3498 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3500 if (c2 == JIS_X_0201) {
3503 }else if (c2 == 0x8f){
3507 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3508 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3509 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3512 c2 = (c2 << 8) | (c1 & 0x7f);
3514 #ifdef SHIFTJIS_CP932
3517 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3518 s2e_conv(s2, s1, &c2, &c1);
3525 #endif /* SHIFTJIS_CP932 */
3527 #endif /* X0212_ENABLE */
3528 } else if (c2 == SSO){
3531 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3534 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3535 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3536 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3541 #ifdef SHIFTJIS_CP932
3542 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3544 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3545 s2e_conv(s2, s1, &c2, &c1);
3552 #endif /* SHIFTJIS_CP932 */
3559 #ifdef UTF8_INPUT_ENABLE
3560 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3567 }else if (0xc0 <= c2 && c2 <= 0xef) {
3568 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3569 #ifdef NUMCHAR_OPTION
3572 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3580 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3583 static const char w_iconv_utf8_1st_byte[] =
3585 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3586 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3587 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3588 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3590 if (c2 < 0 || 0xff < c2) {
3591 }else if (c2 == 0) { /* 0 : 1 byte*/
3593 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3596 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3598 if (c1 < 0x80 || 0xBF < c1) return 0;
3601 if (c0 == 0) return -1;
3602 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3607 if (c0 == 0) return -1;
3608 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3612 if (c0 == 0) return -1;
3613 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3617 if (c0 == 0) return -2;
3618 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3622 if (c0 == 0) return -2;
3623 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3627 if (c0 == 0) return -2;
3628 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3636 if (c2 == 0 || c2 == EOF){
3637 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3638 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3641 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3650 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3651 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3658 }else if (val < 0x800){
3659 *p2 = 0xc0 | (val >> 6);
3660 *p1 = 0x80 | (val & 0x3f);
3662 } else if (val <= NKF_INT32_C(0xFFFF)) {
3663 *p2 = 0xe0 | (val >> 12);
3664 *p1 = 0x80 | ((val >> 6) & 0x3f);
3665 *p0 = 0x80 | (val & 0x3f);
3666 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3667 *p2 = 0xe0 | (val >> 16);
3668 *p1 = 0x80 | ((val >> 12) & 0x3f);
3669 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3678 #ifdef UTF8_INPUT_ENABLE
3679 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3684 } else if (c2 >= 0xf0){
3685 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3686 val = (c2 & 0x0f) << 18;
3687 val |= (c1 & 0x3f) << 12;
3688 val |= (c0 & 0x3f00) >> 2;
3690 }else if (c2 >= 0xe0){
3691 val = (c2 & 0x0f) << 12;
3692 val |= (c1 & 0x3f) << 6;
3694 }else if (c2 >= 0xc0){
3695 val = (c2 & 0x1f) << 6;
3703 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3705 nkf_char c2, c1, c0;
3712 w16w_conv(val, &c2, &c1, &c0);
3713 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3714 #ifdef NUMCHAR_OPTION
3717 *p1 = CLASS_UNICODE | val;
3726 #ifdef UTF8_INPUT_ENABLE
3727 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3730 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3733 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3734 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3736 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3738 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3743 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3744 if (ret) return ret;
3749 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3753 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3754 } else if (is_unicode_bmp(c1)) {
3755 ret = w16e_conv(c1, &c2, &c1);
3758 c1 = CLASS_UNICODE | c1;
3760 if (ret) return ret;
3765 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3767 const unsigned short *const *pp;
3768 const unsigned short *const *const *ppp;
3769 static const char no_best_fit_chars_table_C2[] =
3770 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3771 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3772 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3773 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3774 static const char no_best_fit_chars_table_C2_ms[] =
3775 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3777 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3778 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3779 static const char no_best_fit_chars_table_932_C2[] =
3780 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3782 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3783 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3784 static const char no_best_fit_chars_table_932_C3[] =
3785 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3786 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3787 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3788 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3794 }else if(c2 < 0xe0){
3795 if(no_best_fit_chars_f){
3796 if(ms_ucs_map_f == UCS_MAP_CP932){
3799 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3802 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3805 }else if(!cp932inv_f){
3808 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3811 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3814 }else if(ms_ucs_map_f == UCS_MAP_MS){
3815 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3816 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3834 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3835 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3836 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3838 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3839 }else if(c0 < 0xF0){
3840 if(no_best_fit_chars_f){
3841 if(ms_ucs_map_f == UCS_MAP_CP932){
3842 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3843 }else if(ms_ucs_map_f == UCS_MAP_MS){
3848 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3851 if(c0 == 0x92) return 1;
3856 if(c1 == 0x80 || c0 == 0x9C) return 1;
3859 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3864 if(c0 == 0x94) return 1;
3867 if(c0 == 0xBB) return 1;
3877 if(c0 == 0x95) return 1;
3880 if(c0 == 0xA5) return 1;
3887 if(c0 == 0x8D) return 1;
3890 if(c0 == 0x9E && !cp932inv_f) return 1;
3893 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3901 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3902 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3903 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3905 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3907 #ifdef SHIFTJIS_CP932
3908 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3910 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3911 s2e_conv(s2, s1, p2, p1);
3920 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3923 const unsigned short *p;
3926 if (pp == 0) return 1;
3929 if (c1 < 0 || psize <= c1) return 1;
3931 if (p == 0) return 1;
3934 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3936 if (val == 0) return 1;
3937 if (no_cp932ext_f && (
3938 (val>>8) == 0x2D || /* NEC special characters */
3939 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3947 if (c2 == SO) c2 = JIS_X_0201;
3954 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3961 (*f)(0, bin2hex(c>>shift));
3971 void encode_fallback_html(nkf_char c)
3976 if(c >= NKF_INT32_C(1000000))
3977 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3978 if(c >= NKF_INT32_C(100000))
3979 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3981 (*oconv)(0, 0x30+(c/10000 )%10);
3983 (*oconv)(0, 0x30+(c/1000 )%10);
3985 (*oconv)(0, 0x30+(c/100 )%10);
3987 (*oconv)(0, 0x30+(c/10 )%10);
3989 (*oconv)(0, 0x30+ c %10);
3994 void encode_fallback_xml(nkf_char c)
3999 nkf_each_char_to_hex(oconv, c);
4004 void encode_fallback_java(nkf_char c)
4008 if(!is_unicode_bmp(c)){
4012 (*oconv)(0, bin2hex(c>>20));
4013 (*oconv)(0, bin2hex(c>>16));
4017 (*oconv)(0, bin2hex(c>>12));
4018 (*oconv)(0, bin2hex(c>> 8));
4019 (*oconv)(0, bin2hex(c>> 4));
4020 (*oconv)(0, bin2hex(c ));
4024 void encode_fallback_perl(nkf_char c)
4029 nkf_each_char_to_hex(oconv, c);
4034 void encode_fallback_subchar(nkf_char c)
4036 c = unicode_subchar;
4037 (*oconv)((c>>8)&0xFF, c&0xFF);
4042 #ifdef UTF8_OUTPUT_ENABLE
4043 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
4045 const unsigned short *p;
4047 if (c2 == JIS_X_0201) {
4048 if (ms_ucs_map_f == UCS_MAP_CP10001) {
4056 p = euc_to_utf8_1byte;
4058 } else if (is_eucg3(c2)){
4059 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
4062 c2 = (c2&0x7f) - 0x21;
4063 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4064 p = x0212_to_utf8_2bytes[c2];
4070 c2 = (c2&0x7f) - 0x21;
4071 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4073 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
4074 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
4075 euc_to_utf8_2bytes_ms[c2];
4080 c1 = (c1 & 0x7f) - 0x21;
4081 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
4086 void w_oconv(nkf_char c2, nkf_char c1)
4092 output_bom_f = FALSE;
4103 #ifdef NUMCHAR_OPTION
4104 if (c2 == 0 && is_unicode_capsule(c1)){
4105 val = c1 & VALUE_MASK;
4108 }else if (val < 0x800){
4109 (*o_putc)(0xC0 | (val >> 6));
4110 (*o_putc)(0x80 | (val & 0x3f));
4111 } else if (val <= NKF_INT32_C(0xFFFF)) {
4112 (*o_putc)(0xE0 | (val >> 12));
4113 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
4114 (*o_putc)(0x80 | (val & 0x3f));
4115 } else if (val <= NKF_INT32_C(0x10FFFF)) {
4116 (*o_putc)(0xF0 | ( val>>18));
4117 (*o_putc)(0x80 | ((val>>12) & 0x3f));
4118 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
4119 (*o_putc)(0x80 | ( val & 0x3f));
4126 output_mode = ASCII;
4128 } else if (c2 == ISO_8859_1) {
4129 output_mode = UTF_8;
4130 (*o_putc)(c1 | 0x080);
4132 output_mode = UTF_8;
4133 val = e2w_conv(c2, c1);
4135 w16w_conv(val, &c2, &c1, &c0);
4139 if (c0) (*o_putc)(c0);
4145 void w_oconv16(nkf_char c2, nkf_char c1)
4148 output_bom_f = FALSE;
4149 if (output_endian == ENDIAN_LITTLE){
4150 (*o_putc)((unsigned char)'\377');
4154 (*o_putc)((unsigned char)'\377');
4163 if (c2 == ISO_8859_1) {
4166 #ifdef NUMCHAR_OPTION
4167 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4168 if (is_unicode_bmp(c1)) {
4169 c2 = (c1 >> 8) & 0xff;
4173 if (c1 <= UNICODE_MAX) {
4174 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
4175 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
4176 if (output_endian == ENDIAN_LITTLE){
4177 (*o_putc)(c2 & 0xff);
4178 (*o_putc)((c2 >> 8) & 0xff);
4179 (*o_putc)(c1 & 0xff);
4180 (*o_putc)((c1 >> 8) & 0xff);
4182 (*o_putc)((c2 >> 8) & 0xff);
4183 (*o_putc)(c2 & 0xff);
4184 (*o_putc)((c1 >> 8) & 0xff);
4185 (*o_putc)(c1 & 0xff);
4192 nkf_char val = e2w_conv(c2, c1);
4193 c2 = (val >> 8) & 0xff;
4197 if (output_endian == ENDIAN_LITTLE){
4206 void w_oconv32(nkf_char c2, nkf_char c1)
4209 output_bom_f = FALSE;
4210 if (output_endian == ENDIAN_LITTLE){
4211 (*o_putc)((unsigned char)'\377');
4219 (*o_putc)((unsigned char)'\377');
4228 if (c2 == ISO_8859_1) {
4230 #ifdef NUMCHAR_OPTION
4231 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4235 c1 = e2w_conv(c2, c1);
4238 if (output_endian == ENDIAN_LITTLE){
4239 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4240 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4241 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4245 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4246 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4247 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4252 void e_oconv(nkf_char c2, nkf_char c1)
4254 #ifdef NUMCHAR_OPTION
4255 if (c2 == 0 && is_unicode_capsule(c1)){
4256 w16e_conv(c1, &c2, &c1);
4257 if (c2 == 0 && is_unicode_capsule(c1)){
4258 c2 = c1 & VALUE_MASK;
4259 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4263 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4264 c1 = 0x21 + c1 % 94;
4267 (*o_putc)((c2 & 0x7f) | 0x080);
4268 (*o_putc)(c1 | 0x080);
4270 (*o_putc)((c2 & 0x7f) | 0x080);
4271 (*o_putc)(c1 | 0x080);
4275 if (encode_fallback) (*encode_fallback)(c1);
4284 } else if (c2 == 0) {
4285 output_mode = ASCII;
4287 } else if (c2 == JIS_X_0201) {
4288 output_mode = EUC_JP;
4289 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4290 } else if (c2 == ISO_8859_1) {
4291 output_mode = ISO_8859_1;
4292 (*o_putc)(c1 | 0x080);
4294 } else if (is_eucg3(c2)){
4295 output_mode = EUC_JP;
4296 #ifdef SHIFTJIS_CP932
4299 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4300 s2e_conv(s2, s1, &c2, &c1);
4305 output_mode = ASCII;
4307 }else if (is_eucg3(c2)){
4310 (*o_putc)((c2 & 0x7f) | 0x080);
4311 (*o_putc)(c1 | 0x080);
4314 (*o_putc)((c2 & 0x7f) | 0x080);
4315 (*o_putc)(c1 | 0x080);
4319 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4320 set_iconv(FALSE, 0);
4321 return; /* too late to rescue this char */
4323 output_mode = EUC_JP;
4324 (*o_putc)(c2 | 0x080);
4325 (*o_putc)(c1 | 0x080);
4330 nkf_char x0212_shift(nkf_char c)
4335 if (0x75 <= c && c <= 0x7f){
4336 ret = c + (0x109 - 0x75);
4339 if (0x75 <= c && c <= 0x7f){
4340 ret = c + (0x113 - 0x75);
4347 nkf_char x0212_unshift(nkf_char c)
4350 if (0x7f <= c && c <= 0x88){
4351 ret = c + (0x75 - 0x7f);
4352 }else if (0x89 <= c && c <= 0x92){
4353 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4357 #endif /* X0212_ENABLE */
4359 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4365 if((0x21 <= ndx && ndx <= 0x2F)){
4366 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4367 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4369 }else if(0x6E <= ndx && ndx <= 0x7E){
4370 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4371 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4377 else if(nkf_isgraph(ndx)){
4379 const unsigned short *ptr;
4380 ptr = x0212_shiftjis[ndx - 0x21];
4382 val = ptr[(c1 & 0x7f) - 0x21];
4391 c2 = x0212_shift(c2);
4393 #endif /* X0212_ENABLE */
4395 if(0x7F < c2) return 1;
4396 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4397 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4401 void s_oconv(nkf_char c2, nkf_char c1)
4403 #ifdef NUMCHAR_OPTION
4404 if (c2 == 0 && is_unicode_capsule(c1)){
4405 w16e_conv(c1, &c2, &c1);
4406 if (c2 == 0 && is_unicode_capsule(c1)){
4407 c2 = c1 & VALUE_MASK;
4408 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4411 c2 = c1 / 188 + 0xF0;
4413 c1 += 0x40 + (c1 > 0x3e);
4418 if(encode_fallback)(*encode_fallback)(c1);
4427 } else if (c2 == 0) {
4428 output_mode = ASCII;
4430 } else if (c2 == JIS_X_0201) {
4431 output_mode = SHIFT_JIS;
4433 } else if (c2 == ISO_8859_1) {
4434 output_mode = ISO_8859_1;
4435 (*o_putc)(c1 | 0x080);
4437 } else if (is_eucg3(c2)){
4438 output_mode = SHIFT_JIS;
4439 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4445 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4446 set_iconv(FALSE, 0);
4447 return; /* too late to rescue this char */
4449 output_mode = SHIFT_JIS;
4450 e2s_conv(c2, c1, &c2, &c1);
4452 #ifdef SHIFTJIS_CP932
4454 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4455 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4461 #endif /* SHIFTJIS_CP932 */
4464 if (prefix_table[(unsigned char)c1]){
4465 (*o_putc)(prefix_table[(unsigned char)c1]);
4471 void j_oconv(nkf_char c2, nkf_char c1)
4473 #ifdef NUMCHAR_OPTION
4474 if (c2 == 0 && is_unicode_capsule(c1)){
4475 w16e_conv(c1, &c2, &c1);
4476 if (c2 == 0 && is_unicode_capsule(c1)){
4477 c2 = c1 & VALUE_MASK;
4478 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4481 c2 = 0x7F + c1 / 94;
4482 c1 = 0x21 + c1 % 94;
4484 if (encode_fallback) (*encode_fallback)(c1);
4491 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4494 (*o_putc)(ascii_intro);
4495 output_mode = ASCII;
4499 } else if (is_eucg3(c2)){
4501 if(output_mode!=JIS_X_0213_2){
4502 output_mode = JIS_X_0213_2;
4509 if(output_mode!=JIS_X_0212){
4510 output_mode = JIS_X_0212;
4517 (*o_putc)(c2 & 0x7f);
4520 } else if (c2==JIS_X_0201) {
4521 if (output_mode!=JIS_X_0201) {
4522 output_mode = JIS_X_0201;
4528 } else if (c2==ISO_8859_1) {
4529 /* iso8859 introduction, or 8th bit on */
4530 /* Can we convert in 7bit form using ESC-'-'-A ?
4532 output_mode = ISO_8859_1;
4534 } else if (c2 == 0) {
4535 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4538 (*o_putc)(ascii_intro);
4539 output_mode = ASCII;
4544 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4545 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4547 if (output_mode!=JIS_X_0213_1) {
4548 output_mode = JIS_X_0213_1;
4554 }else if (output_mode != JIS_X_0208) {
4555 output_mode = JIS_X_0208;
4558 (*o_putc)(kanji_intro);
4565 void base64_conv(nkf_char c2, nkf_char c1)
4567 mime_prechar(c2, c1);
4568 (*o_base64conv)(c2,c1);
4572 static nkf_char broken_buf[3];
4573 static int broken_counter = 0;
4574 static int broken_last = 0;
4575 nkf_char broken_getc(FILE *f)
4579 if (broken_counter>0) {
4580 return broken_buf[--broken_counter];
4583 if (c=='$' && broken_last != ESC
4584 && (input_mode==ASCII || input_mode==JIS_X_0201)) {
4587 if (c1=='@'|| c1=='B') {
4588 broken_buf[0]=c1; broken_buf[1]=c;
4595 } else if (c=='(' && broken_last != ESC
4596 && (input_mode==JIS_X_0208 || input_mode==JIS_X_0201)) { /* ) */
4599 if (c1=='J'|| c1=='B') {
4600 broken_buf[0]=c1; broken_buf[1]=c;
4613 nkf_char broken_ungetc(nkf_char c, FILE *f)
4615 if (broken_counter<2)
4616 broken_buf[broken_counter++]=c;
4620 void nl_conv(nkf_char c2, nkf_char c1)
4622 if (guess_f && input_newline != EOF) {
4623 if (c2 == 0 && c1 == LF) {
4624 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4625 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4626 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4628 else if (!input_newline) input_newline = CR;
4629 else if (input_newline != CR) input_newline = EOF;
4631 if (prev_cr || (c2 == 0 && c1 == LF)) {
4633 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4634 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4636 if (c2 == 0 && c1 == CR) prev_cr = CR;
4637 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4641 Return value of fold_conv()
4643 LF add newline and output char
4644 CR add newline and output nothing
4647 1 (or else) normal output
4649 fold state in prev (previous character)
4651 >0x80 Japanese (X0208/X0201)
4656 This fold algorthm does not preserve heading space in a line.
4657 This is the main difference from fmt.
4660 #define char_size(c2,c1) (c2?2:1)
4662 void fold_conv(nkf_char c2, nkf_char c1)
4665 nkf_char fold_state;
4667 if (c1== CR && !fold_preserve_f) {
4668 fold_state=0; /* ignore cr */
4669 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4671 fold_state=0; /* ignore cr */
4672 } else if (c1== BS) {
4673 if (f_line>0) f_line--;
4675 } else if (c2==EOF && f_line != 0) { /* close open last line */
4677 } else if ((c1==LF && !fold_preserve_f)
4678 || ((c1==CR||(c1==LF&&f_prev!=CR))
4679 && fold_preserve_f)) {
4681 if (fold_preserve_f) {
4685 } else if ((f_prev == c1 && !fold_preserve_f)
4686 || (f_prev == LF && fold_preserve_f)
4687 ) { /* duplicate newline */
4690 fold_state = LF; /* output two newline */
4696 if (f_prev&0x80) { /* Japanese? */
4698 fold_state = 0; /* ignore given single newline */
4699 } else if (f_prev==SP) {
4703 if (++f_line<=fold_len)
4707 fold_state = CR; /* fold and output nothing */
4711 } else if (c1=='\f') {
4714 fold_state = LF; /* output newline and clear */
4715 } else if ( (c2==0 && c1==SP)||
4716 (c2==0 && c1==TAB)||
4717 (c2=='!'&& c1=='!')) {
4718 /* X0208 kankaku or ascii space */
4720 fold_state = 0; /* remove duplicate spaces */
4723 if (++f_line<=fold_len)
4724 fold_state = SP; /* output ASCII space only */
4726 f_prev = SP; f_line = 0;
4727 fold_state = CR; /* fold and output nothing */
4731 prev0 = f_prev; /* we still need this one... , but almost done */
4733 if (c2 || c2==JIS_X_0201)
4734 f_prev |= 0x80; /* this is Japanese */
4735 f_line += char_size(c2,c1);
4736 if (f_line<=fold_len) { /* normal case */
4739 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4740 f_line = char_size(c2,c1);
4741 fold_state = LF; /* We can't wait, do fold now */
4742 } else if (c2==JIS_X_0201) {
4743 /* simple kinsoku rules return 1 means no folding */
4744 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4745 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4746 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4747 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4748 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4749 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4750 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4752 fold_state = LF;/* add one new f_line before this character */
4755 fold_state = LF;/* add one new f_line before this character */
4758 /* kinsoku point in ASCII */
4759 if ( c1==')'|| /* { [ ( */
4770 /* just after special */
4771 } else if (!is_alnum(prev0)) {
4772 f_line = char_size(c2,c1);
4774 } else if ((prev0==SP) || /* ignored new f_line */
4775 (prev0==LF)|| /* ignored new f_line */
4776 (prev0&0x80)) { /* X0208 - ASCII */
4777 f_line = char_size(c2,c1);
4778 fold_state = LF;/* add one new f_line before this character */
4780 fold_state = 1; /* default no fold in ASCII */
4784 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4785 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4786 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4787 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4788 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4789 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4790 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4791 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4792 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4793 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4794 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4795 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4796 /* default no fold in kinsoku */
4799 f_line = char_size(c2,c1);
4800 /* add one new f_line before this character */
4803 f_line = char_size(c2,c1);
4805 /* add one new f_line before this character */
4810 /* terminator process */
4811 switch(fold_state) {
4813 OCONV_NEWLINE((*o_fconv));
4819 OCONV_NEWLINE((*o_fconv));
4830 nkf_char z_prev2=0,z_prev1=0;
4832 void z_conv(nkf_char c2, nkf_char c1)
4835 /* if (c2) c1 &= 0x7f; assertion */
4837 if (c2 == JIS_X_0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4843 if (z_prev2 == JIS_X_0201) {
4844 if (c2 == JIS_X_0201) {
4845 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4847 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4849 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4851 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4856 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4858 if (c2 == JIS_X_0201) {
4859 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4860 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4865 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4876 if (alpha_f&1 && c2 == 0x23) {
4877 /* JISX0208 Alphabet */
4879 } else if (c2 == 0x21) {
4880 /* JISX0208 Kigou */
4885 } else if (alpha_f&4) {
4890 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4896 if (alpha_f&8 && c2 == 0) {
4900 case '>': entity = ">"; break;
4901 case '<': entity = "<"; break;
4902 case '\"': entity = """; break;
4903 case '&': entity = "&"; break;
4906 while (*entity) (*o_zconv)(0, *entity++);
4912 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4917 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4921 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4925 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4929 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4933 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4937 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4941 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4945 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4950 (*o_zconv)(JIS_X_0201, c);
4953 } else if (c2 == 0x25) {
4954 /* JISX0208 Katakana */
4955 static const int fullwidth_to_halfwidth[] =
4957 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4958 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4959 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4960 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4961 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4962 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4963 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4964 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4965 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4966 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4967 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4968 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4970 if (fullwidth_to_halfwidth[c1-0x20]){
4971 c2 = fullwidth_to_halfwidth[c1-0x20];
4972 (*o_zconv)(JIS_X_0201, c2>>8);
4974 (*o_zconv)(JIS_X_0201, c2&0xFF);
4984 #define rot13(c) ( \
4986 (c <= 'M') ? (c + 13): \
4987 (c <= 'Z') ? (c - 13): \
4989 (c <= 'm') ? (c + 13): \
4990 (c <= 'z') ? (c - 13): \
4994 #define rot47(c) ( \
4996 ( c <= 'O') ? (c + 47) : \
4997 ( c <= '~') ? (c - 47) : \
5001 void rot_conv(nkf_char c2, nkf_char c1)
5003 if (c2==0 || c2==JIS_X_0201 || c2==ISO_8859_1) {
5009 (*o_rot_conv)(c2,c1);
5012 void hira_conv(nkf_char c2, nkf_char c1)
5016 if (0x20 < c1 && c1 < 0x74) {
5018 (*o_hira_conv)(c2,c1);
5020 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
5022 c1 = CLASS_UNICODE | 0x3094;
5023 (*o_hira_conv)(c2,c1);
5026 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
5028 (*o_hira_conv)(c2,c1);
5033 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
5036 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
5038 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
5042 (*o_hira_conv)(c2,c1);
5046 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
5048 static const nkf_char range[RANGE_NUM_MAX][2] = {
5069 nkf_char start, end, c;
5071 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
5075 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
5080 for (i = 0; i < RANGE_NUM_MAX; i++) {
5081 start = range[i][0];
5084 if (c >= start && c <= end) {
5089 (*o_iso2022jp_check_conv)(c2,c1);
5093 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
5095 static const unsigned char *mime_pattern[] = {
5096 (const unsigned char *)"\075?EUC-JP?B?",
5097 (const unsigned char *)"\075?SHIFT_JIS?B?",
5098 (const unsigned char *)"\075?ISO-8859-1?Q?",
5099 (const unsigned char *)"\075?ISO-8859-1?B?",
5100 (const unsigned char *)"\075?ISO-2022-JP?B?",
5101 (const unsigned char *)"\075?ISO-2022-JP?Q?",
5102 #if defined(UTF8_INPUT_ENABLE)
5103 (const unsigned char *)"\075?UTF-8?B?",
5104 (const unsigned char *)"\075?UTF-8?Q?",
5106 (const unsigned char *)"\075?US-ASCII?Q?",
5111 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
5112 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
5113 e_iconv, s_iconv, 0, 0, 0, 0,
5114 #if defined(UTF8_INPUT_ENABLE)
5120 static const nkf_char mime_encode[] = {
5121 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201,
5122 #if defined(UTF8_INPUT_ENABLE)
5129 static const nkf_char mime_encode_method[] = {
5130 'B', 'B','Q', 'B', 'B', 'Q',
5131 #if defined(UTF8_INPUT_ENABLE)
5139 #define MAXRECOVER 20
5141 void switch_mime_getc(void)
5143 if (i_getc!=mime_getc) {
5144 i_mgetc = i_getc; i_getc = mime_getc;
5145 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5146 if(mime_f==STRICT_MIME) {
5147 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
5148 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
5153 void unswitch_mime_getc(void)
5155 if(mime_f==STRICT_MIME) {
5156 i_mgetc = i_mgetc_buf;
5157 i_mungetc = i_mungetc_buf;
5160 i_ungetc = i_mungetc;
5161 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
5162 mime_iconv_back = NULL;
5165 nkf_char mime_begin_strict(FILE *f)
5169 const unsigned char *p,*q;
5170 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
5172 mime_decode_mode = FALSE;
5173 /* =? has been checked */
5175 p = mime_pattern[j];
5178 for(i=2;p[i]>SP;i++) { /* start at =? */
5179 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
5180 /* pattern fails, try next one */
5182 while (mime_pattern[++j]) {
5183 p = mime_pattern[j];
5184 for(k=2;k<i;k++) /* assume length(p) > i */
5185 if (p[k]!=q[k]) break;
5186 if (k==i && nkf_toupper(c1)==p[k]) break;
5188 p = mime_pattern[j];
5189 if (p) continue; /* found next one, continue */
5190 /* all fails, output from recovery buffer */
5198 mime_decode_mode = p[i-2];
5200 mime_iconv_back = iconv;
5201 set_iconv(FALSE, mime_priority_func[j]);
5202 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
5204 if (mime_decode_mode=='B') {
5205 mimebuf_f = unbuf_f;
5207 /* do MIME integrity check */
5208 return mime_integrity(f,mime_pattern[j]);
5216 nkf_char mime_getc_buf(FILE *f)
5218 /* we don't keep eof of Fifo, becase it contains ?= as
5219 a terminator. It was checked in mime_integrity. */
5220 return ((mimebuf_f)?
5221 (*i_mgetc_buf)(f):Fifo(mime_input++));
5224 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
5227 (*i_mungetc_buf)(c,f);
5229 Fifo(--mime_input) = (unsigned char)c;
5233 nkf_char mime_begin(FILE *f)
5238 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
5239 /* re-read and convert again from mime_buffer. */
5241 /* =? has been checked */
5243 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
5244 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
5245 /* We accept any character type even if it is breaked by new lines */
5246 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5247 if (c1==LF||c1==SP||c1==CR||
5248 c1=='-'||c1=='_'||is_alnum(c1)) continue;
5250 /* Failed. But this could be another MIME preemble */
5258 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5259 if (!(++i<MAXRECOVER) || c1==EOF) break;
5260 if (c1=='b'||c1=='B') {
5261 mime_decode_mode = 'B';
5262 } else if (c1=='q'||c1=='Q') {
5263 mime_decode_mode = 'Q';
5267 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5268 if (!(++i<MAXRECOVER) || c1==EOF) break;
5270 mime_decode_mode = FALSE;
5276 if (!mime_decode_mode) {
5277 /* false MIME premble, restart from mime_buffer */
5278 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5279 /* Since we are in MIME mode until buffer becomes empty, */
5280 /* we never go into mime_begin again for a while. */
5283 /* discard mime preemble, and goto MIME mode */
5285 /* do no MIME integrity check */
5286 return c1; /* used only for checking EOF */
5290 void no_putc(nkf_char c)
5295 void debug(const char *str)
5298 fprintf(stderr, "%s\n", str ? str : "NULL");
5303 void set_input_codename(char *codename)
5305 if (!input_codename) {
5306 input_codename = codename;
5307 } else if (strcmp(codename, input_codename) != 0) {
5308 input_codename = "";
5312 static char* get_guessed_code(void)
5314 if (input_codename && !*input_codename) {
5315 input_codename = "BINARY";
5317 struct input_code *p = find_inputcode_byfunc(iconv);
5318 if (!input_codename) {
5319 input_codename = "ASCII";
5320 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5321 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5322 input_codename = "CP932";
5323 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5324 if (p->score & (SCORE_X0212))
5325 input_codename = "EUCJP-MS";
5326 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5327 input_codename = "CP51932";
5328 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5329 if (p->score & (SCORE_KANA))
5330 input_codename = "CP50221";
5331 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5332 input_codename = "CP50220";
5335 return input_codename;
5338 #if !defined(PERL_XS) && !defined(WIN32DLL)
5339 void print_guessed_code(char *filename)
5341 if (filename != NULL) printf("%s: ", filename);
5342 if (input_codename && !*input_codename) {
5345 input_codename = get_guessed_code();
5347 printf("%s\n", input_codename);
5351 input_newline == CR ? " (CR)" :
5352 input_newline == LF ? " (LF)" :
5353 input_newline == CRLF ? " (CRLF)" :
5354 input_newline == EOF ? " (MIXED NL)" :
5363 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5365 nkf_char c1, c2, c3;
5371 if (!nkf_isxdigit(c2)){
5376 if (!nkf_isxdigit(c3)){
5381 return (hex2bin(c2) << 4) | hex2bin(c3);
5384 nkf_char cap_getc(FILE *f)
5386 return hex_getc(':', f, i_cgetc, i_cungetc);
5389 nkf_char cap_ungetc(nkf_char c, FILE *f)
5391 return (*i_cungetc)(c, f);
5394 nkf_char url_getc(FILE *f)
5396 return hex_getc('%', f, i_ugetc, i_uungetc);
5399 nkf_char url_ungetc(nkf_char c, FILE *f)
5401 return (*i_uungetc)(c, f);
5405 #ifdef NUMCHAR_OPTION
5406 nkf_char numchar_getc(FILE *f)
5408 nkf_char (*g)(FILE *) = i_ngetc;
5409 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5420 if (buf[i] == 'x' || buf[i] == 'X'){
5421 for (j = 0; j < 7; j++){
5423 if (!nkf_isxdigit(buf[i])){
5430 c |= hex2bin(buf[i]);
5433 for (j = 0; j < 8; j++){
5437 if (!nkf_isdigit(buf[i])){
5444 c += hex2bin(buf[i]);
5450 return CLASS_UNICODE | c;
5459 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5461 return (*i_nungetc)(c, f);
5465 #ifdef UNICODE_NORMALIZATION
5467 /* Normalization Form C */
5468 nkf_char nfc_getc(FILE *f)
5470 nkf_char (*g)(FILE *f) = i_nfc_getc;
5471 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5472 int i=0, j, k=1, lower, upper;
5474 const nkf_nfchar *array;
5477 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5478 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5479 while (upper >= lower) {
5480 j = (lower+upper) / 2;
5481 array = normalization_table[j].nfd;
5482 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5483 if (array[k] != buf[k]){
5484 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5491 array = normalization_table[j].nfc;
5492 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5493 buf[i] = (nkf_char)(array[i]);
5504 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5506 return (*i_nfc_ungetc)(c, f);
5508 #endif /* UNICODE_NORMALIZATION */
5514 nkf_char c1, c2, c3, c4, cc;
5515 nkf_char t1, t2, t3, t4, mode, exit_mode;
5516 nkf_char lwsp_count;
5519 nkf_char lwsp_size = 128;
5521 if (mime_top != mime_last) { /* Something is in FIFO */
5522 return Fifo(mime_top++);
5524 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5525 mime_decode_mode=FALSE;
5526 unswitch_mime_getc();
5527 return (*i_getc)(f);
5530 if (mimebuf_f == FIXED_MIME)
5531 exit_mode = mime_decode_mode;
5534 if (mime_decode_mode == 'Q') {
5535 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5537 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5538 if (c1<=SP || DEL<=c1) {
5539 mime_decode_mode = exit_mode; /* prepare for quit */
5542 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5546 mime_decode_mode = exit_mode; /* prepare for quit */
5547 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5548 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5549 /* end Q encoding */
5550 input_mode = exit_mode;
5552 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5553 if (lwsp_buf==NULL) {
5554 perror("can't malloc");
5557 while ((c1=(*i_getc)(f))!=EOF) {
5562 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5570 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5571 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5586 lwsp_buf[lwsp_count] = (unsigned char)c1;
5587 if (lwsp_count++>lwsp_size){
5589 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5590 if (lwsp_buf_new==NULL) {
5592 perror("can't realloc");
5595 lwsp_buf = lwsp_buf_new;
5601 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5603 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5604 i_ungetc(lwsp_buf[lwsp_count],f);
5610 if (c1=='='&&c2<SP) { /* this is soft wrap */
5611 while((c1 = (*i_mgetc)(f)) <=SP) {
5612 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5614 mime_decode_mode = 'Q'; /* still in MIME */
5615 goto restart_mime_q;
5618 mime_decode_mode = 'Q'; /* still in MIME */
5622 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5623 if (c2<=SP) return c2;
5624 mime_decode_mode = 'Q'; /* still in MIME */
5625 return ((hex2bin(c2)<<4) + hex2bin(c3));
5628 if (mime_decode_mode != 'B') {
5629 mime_decode_mode = FALSE;
5630 return (*i_mgetc)(f);
5634 /* Base64 encoding */
5636 MIME allows line break in the middle of
5637 Base64, but we are very pessimistic in decoding
5638 in unbuf mode because MIME encoded code may broken by
5639 less or editor's control sequence (such as ESC-[-K in unbuffered
5640 mode. ignore incomplete MIME.
5642 mode = mime_decode_mode;
5643 mime_decode_mode = exit_mode; /* prepare for quit */
5645 while ((c1 = (*i_mgetc)(f))<=SP) {
5650 if ((c2 = (*i_mgetc)(f))<=SP) {
5653 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5654 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5657 if ((c1 == '?') && (c2 == '=')) {
5660 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5661 if (lwsp_buf==NULL) {
5662 perror("can't malloc");
5665 while ((c1=(*i_getc)(f))!=EOF) {
5670 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5678 if ((c1=(*i_getc)(f))!=EOF) {
5682 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5697 lwsp_buf[lwsp_count] = (unsigned char)c1;
5698 if (lwsp_count++>lwsp_size){
5700 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5701 if (lwsp_buf_new==NULL) {
5703 perror("can't realloc");
5706 lwsp_buf = lwsp_buf_new;
5712 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5714 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5715 i_ungetc(lwsp_buf[lwsp_count],f);
5722 if ((c3 = (*i_mgetc)(f))<=SP) {
5725 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5726 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5730 if ((c4 = (*i_mgetc)(f))<=SP) {
5733 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5734 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5738 mime_decode_mode = mode; /* still in MIME sigh... */
5740 /* BASE 64 decoding */
5742 t1 = 0x3f & base64decode(c1);
5743 t2 = 0x3f & base64decode(c2);
5744 t3 = 0x3f & base64decode(c3);
5745 t4 = 0x3f & base64decode(c4);
5746 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5748 Fifo(mime_last++) = (unsigned char)cc;
5749 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5751 Fifo(mime_last++) = (unsigned char)cc;
5752 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5754 Fifo(mime_last++) = (unsigned char)cc;
5759 return Fifo(mime_top++);
5762 nkf_char mime_ungetc(nkf_char c, FILE *f)
5764 Fifo(--mime_top) = (unsigned char)c;
5768 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5772 /* In buffered mode, read until =? or NL or buffer full
5774 mime_input = mime_top;
5775 mime_last = mime_top;
5777 while(*p) Fifo(mime_input++) = *p++;
5780 while((c=(*i_getc)(f))!=EOF) {
5781 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5782 break; /* buffer full */
5784 if (c=='=' && d=='?') {
5785 /* checked. skip header, start decode */
5786 Fifo(mime_input++) = (unsigned char)c;
5787 /* mime_last_input = mime_input; */
5792 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5794 /* Should we check length mod 4? */
5795 Fifo(mime_input++) = (unsigned char)c;
5798 /* In case of Incomplete MIME, no MIME decode */
5799 Fifo(mime_input++) = (unsigned char)c;
5800 mime_last = mime_input; /* point undecoded buffer */
5801 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5802 switch_mime_getc(); /* anyway we need buffered getc */
5806 nkf_char base64decode(nkf_char c)
5811 i = c - 'A'; /* A..Z 0-25 */
5812 } else if (c == '_') {
5813 i = '?' /* 63 */ ; /* _ 63 */
5815 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5817 } else if (c > '/') {
5818 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5819 } else if (c == '+' || c == '-') {
5820 i = '>' /* 62 */ ; /* + and - 62 */
5822 i = '?' /* 63 */ ; /* / 63 */
5827 static const char basis_64[] =
5828 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5830 static nkf_char b64c;
5831 #define MIMEOUT_BUF_LENGTH (60)
5832 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5833 int mimeout_buf_count = 0;
5835 void open_mime(nkf_char mode)
5837 const unsigned char *p;
5840 p = mime_pattern[0];
5841 for(i=0;mime_pattern[i];i++) {
5842 if (mode == mime_encode[i]) {
5843 p = mime_pattern[i];
5847 mimeout_mode = mime_encode_method[i];
5849 if (base64_count>45) {
5850 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5851 (*o_mputc)(mimeout_buf[i]);
5854 PUT_NEWLINE((*o_mputc));
5857 if (mimeout_buf_count>0
5858 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5859 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5863 for (;i<mimeout_buf_count;i++) {
5864 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5865 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5866 (*o_mputc)(mimeout_buf[i]);
5876 j = mimeout_buf_count;
5877 mimeout_buf_count = 0;
5879 mime_putc(mimeout_buf[i]);
5883 void close_mime(void)
5893 switch(mimeout_mode) {
5898 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5904 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5909 if (mimeout_mode > 0) {
5910 if (mimeout_f!=FIXED_MIME) {
5912 } else if (mimeout_mode != 'Q')
5917 void mimeout_addchar(nkf_char c)
5919 switch(mimeout_mode) {
5924 } else if(!nkf_isalnum(c)) {
5926 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5927 (*o_mputc)(bin2hex((c&0xf)));
5936 (*o_mputc)(basis_64[c>>2]);
5941 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5947 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5948 (*o_mputc)(basis_64[c & 0x3F]);
5959 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5961 void mime_prechar(nkf_char c2, nkf_char c1)
5963 if (mimeout_mode > 0){
5965 if (base64_count + mimeout_buf_count/3*4> 73){
5966 (*o_base64conv)(EOF,0);
5967 OCONV_NEWLINE((*o_base64conv));
5968 (*o_base64conv)(0,SP);
5972 if (base64_count + mimeout_buf_count/3*4> 66) {
5973 (*o_base64conv)(EOF,0);
5974 OCONV_NEWLINE((*o_base64conv));
5975 (*o_base64conv)(0,SP);
5981 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5982 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5983 open_mime(output_mode);
5984 (*o_base64conv)(EOF,0);
5985 OCONV_NEWLINE((*o_base64conv));
5986 (*o_base64conv)(0,SP);
5993 void mime_putc(nkf_char c)
5998 if (mimeout_f == FIXED_MIME){
5999 if (mimeout_mode == 'Q'){
6000 if (base64_count > 71){
6001 if (c!=CR && c!=LF) {
6003 PUT_NEWLINE((*o_mputc));
6008 if (base64_count > 71){
6010 PUT_NEWLINE((*o_mputc));
6013 if (c == EOF) { /* c==EOF */
6017 if (c != EOF) { /* c==EOF */
6023 /* mimeout_f != FIXED_MIME */
6025 if (c == EOF) { /* c==EOF */
6026 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
6027 j = mimeout_buf_count;
6028 mimeout_buf_count = 0;
6030 if (mimeout_mode > 0) {
6031 if (!nkf_isblank(mimeout_buf[j-1])) {
6033 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
6036 mimeout_addchar(mimeout_buf[i]);
6040 mimeout_addchar(mimeout_buf[i]);
6044 mimeout_addchar(mimeout_buf[i]);
6050 mimeout_addchar(mimeout_buf[i]);
6056 if (mimeout_buf_count > 0){
6057 lastchar = mimeout_buf[mimeout_buf_count - 1];
6062 if (mimeout_mode=='Q') {
6063 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6064 if (c == CR || c == LF) {
6069 } else if (c <= SP) {
6071 if (base64_count > 70) {
6072 PUT_NEWLINE((*o_mputc));
6075 if (!nkf_isblank(c)) {
6080 if (base64_count > 70) {
6082 PUT_NEWLINE((*o_mputc));
6085 open_mime(output_mode);
6087 if (!nkf_noescape_mime(c)) {
6098 if (mimeout_mode <= 0) {
6099 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6100 if (nkf_isspace(c)) {
6102 if (mimeout_mode == -1) {
6105 if (c==CR || c==LF) {
6107 open_mime(output_mode);
6113 for (i=0;i<mimeout_buf_count;i++) {
6114 (*o_mputc)(mimeout_buf[i]);
6115 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
6126 mimeout_buf[0] = (char)c;
6127 mimeout_buf_count = 1;
6129 if (base64_count > 1
6130 && base64_count + mimeout_buf_count > 76
6131 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
6132 PUT_NEWLINE((*o_mputc));
6134 if (!nkf_isspace(mimeout_buf[0])){
6139 mimeout_buf[mimeout_buf_count++] = (char)c;
6140 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6141 open_mime(output_mode);
6146 if (lastchar==CR || lastchar == LF){
6147 for (i=0;i<mimeout_buf_count;i++) {
6148 (*o_mputc)(mimeout_buf[i]);
6151 mimeout_buf_count = 0;
6154 for (i=0;i<mimeout_buf_count-1;i++) {
6155 (*o_mputc)(mimeout_buf[i]);
6158 mimeout_buf[0] = SP;
6159 mimeout_buf_count = 1;
6161 open_mime(output_mode);
6164 /* mimeout_mode == 'B', 1, 2 */
6165 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6166 if (lastchar == CR || lastchar == LF){
6167 if (nkf_isblank(c)) {
6168 for (i=0;i<mimeout_buf_count;i++) {
6169 mimeout_addchar(mimeout_buf[i]);
6171 mimeout_buf_count = 0;
6172 } else if (SP<c && c<DEL) {
6174 for (i=0;i<mimeout_buf_count;i++) {
6175 (*o_mputc)(mimeout_buf[i]);
6178 mimeout_buf_count = 0;
6180 mimeout_buf[mimeout_buf_count++] = (char)c;
6183 if (c==SP || c==TAB || c==CR || c==LF) {
6184 for (i=0;i<mimeout_buf_count;i++) {
6185 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
6187 for (i=0;i<mimeout_buf_count;i++) {
6188 (*o_mputc)(mimeout_buf[i]);
6191 mimeout_buf_count = 0;
6194 mimeout_buf[mimeout_buf_count++] = (char)c;
6195 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6197 for (i=0;i<mimeout_buf_count;i++) {
6198 (*o_mputc)(mimeout_buf[i]);
6201 mimeout_buf_count = 0;
6205 if (mimeout_buf_count>0 && SP<c && c!='=') {
6206 mimeout_buf[mimeout_buf_count++] = (char)c;
6207 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6208 j = mimeout_buf_count;
6209 mimeout_buf_count = 0;
6211 mimeout_addchar(mimeout_buf[i]);
6218 if (mimeout_buf_count>0) {
6219 j = mimeout_buf_count;
6220 mimeout_buf_count = 0;
6222 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
6224 mimeout_addchar(mimeout_buf[i]);
6230 (*o_mputc)(mimeout_buf[i]);
6232 open_mime(output_mode);
6242 struct input_code *p = input_code_list;
6255 mime_f = MIME_DECODE_DEFAULT;
6256 mime_decode_f = FALSE;
6261 x0201_f = X0201_DEFAULT;
6262 iso2022jp_f = FALSE;
6263 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6264 ms_ucs_map_f = UCS_MAP_ASCII;
6266 #ifdef UTF8_INPUT_ENABLE
6267 no_cp932ext_f = FALSE;
6268 no_best_fit_chars_f = FALSE;
6269 encode_fallback = NULL;
6270 unicode_subchar = '?';
6271 input_endian = ENDIAN_BIG;
6273 #ifdef UTF8_OUTPUT_ENABLE
6274 output_bom_f = FALSE;
6275 output_endian = ENDIAN_BIG;
6277 #ifdef UNICODE_NORMALIZATION
6293 #ifdef SHIFTJIS_CP932
6303 for (i = 0; i < 256; i++){
6304 prefix_table[i] = 0;
6308 mimeout_buf_count = 0;
6313 fold_preserve_f = FALSE;
6316 kanji_intro = DEFAULT_J;
6317 ascii_intro = DEFAULT_R;
6318 fold_margin = FOLD_MARGIN;
6319 output_conv = DEFAULT_CONV;
6320 oconv = DEFAULT_CONV;
6321 o_zconv = no_connection;
6322 o_fconv = no_connection;
6323 o_nlconv = no_connection;
6324 o_rot_conv = no_connection;
6325 o_hira_conv = no_connection;
6326 o_base64conv = no_connection;
6327 o_iso2022jp_check_conv = no_connection;
6330 i_ungetc = std_ungetc;
6332 i_bungetc = std_ungetc;
6335 i_mungetc = std_ungetc;
6336 i_mgetc_buf = std_getc;
6337 i_mungetc_buf = std_ungetc;
6338 output_mode = ASCII;
6341 mime_decode_mode = FALSE;
6349 z_prev2=0,z_prev1=0;
6351 iconv_for_check = 0;
6353 input_codename = NULL;
6354 output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
6360 void no_connection(nkf_char c2, nkf_char c1)
6362 no_connection2(c2,c1,0);
6365 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6367 fprintf(stderr,"nkf internal module connection failure.\n");
6369 return 0; /* LINT */
6374 #define fprintf dllprintf
6378 fprintf(HELP_OUTPUT,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6379 fprintf(HELP_OUTPUT,"Flags:\n");
6380 fprintf(HELP_OUTPUT,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6381 #ifdef DEFAULT_CODE_SJIS
6382 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6384 #ifdef DEFAULT_CODE_JIS
6385 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6387 #ifdef DEFAULT_CODE_EUC
6388 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6390 #ifdef DEFAULT_CODE_UTF8
6391 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6393 #ifdef UTF8_OUTPUT_ENABLE
6394 fprintf(HELP_OUTPUT," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6396 fprintf(HELP_OUTPUT,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6397 #ifdef UTF8_INPUT_ENABLE
6398 fprintf(HELP_OUTPUT," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6400 fprintf(HELP_OUTPUT,"t no conversion\n");
6401 fprintf(HELP_OUTPUT,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6402 fprintf(HELP_OUTPUT,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6403 fprintf(HELP_OUTPUT,"r {de/en}crypt ROT13/47\n");
6404 fprintf(HELP_OUTPUT,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6405 fprintf(HELP_OUTPUT,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6406 fprintf(HELP_OUTPUT,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6407 fprintf(HELP_OUTPUT,"l ISO8859-1 (Latin-1) support\n");
6408 fprintf(HELP_OUTPUT,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6409 fprintf(HELP_OUTPUT,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6410 fprintf(HELP_OUTPUT," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6411 fprintf(HELP_OUTPUT," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6412 fprintf(HELP_OUTPUT,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6413 fprintf(HELP_OUTPUT,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6415 fprintf(HELP_OUTPUT,"T Text mode output\n");
6417 fprintf(HELP_OUTPUT,"O Output to File (DEFAULT 'nkf.out')\n");
6418 fprintf(HELP_OUTPUT,"I Convert non ISO-2022-JP charactor to GETA\n");
6419 fprintf(HELP_OUTPUT,"d,c Convert line breaks -d: LF -c: CRLF\n");
6420 fprintf(HELP_OUTPUT,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6421 fprintf(HELP_OUTPUT,"v, V Show this usage. V: show configuration\n");
6422 fprintf(HELP_OUTPUT,"\n");
6423 fprintf(HELP_OUTPUT,"Long name options\n");
6424 fprintf(HELP_OUTPUT," --ic=<input codeset> --oc=<output codeset>\n");
6425 fprintf(HELP_OUTPUT," Specify the input or output codeset\n");
6426 fprintf(HELP_OUTPUT," --fj --unix --mac --windows\n");
6427 fprintf(HELP_OUTPUT," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6428 fprintf(HELP_OUTPUT," Convert for the system or code\n");
6429 fprintf(HELP_OUTPUT," --hiragana --katakana --katakana-hiragana\n");
6430 fprintf(HELP_OUTPUT," To Hiragana/Katakana Conversion\n");
6431 fprintf(HELP_OUTPUT," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6433 fprintf(HELP_OUTPUT," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6435 #ifdef NUMCHAR_OPTION
6436 fprintf(HELP_OUTPUT," --numchar-input Convert Unicode Character Reference\n");
6438 #ifdef UTF8_INPUT_ENABLE
6439 fprintf(HELP_OUTPUT," --fb-{skip, html, xml, perl, java, subchar}\n");
6440 fprintf(HELP_OUTPUT," Specify how nkf handles unassigned characters\n");
6443 fprintf(HELP_OUTPUT," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6444 fprintf(HELP_OUTPUT," Overwrite original listed files by filtered result\n");
6445 fprintf(HELP_OUTPUT," --overwrite preserves timestamp of original files\n");
6447 fprintf(HELP_OUTPUT," -g --guess Guess the input code\n");
6448 fprintf(HELP_OUTPUT," --help --version Show this help/the version\n");
6449 fprintf(HELP_OUTPUT," For more information, see also man nkf\n");
6450 fprintf(HELP_OUTPUT,"\n");
6454 void show_configuration(void)
6456 fprintf(HELP_OUTPUT, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n");
6457 fprintf(HELP_OUTPUT, " Compile-time options:\n");
6458 fprintf(HELP_OUTPUT, " Default output encoding: "
6459 #if defined(DEFAULT_CODE_JIS)
6461 #elif defined(DEFAULT_CODE_SJIS)
6463 #elif defined(DEFAULT_CODE_EUC)
6465 #elif defined(DEFAULT_CODE_UTF8)
6469 fprintf(HELP_OUTPUT, " Default output newline: "
6470 #if DEFAULT_NEWLINE == CR
6472 #elif DEFAULT_NEWLINE == CRLF
6478 fprintf(HELP_OUTPUT, " Decode MIME encoded string: "
6479 #if MIME_DECODE_DEFAULT
6485 fprintf(HELP_OUTPUT, " Convert JIS X 0201 Katakana: "
6492 fprintf(HELP_OUTPUT, " --help, --version output: "
6493 #if HELP_OUTPUT_HELP_OUTPUT
6503 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");