1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.160 2007/12/23 08:12:27 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-12-22"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #if defined(DEFAULT_CODE_JIS)
44 #elif defined(DEFAULT_CODE_SJIS)
45 #elif defined(DEFAULT_CODE_EUC)
46 #elif defined(DEFAULT_CODE_UTF8)
48 #define DEFAULT_CODE_JIS 1
51 #ifndef MIME_DECODE_DEFAULT
52 #define MIME_DECODE_DEFAULT STRICT_MIME
55 #define X0201_DEFAULT TRUE
58 #if DEFAULT_NEWLINE == 0x0D0A
59 #define PUT_NEWLINE(func) do {\
63 #define OCONV_NEWLINE(func) do {\
67 #elif DEFAULT_NEWLINE == 0x0D
68 #define PUT_NEWLINE(func) func(0x0D)
69 #define OCONV_NEWLINE(func) func(0, 0x0D)
71 #define DEFAULT_NEWLINE 0x0A
72 #define PUT_NEWLINE(func) func(0x0A)
73 #define OCONV_NEWLINE(func) func(0, 0x0A)
75 #ifdef HELP_OUTPUT_STDERR
76 #define HELP_OUTPUT stderr
78 #define HELP_OUTPUT stdout
81 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
83 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
99 #if defined(MSDOS) || defined(__OS2__)
102 #if defined(_MSC_VER) || defined(__WATCOMC__)
103 #define mktemp _mktemp
109 #define setbinmode(fp) fsetbin(fp)
110 #elif defined(__DJGPP__)
111 #include <libc/dosio.h>
112 #define setbinmode(fp) djgpp_setbinmode(fp)
113 #else /* Microsoft C, Turbo C */
114 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
117 #define setbinmode(fp)
120 #if defined(__DJGPP__)
121 void djgpp_setbinmode(FILE *fp)
123 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
126 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
127 __file_handle_set(fd, m);
131 #ifdef _IOFBF /* SysV and MSDOS, Windows */
132 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
134 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
137 /*Borland C++ 4.5 EasyWin*/
138 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
147 /* added by satoru@isoternet.org */
149 #include <sys/types.h>
151 #include <sys/stat.h>
152 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
154 #if defined(__WATCOMC__)
155 #include <sys/utime.h>
159 #else /* defined(MSDOS) */
161 #ifdef __BORLANDC__ /* BCC32 */
163 #else /* !defined(__BORLANDC__) */
164 #include <sys/utime.h>
165 #endif /* (__BORLANDC__) */
166 #else /* !defined(__WIN32__) */
167 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
168 #include <sys/utime.h>
169 #elif defined(__TURBOC__) /* BCC */
171 #elif defined(LSI_C) /* LSI C */
172 #endif /* (__WIN32__) */
180 /* state of output_mode and input_mode
191 /* Input Assumption */
196 #define LATIN1_INPUT 6
197 #define UTF8_INPUT 13
198 #define UTF16_INPUT 1015
199 #define UTF32_INPUT 1017
202 #define STRICT_MIME 8
209 #define ENDIAN_BIG 1234
210 #define ENDIAN_LITTLE 4321
211 #define ENDIAN_2143 2143
212 #define ENDIAN_3412 3412
276 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
277 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
278 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
279 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
280 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
281 void j_oconv(nkf_char c2, nkf_char c1);
282 void s_oconv(nkf_char c2, nkf_char c1);
283 void e_oconv(nkf_char c2, nkf_char c1);
284 void w_oconv(nkf_char c2, nkf_char c1);
285 void w_oconv16(nkf_char c2, nkf_char c1);
286 void w_oconv32(nkf_char c2, nkf_char c1);
290 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
291 void (*oconv_func)(nkf_char c2, nkf_char c1);
292 } nkf_native_encoding;
294 nkf_native_encoding NkfEncodingASCII = { "US_ASCII", e_iconv, e_oconv };
295 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
296 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
297 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
298 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
299 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
300 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
305 nkf_native_encoding *based_encoding;
307 nkf_encoding nkf_encoding_table[] = {
308 {ASCII, "ASCII", &NkfEncodingASCII},
309 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
310 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingASCII},
311 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
312 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
313 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
314 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
315 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
316 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
317 {WINDOWS_31J, "WINDOWS-31J", &NkfEncodingShift_JIS},
318 {CP10001, "CP10001", &NkfEncodingShift_JIS},
319 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
320 {CP51932, "CP51932", &NkfEncodingEUC_JP},
321 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
322 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
323 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
324 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
325 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
326 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
327 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
328 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
329 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
330 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
331 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
332 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
333 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
334 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
335 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
336 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
337 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
338 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
339 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
340 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
341 {BINARY, "BINARY", &NkfEncodingASCII},
344 #define NKF_ENCODING_TABLE_SIZE 34
348 } encoding_name_to_id_table[] = {
350 {"ISO-2022-JP", ISO_2022_JP},
351 {"X-ISO2022JP-CP932", CP50220},
352 {"CP50220", CP50220},
353 {"CP50221", CP50221},
354 {"CP50222", CP50222},
355 {"ISO-2022-JP-1", ISO_2022_JP_1},
356 {"ISO-2022-JP-3", ISO_2022_JP_3},
357 {"SHIFT_JIS", SHIFT_JIS},
359 {"WINDOWS-31J", WINDOWS_31J},
360 {"CSWINDOWS31J", WINDOWS_31J},
361 {"CP932", WINDOWS_31J},
362 {"MS932", WINDOWS_31J},
363 {"CP10001", CP10001},
366 {"CP51932", CP51932},
367 {"EUC-JP-MS", EUCJP_MS},
368 {"EUCJP-MS", EUCJP_MS},
369 {"EUCJPMS", EUCJP_MS},
370 {"EUC-JP-ASCII", EUCJP_ASCII},
371 {"EUCJP-ASCII", EUCJP_ASCII},
372 {"SHIFT_JISX0213", SHIFT_JISX0213},
373 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
374 {"EUC-JISX0213", EUC_JISX0213},
375 {"EUC-JIS-2004", EUC_JIS_2004},
378 {"UTF-8-BOM", UTF_8_BOM},
379 {"UTF8-MAC", UTF8_MAC},
380 {"UTF-8-MAC", UTF8_MAC},
382 {"UTF-16BE", UTF_16BE},
383 {"UTF-16BE-BOM", UTF_16BE_BOM},
384 {"UTF-16LE", UTF_16LE},
385 {"UTF-16LE-BOM", UTF_16LE_BOM},
387 {"UTF-32BE", UTF_32BE},
388 {"UTF-32BE-BOM", UTF_32BE_BOM},
389 {"UTF-32LE", UTF_32LE},
390 {"UTF-32LE-BOM", UTF_32LE_BOM},
394 #if defined(DEFAULT_CODE_JIS)
395 #define DEFAULT_ENCODING ISO_2022_JP
396 #elif defined(DEFAULT_CODE_SJIS)
397 #define DEFAULT_ENCODING SHIFT_JIS
398 #elif defined(DEFAULT_CODE_EUC)
399 #define DEFAULT_ENCODING EUC_JP
400 #elif defined(DEFAULT_CODE_UTF8)
401 #define DEFAULT_ENCODING UTF_8
405 #define is_alnum(c) \
406 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
408 /* I don't trust portablity of toupper */
409 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
410 #define nkf_isoctal(c) ('0'<=c && c<='7')
411 #define nkf_isdigit(c) ('0'<=c && c<='9')
412 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
413 #define nkf_isblank(c) (c == SP || c == TAB)
414 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
415 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
416 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
417 #define nkf_isprint(c) (SP<=c && c<='~')
418 #define nkf_isgraph(c) ('!'<=c && c<='~')
419 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
420 ('A'<=c&&c<='F') ? (c-'A'+10) : \
421 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
422 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
423 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
424 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
425 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
426 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
428 #define CP932_TABLE_BEGIN 0xFA
429 #define CP932_TABLE_END 0xFC
430 #define CP932INV_TABLE_BEGIN 0xED
431 #define CP932INV_TABLE_END 0xEE
432 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
434 #define HOLD_SIZE 1024
435 #if defined(INT_IS_SHORT)
436 #define IOBUF_SIZE 2048
438 #define IOBUF_SIZE 16384
441 #define DEFAULT_J 'B'
442 #define DEFAULT_R 'B'
444 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
445 #define SJ6394 0x0161 /* 63 - 94 ku offset */
447 #define RANGE_NUM_MAX 18
452 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
453 #define sizeof_euc_to_utf8_1byte 94
454 #define sizeof_euc_to_utf8_2bytes 94
455 #define sizeof_utf8_to_euc_C2 64
456 #define sizeof_utf8_to_euc_E5B8 64
457 #define sizeof_utf8_to_euc_2bytes 112
458 #define sizeof_utf8_to_euc_3bytes 16
461 /* MIME preprocessor */
463 #ifdef EASYWIN /*Easy Win */
464 extern POINT _BufferSize;
473 void (*status_func)(struct input_code *, nkf_char);
474 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
478 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
479 static nkf_encoding *output_encoding;
481 #if !defined(PERL_XS) && !defined(WIN32DLL)
482 static nkf_char noconvert(FILE *f);
484 static void module_connection(void);
485 static nkf_char kanji_convert(FILE *f);
486 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
487 static nkf_char push_hold_buf(nkf_char c2);
488 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
489 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
490 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
492 * 0: Shift_JIS, eucJP-ascii
497 #define UCS_MAP_ASCII 0
499 #define UCS_MAP_CP932 2
500 #define UCS_MAP_CP10001 3
501 static int ms_ucs_map_f = UCS_MAP_ASCII;
503 #ifdef UTF8_INPUT_ENABLE
504 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
505 static int no_cp932ext_f = FALSE;
506 /* ignore ZERO WIDTH NO-BREAK SPACE */
507 static int no_best_fit_chars_f = FALSE;
508 static int input_endian = ENDIAN_BIG;
509 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
510 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
511 static void encode_fallback_html(nkf_char c);
512 static void encode_fallback_xml(nkf_char c);
513 static void encode_fallback_java(nkf_char c);
514 static void encode_fallback_perl(nkf_char c);
515 static void encode_fallback_subchar(nkf_char c);
516 static void (*encode_fallback)(nkf_char c) = NULL;
517 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
518 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
519 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
520 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
521 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
522 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
523 static void w_status(struct input_code *, nkf_char);
525 #ifdef UTF8_OUTPUT_ENABLE
526 static int output_bom_f = FALSE;
527 static int output_endian = ENDIAN_BIG;
528 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
530 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
531 static void fold_conv(nkf_char c2,nkf_char c1);
532 static void nl_conv(nkf_char c2,nkf_char c1);
533 static void z_conv(nkf_char c2,nkf_char c1);
534 static void rot_conv(nkf_char c2,nkf_char c1);
535 static void hira_conv(nkf_char c2,nkf_char c1);
536 static void base64_conv(nkf_char c2,nkf_char c1);
537 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
538 static void no_connection(nkf_char c2,nkf_char c1);
539 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
541 static void code_score(struct input_code *ptr);
542 static void code_status(nkf_char c);
544 static void std_putc(nkf_char c);
545 static nkf_char std_getc(FILE *f);
546 static nkf_char std_ungetc(nkf_char c,FILE *f);
548 static nkf_char broken_getc(FILE *f);
549 static nkf_char broken_ungetc(nkf_char c,FILE *f);
551 static nkf_char mime_begin(FILE *f);
552 static nkf_char mime_getc(FILE *f);
553 static nkf_char mime_ungetc(nkf_char c,FILE *f);
555 static void switch_mime_getc(void);
556 static void unswitch_mime_getc(void);
557 static nkf_char mime_begin_strict(FILE *f);
558 static nkf_char mime_getc_buf(FILE *f);
559 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
560 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
562 static nkf_char base64decode(nkf_char c);
563 static void mime_prechar(nkf_char c2, nkf_char c1);
564 static void mime_putc(nkf_char c);
565 static void open_mime(nkf_char c);
566 static void close_mime(void);
567 static void eof_mime(void);
568 static void mimeout_addchar(nkf_char c);
570 static void usage(void);
571 static void version(void);
572 static void show_configuration(void);
574 static void options(unsigned char *c);
575 static void reinit(void);
579 #if !defined(PERL_XS) && !defined(WIN32DLL)
580 static unsigned char stdibuf[IOBUF_SIZE];
581 static unsigned char stdobuf[IOBUF_SIZE];
583 static unsigned char hold_buf[HOLD_SIZE*2];
584 static int hold_count = 0;
586 /* MIME preprocessor fifo */
588 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
589 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
590 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
591 static unsigned char mime_buf[MIME_BUF_SIZE];
592 static unsigned int mime_top = 0;
593 static unsigned int mime_last = 0; /* decoded */
594 static unsigned int mime_input = 0; /* undecoded */
595 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
598 static int unbuf_f = FALSE;
599 static int estab_f = FALSE;
600 static int nop_f = FALSE;
601 static int binmode_f = TRUE; /* binary mode */
602 static int rot_f = FALSE; /* rot14/43 mode */
603 static int hira_f = FALSE; /* hira/kata henkan */
604 static int input_f = FALSE; /* non fixed input code */
605 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
606 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
607 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
608 static int mimebuf_f = FALSE; /* MIME buffered input */
609 static int broken_f = FALSE; /* convert ESC-less broken JIS */
610 static int iso8859_f = FALSE; /* ISO8859 through */
611 static int mimeout_f = FALSE; /* base64 mode */
612 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
613 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
615 #ifdef UNICODE_NORMALIZATION
616 static int nfc_f = FALSE;
617 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
618 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
619 static nkf_char nfc_getc(FILE *f);
620 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
624 static int cap_f = FALSE;
625 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
626 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
627 static nkf_char cap_getc(FILE *f);
628 static nkf_char cap_ungetc(nkf_char c,FILE *f);
630 static int url_f = FALSE;
631 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
632 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
633 static nkf_char url_getc(FILE *f);
634 static nkf_char url_ungetc(nkf_char c,FILE *f);
637 #if defined(INT_IS_SHORT)
638 #define NKF_INT32_C(n) (n##L)
640 #define NKF_INT32_C(n) (n)
642 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
643 #define CLASS_MASK NKF_INT32_C(0xFF000000)
644 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
645 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
646 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
647 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
648 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
650 #ifdef NUMCHAR_OPTION
651 static int numchar_f = FALSE;
652 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
653 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
654 static nkf_char numchar_getc(FILE *f);
655 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
659 static int noout_f = FALSE;
660 static void no_putc(nkf_char c);
661 static int debug_f = FALSE;
662 static void debug(const char *str);
663 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
666 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
668 static void print_guessed_code(char *filename);
670 static void set_input_codename(char *codename);
673 static int exec_f = 0;
676 #ifdef SHIFTJIS_CP932
677 /* invert IBM extended characters to others */
678 static int cp51932_f = FALSE;
680 /* invert NEC-selected IBM extended characters to IBM extended characters */
681 static int cp932inv_f = TRUE;
683 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
684 #endif /* SHIFTJIS_CP932 */
687 static int x0212_f = FALSE;
688 static nkf_char x0212_shift(nkf_char c);
689 static nkf_char x0212_unshift(nkf_char c);
691 static int x0213_f = FALSE;
693 static unsigned char prefix_table[256];
695 static void set_code_score(struct input_code *ptr, nkf_char score);
696 static void clr_code_score(struct input_code *ptr, nkf_char score);
697 static void status_disable(struct input_code *ptr);
698 static void status_push_ch(struct input_code *ptr, nkf_char c);
699 static void status_clear(struct input_code *ptr);
700 static void status_reset(struct input_code *ptr);
701 static void status_reinit(struct input_code *ptr);
702 static void status_check(struct input_code *ptr, nkf_char c);
703 static void e_status(struct input_code *, nkf_char);
704 static void s_status(struct input_code *, nkf_char);
706 struct input_code input_code_list[] = {
707 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
708 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
709 #ifdef UTF8_INPUT_ENABLE
710 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
711 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
712 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
717 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
718 static int base64_count = 0;
720 /* X0208 -> ASCII converter */
723 static int f_line = 0; /* chars in line */
724 static int f_prev = 0;
725 static int fold_preserve_f = FALSE; /* preserve new lines */
726 static int fold_f = FALSE;
727 static int fold_len = 0;
730 static unsigned char kanji_intro = DEFAULT_J;
731 static unsigned char ascii_intro = DEFAULT_R;
735 #define FOLD_MARGIN 10
736 #define DEFAULT_FOLD 60
738 static int fold_margin = FOLD_MARGIN;
742 #ifdef DEFAULT_CODE_JIS
743 # define DEFAULT_CONV j_oconv
745 #ifdef DEFAULT_CODE_SJIS
746 # define DEFAULT_CONV s_oconv
748 #ifdef DEFAULT_CODE_EUC
749 # define DEFAULT_CONV e_oconv
751 #ifdef DEFAULT_CODE_UTF8
752 # define DEFAULT_CONV w_oconv
755 /* process default */
756 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
758 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
759 /* s_iconv or oconv */
760 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
762 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
763 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
764 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
765 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
766 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
767 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
768 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
770 /* static redirections */
772 static void (*o_putc)(nkf_char c) = std_putc;
774 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
775 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
777 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
778 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
780 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
782 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
783 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
785 /* for strict mime */
786 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
787 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
790 static int output_mode = ASCII, /* output kanji mode */
791 input_mode = ASCII, /* input kanji mode */
792 shift_mode = FALSE; /* TRUE shift out, or X0201 */
793 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
795 /* X0201 / X0208 conversion tables */
797 /* X0201 kana conversion table */
799 static const unsigned char cv[]= {
800 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
801 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
802 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
803 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
804 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
805 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
806 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
807 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
808 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
809 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
810 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
811 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
812 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
813 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
814 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
815 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
819 /* X0201 kana conversion table for daguten */
821 static const unsigned char dv[]= {
822 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
823 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
824 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
825 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
826 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
827 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
828 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
829 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
830 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
831 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
832 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
833 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
834 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
835 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
836 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
837 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
840 /* X0201 kana conversion table for han-daguten */
842 static const unsigned char ev[]= {
843 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
844 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
845 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
846 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
847 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
848 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
849 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
850 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
851 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
852 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
853 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
854 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
855 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
856 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
857 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
858 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
862 /* X0208 kigou conversion table */
863 /* 0x8140 - 0x819e */
864 static const unsigned char fv[] = {
866 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
867 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
868 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
869 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
870 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
871 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
872 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
873 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
874 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
875 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
876 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
877 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
882 static int file_out_f = FALSE;
884 static int overwrite_f = FALSE;
885 static int preserve_time_f = FALSE;
886 static int backup_f = FALSE;
887 static char *backup_suffix = "";
888 static char *get_backup_filename(const char *suffix, const char *filename);
891 static int nlmode_f = 0; /* CR, LF, CRLF */
892 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
893 static nkf_char prev_cr = 0; /* CR or 0 */
894 #ifdef EASYWIN /*Easy Win */
895 static int end_check;
898 #define STD_GC_BUFSIZE (256)
899 nkf_char std_gc_buf[STD_GC_BUFSIZE];
902 char* nkf_strcpy(const char *str)
904 char* result = malloc(strlen(str) + 1);
913 static void nkf_str_upcase(const char *src, char *dest, size_t length)
916 for (; i < length && src[i]; i++) {
917 dest[i] = nkf_toupper(src[i]);
922 static nkf_encoding *nkf_enc_from_index(int idx)
924 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
927 return &nkf_encoding_table[idx];
930 static int nkf_enc_find_index(const char *name)
933 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
934 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
935 return encoding_name_to_id_table[i].id;
941 static nkf_encoding *nkf_enc_find(const char *name)
944 idx = nkf_enc_find_index(name);
945 if (idx < 0) return 0;
946 return nkf_enc_from_index(idx);
949 #define nkf_enc_name(enc) (enc)->name
950 #define nkf_enc_to_index(enc) (enc)->id
951 #define nkf_enc_to_base_encoding(enc) (enc)->based_encoding
954 #include "nkf32dll.c"
955 #elif defined(PERL_XS)
957 int main(int argc, char **argv)
962 char *outfname = NULL;
965 #ifdef EASYWIN /*Easy Win */
966 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
969 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
970 cp = (unsigned char *)*argv;
974 int debug_f_back = debug_f;
977 int exec_f_back = exec_f;
980 int x0212_f_back = x0212_f;
982 int x0213_f_back = x0213_f;
983 int guess_f_back = guess_f;
985 guess_f = guess_f_back;
988 debug_f = debug_f_back;
991 exec_f = exec_f_back;
994 x0212_f = x0212_f_back;
996 x0213_f = x0213_f_back;
1001 if (pipe(fds) < 0 || (pid = fork()) < 0){
1012 execvp(argv[1], &argv[1]);
1027 if (binmode_f == TRUE)
1028 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1029 if (freopen("","wb",stdout) == NULL)
1036 setbuf(stdout, (char *) NULL);
1038 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
1041 if (binmode_f == TRUE)
1042 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1043 if (freopen("","rb",stdin) == NULL) return (-1);
1047 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
1051 kanji_convert(stdin);
1052 if (guess_f) print_guessed_code(NULL);
1056 int is_argument_error = FALSE;
1058 input_codename = NULL;
1061 iconv_for_check = 0;
1063 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
1065 is_argument_error = TRUE;
1073 /* reopen file for stdout */
1074 if (file_out_f == TRUE) {
1077 outfname = malloc(strlen(origfname)
1078 + strlen(".nkftmpXXXXXX")
1084 strcpy(outfname, origfname);
1088 for (i = strlen(outfname); i; --i){
1089 if (outfname[i - 1] == '/'
1090 || outfname[i - 1] == '\\'){
1096 strcat(outfname, "ntXXXXXX");
1098 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
1099 S_IREAD | S_IWRITE);
1101 strcat(outfname, ".nkftmpXXXXXX");
1102 fd = mkstemp(outfname);
1105 || (fd_backup = dup(fileno(stdout))) < 0
1106 || dup2(fd, fileno(stdout)) < 0
1117 outfname = "nkf.out";
1120 if(freopen(outfname, "w", stdout) == NULL) {
1124 if (binmode_f == TRUE) {
1125 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1126 if (freopen("","wb",stdout) == NULL)
1133 if (binmode_f == TRUE)
1134 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1135 if (freopen("","rb",fin) == NULL)
1140 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
1144 char *filename = NULL;
1146 if (nfiles > 1) filename = origfname;
1147 if (guess_f) print_guessed_code(filename);
1153 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1161 if (dup2(fd_backup, fileno(stdout)) < 0){
1164 if (stat(origfname, &sb)) {
1165 fprintf(stderr, "Can't stat %s\n", origfname);
1167 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
1168 if (chmod(outfname, sb.st_mode)) {
1169 fprintf(stderr, "Can't set permission %s\n", outfname);
1172 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
1173 if(preserve_time_f){
1174 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1175 tb[0] = tb[1] = sb.st_mtime;
1176 if (utime(outfname, tb)) {
1177 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1180 tb.actime = sb.st_atime;
1181 tb.modtime = sb.st_mtime;
1182 if (utime(outfname, &tb)) {
1183 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1188 char *backup_filename = get_backup_filename(backup_suffix, origfname);
1190 unlink(backup_filename);
1192 if (rename(origfname, backup_filename)) {
1193 perror(backup_filename);
1194 fprintf(stderr, "Can't rename %s to %s\n",
1195 origfname, backup_filename);
1199 if (unlink(origfname)){
1204 if (rename(outfname, origfname)) {
1206 fprintf(stderr, "Can't rename %s to %s\n",
1207 outfname, origfname);
1214 if (is_argument_error)
1217 #ifdef EASYWIN /*Easy Win */
1218 if (file_out_f == FALSE)
1219 scanf("%d",&end_check);
1222 #else /* for Other OS */
1223 if (file_out_f == TRUE)
1225 #endif /*Easy Win */
1228 #endif /* WIN32DLL */
1231 char *get_backup_filename(const char *suffix, const char *filename)
1233 char *backup_filename;
1234 int asterisk_count = 0;
1236 int filename_length = strlen(filename);
1238 for(i = 0; suffix[i]; i++){
1239 if(suffix[i] == '*') asterisk_count++;
1243 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1244 if (!backup_filename){
1245 perror("Can't malloc backup filename.");
1249 for(i = 0, j = 0; suffix[i];){
1250 if(suffix[i] == '*'){
1251 backup_filename[j] = '\0';
1252 strncat(backup_filename, filename, filename_length);
1254 j += filename_length;
1256 backup_filename[j++] = suffix[i++];
1259 backup_filename[j] = '\0';
1261 j = strlen(suffix) + filename_length;
1262 backup_filename = malloc( + 1);
1263 strcpy(backup_filename, filename);
1264 strcat(backup_filename, suffix);
1265 backup_filename[j] = '\0';
1267 return backup_filename;
1271 static const struct {
1295 {"katakana-hiragana","h3"},
1303 #ifdef UTF8_OUTPUT_ENABLE
1313 {"fb-subchar=", ""},
1315 #ifdef UTF8_INPUT_ENABLE
1316 {"utf8-input", "W"},
1317 {"utf16-input", "W16"},
1318 {"no-cp932ext", ""},
1319 {"no-best-fit-chars",""},
1321 #ifdef UNICODE_NORMALIZATION
1322 {"utf8mac-input", ""},
1334 #ifdef NUMCHAR_OPTION
1335 {"numchar-input", ""},
1341 #ifdef SHIFTJIS_CP932
1351 static int option_mode = 0;
1353 void options(unsigned char *cp)
1357 unsigned char *cp_back = NULL;
1363 while(*cp && *cp++!='-');
1364 while (*cp || cp_back) {
1372 case '-': /* literal options */
1373 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1377 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1378 p = (unsigned char *)long_option[i].name;
1379 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1380 if (*p == cp[j] || cp[j] == SP){
1387 fprintf(stderr, "unknown long option: --%s\n", cp);
1390 while(*cp && *cp != SP && cp++);
1391 if (long_option[i].alias[0]){
1393 cp = (unsigned char *)long_option[i].alias;
1395 if (strcmp(long_option[i].name, "ic=") == 0){
1396 nkf_str_upcase((char *)p, codeset, 32);
1397 enc = nkf_enc_find(codeset);
1398 switch (nkf_enc_to_index(enc)) {
1400 input_f = JIS_INPUT;
1405 input_f = JIS_INPUT;
1406 #ifdef SHIFTJIS_CP932
1409 #ifdef UTF8_OUTPUT_ENABLE
1410 ms_ucs_map_f = UCS_MAP_CP932;
1414 input_f = JIS_INPUT;
1420 input_f = JIS_INPUT;
1427 input_f = SJIS_INPUT;
1430 input_f = SJIS_INPUT;
1431 #ifdef SHIFTJIS_CP932
1434 #ifdef UTF8_OUTPUT_ENABLE
1435 ms_ucs_map_f = UCS_MAP_CP932;
1439 input_f = SJIS_INPUT;
1440 #ifdef SHIFTJIS_CP932
1443 #ifdef UTF8_OUTPUT_ENABLE
1444 ms_ucs_map_f = UCS_MAP_CP10001;
1448 input_f = EUC_INPUT;
1451 input_f = EUC_INPUT;
1452 #ifdef SHIFTJIS_CP932
1455 #ifdef UTF8_OUTPUT_ENABLE
1456 ms_ucs_map_f = UCS_MAP_CP932;
1460 input_f = EUC_INPUT;
1461 #ifdef SHIFTJIS_CP932
1464 #ifdef UTF8_OUTPUT_ENABLE
1465 ms_ucs_map_f = UCS_MAP_MS;
1469 input_f = EUC_INPUT;
1470 #ifdef SHIFTJIS_CP932
1473 #ifdef UTF8_OUTPUT_ENABLE
1474 ms_ucs_map_f = UCS_MAP_ASCII;
1477 case SHIFT_JISX0213:
1478 case SHIFT_JIS_2004:
1479 input_f = SJIS_INPUT;
1481 #ifdef SHIFTJIS_CP932
1487 input_f = EUC_INPUT;
1489 #ifdef SHIFTJIS_CP932
1493 #ifdef UTF8_INPUT_ENABLE
1497 input_f = UTF8_INPUT;
1499 #ifdef UNICODE_NORMALIZATION
1501 input_f = UTF8_INPUT;
1508 input_f = UTF16_INPUT;
1509 input_endian = ENDIAN_BIG;
1513 input_f = UTF16_INPUT;
1514 input_endian = ENDIAN_LITTLE;
1519 input_f = UTF32_INPUT;
1520 input_endian = ENDIAN_BIG;
1524 input_f = UTF32_INPUT;
1525 input_endian = ENDIAN_LITTLE;
1529 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1534 if (strcmp(long_option[i].name, "oc=") == 0){
1536 nkf_str_upcase((char *)p, codeset, 32);
1537 output_encoding = nkf_enc_find(codeset);
1538 switch (nkf_enc_to_index(output_encoding)) {
1540 output_conv = j_oconv;
1543 output_conv = j_oconv;
1545 #ifdef SHIFTJIS_CP932
1548 #ifdef UTF8_OUTPUT_ENABLE
1549 ms_ucs_map_f = UCS_MAP_CP932;
1553 output_conv = j_oconv;
1554 #ifdef SHIFTJIS_CP932
1557 #ifdef UTF8_OUTPUT_ENABLE
1558 ms_ucs_map_f = UCS_MAP_CP932;
1562 output_conv = j_oconv;
1566 #ifdef SHIFTJIS_CP932
1571 output_conv = j_oconv;
1576 #ifdef SHIFTJIS_CP932
1581 output_conv = s_oconv;
1584 output_conv = s_oconv;
1585 #ifdef UTF8_OUTPUT_ENABLE
1586 ms_ucs_map_f = UCS_MAP_CP932;
1590 output_conv = s_oconv;
1591 #ifdef UTF8_OUTPUT_ENABLE
1592 ms_ucs_map_f = UCS_MAP_CP10001;
1596 output_conv = e_oconv;
1599 output_conv = e_oconv;
1600 #ifdef SHIFTJIS_CP932
1603 #ifdef UTF8_OUTPUT_ENABLE
1604 ms_ucs_map_f = UCS_MAP_CP932;
1608 output_conv = e_oconv;
1612 #ifdef UTF8_OUTPUT_ENABLE
1613 ms_ucs_map_f = UCS_MAP_MS;
1617 output_conv = e_oconv;
1621 #ifdef UTF8_OUTPUT_ENABLE
1622 ms_ucs_map_f = UCS_MAP_ASCII;
1625 case SHIFT_JISX0213:
1626 case SHIFT_JIS_2004:
1627 output_conv = s_oconv;
1629 #ifdef SHIFTJIS_CP932
1635 output_conv = e_oconv;
1640 #ifdef SHIFTJIS_CP932
1644 #ifdef UTF8_OUTPUT_ENABLE
1647 output_conv = w_oconv;
1650 output_conv = w_oconv;
1651 output_bom_f = TRUE;
1654 output_conv = w_oconv16;
1658 output_conv = w_oconv16;
1659 output_bom_f = TRUE;
1662 output_conv = w_oconv16;
1663 output_endian = ENDIAN_LITTLE;
1666 output_conv = w_oconv16;
1667 output_endian = ENDIAN_LITTLE;
1668 output_bom_f = TRUE;
1672 output_conv = w_oconv32;
1675 output_conv = w_oconv32;
1676 output_bom_f = TRUE;
1679 output_conv = w_oconv32;
1680 output_endian = ENDIAN_LITTLE;
1683 output_conv = w_oconv32;
1684 output_endian = ENDIAN_LITTLE;
1685 output_bom_f = TRUE;
1689 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1694 if (strcmp(long_option[i].name, "guess=") == 0){
1703 if (strcmp(long_option[i].name, "overwrite") == 0){
1706 preserve_time_f = TRUE;
1709 if (strcmp(long_option[i].name, "overwrite=") == 0){
1712 preserve_time_f = TRUE;
1714 backup_suffix = malloc(strlen((char *) p) + 1);
1715 strcpy(backup_suffix, (char *) p);
1718 if (strcmp(long_option[i].name, "in-place") == 0){
1721 preserve_time_f = FALSE;
1724 if (strcmp(long_option[i].name, "in-place=") == 0){
1727 preserve_time_f = FALSE;
1729 backup_suffix = malloc(strlen((char *) p) + 1);
1730 strcpy(backup_suffix, (char *) p);
1735 if (strcmp(long_option[i].name, "cap-input") == 0){
1739 if (strcmp(long_option[i].name, "url-input") == 0){
1744 #ifdef NUMCHAR_OPTION
1745 if (strcmp(long_option[i].name, "numchar-input") == 0){
1751 if (strcmp(long_option[i].name, "no-output") == 0){
1755 if (strcmp(long_option[i].name, "debug") == 0){
1760 if (strcmp(long_option[i].name, "cp932") == 0){
1761 #ifdef SHIFTJIS_CP932
1765 #ifdef UTF8_OUTPUT_ENABLE
1766 ms_ucs_map_f = UCS_MAP_CP932;
1770 if (strcmp(long_option[i].name, "no-cp932") == 0){
1771 #ifdef SHIFTJIS_CP932
1775 #ifdef UTF8_OUTPUT_ENABLE
1776 ms_ucs_map_f = UCS_MAP_ASCII;
1780 #ifdef SHIFTJIS_CP932
1781 if (strcmp(long_option[i].name, "cp932inv") == 0){
1788 if (strcmp(long_option[i].name, "x0212") == 0){
1795 if (strcmp(long_option[i].name, "exec-in") == 0){
1799 if (strcmp(long_option[i].name, "exec-out") == 0){
1804 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1805 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1806 no_cp932ext_f = TRUE;
1809 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1810 no_best_fit_chars_f = TRUE;
1813 if (strcmp(long_option[i].name, "fb-skip") == 0){
1814 encode_fallback = NULL;
1817 if (strcmp(long_option[i].name, "fb-html") == 0){
1818 encode_fallback = encode_fallback_html;
1821 if (strcmp(long_option[i].name, "fb-xml") == 0){
1822 encode_fallback = encode_fallback_xml;
1825 if (strcmp(long_option[i].name, "fb-java") == 0){
1826 encode_fallback = encode_fallback_java;
1829 if (strcmp(long_option[i].name, "fb-perl") == 0){
1830 encode_fallback = encode_fallback_perl;
1833 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1834 encode_fallback = encode_fallback_subchar;
1837 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1838 encode_fallback = encode_fallback_subchar;
1839 unicode_subchar = 0;
1841 /* decimal number */
1842 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1843 unicode_subchar *= 10;
1844 unicode_subchar += hex2bin(p[i]);
1846 }else if(p[1] == 'x' || p[1] == 'X'){
1847 /* hexadecimal number */
1848 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1849 unicode_subchar <<= 4;
1850 unicode_subchar |= hex2bin(p[i]);
1854 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1855 unicode_subchar *= 8;
1856 unicode_subchar += hex2bin(p[i]);
1859 w16e_conv(unicode_subchar, &i, &j);
1860 unicode_subchar = i<<8 | j;
1864 #ifdef UTF8_OUTPUT_ENABLE
1865 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1866 ms_ucs_map_f = UCS_MAP_MS;
1870 #ifdef UNICODE_NORMALIZATION
1871 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1872 input_f = UTF8_INPUT;
1877 if (strcmp(long_option[i].name, "prefix=") == 0){
1878 if (nkf_isgraph(p[0])){
1879 for (i = 1; nkf_isgraph(p[i]); i++){
1880 prefix_table[p[i]] = p[0];
1887 case 'b': /* buffered mode */
1890 case 'u': /* non bufferd mode */
1893 case 't': /* transparent mode */
1898 } else if (*cp=='2') {
1902 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1910 case 'j': /* JIS output */
1912 output_conv = j_oconv;
1913 output_encoding = nkf_enc_from_index(ISO_2022_JP);
1915 case 'e': /* AT&T EUC output */
1916 output_conv = e_oconv;
1918 output_encoding = nkf_enc_from_index(EUC_JP);
1920 case 's': /* SJIS output */
1921 output_conv = s_oconv;
1922 output_encoding = nkf_enc_from_index(SHIFT_JIS);
1924 case 'l': /* ISO8859 Latin-1 support, no conversion */
1925 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1926 input_f = LATIN1_INPUT;
1928 case 'i': /* Kanji IN ESC-$-@/B */
1929 if (*cp=='@'||*cp=='B')
1930 kanji_intro = *cp++;
1932 case 'o': /* ASCII IN ESC-(-J/B */
1933 if (*cp=='J'||*cp=='B'||*cp=='H')
1934 ascii_intro = *cp++;
1938 bit:1 katakana->hiragana
1939 bit:2 hiragana->katakana
1941 if ('9'>= *cp && *cp>='0')
1942 hira_f |= (*cp++ -'0');
1949 #if defined(MSDOS) || defined(__OS2__)
1956 show_configuration();
1964 #ifdef UTF8_OUTPUT_ENABLE
1965 case 'w': /* UTF-8 output */
1967 output_conv = w_oconv; cp++;
1970 output_encoding = nkf_enc_from_index(UTF_8N);
1972 output_bom_f = TRUE;
1973 output_encoding = nkf_enc_from_index(UTF_8_BOM);
1977 if ('1'== cp[0] && '6'==cp[1]) {
1978 output_conv = w_oconv16; cp+=2;
1980 } else if ('3'== cp[0] && '2'==cp[1]) {
1981 output_conv = w_oconv32; cp+=2;
1984 output_conv = w_oconv;
1985 output_encoding = nkf_enc_from_index(UTF_8);
1990 output_endian = ENDIAN_LITTLE;
1991 } else if (cp[0] == 'B') {
1994 output_encoding = nkf_enc_from_index(enc_idx);
1999 enc_idx = enc_idx == UTF_16
2000 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
2001 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
2003 output_bom_f = TRUE;
2004 enc_idx = enc_idx == UTF_16
2005 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
2006 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
2008 output_encoding = nkf_enc_from_index(enc_idx);
2012 #ifdef UTF8_INPUT_ENABLE
2013 case 'W': /* UTF input */
2016 input_f = UTF8_INPUT;
2018 if ('1'== cp[0] && '6'==cp[1]) {
2020 input_f = UTF16_INPUT;
2021 input_endian = ENDIAN_BIG;
2022 } else if ('3'== cp[0] && '2'==cp[1]) {
2024 input_f = UTF32_INPUT;
2025 input_endian = ENDIAN_BIG;
2027 input_f = UTF8_INPUT;
2032 input_endian = ENDIAN_LITTLE;
2033 } else if (cp[0] == 'B') {
2039 /* Input code assumption */
2040 case 'J': /* JIS input */
2041 input_f = JIS_INPUT;
2043 case 'E': /* AT&T EUC input */
2044 input_f = EUC_INPUT;
2046 case 'S': /* MS Kanji input */
2047 input_f = SJIS_INPUT;
2049 case 'Z': /* Convert X0208 alphabet to asii */
2051 bit:0 Convert JIS X 0208 Alphabet to ASCII
2052 bit:1 Convert Kankaku to one space
2053 bit:2 Convert Kankaku to two spaces
2054 bit:3 Convert HTML Entity
2055 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
2057 while ('0'<= *cp && *cp <='9') {
2058 alpha_f |= 1 << (*cp++ - '0');
2060 if (!alpha_f) alpha_f = 1;
2062 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
2063 x0201_f = FALSE; /* No X0201->X0208 conversion */
2065 ESC-(-I in JIS, EUC, MS Kanji
2066 SI/SO in JIS, EUC, MS Kanji
2067 SSO in EUC, JIS, not in MS Kanji
2068 MS Kanji (0xa0-0xdf)
2070 ESC-(-I in JIS (0x20-0x5f)
2071 SSO in EUC (0xa0-0xdf)
2072 0xa0-0xd in MS Kanji (0xa0-0xdf)
2075 case 'X': /* Convert X0201 kana to X0208 */
2078 case 'F': /* prserve new lines */
2079 fold_preserve_f = TRUE;
2080 case 'f': /* folding -f60 or -f */
2083 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2085 fold_len += *cp++ - '0';
2087 if (!(0<fold_len && fold_len<BUFSIZ))
2088 fold_len = DEFAULT_FOLD;
2092 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2094 fold_margin += *cp++ - '0';
2098 case 'm': /* MIME support */
2099 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
2100 if (*cp=='B'||*cp=='Q') {
2101 mime_decode_mode = *cp++;
2102 mimebuf_f = FIXED_MIME;
2103 } else if (*cp=='N') {
2104 mime_f = TRUE; cp++;
2105 } else if (*cp=='S') {
2106 mime_f = STRICT_MIME; cp++;
2107 } else if (*cp=='0') {
2108 mime_decode_f = FALSE;
2109 mime_f = FALSE; cp++;
2112 case 'M': /* MIME output */
2115 mimeout_f = FIXED_MIME; cp++;
2116 } else if (*cp=='Q') {
2118 mimeout_f = FIXED_MIME; cp++;
2123 case 'B': /* Broken JIS support */
2125 bit:1 allow any x on ESC-(-x or ESC-$-x
2126 bit:2 reset to ascii on NL
2128 if ('9'>= *cp && *cp>='0')
2129 broken_f |= 1<<(*cp++ -'0');
2134 case 'O':/* for Output file */
2138 case 'c':/* add cr code */
2141 case 'd':/* delete cr code */
2144 case 'I': /* ISO-2022-JP output */
2147 case 'L': /* line mode */
2148 if (*cp=='u') { /* unix */
2149 nlmode_f = LF; cp++;
2150 } else if (*cp=='m') { /* mac */
2151 nlmode_f = CR; cp++;
2152 } else if (*cp=='w') { /* windows */
2153 nlmode_f = CRLF; cp++;
2154 } else if (*cp=='0') { /* no conversion */
2163 } else if (*cp == '0') {
2172 /* module muliple options in a string are allowed for Perl moudle */
2173 while(*cp && *cp++!='-');
2176 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
2177 /* bogus option but ignored */
2183 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2186 struct input_code *p = input_code_list;
2188 if (iconv_func == p->iconv_func){
2197 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2199 #ifdef INPUT_CODE_FIX
2207 #ifdef INPUT_CODE_FIX
2208 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
2214 if (estab_f && iconv_for_check != iconv){
2215 struct input_code *p = find_inputcode_byfunc(iconv);
2217 set_input_codename(p->name);
2220 iconv_for_check = iconv;
2225 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2226 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2227 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2228 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2229 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2230 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2231 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2232 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2234 #define SCORE_INIT (SCORE_iMIME)
2236 static const char score_table_A0[] = {
2239 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2240 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2243 static const char score_table_F0[] = {
2244 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2245 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2246 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2247 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2250 void set_code_score(struct input_code *ptr, nkf_char score)
2253 ptr->score |= score;
2257 void clr_code_score(struct input_code *ptr, nkf_char score)
2260 ptr->score &= ~score;
2264 void code_score(struct input_code *ptr)
2266 nkf_char c2 = ptr->buf[0];
2267 #ifdef UTF8_OUTPUT_ENABLE
2268 nkf_char c1 = ptr->buf[1];
2271 set_code_score(ptr, SCORE_ERROR);
2272 }else if (c2 == SSO){
2273 set_code_score(ptr, SCORE_KANA);
2274 }else if (c2 == 0x8f){
2275 set_code_score(ptr, SCORE_X0212);
2276 #ifdef UTF8_OUTPUT_ENABLE
2277 }else if (!e2w_conv(c2, c1)){
2278 set_code_score(ptr, SCORE_NO_EXIST);
2280 }else if ((c2 & 0x70) == 0x20){
2281 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2282 }else if ((c2 & 0x70) == 0x70){
2283 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2284 }else if ((c2 & 0x70) >= 0x50){
2285 set_code_score(ptr, SCORE_L2);
2289 void status_disable(struct input_code *ptr)
2294 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2297 void status_push_ch(struct input_code *ptr, nkf_char c)
2299 ptr->buf[ptr->index++] = c;
2302 void status_clear(struct input_code *ptr)
2308 void status_reset(struct input_code *ptr)
2311 ptr->score = SCORE_INIT;
2314 void status_reinit(struct input_code *ptr)
2317 ptr->_file_stat = 0;
2320 void status_check(struct input_code *ptr, nkf_char c)
2322 if (c <= DEL && estab_f){
2327 void s_status(struct input_code *ptr, nkf_char c)
2331 status_check(ptr, c);
2336 #ifdef NUMCHAR_OPTION
2337 }else if (is_unicode_capsule(c)){
2340 }else if (0xa1 <= c && c <= 0xdf){
2341 status_push_ch(ptr, SSO);
2342 status_push_ch(ptr, c);
2345 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2347 status_push_ch(ptr, c);
2348 }else if (0xed <= c && c <= 0xee){
2350 status_push_ch(ptr, c);
2351 #ifdef SHIFTJIS_CP932
2352 }else if (is_ibmext_in_sjis(c)){
2354 status_push_ch(ptr, c);
2355 #endif /* SHIFTJIS_CP932 */
2357 }else if (0xf0 <= c && c <= 0xfc){
2359 status_push_ch(ptr, c);
2360 #endif /* X0212_ENABLE */
2362 status_disable(ptr);
2366 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2367 status_push_ch(ptr, c);
2368 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2372 status_disable(ptr);
2376 #ifdef SHIFTJIS_CP932
2377 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2378 status_push_ch(ptr, c);
2379 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2380 set_code_score(ptr, SCORE_CP932);
2385 #endif /* SHIFTJIS_CP932 */
2386 status_disable(ptr);
2389 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2390 status_push_ch(ptr, c);
2391 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2392 set_code_score(ptr, SCORE_CP932);
2395 status_disable(ptr);
2401 void e_status(struct input_code *ptr, nkf_char c)
2405 status_check(ptr, c);
2410 #ifdef NUMCHAR_OPTION
2411 }else if (is_unicode_capsule(c)){
2414 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2416 status_push_ch(ptr, c);
2418 }else if (0x8f == c){
2420 status_push_ch(ptr, c);
2421 #endif /* X0212_ENABLE */
2423 status_disable(ptr);
2427 if (0xa1 <= c && c <= 0xfe){
2428 status_push_ch(ptr, c);
2432 status_disable(ptr);
2437 if (0xa1 <= c && c <= 0xfe){
2439 status_push_ch(ptr, c);
2441 status_disable(ptr);
2443 #endif /* X0212_ENABLE */
2447 #ifdef UTF8_INPUT_ENABLE
2448 void w_status(struct input_code *ptr, nkf_char c)
2452 status_check(ptr, c);
2457 #ifdef NUMCHAR_OPTION
2458 }else if (is_unicode_capsule(c)){
2461 }else if (0xc0 <= c && c <= 0xdf){
2463 status_push_ch(ptr, c);
2464 }else if (0xe0 <= c && c <= 0xef){
2466 status_push_ch(ptr, c);
2467 }else if (0xf0 <= c && c <= 0xf4){
2469 status_push_ch(ptr, c);
2471 status_disable(ptr);
2476 if (0x80 <= c && c <= 0xbf){
2477 status_push_ch(ptr, c);
2478 if (ptr->index > ptr->stat){
2479 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2480 && ptr->buf[2] == 0xbf);
2481 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2482 &ptr->buf[0], &ptr->buf[1]);
2489 status_disable(ptr);
2493 if (0x80 <= c && c <= 0xbf){
2494 if (ptr->index < ptr->stat){
2495 status_push_ch(ptr, c);
2500 status_disable(ptr);
2507 void code_status(nkf_char c)
2509 int action_flag = 1;
2510 struct input_code *result = 0;
2511 struct input_code *p = input_code_list;
2513 if (!p->status_func) {
2517 if (!p->status_func)
2519 (p->status_func)(p, c);
2522 }else if(p->stat == 0){
2533 if (result && !estab_f){
2534 set_iconv(TRUE, result->iconv_func);
2535 }else if (c <= DEL){
2536 struct input_code *ptr = input_code_list;
2546 nkf_char std_getc(FILE *f)
2549 return std_gc_buf[--std_gc_ndx];
2555 nkf_char std_ungetc(nkf_char c, FILE *f)
2557 if (std_gc_ndx == STD_GC_BUFSIZE){
2560 std_gc_buf[std_gc_ndx++] = c;
2565 void std_putc(nkf_char c)
2572 #if !defined(PERL_XS) && !defined(WIN32DLL)
2573 nkf_char noconvert(FILE *f)
2578 module_connection();
2579 while ((c = (*i_getc)(f)) != EOF)
2586 void module_connection(void)
2588 oconv = output_conv;
2591 /* replace continucation module, from output side */
2593 /* output redicrection */
2595 if (noout_f || guess_f){
2602 if (mimeout_f == TRUE) {
2603 o_base64conv = oconv; oconv = base64_conv;
2605 /* base64_count = 0; */
2608 if (nlmode_f || guess_f) {
2609 o_nlconv = oconv; oconv = nl_conv;
2612 o_rot_conv = oconv; oconv = rot_conv;
2615 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2618 o_hira_conv = oconv; oconv = hira_conv;
2621 o_fconv = oconv; oconv = fold_conv;
2624 if (alpha_f || x0201_f) {
2625 o_zconv = oconv; oconv = z_conv;
2629 i_ungetc = std_ungetc;
2630 /* input redicrection */
2633 i_cgetc = i_getc; i_getc = cap_getc;
2634 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2637 i_ugetc = i_getc; i_getc = url_getc;
2638 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2641 #ifdef NUMCHAR_OPTION
2643 i_ngetc = i_getc; i_getc = numchar_getc;
2644 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2647 #ifdef UNICODE_NORMALIZATION
2648 if (nfc_f && input_f == UTF8_INPUT){
2649 i_nfc_getc = i_getc; i_getc = nfc_getc;
2650 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2653 if (mime_f && mimebuf_f==FIXED_MIME) {
2654 i_mgetc = i_getc; i_getc = mime_getc;
2655 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2658 i_bgetc = i_getc; i_getc = broken_getc;
2659 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2661 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2662 set_iconv(-TRUE, e_iconv);
2663 } else if (input_f == SJIS_INPUT) {
2664 set_iconv(-TRUE, s_iconv);
2665 #ifdef UTF8_INPUT_ENABLE
2666 } else if (input_f == UTF8_INPUT) {
2667 set_iconv(-TRUE, w_iconv);
2668 } else if (input_f == UTF16_INPUT) {
2669 set_iconv(-TRUE, w_iconv16);
2670 } else if (input_f == UTF32_INPUT) {
2671 set_iconv(-TRUE, w_iconv32);
2674 set_iconv(FALSE, e_iconv);
2678 struct input_code *p = input_code_list;
2686 * Check and Ignore BOM
2688 void check_bom(FILE *f)
2691 switch(c2 = (*i_getc)(f)){
2693 if((c2 = (*i_getc)(f)) == 0x00){
2694 if((c2 = (*i_getc)(f)) == 0xFE){
2695 if((c2 = (*i_getc)(f)) == 0xFF){
2697 set_iconv(TRUE, w_iconv32);
2699 if (iconv == w_iconv32) {
2700 input_endian = ENDIAN_BIG;
2703 (*i_ungetc)(0xFF,f);
2704 }else (*i_ungetc)(c2,f);
2705 (*i_ungetc)(0xFE,f);
2706 }else if(c2 == 0xFF){
2707 if((c2 = (*i_getc)(f)) == 0xFE){
2709 set_iconv(TRUE, w_iconv32);
2711 if (iconv == w_iconv32) {
2712 input_endian = ENDIAN_2143;
2715 (*i_ungetc)(0xFF,f);
2716 }else (*i_ungetc)(c2,f);
2717 (*i_ungetc)(0xFF,f);
2718 }else (*i_ungetc)(c2,f);
2719 (*i_ungetc)(0x00,f);
2720 }else (*i_ungetc)(c2,f);
2721 (*i_ungetc)(0x00,f);
2724 if((c2 = (*i_getc)(f)) == 0xBB){
2725 if((c2 = (*i_getc)(f)) == 0xBF){
2727 set_iconv(TRUE, w_iconv);
2729 if (iconv == w_iconv) {
2732 (*i_ungetc)(0xBF,f);
2733 }else (*i_ungetc)(c2,f);
2734 (*i_ungetc)(0xBB,f);
2735 }else (*i_ungetc)(c2,f);
2736 (*i_ungetc)(0xEF,f);
2739 if((c2 = (*i_getc)(f)) == 0xFF){
2740 if((c2 = (*i_getc)(f)) == 0x00){
2741 if((c2 = (*i_getc)(f)) == 0x00){
2743 set_iconv(TRUE, w_iconv32);
2745 if (iconv == w_iconv32) {
2746 input_endian = ENDIAN_3412;
2749 (*i_ungetc)(0x00,f);
2750 }else (*i_ungetc)(c2,f);
2751 (*i_ungetc)(0x00,f);
2752 }else (*i_ungetc)(c2,f);
2754 set_iconv(TRUE, w_iconv16);
2756 if (iconv == w_iconv16) {
2757 input_endian = ENDIAN_BIG;
2760 (*i_ungetc)(0xFF,f);
2761 }else (*i_ungetc)(c2,f);
2762 (*i_ungetc)(0xFE,f);
2765 if((c2 = (*i_getc)(f)) == 0xFE){
2766 if((c2 = (*i_getc)(f)) == 0x00){
2767 if((c2 = (*i_getc)(f)) == 0x00){
2769 set_iconv(TRUE, w_iconv32);
2771 if (iconv == w_iconv32) {
2772 input_endian = ENDIAN_LITTLE;
2775 (*i_ungetc)(0x00,f);
2776 }else (*i_ungetc)(c2,f);
2777 (*i_ungetc)(0x00,f);
2778 }else (*i_ungetc)(c2,f);
2780 set_iconv(TRUE, w_iconv16);
2782 if (iconv == w_iconv16) {
2783 input_endian = ENDIAN_LITTLE;
2786 (*i_ungetc)(0xFE,f);
2787 }else (*i_ungetc)(c2,f);
2788 (*i_ungetc)(0xFF,f);
2797 Conversion main loop. Code detection only.
2800 nkf_char kanji_convert(FILE *f)
2802 nkf_char c3, c2=0, c1, c0=0;
2803 int is_8bit = FALSE;
2805 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2806 #ifdef UTF8_INPUT_ENABLE
2807 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2814 output_mode = ASCII;
2817 #define NEXT continue /* no output, get next */
2818 #define SEND ; /* output c1 and c2, get next */
2819 #define LAST break /* end of loop, go closing */
2821 module_connection();
2824 while ((c1 = (*i_getc)(f)) != EOF) {
2825 #ifdef INPUT_CODE_FIX
2831 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2832 /* in case of 8th bit is on */
2833 if (!estab_f&&!mime_decode_mode) {
2834 /* in case of not established yet */
2835 /* It is still ambiguious */
2836 if (h_conv(f, c2, c1)==EOF)
2842 /* in case of already established */
2844 /* ignore bogus code and not CP5022x UCD */
2852 /* second byte, 7 bit code */
2853 /* it might be kanji shitfted */
2854 if ((c1 == DEL) || (c1 <= SP)) {
2855 /* ignore bogus first code */
2862 #ifdef UTF8_INPUT_ENABLE
2863 if (iconv == w_iconv16) {
2864 if (input_endian == ENDIAN_BIG) {
2866 if ((c1 = (*i_getc)(f)) != EOF) {
2867 if (0xD8 <= c2 && c2 <= 0xDB) {
2868 if ((c0 = (*i_getc)(f)) != EOF) {
2870 if ((c3 = (*i_getc)(f)) != EOF) {
2877 if ((c2 = (*i_getc)(f)) != EOF) {
2878 if (0xD8 <= c2 && c2 <= 0xDB) {
2879 if ((c3 = (*i_getc)(f)) != EOF) {
2880 if ((c0 = (*i_getc)(f)) != EOF) {
2889 } else if(iconv == w_iconv32){
2891 if((c2 = (*i_getc)(f)) != EOF &&
2892 (c1 = (*i_getc)(f)) != EOF &&
2893 (c0 = (*i_getc)(f)) != EOF){
2894 switch(input_endian){
2896 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2899 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2902 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2905 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2915 #ifdef NUMCHAR_OPTION
2916 if (is_unicode_capsule(c1)){
2920 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2922 if (!estab_f && !iso8859_f) {
2923 /* not established yet */
2926 } else { /* estab_f==TRUE */
2931 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2932 /* SJIS X0201 Case... */
2933 if (iso2022jp_f && !x0201_f) {
2934 (*oconv)(GETA1, GETA2);
2941 } else if (c1==SSO && iconv != s_iconv) {
2942 /* EUC X0201 Case */
2943 c1 = (*i_getc)(f); /* skip SSO */
2945 if (SSP<=c1 && c1<0xe0) {
2946 if (iso2022jp_f && !x0201_f) {
2947 (*oconv)(GETA1, GETA2);
2954 } else { /* bogus code, skip SSO and one byte */
2957 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2958 (c1 == 0xFD || c1 == 0xFE)) {
2964 /* already established */
2969 } else if ((c1 > SP) && (c1 != DEL)) {
2970 /* in case of Roman characters */
2972 /* output 1 shifted byte */
2976 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2977 /* output 1 shifted byte */
2978 if (iso2022jp_f && !x0201_f) {
2979 (*oconv)(GETA1, GETA2);
2986 /* look like bogus code */
2989 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
2990 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
2991 /* in case of Kanji shifted */
2994 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2995 /* Check MIME code */
2996 if ((c1 = (*i_getc)(f)) == EOF) {
2999 } else if (c1 == '?') {
3000 /* =? is mime conversion start sequence */
3001 if(mime_f == STRICT_MIME) {
3002 /* check in real detail */
3003 if (mime_begin_strict(f) == EOF)
3007 } else if (mime_begin(f) == EOF)
3017 /* normal ASCII code */
3020 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
3023 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
3026 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
3027 if ((c1 = (*i_getc)(f)) == EOF) {
3028 /* (*oconv)(0, ESC); don't send bogus code */
3030 } else if (c1 == '$') {
3031 if ((c1 = (*i_getc)(f)) == EOF) {
3033 (*oconv)(0, ESC); don't send bogus code
3034 (*oconv)(0, '$'); */
3036 } else if (c1 == '@'|| c1 == 'B') {
3037 /* This is kanji introduction */
3038 input_mode = JIS_X_0208;
3040 set_input_codename("ISO-2022-JP");
3042 debug("ISO-2022-JP");
3045 } else if (c1 == '(') {
3046 if ((c1 = (*i_getc)(f)) == EOF) {
3047 /* don't send bogus code
3053 } else if (c1 == '@'|| c1 == 'B') {
3054 /* This is kanji introduction */
3055 input_mode = JIS_X_0208;
3059 } else if (c1 == 'D'){
3060 input_mode = JIS_X_0212;
3063 #endif /* X0212_ENABLE */
3064 } else if (c1 == 0x4F){
3065 input_mode = JIS_X_0213_1;
3068 } else if (c1 == 0x50){
3069 input_mode = JIS_X_0213_2;
3073 /* could be some special code */
3080 } else if (broken_f&0x2) {
3081 /* accept any ESC-(-x as broken code ... */
3082 input_mode = JIS_X_0208;
3091 } else if (c1 == '(') {
3092 if ((c1 = (*i_getc)(f)) == EOF) {
3093 /* don't send bogus code
3095 (*oconv)(0, '('); */
3099 /* This is X0201 kana introduction */
3100 input_mode = JIS_X_0201; shift_mode = JIS_X_0201;
3102 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
3103 /* This is X0208 kanji introduction */
3104 input_mode = ASCII; shift_mode = FALSE;
3106 } else if (broken_f&0x2) {
3107 input_mode = ASCII; shift_mode = FALSE;
3112 /* maintain various input_mode here */
3116 } else if ( c1 == 'N' || c1 == 'n'){
3118 c3 = (*i_getc)(f); /* skip SS2 */
3119 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
3134 } else if (c1 == ESC && iconv == s_iconv) {
3135 /* ESC in Shift_JIS */
3136 if ((c1 = (*i_getc)(f)) == EOF) {
3137 /* (*oconv)(0, ESC); don't send bogus code */
3139 } else if (c1 == '$') {
3141 if ((c1 = (*i_getc)(f)) == EOF) {
3143 (*oconv)(0, ESC); don't send bogus code
3144 (*oconv)(0, '$'); */
3147 if (('E' <= c1 && c1 <= 'G') ||
3148 ('O' <= c1 && c1 <= 'Q')) {
3156 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
3157 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
3158 while ((c1 = (*i_getc)(f)) != EOF) {
3159 if (SP <= c1 && c1 <= 'z') {
3160 (*oconv)(0, c1 + c0);
3161 } else break; /* c1 == SO */
3165 if (c1 == EOF) LAST;
3172 } else if (c1 == LF || c1 == CR) {
3174 input_mode = ASCII; set_iconv(FALSE, 0);
3176 } else if (mime_decode_f && !mime_decode_mode){
3178 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
3186 } else { /* if (c1 == CR)*/
3187 if ((c1=(*i_getc)(f))!=EOF) {
3191 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
3205 } else if (c1 == DEL && input_mode == JIS_X_0208) {
3215 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
3218 if ((c0 = (*i_getc)(f)) != EOF) {
3221 if ((c3 = (*i_getc)(f)) != EOF) {
3223 (*iconv)(c2, c1, c0|c3);
3228 /* 3 bytes EUC or UTF-8 */
3229 if ((c0 = (*i_getc)(f)) != EOF) {
3231 (*iconv)(c2, c1, c0);
3239 0x7F <= c2 && c2 <= 0x92 &&
3240 0x21 <= c1 && c1 <= 0x7E) {
3242 if(c1 == 0x7F) return 0;
3243 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
3246 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
3250 (*oconv)(PREFIX_EUCG3 | c2, c1);
3252 #endif /* X0212_ENABLE */
3254 (*oconv)(PREFIX_EUCG3 | c2, c1);
3257 (*oconv)(input_mode, c1); /* other special case */
3263 /* goto next_word */
3267 (*iconv)(EOF, 0, 0);
3268 if (!input_codename)
3271 struct input_code *p = input_code_list;
3272 struct input_code *result = p;
3274 if (p->score < result->score) result = p;
3277 set_input_codename(result->name);
3279 debug(result->name);
3287 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3289 nkf_char ret, c3, c0;
3293 /** it must NOT be in the kanji shifte sequence */
3294 /** it must NOT be written in JIS7 */
3295 /** and it must be after 2 byte 8bit code */
3301 while ((c1 = (*i_getc)(f)) != EOF) {
3307 if (push_hold_buf(c1) == EOF || estab_f){
3313 struct input_code *p = input_code_list;
3314 struct input_code *result = p;
3319 if (p->status_func && p->score < result->score){
3324 set_iconv(TRUE, result->iconv_func);
3329 ** 1) EOF is detected, or
3330 ** 2) Code is established, or
3331 ** 3) Buffer is FULL (but last word is pushed)
3333 ** in 1) and 3) cases, we continue to use
3334 ** Kanji codes by oconv and leave estab_f unchanged.
3339 while (hold_index < hold_count){
3340 c2 = hold_buf[hold_index++];
3342 #ifdef NUMCHAR_OPTION
3343 || is_unicode_capsule(c2)
3348 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3349 (*iconv)(JIS_X_0201, c2, 0);
3352 if (hold_index < hold_count){
3353 c1 = hold_buf[hold_index++];
3363 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3366 if (hold_index < hold_count){
3367 c0 = hold_buf[hold_index++];
3368 } else if ((c0 = (*i_getc)(f)) == EOF) {
3374 if (hold_index < hold_count){
3375 c3 = hold_buf[hold_index++];
3376 } else if ((c3 = (*i_getc)(f)) == EOF) {
3381 (*iconv)(c2, c1, c0|c3);
3386 /* 3 bytes EUC or UTF-8 */
3387 if (hold_index < hold_count){
3388 c0 = hold_buf[hold_index++];
3389 } else if ((c0 = (*i_getc)(f)) == EOF) {
3395 (*iconv)(c2, c1, c0);
3398 if (c0 == EOF) break;
3403 nkf_char push_hold_buf(nkf_char c2)
3405 if (hold_count >= HOLD_SIZE*2)
3407 hold_buf[hold_count++] = (unsigned char)c2;
3408 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3411 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3413 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3416 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3417 #ifdef SHIFTJIS_CP932
3418 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3419 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3426 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3427 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3433 #endif /* SHIFTJIS_CP932 */
3435 if (!x0213_f && is_ibmext_in_sjis(c2)){
3436 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3439 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3452 if(x0213_f && c2 >= 0xF0){
3453 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3454 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3455 }else{ /* 78<=k<=94 */
3456 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3457 if (0x9E < c1) c2++;
3460 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3461 if (0x9E < c1) c2++;
3464 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3471 c2 = x0212_unshift(c2);
3478 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3480 if (c2 == JIS_X_0201) {
3482 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3484 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3486 if(c1 == 0x7F) return 0;
3487 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3490 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3491 if (ret) return ret;
3497 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3499 if (c2 == JIS_X_0201) {
3502 }else if (c2 == 0x8f){
3506 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3507 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3508 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3511 c2 = (c2 << 8) | (c1 & 0x7f);
3513 #ifdef SHIFTJIS_CP932
3516 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3517 s2e_conv(s2, s1, &c2, &c1);
3524 #endif /* SHIFTJIS_CP932 */
3526 #endif /* X0212_ENABLE */
3527 } else if (c2 == SSO){
3530 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3533 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3534 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3535 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3540 #ifdef SHIFTJIS_CP932
3541 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3543 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3544 s2e_conv(s2, s1, &c2, &c1);
3551 #endif /* SHIFTJIS_CP932 */
3558 #ifdef UTF8_INPUT_ENABLE
3559 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3566 }else if (0xc0 <= c2 && c2 <= 0xef) {
3567 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3568 #ifdef NUMCHAR_OPTION
3571 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3579 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3582 static const char w_iconv_utf8_1st_byte[] =
3584 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3585 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3586 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3587 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3589 if (c2 < 0 || 0xff < c2) {
3590 }else if (c2 == 0) { /* 0 : 1 byte*/
3592 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3595 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3597 if (c1 < 0x80 || 0xBF < c1) return 0;
3600 if (c0 == 0) return -1;
3601 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3606 if (c0 == 0) return -1;
3607 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3611 if (c0 == 0) return -1;
3612 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3616 if (c0 == 0) return -2;
3617 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3621 if (c0 == 0) return -2;
3622 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3626 if (c0 == 0) return -2;
3627 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3635 if (c2 == 0 || c2 == EOF){
3636 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3637 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3640 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3649 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3650 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3657 }else if (val < 0x800){
3658 *p2 = 0xc0 | (val >> 6);
3659 *p1 = 0x80 | (val & 0x3f);
3661 } else if (val <= NKF_INT32_C(0xFFFF)) {
3662 *p2 = 0xe0 | (val >> 12);
3663 *p1 = 0x80 | ((val >> 6) & 0x3f);
3664 *p0 = 0x80 | (val & 0x3f);
3665 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3666 *p2 = 0xe0 | (val >> 16);
3667 *p1 = 0x80 | ((val >> 12) & 0x3f);
3668 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3677 #ifdef UTF8_INPUT_ENABLE
3678 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3683 } else if (c2 >= 0xf0){
3684 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3685 val = (c2 & 0x0f) << 18;
3686 val |= (c1 & 0x3f) << 12;
3687 val |= (c0 & 0x3f00) >> 2;
3689 }else if (c2 >= 0xe0){
3690 val = (c2 & 0x0f) << 12;
3691 val |= (c1 & 0x3f) << 6;
3693 }else if (c2 >= 0xc0){
3694 val = (c2 & 0x1f) << 6;
3702 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3704 nkf_char c2, c1, c0;
3711 w16w_conv(val, &c2, &c1, &c0);
3712 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3713 #ifdef NUMCHAR_OPTION
3716 *p1 = CLASS_UNICODE | val;
3725 #ifdef UTF8_INPUT_ENABLE
3726 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3729 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3732 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3733 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3735 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3737 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3742 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3743 if (ret) return ret;
3748 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3752 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3753 } else if (is_unicode_bmp(c1)) {
3754 ret = w16e_conv(c1, &c2, &c1);
3757 c1 = CLASS_UNICODE | c1;
3759 if (ret) return ret;
3764 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3766 const unsigned short *const *pp;
3767 const unsigned short *const *const *ppp;
3768 static const char no_best_fit_chars_table_C2[] =
3769 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3770 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3771 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3772 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3773 static const char no_best_fit_chars_table_C2_ms[] =
3774 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3775 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3776 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3777 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3778 static const char no_best_fit_chars_table_932_C2[] =
3779 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3781 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3782 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3783 static const char no_best_fit_chars_table_932_C3[] =
3784 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3785 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3787 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3793 }else if(c2 < 0xe0){
3794 if(no_best_fit_chars_f){
3795 if(ms_ucs_map_f == UCS_MAP_CP932){
3798 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3801 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3804 }else if(!cp932inv_f){
3807 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3810 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3813 }else if(ms_ucs_map_f == UCS_MAP_MS){
3814 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3815 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3833 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3834 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3835 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3837 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3838 }else if(c0 < 0xF0){
3839 if(no_best_fit_chars_f){
3840 if(ms_ucs_map_f == UCS_MAP_CP932){
3841 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3842 }else if(ms_ucs_map_f == UCS_MAP_MS){
3847 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3850 if(c0 == 0x92) return 1;
3855 if(c1 == 0x80 || c0 == 0x9C) return 1;
3858 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3863 if(c0 == 0x94) return 1;
3866 if(c0 == 0xBB) return 1;
3876 if(c0 == 0x95) return 1;
3879 if(c0 == 0xA5) return 1;
3886 if(c0 == 0x8D) return 1;
3889 if(c0 == 0x9E && !cp932inv_f) return 1;
3892 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3900 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3901 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3902 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3904 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3906 #ifdef SHIFTJIS_CP932
3907 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3909 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3910 s2e_conv(s2, s1, p2, p1);
3919 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3922 const unsigned short *p;
3925 if (pp == 0) return 1;
3928 if (c1 < 0 || psize <= c1) return 1;
3930 if (p == 0) return 1;
3933 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3935 if (val == 0) return 1;
3936 if (no_cp932ext_f && (
3937 (val>>8) == 0x2D || /* NEC special characters */
3938 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3946 if (c2 == SO) c2 = JIS_X_0201;
3953 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3960 (*f)(0, bin2hex(c>>shift));
3970 void encode_fallback_html(nkf_char c)
3975 if(c >= NKF_INT32_C(1000000))
3976 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3977 if(c >= NKF_INT32_C(100000))
3978 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3980 (*oconv)(0, 0x30+(c/10000 )%10);
3982 (*oconv)(0, 0x30+(c/1000 )%10);
3984 (*oconv)(0, 0x30+(c/100 )%10);
3986 (*oconv)(0, 0x30+(c/10 )%10);
3988 (*oconv)(0, 0x30+ c %10);
3993 void encode_fallback_xml(nkf_char c)
3998 nkf_each_char_to_hex(oconv, c);
4003 void encode_fallback_java(nkf_char c)
4007 if(!is_unicode_bmp(c)){
4011 (*oconv)(0, bin2hex(c>>20));
4012 (*oconv)(0, bin2hex(c>>16));
4016 (*oconv)(0, bin2hex(c>>12));
4017 (*oconv)(0, bin2hex(c>> 8));
4018 (*oconv)(0, bin2hex(c>> 4));
4019 (*oconv)(0, bin2hex(c ));
4023 void encode_fallback_perl(nkf_char c)
4028 nkf_each_char_to_hex(oconv, c);
4033 void encode_fallback_subchar(nkf_char c)
4035 c = unicode_subchar;
4036 (*oconv)((c>>8)&0xFF, c&0xFF);
4041 #ifdef UTF8_OUTPUT_ENABLE
4042 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
4044 const unsigned short *p;
4046 if (c2 == JIS_X_0201) {
4047 if (ms_ucs_map_f == UCS_MAP_CP10001) {
4055 p = euc_to_utf8_1byte;
4057 } else if (is_eucg3(c2)){
4058 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
4061 c2 = (c2&0x7f) - 0x21;
4062 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4063 p = x0212_to_utf8_2bytes[c2];
4069 c2 = (c2&0x7f) - 0x21;
4070 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4072 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
4073 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
4074 euc_to_utf8_2bytes_ms[c2];
4079 c1 = (c1 & 0x7f) - 0x21;
4080 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
4085 void w_oconv(nkf_char c2, nkf_char c1)
4091 output_bom_f = FALSE;
4102 #ifdef NUMCHAR_OPTION
4103 if (c2 == 0 && is_unicode_capsule(c1)){
4104 val = c1 & VALUE_MASK;
4107 }else if (val < 0x800){
4108 (*o_putc)(0xC0 | (val >> 6));
4109 (*o_putc)(0x80 | (val & 0x3f));
4110 } else if (val <= NKF_INT32_C(0xFFFF)) {
4111 (*o_putc)(0xE0 | (val >> 12));
4112 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
4113 (*o_putc)(0x80 | (val & 0x3f));
4114 } else if (val <= NKF_INT32_C(0x10FFFF)) {
4115 (*o_putc)(0xF0 | ( val>>18));
4116 (*o_putc)(0x80 | ((val>>12) & 0x3f));
4117 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
4118 (*o_putc)(0x80 | ( val & 0x3f));
4125 output_mode = ASCII;
4127 } else if (c2 == ISO_8859_1) {
4128 output_mode = UTF_8;
4129 (*o_putc)(c1 | 0x080);
4131 output_mode = UTF_8;
4132 val = e2w_conv(c2, c1);
4134 w16w_conv(val, &c2, &c1, &c0);
4138 if (c0) (*o_putc)(c0);
4144 void w_oconv16(nkf_char c2, nkf_char c1)
4147 output_bom_f = FALSE;
4148 if (output_endian == ENDIAN_LITTLE){
4149 (*o_putc)((unsigned char)'\377');
4153 (*o_putc)((unsigned char)'\377');
4162 if (c2 == ISO_8859_1) {
4165 #ifdef NUMCHAR_OPTION
4166 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4167 if (is_unicode_bmp(c1)) {
4168 c2 = (c1 >> 8) & 0xff;
4172 if (c1 <= UNICODE_MAX) {
4173 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
4174 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
4175 if (output_endian == ENDIAN_LITTLE){
4176 (*o_putc)(c2 & 0xff);
4177 (*o_putc)((c2 >> 8) & 0xff);
4178 (*o_putc)(c1 & 0xff);
4179 (*o_putc)((c1 >> 8) & 0xff);
4181 (*o_putc)((c2 >> 8) & 0xff);
4182 (*o_putc)(c2 & 0xff);
4183 (*o_putc)((c1 >> 8) & 0xff);
4184 (*o_putc)(c1 & 0xff);
4191 nkf_char val = e2w_conv(c2, c1);
4192 c2 = (val >> 8) & 0xff;
4196 if (output_endian == ENDIAN_LITTLE){
4205 void w_oconv32(nkf_char c2, nkf_char c1)
4208 output_bom_f = FALSE;
4209 if (output_endian == ENDIAN_LITTLE){
4210 (*o_putc)((unsigned char)'\377');
4218 (*o_putc)((unsigned char)'\377');
4227 if (c2 == ISO_8859_1) {
4229 #ifdef NUMCHAR_OPTION
4230 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4234 c1 = e2w_conv(c2, c1);
4237 if (output_endian == ENDIAN_LITTLE){
4238 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4239 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4240 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4244 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4245 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4246 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4251 void e_oconv(nkf_char c2, nkf_char c1)
4253 #ifdef NUMCHAR_OPTION
4254 if (c2 == 0 && is_unicode_capsule(c1)){
4255 w16e_conv(c1, &c2, &c1);
4256 if (c2 == 0 && is_unicode_capsule(c1)){
4257 c2 = c1 & VALUE_MASK;
4258 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4262 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4263 c1 = 0x21 + c1 % 94;
4266 (*o_putc)((c2 & 0x7f) | 0x080);
4267 (*o_putc)(c1 | 0x080);
4269 (*o_putc)((c2 & 0x7f) | 0x080);
4270 (*o_putc)(c1 | 0x080);
4274 if (encode_fallback) (*encode_fallback)(c1);
4283 } else if (c2 == 0) {
4284 output_mode = ASCII;
4286 } else if (c2 == JIS_X_0201) {
4287 output_mode = EUC_JP;
4288 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4289 } else if (c2 == ISO_8859_1) {
4290 output_mode = ISO_8859_1;
4291 (*o_putc)(c1 | 0x080);
4293 } else if (is_eucg3(c2)){
4294 output_mode = EUC_JP;
4295 #ifdef SHIFTJIS_CP932
4298 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4299 s2e_conv(s2, s1, &c2, &c1);
4304 output_mode = ASCII;
4306 }else if (is_eucg3(c2)){
4309 (*o_putc)((c2 & 0x7f) | 0x080);
4310 (*o_putc)(c1 | 0x080);
4313 (*o_putc)((c2 & 0x7f) | 0x080);
4314 (*o_putc)(c1 | 0x080);
4318 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4319 set_iconv(FALSE, 0);
4320 return; /* too late to rescue this char */
4322 output_mode = EUC_JP;
4323 (*o_putc)(c2 | 0x080);
4324 (*o_putc)(c1 | 0x080);
4329 nkf_char x0212_shift(nkf_char c)
4334 if (0x75 <= c && c <= 0x7f){
4335 ret = c + (0x109 - 0x75);
4338 if (0x75 <= c && c <= 0x7f){
4339 ret = c + (0x113 - 0x75);
4346 nkf_char x0212_unshift(nkf_char c)
4349 if (0x7f <= c && c <= 0x88){
4350 ret = c + (0x75 - 0x7f);
4351 }else if (0x89 <= c && c <= 0x92){
4352 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4356 #endif /* X0212_ENABLE */
4358 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4364 if((0x21 <= ndx && ndx <= 0x2F)){
4365 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4366 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4368 }else if(0x6E <= ndx && ndx <= 0x7E){
4369 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4370 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4376 else if(nkf_isgraph(ndx)){
4378 const unsigned short *ptr;
4379 ptr = x0212_shiftjis[ndx - 0x21];
4381 val = ptr[(c1 & 0x7f) - 0x21];
4390 c2 = x0212_shift(c2);
4392 #endif /* X0212_ENABLE */
4394 if(0x7F < c2) return 1;
4395 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4396 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4400 void s_oconv(nkf_char c2, nkf_char c1)
4402 #ifdef NUMCHAR_OPTION
4403 if (c2 == 0 && is_unicode_capsule(c1)){
4404 w16e_conv(c1, &c2, &c1);
4405 if (c2 == 0 && is_unicode_capsule(c1)){
4406 c2 = c1 & VALUE_MASK;
4407 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4410 c2 = c1 / 188 + 0xF0;
4412 c1 += 0x40 + (c1 > 0x3e);
4417 if(encode_fallback)(*encode_fallback)(c1);
4426 } else if (c2 == 0) {
4427 output_mode = ASCII;
4429 } else if (c2 == JIS_X_0201) {
4430 output_mode = SHIFT_JIS;
4432 } else if (c2 == ISO_8859_1) {
4433 output_mode = ISO_8859_1;
4434 (*o_putc)(c1 | 0x080);
4436 } else if (is_eucg3(c2)){
4437 output_mode = SHIFT_JIS;
4438 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4444 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4445 set_iconv(FALSE, 0);
4446 return; /* too late to rescue this char */
4448 output_mode = SHIFT_JIS;
4449 e2s_conv(c2, c1, &c2, &c1);
4451 #ifdef SHIFTJIS_CP932
4453 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4454 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4460 #endif /* SHIFTJIS_CP932 */
4463 if (prefix_table[(unsigned char)c1]){
4464 (*o_putc)(prefix_table[(unsigned char)c1]);
4470 void j_oconv(nkf_char c2, nkf_char c1)
4472 #ifdef NUMCHAR_OPTION
4473 if (c2 == 0 && is_unicode_capsule(c1)){
4474 w16e_conv(c1, &c2, &c1);
4475 if (c2 == 0 && is_unicode_capsule(c1)){
4476 c2 = c1 & VALUE_MASK;
4477 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4480 c2 = 0x7F + c1 / 94;
4481 c1 = 0x21 + c1 % 94;
4483 if (encode_fallback) (*encode_fallback)(c1);
4490 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4493 (*o_putc)(ascii_intro);
4494 output_mode = ASCII;
4498 } else if (is_eucg3(c2)){
4500 if(output_mode!=JIS_X_0213_2){
4501 output_mode = JIS_X_0213_2;
4508 if(output_mode!=JIS_X_0212){
4509 output_mode = JIS_X_0212;
4516 (*o_putc)(c2 & 0x7f);
4519 } else if (c2==JIS_X_0201) {
4520 if (output_mode!=JIS_X_0201) {
4521 output_mode = JIS_X_0201;
4527 } else if (c2==ISO_8859_1) {
4528 /* iso8859 introduction, or 8th bit on */
4529 /* Can we convert in 7bit form using ESC-'-'-A ?
4531 output_mode = ISO_8859_1;
4533 } else if (c2 == 0) {
4534 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4537 (*o_putc)(ascii_intro);
4538 output_mode = ASCII;
4543 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4544 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4546 if (output_mode!=JIS_X_0213_1) {
4547 output_mode = JIS_X_0213_1;
4553 }else if (output_mode != JIS_X_0208) {
4554 output_mode = JIS_X_0208;
4557 (*o_putc)(kanji_intro);
4564 void base64_conv(nkf_char c2, nkf_char c1)
4566 mime_prechar(c2, c1);
4567 (*o_base64conv)(c2,c1);
4571 static nkf_char broken_buf[3];
4572 static int broken_counter = 0;
4573 static int broken_last = 0;
4574 nkf_char broken_getc(FILE *f)
4578 if (broken_counter>0) {
4579 return broken_buf[--broken_counter];
4582 if (c=='$' && broken_last != ESC
4583 && (input_mode==ASCII || input_mode==JIS_X_0201)) {
4586 if (c1=='@'|| c1=='B') {
4587 broken_buf[0]=c1; broken_buf[1]=c;
4594 } else if (c=='(' && broken_last != ESC
4595 && (input_mode==JIS_X_0208 || input_mode==JIS_X_0201)) { /* ) */
4598 if (c1=='J'|| c1=='B') {
4599 broken_buf[0]=c1; broken_buf[1]=c;
4612 nkf_char broken_ungetc(nkf_char c, FILE *f)
4614 if (broken_counter<2)
4615 broken_buf[broken_counter++]=c;
4619 void nl_conv(nkf_char c2, nkf_char c1)
4621 if (guess_f && input_newline != EOF) {
4622 if (c2 == 0 && c1 == LF) {
4623 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4624 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4625 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4627 else if (!input_newline) input_newline = CR;
4628 else if (input_newline != CR) input_newline = EOF;
4630 if (prev_cr || (c2 == 0 && c1 == LF)) {
4632 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4633 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4635 if (c2 == 0 && c1 == CR) prev_cr = CR;
4636 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4640 Return value of fold_conv()
4642 LF add newline and output char
4643 CR add newline and output nothing
4646 1 (or else) normal output
4648 fold state in prev (previous character)
4650 >0x80 Japanese (X0208/X0201)
4655 This fold algorthm does not preserve heading space in a line.
4656 This is the main difference from fmt.
4659 #define char_size(c2,c1) (c2?2:1)
4661 void fold_conv(nkf_char c2, nkf_char c1)
4664 nkf_char fold_state;
4666 if (c1== CR && !fold_preserve_f) {
4667 fold_state=0; /* ignore cr */
4668 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4670 fold_state=0; /* ignore cr */
4671 } else if (c1== BS) {
4672 if (f_line>0) f_line--;
4674 } else if (c2==EOF && f_line != 0) { /* close open last line */
4676 } else if ((c1==LF && !fold_preserve_f)
4677 || ((c1==CR||(c1==LF&&f_prev!=CR))
4678 && fold_preserve_f)) {
4680 if (fold_preserve_f) {
4684 } else if ((f_prev == c1 && !fold_preserve_f)
4685 || (f_prev == LF && fold_preserve_f)
4686 ) { /* duplicate newline */
4689 fold_state = LF; /* output two newline */
4695 if (f_prev&0x80) { /* Japanese? */
4697 fold_state = 0; /* ignore given single newline */
4698 } else if (f_prev==SP) {
4702 if (++f_line<=fold_len)
4706 fold_state = CR; /* fold and output nothing */
4710 } else if (c1=='\f') {
4713 fold_state = LF; /* output newline and clear */
4714 } else if ( (c2==0 && c1==SP)||
4715 (c2==0 && c1==TAB)||
4716 (c2=='!'&& c1=='!')) {
4717 /* X0208 kankaku or ascii space */
4719 fold_state = 0; /* remove duplicate spaces */
4722 if (++f_line<=fold_len)
4723 fold_state = SP; /* output ASCII space only */
4725 f_prev = SP; f_line = 0;
4726 fold_state = CR; /* fold and output nothing */
4730 prev0 = f_prev; /* we still need this one... , but almost done */
4732 if (c2 || c2==JIS_X_0201)
4733 f_prev |= 0x80; /* this is Japanese */
4734 f_line += char_size(c2,c1);
4735 if (f_line<=fold_len) { /* normal case */
4738 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4739 f_line = char_size(c2,c1);
4740 fold_state = LF; /* We can't wait, do fold now */
4741 } else if (c2==JIS_X_0201) {
4742 /* simple kinsoku rules return 1 means no folding */
4743 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4744 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4745 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4746 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4747 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4748 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4749 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4751 fold_state = LF;/* add one new f_line before this character */
4754 fold_state = LF;/* add one new f_line before this character */
4757 /* kinsoku point in ASCII */
4758 if ( c1==')'|| /* { [ ( */
4769 /* just after special */
4770 } else if (!is_alnum(prev0)) {
4771 f_line = char_size(c2,c1);
4773 } else if ((prev0==SP) || /* ignored new f_line */
4774 (prev0==LF)|| /* ignored new f_line */
4775 (prev0&0x80)) { /* X0208 - ASCII */
4776 f_line = char_size(c2,c1);
4777 fold_state = LF;/* add one new f_line before this character */
4779 fold_state = 1; /* default no fold in ASCII */
4783 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4784 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4785 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4786 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4787 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4788 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4789 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4790 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4791 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4792 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4793 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4794 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4795 /* default no fold in kinsoku */
4798 f_line = char_size(c2,c1);
4799 /* add one new f_line before this character */
4802 f_line = char_size(c2,c1);
4804 /* add one new f_line before this character */
4809 /* terminator process */
4810 switch(fold_state) {
4812 OCONV_NEWLINE((*o_fconv));
4818 OCONV_NEWLINE((*o_fconv));
4829 nkf_char z_prev2=0,z_prev1=0;
4831 void z_conv(nkf_char c2, nkf_char c1)
4834 /* if (c2) c1 &= 0x7f; assertion */
4836 if (c2 == JIS_X_0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4842 if (z_prev2 == JIS_X_0201) {
4843 if (c2 == JIS_X_0201) {
4844 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4846 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4848 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4850 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4855 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4857 if (c2 == JIS_X_0201) {
4858 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4859 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4864 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4875 if (alpha_f&1 && c2 == 0x23) {
4876 /* JISX0208 Alphabet */
4878 } else if (c2 == 0x21) {
4879 /* JISX0208 Kigou */
4884 } else if (alpha_f&4) {
4889 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4895 if (alpha_f&8 && c2 == 0) {
4899 case '>': entity = ">"; break;
4900 case '<': entity = "<"; break;
4901 case '\"': entity = """; break;
4902 case '&': entity = "&"; break;
4905 while (*entity) (*o_zconv)(0, *entity++);
4911 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4916 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4920 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4924 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4928 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4932 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4936 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4940 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4944 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4949 (*o_zconv)(JIS_X_0201, c);
4952 } else if (c2 == 0x25) {
4953 /* JISX0208 Katakana */
4954 static const int fullwidth_to_halfwidth[] =
4956 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4957 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4958 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4959 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4960 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4961 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4962 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4963 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4964 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4965 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4966 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4967 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4969 if (fullwidth_to_halfwidth[c1-0x20]){
4970 c2 = fullwidth_to_halfwidth[c1-0x20];
4971 (*o_zconv)(JIS_X_0201, c2>>8);
4973 (*o_zconv)(JIS_X_0201, c2&0xFF);
4983 #define rot13(c) ( \
4985 (c <= 'M') ? (c + 13): \
4986 (c <= 'Z') ? (c - 13): \
4988 (c <= 'm') ? (c + 13): \
4989 (c <= 'z') ? (c - 13): \
4993 #define rot47(c) ( \
4995 ( c <= 'O') ? (c + 47) : \
4996 ( c <= '~') ? (c - 47) : \
5000 void rot_conv(nkf_char c2, nkf_char c1)
5002 if (c2==0 || c2==JIS_X_0201 || c2==ISO_8859_1) {
5008 (*o_rot_conv)(c2,c1);
5011 void hira_conv(nkf_char c2, nkf_char c1)
5015 if (0x20 < c1 && c1 < 0x74) {
5017 (*o_hira_conv)(c2,c1);
5019 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
5021 c1 = CLASS_UNICODE | 0x3094;
5022 (*o_hira_conv)(c2,c1);
5025 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
5027 (*o_hira_conv)(c2,c1);
5032 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
5035 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
5037 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
5041 (*o_hira_conv)(c2,c1);
5045 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
5047 static const nkf_char range[RANGE_NUM_MAX][2] = {
5068 nkf_char start, end, c;
5070 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
5074 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
5079 for (i = 0; i < RANGE_NUM_MAX; i++) {
5080 start = range[i][0];
5083 if (c >= start && c <= end) {
5088 (*o_iso2022jp_check_conv)(c2,c1);
5092 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
5094 static const unsigned char *mime_pattern[] = {
5095 (const unsigned char *)"\075?EUC-JP?B?",
5096 (const unsigned char *)"\075?SHIFT_JIS?B?",
5097 (const unsigned char *)"\075?ISO-8859-1?Q?",
5098 (const unsigned char *)"\075?ISO-8859-1?B?",
5099 (const unsigned char *)"\075?ISO-2022-JP?B?",
5100 (const unsigned char *)"\075?ISO-2022-JP?Q?",
5101 #if defined(UTF8_INPUT_ENABLE)
5102 (const unsigned char *)"\075?UTF-8?B?",
5103 (const unsigned char *)"\075?UTF-8?Q?",
5105 (const unsigned char *)"\075?US-ASCII?Q?",
5110 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
5111 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
5112 e_iconv, s_iconv, 0, 0, 0, 0,
5113 #if defined(UTF8_INPUT_ENABLE)
5119 static const nkf_char mime_encode[] = {
5120 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201,
5121 #if defined(UTF8_INPUT_ENABLE)
5128 static const nkf_char mime_encode_method[] = {
5129 'B', 'B','Q', 'B', 'B', 'Q',
5130 #if defined(UTF8_INPUT_ENABLE)
5138 #define MAXRECOVER 20
5140 void switch_mime_getc(void)
5142 if (i_getc!=mime_getc) {
5143 i_mgetc = i_getc; i_getc = mime_getc;
5144 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5145 if(mime_f==STRICT_MIME) {
5146 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
5147 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
5152 void unswitch_mime_getc(void)
5154 if(mime_f==STRICT_MIME) {
5155 i_mgetc = i_mgetc_buf;
5156 i_mungetc = i_mungetc_buf;
5159 i_ungetc = i_mungetc;
5160 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
5161 mime_iconv_back = NULL;
5164 nkf_char mime_begin_strict(FILE *f)
5168 const unsigned char *p,*q;
5169 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
5171 mime_decode_mode = FALSE;
5172 /* =? has been checked */
5174 p = mime_pattern[j];
5177 for(i=2;p[i]>SP;i++) { /* start at =? */
5178 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
5179 /* pattern fails, try next one */
5181 while (mime_pattern[++j]) {
5182 p = mime_pattern[j];
5183 for(k=2;k<i;k++) /* assume length(p) > i */
5184 if (p[k]!=q[k]) break;
5185 if (k==i && nkf_toupper(c1)==p[k]) break;
5187 p = mime_pattern[j];
5188 if (p) continue; /* found next one, continue */
5189 /* all fails, output from recovery buffer */
5197 mime_decode_mode = p[i-2];
5199 mime_iconv_back = iconv;
5200 set_iconv(FALSE, mime_priority_func[j]);
5201 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
5203 if (mime_decode_mode=='B') {
5204 mimebuf_f = unbuf_f;
5206 /* do MIME integrity check */
5207 return mime_integrity(f,mime_pattern[j]);
5215 nkf_char mime_getc_buf(FILE *f)
5217 /* we don't keep eof of Fifo, becase it contains ?= as
5218 a terminator. It was checked in mime_integrity. */
5219 return ((mimebuf_f)?
5220 (*i_mgetc_buf)(f):Fifo(mime_input++));
5223 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
5226 (*i_mungetc_buf)(c,f);
5228 Fifo(--mime_input) = (unsigned char)c;
5232 nkf_char mime_begin(FILE *f)
5237 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
5238 /* re-read and convert again from mime_buffer. */
5240 /* =? has been checked */
5242 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
5243 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
5244 /* We accept any character type even if it is breaked by new lines */
5245 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5246 if (c1==LF||c1==SP||c1==CR||
5247 c1=='-'||c1=='_'||is_alnum(c1)) continue;
5249 /* Failed. But this could be another MIME preemble */
5257 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5258 if (!(++i<MAXRECOVER) || c1==EOF) break;
5259 if (c1=='b'||c1=='B') {
5260 mime_decode_mode = 'B';
5261 } else if (c1=='q'||c1=='Q') {
5262 mime_decode_mode = 'Q';
5266 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5267 if (!(++i<MAXRECOVER) || c1==EOF) break;
5269 mime_decode_mode = FALSE;
5275 if (!mime_decode_mode) {
5276 /* false MIME premble, restart from mime_buffer */
5277 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5278 /* Since we are in MIME mode until buffer becomes empty, */
5279 /* we never go into mime_begin again for a while. */
5282 /* discard mime preemble, and goto MIME mode */
5284 /* do no MIME integrity check */
5285 return c1; /* used only for checking EOF */
5289 void no_putc(nkf_char c)
5294 void debug(const char *str)
5297 fprintf(stderr, "%s\n", str ? str : "NULL");
5302 void set_input_codename(char *codename)
5304 if (!input_codename) {
5305 input_codename = codename;
5306 } else if (strcmp(codename, input_codename) != 0) {
5307 input_codename = "";
5311 static char* get_guessed_code(void)
5313 if (input_codename && !*input_codename) {
5314 input_codename = "BINARY";
5316 struct input_code *p = find_inputcode_byfunc(iconv);
5317 if (!input_codename) {
5318 input_codename = "ASCII";
5319 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5320 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5321 input_codename = "CP932";
5322 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5323 if (p->score & (SCORE_X0212))
5324 input_codename = "EUCJP-MS";
5325 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5326 input_codename = "CP51932";
5327 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5328 if (p->score & (SCORE_KANA))
5329 input_codename = "CP50221";
5330 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5331 input_codename = "CP50220";
5334 return input_codename;
5337 #if !defined(PERL_XS) && !defined(WIN32DLL)
5338 void print_guessed_code(char *filename)
5340 if (filename != NULL) printf("%s: ", filename);
5341 if (input_codename && !*input_codename) {
5344 input_codename = get_guessed_code();
5346 printf("%s\n", input_codename);
5350 input_newline == CR ? " (CR)" :
5351 input_newline == LF ? " (LF)" :
5352 input_newline == CRLF ? " (CRLF)" :
5353 input_newline == EOF ? " (MIXED NL)" :
5362 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5364 nkf_char c1, c2, c3;
5370 if (!nkf_isxdigit(c2)){
5375 if (!nkf_isxdigit(c3)){
5380 return (hex2bin(c2) << 4) | hex2bin(c3);
5383 nkf_char cap_getc(FILE *f)
5385 return hex_getc(':', f, i_cgetc, i_cungetc);
5388 nkf_char cap_ungetc(nkf_char c, FILE *f)
5390 return (*i_cungetc)(c, f);
5393 nkf_char url_getc(FILE *f)
5395 return hex_getc('%', f, i_ugetc, i_uungetc);
5398 nkf_char url_ungetc(nkf_char c, FILE *f)
5400 return (*i_uungetc)(c, f);
5404 #ifdef NUMCHAR_OPTION
5405 nkf_char numchar_getc(FILE *f)
5407 nkf_char (*g)(FILE *) = i_ngetc;
5408 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5419 if (buf[i] == 'x' || buf[i] == 'X'){
5420 for (j = 0; j < 7; j++){
5422 if (!nkf_isxdigit(buf[i])){
5429 c |= hex2bin(buf[i]);
5432 for (j = 0; j < 8; j++){
5436 if (!nkf_isdigit(buf[i])){
5443 c += hex2bin(buf[i]);
5449 return CLASS_UNICODE | c;
5458 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5460 return (*i_nungetc)(c, f);
5464 #ifdef UNICODE_NORMALIZATION
5466 /* Normalization Form C */
5467 nkf_char nfc_getc(FILE *f)
5469 nkf_char (*g)(FILE *f) = i_nfc_getc;
5470 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5471 int i=0, j, k=1, lower, upper;
5473 const nkf_nfchar *array;
5476 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5477 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5478 while (upper >= lower) {
5479 j = (lower+upper) / 2;
5480 array = normalization_table[j].nfd;
5481 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5482 if (array[k] != buf[k]){
5483 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5490 array = normalization_table[j].nfc;
5491 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5492 buf[i] = (nkf_char)(array[i]);
5503 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5505 return (*i_nfc_ungetc)(c, f);
5507 #endif /* UNICODE_NORMALIZATION */
5513 nkf_char c1, c2, c3, c4, cc;
5514 nkf_char t1, t2, t3, t4, mode, exit_mode;
5515 nkf_char lwsp_count;
5518 nkf_char lwsp_size = 128;
5520 if (mime_top != mime_last) { /* Something is in FIFO */
5521 return Fifo(mime_top++);
5523 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5524 mime_decode_mode=FALSE;
5525 unswitch_mime_getc();
5526 return (*i_getc)(f);
5529 if (mimebuf_f == FIXED_MIME)
5530 exit_mode = mime_decode_mode;
5533 if (mime_decode_mode == 'Q') {
5534 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5536 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5537 if (c1<=SP || DEL<=c1) {
5538 mime_decode_mode = exit_mode; /* prepare for quit */
5541 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5545 mime_decode_mode = exit_mode; /* prepare for quit */
5546 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5547 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5548 /* end Q encoding */
5549 input_mode = exit_mode;
5551 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5552 if (lwsp_buf==NULL) {
5553 perror("can't malloc");
5556 while ((c1=(*i_getc)(f))!=EOF) {
5561 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5569 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5570 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5585 lwsp_buf[lwsp_count] = (unsigned char)c1;
5586 if (lwsp_count++>lwsp_size){
5588 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5589 if (lwsp_buf_new==NULL) {
5591 perror("can't realloc");
5594 lwsp_buf = lwsp_buf_new;
5600 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5602 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5603 i_ungetc(lwsp_buf[lwsp_count],f);
5609 if (c1=='='&&c2<SP) { /* this is soft wrap */
5610 while((c1 = (*i_mgetc)(f)) <=SP) {
5611 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5613 mime_decode_mode = 'Q'; /* still in MIME */
5614 goto restart_mime_q;
5617 mime_decode_mode = 'Q'; /* still in MIME */
5621 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5622 if (c2<=SP) return c2;
5623 mime_decode_mode = 'Q'; /* still in MIME */
5624 return ((hex2bin(c2)<<4) + hex2bin(c3));
5627 if (mime_decode_mode != 'B') {
5628 mime_decode_mode = FALSE;
5629 return (*i_mgetc)(f);
5633 /* Base64 encoding */
5635 MIME allows line break in the middle of
5636 Base64, but we are very pessimistic in decoding
5637 in unbuf mode because MIME encoded code may broken by
5638 less or editor's control sequence (such as ESC-[-K in unbuffered
5639 mode. ignore incomplete MIME.
5641 mode = mime_decode_mode;
5642 mime_decode_mode = exit_mode; /* prepare for quit */
5644 while ((c1 = (*i_mgetc)(f))<=SP) {
5649 if ((c2 = (*i_mgetc)(f))<=SP) {
5652 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5653 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5656 if ((c1 == '?') && (c2 == '=')) {
5659 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5660 if (lwsp_buf==NULL) {
5661 perror("can't malloc");
5664 while ((c1=(*i_getc)(f))!=EOF) {
5669 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5677 if ((c1=(*i_getc)(f))!=EOF) {
5681 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5696 lwsp_buf[lwsp_count] = (unsigned char)c1;
5697 if (lwsp_count++>lwsp_size){
5699 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5700 if (lwsp_buf_new==NULL) {
5702 perror("can't realloc");
5705 lwsp_buf = lwsp_buf_new;
5711 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5713 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5714 i_ungetc(lwsp_buf[lwsp_count],f);
5721 if ((c3 = (*i_mgetc)(f))<=SP) {
5724 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5725 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5729 if ((c4 = (*i_mgetc)(f))<=SP) {
5732 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5733 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5737 mime_decode_mode = mode; /* still in MIME sigh... */
5739 /* BASE 64 decoding */
5741 t1 = 0x3f & base64decode(c1);
5742 t2 = 0x3f & base64decode(c2);
5743 t3 = 0x3f & base64decode(c3);
5744 t4 = 0x3f & base64decode(c4);
5745 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5747 Fifo(mime_last++) = (unsigned char)cc;
5748 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5750 Fifo(mime_last++) = (unsigned char)cc;
5751 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5753 Fifo(mime_last++) = (unsigned char)cc;
5758 return Fifo(mime_top++);
5761 nkf_char mime_ungetc(nkf_char c, FILE *f)
5763 Fifo(--mime_top) = (unsigned char)c;
5767 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5771 /* In buffered mode, read until =? or NL or buffer full
5773 mime_input = mime_top;
5774 mime_last = mime_top;
5776 while(*p) Fifo(mime_input++) = *p++;
5779 while((c=(*i_getc)(f))!=EOF) {
5780 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5781 break; /* buffer full */
5783 if (c=='=' && d=='?') {
5784 /* checked. skip header, start decode */
5785 Fifo(mime_input++) = (unsigned char)c;
5786 /* mime_last_input = mime_input; */
5791 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5793 /* Should we check length mod 4? */
5794 Fifo(mime_input++) = (unsigned char)c;
5797 /* In case of Incomplete MIME, no MIME decode */
5798 Fifo(mime_input++) = (unsigned char)c;
5799 mime_last = mime_input; /* point undecoded buffer */
5800 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5801 switch_mime_getc(); /* anyway we need buffered getc */
5805 nkf_char base64decode(nkf_char c)
5810 i = c - 'A'; /* A..Z 0-25 */
5811 } else if (c == '_') {
5812 i = '?' /* 63 */ ; /* _ 63 */
5814 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5816 } else if (c > '/') {
5817 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5818 } else if (c == '+' || c == '-') {
5819 i = '>' /* 62 */ ; /* + and - 62 */
5821 i = '?' /* 63 */ ; /* / 63 */
5826 static const char basis_64[] =
5827 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5829 static nkf_char b64c;
5830 #define MIMEOUT_BUF_LENGTH (60)
5831 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5832 int mimeout_buf_count = 0;
5834 void open_mime(nkf_char mode)
5836 const unsigned char *p;
5839 p = mime_pattern[0];
5840 for(i=0;mime_pattern[i];i++) {
5841 if (mode == mime_encode[i]) {
5842 p = mime_pattern[i];
5846 mimeout_mode = mime_encode_method[i];
5848 if (base64_count>45) {
5849 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5850 (*o_mputc)(mimeout_buf[i]);
5853 PUT_NEWLINE((*o_mputc));
5856 if (mimeout_buf_count>0
5857 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5858 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5862 for (;i<mimeout_buf_count;i++) {
5863 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5864 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5865 (*o_mputc)(mimeout_buf[i]);
5875 j = mimeout_buf_count;
5876 mimeout_buf_count = 0;
5878 mime_putc(mimeout_buf[i]);
5882 void close_mime(void)
5892 switch(mimeout_mode) {
5897 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5903 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5908 if (mimeout_mode > 0) {
5909 if (mimeout_f!=FIXED_MIME) {
5911 } else if (mimeout_mode != 'Q')
5916 void mimeout_addchar(nkf_char c)
5918 switch(mimeout_mode) {
5923 } else if(!nkf_isalnum(c)) {
5925 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5926 (*o_mputc)(bin2hex((c&0xf)));
5935 (*o_mputc)(basis_64[c>>2]);
5940 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5946 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5947 (*o_mputc)(basis_64[c & 0x3F]);
5958 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5960 void mime_prechar(nkf_char c2, nkf_char c1)
5962 if (mimeout_mode > 0){
5964 if (base64_count + mimeout_buf_count/3*4> 73){
5965 (*o_base64conv)(EOF,0);
5966 OCONV_NEWLINE((*o_base64conv));
5967 (*o_base64conv)(0,SP);
5971 if (base64_count + mimeout_buf_count/3*4> 66) {
5972 (*o_base64conv)(EOF,0);
5973 OCONV_NEWLINE((*o_base64conv));
5974 (*o_base64conv)(0,SP);
5980 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5981 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5982 open_mime(output_mode);
5983 (*o_base64conv)(EOF,0);
5984 OCONV_NEWLINE((*o_base64conv));
5985 (*o_base64conv)(0,SP);
5992 void mime_putc(nkf_char c)
5997 if (mimeout_f == FIXED_MIME){
5998 if (mimeout_mode == 'Q'){
5999 if (base64_count > 71){
6000 if (c!=CR && c!=LF) {
6002 PUT_NEWLINE((*o_mputc));
6007 if (base64_count > 71){
6009 PUT_NEWLINE((*o_mputc));
6012 if (c == EOF) { /* c==EOF */
6016 if (c != EOF) { /* c==EOF */
6022 /* mimeout_f != FIXED_MIME */
6024 if (c == EOF) { /* c==EOF */
6025 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
6026 j = mimeout_buf_count;
6027 mimeout_buf_count = 0;
6029 if (mimeout_mode > 0) {
6030 if (!nkf_isblank(mimeout_buf[j-1])) {
6032 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
6035 mimeout_addchar(mimeout_buf[i]);
6039 mimeout_addchar(mimeout_buf[i]);
6043 mimeout_addchar(mimeout_buf[i]);
6049 mimeout_addchar(mimeout_buf[i]);
6055 if (mimeout_buf_count > 0){
6056 lastchar = mimeout_buf[mimeout_buf_count - 1];
6061 if (mimeout_mode=='Q') {
6062 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6063 if (c == CR || c == LF) {
6068 } else if (c <= SP) {
6070 if (base64_count > 70) {
6071 PUT_NEWLINE((*o_mputc));
6074 if (!nkf_isblank(c)) {
6079 if (base64_count > 70) {
6081 PUT_NEWLINE((*o_mputc));
6084 open_mime(output_mode);
6086 if (!nkf_noescape_mime(c)) {
6097 if (mimeout_mode <= 0) {
6098 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6099 if (nkf_isspace(c)) {
6101 if (mimeout_mode == -1) {
6104 if (c==CR || c==LF) {
6106 open_mime(output_mode);
6112 for (i=0;i<mimeout_buf_count;i++) {
6113 (*o_mputc)(mimeout_buf[i]);
6114 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
6125 mimeout_buf[0] = (char)c;
6126 mimeout_buf_count = 1;
6128 if (base64_count > 1
6129 && base64_count + mimeout_buf_count > 76
6130 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
6131 PUT_NEWLINE((*o_mputc));
6133 if (!nkf_isspace(mimeout_buf[0])){
6138 mimeout_buf[mimeout_buf_count++] = (char)c;
6139 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6140 open_mime(output_mode);
6145 if (lastchar==CR || lastchar == LF){
6146 for (i=0;i<mimeout_buf_count;i++) {
6147 (*o_mputc)(mimeout_buf[i]);
6150 mimeout_buf_count = 0;
6153 for (i=0;i<mimeout_buf_count-1;i++) {
6154 (*o_mputc)(mimeout_buf[i]);
6157 mimeout_buf[0] = SP;
6158 mimeout_buf_count = 1;
6160 open_mime(output_mode);
6163 /* mimeout_mode == 'B', 1, 2 */
6164 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6165 if (lastchar == CR || lastchar == LF){
6166 if (nkf_isblank(c)) {
6167 for (i=0;i<mimeout_buf_count;i++) {
6168 mimeout_addchar(mimeout_buf[i]);
6170 mimeout_buf_count = 0;
6171 } else if (SP<c && c<DEL) {
6173 for (i=0;i<mimeout_buf_count;i++) {
6174 (*o_mputc)(mimeout_buf[i]);
6177 mimeout_buf_count = 0;
6179 mimeout_buf[mimeout_buf_count++] = (char)c;
6182 if (c==SP || c==TAB || c==CR || c==LF) {
6183 for (i=0;i<mimeout_buf_count;i++) {
6184 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
6186 for (i=0;i<mimeout_buf_count;i++) {
6187 (*o_mputc)(mimeout_buf[i]);
6190 mimeout_buf_count = 0;
6193 mimeout_buf[mimeout_buf_count++] = (char)c;
6194 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6196 for (i=0;i<mimeout_buf_count;i++) {
6197 (*o_mputc)(mimeout_buf[i]);
6200 mimeout_buf_count = 0;
6204 if (mimeout_buf_count>0 && SP<c && c!='=') {
6205 mimeout_buf[mimeout_buf_count++] = (char)c;
6206 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6207 j = mimeout_buf_count;
6208 mimeout_buf_count = 0;
6210 mimeout_addchar(mimeout_buf[i]);
6217 if (mimeout_buf_count>0) {
6218 j = mimeout_buf_count;
6219 mimeout_buf_count = 0;
6221 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
6223 mimeout_addchar(mimeout_buf[i]);
6229 (*o_mputc)(mimeout_buf[i]);
6231 open_mime(output_mode);
6241 struct input_code *p = input_code_list;
6254 mime_f = MIME_DECODE_DEFAULT;
6255 mime_decode_f = FALSE;
6260 x0201_f = X0201_DEFAULT;
6261 iso2022jp_f = FALSE;
6262 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6263 ms_ucs_map_f = UCS_MAP_ASCII;
6265 #ifdef UTF8_INPUT_ENABLE
6266 no_cp932ext_f = FALSE;
6267 no_best_fit_chars_f = FALSE;
6268 encode_fallback = NULL;
6269 unicode_subchar = '?';
6270 input_endian = ENDIAN_BIG;
6272 #ifdef UTF8_OUTPUT_ENABLE
6273 output_bom_f = FALSE;
6274 output_endian = ENDIAN_BIG;
6276 #ifdef UNICODE_NORMALIZATION
6292 #ifdef SHIFTJIS_CP932
6302 for (i = 0; i < 256; i++){
6303 prefix_table[i] = 0;
6307 mimeout_buf_count = 0;
6312 fold_preserve_f = FALSE;
6315 kanji_intro = DEFAULT_J;
6316 ascii_intro = DEFAULT_R;
6317 fold_margin = FOLD_MARGIN;
6318 output_conv = DEFAULT_CONV;
6319 oconv = DEFAULT_CONV;
6320 o_zconv = no_connection;
6321 o_fconv = no_connection;
6322 o_nlconv = no_connection;
6323 o_rot_conv = no_connection;
6324 o_hira_conv = no_connection;
6325 o_base64conv = no_connection;
6326 o_iso2022jp_check_conv = no_connection;
6329 i_ungetc = std_ungetc;
6331 i_bungetc = std_ungetc;
6334 i_mungetc = std_ungetc;
6335 i_mgetc_buf = std_getc;
6336 i_mungetc_buf = std_ungetc;
6337 output_mode = ASCII;
6340 mime_decode_mode = FALSE;
6348 z_prev2=0,z_prev1=0;
6350 iconv_for_check = 0;
6352 input_codename = NULL;
6353 output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
6359 void no_connection(nkf_char c2, nkf_char c1)
6361 no_connection2(c2,c1,0);
6364 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6366 fprintf(stderr,"nkf internal module connection failure.\n");
6368 return 0; /* LINT */
6373 #define fprintf dllprintf
6377 fprintf(HELP_OUTPUT,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6378 fprintf(HELP_OUTPUT,"Flags:\n");
6379 fprintf(HELP_OUTPUT,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6380 #ifdef DEFAULT_CODE_SJIS
6381 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6383 #ifdef DEFAULT_CODE_JIS
6384 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6386 #ifdef DEFAULT_CODE_EUC
6387 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6389 #ifdef DEFAULT_CODE_UTF8
6390 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6392 #ifdef UTF8_OUTPUT_ENABLE
6393 fprintf(HELP_OUTPUT," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6395 fprintf(HELP_OUTPUT,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6396 #ifdef UTF8_INPUT_ENABLE
6397 fprintf(HELP_OUTPUT," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6399 fprintf(HELP_OUTPUT,"t no conversion\n");
6400 fprintf(HELP_OUTPUT,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6401 fprintf(HELP_OUTPUT,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6402 fprintf(HELP_OUTPUT,"r {de/en}crypt ROT13/47\n");
6403 fprintf(HELP_OUTPUT,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6404 fprintf(HELP_OUTPUT,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6405 fprintf(HELP_OUTPUT,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6406 fprintf(HELP_OUTPUT,"l ISO8859-1 (Latin-1) support\n");
6407 fprintf(HELP_OUTPUT,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6408 fprintf(HELP_OUTPUT,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6409 fprintf(HELP_OUTPUT," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6410 fprintf(HELP_OUTPUT," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6411 fprintf(HELP_OUTPUT,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6412 fprintf(HELP_OUTPUT,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6414 fprintf(HELP_OUTPUT,"T Text mode output\n");
6416 fprintf(HELP_OUTPUT,"O Output to File (DEFAULT 'nkf.out')\n");
6417 fprintf(HELP_OUTPUT,"I Convert non ISO-2022-JP charactor to GETA\n");
6418 fprintf(HELP_OUTPUT,"d,c Convert line breaks -d: LF -c: CRLF\n");
6419 fprintf(HELP_OUTPUT,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6420 fprintf(HELP_OUTPUT,"v, V Show this usage. V: show configuration\n");
6421 fprintf(HELP_OUTPUT,"\n");
6422 fprintf(HELP_OUTPUT,"Long name options\n");
6423 fprintf(HELP_OUTPUT," --ic=<input codeset> --oc=<output codeset>\n");
6424 fprintf(HELP_OUTPUT," Specify the input or output codeset\n");
6425 fprintf(HELP_OUTPUT," --fj --unix --mac --windows\n");
6426 fprintf(HELP_OUTPUT," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6427 fprintf(HELP_OUTPUT," Convert for the system or code\n");
6428 fprintf(HELP_OUTPUT," --hiragana --katakana --katakana-hiragana\n");
6429 fprintf(HELP_OUTPUT," To Hiragana/Katakana Conversion\n");
6430 fprintf(HELP_OUTPUT," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6432 fprintf(HELP_OUTPUT," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6434 #ifdef NUMCHAR_OPTION
6435 fprintf(HELP_OUTPUT," --numchar-input Convert Unicode Character Reference\n");
6437 #ifdef UTF8_INPUT_ENABLE
6438 fprintf(HELP_OUTPUT," --fb-{skip, html, xml, perl, java, subchar}\n");
6439 fprintf(HELP_OUTPUT," Specify how nkf handles unassigned characters\n");
6442 fprintf(HELP_OUTPUT," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6443 fprintf(HELP_OUTPUT," Overwrite original listed files by filtered result\n");
6444 fprintf(HELP_OUTPUT," --overwrite preserves timestamp of original files\n");
6446 fprintf(HELP_OUTPUT," -g --guess Guess the input code\n");
6447 fprintf(HELP_OUTPUT," --help --version Show this help/the version\n");
6448 fprintf(HELP_OUTPUT," For more information, see also man nkf\n");
6449 fprintf(HELP_OUTPUT,"\n");
6453 void show_configuration(void)
6455 fprintf(HELP_OUTPUT, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n");
6456 fprintf(HELP_OUTPUT, " Compile-time options:\n");
6457 fprintf(HELP_OUTPUT, " Default output encoding: "
6458 #if defined(DEFAULT_CODE_JIS)
6460 #elif defined(DEFAULT_CODE_SJIS)
6462 #elif defined(DEFAULT_CODE_EUC)
6464 #elif defined(DEFAULT_CODE_UTF8)
6468 fprintf(HELP_OUTPUT, " Default output newline: "
6469 #if DEFAULT_NEWLINE == CR
6471 #elif DEFAULT_NEWLINE == CRLF
6477 fprintf(HELP_OUTPUT, " Decode MIME encoded string: "
6478 #if MIME_DECODE_DEFAULT
6484 fprintf(HELP_OUTPUT, " Convert JIS X 0201 Katakana: "
6491 fprintf(HELP_OUTPUT, " --help, --version output: "
6492 #if HELP_OUTPUT_HELP_OUTPUT
6502 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");