1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.165 2008/01/22 00:30:05 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2008-01-21"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
43 #ifndef MIME_DECODE_DEFAULT
44 #define MIME_DECODE_DEFAULT STRICT_MIME
47 #define X0201_DEFAULT TRUE
50 #if DEFAULT_NEWLINE == 0x0D0A
51 #define PUT_NEWLINE(func) do {\
55 #define OCONV_NEWLINE(func) do {\
59 #elif DEFAULT_NEWLINE == 0x0D
60 #define PUT_NEWLINE(func) func(0x0D)
61 #define OCONV_NEWLINE(func) func(0, 0x0D)
63 #define DEFAULT_NEWLINE 0x0A
64 #define PUT_NEWLINE(func) func(0x0A)
65 #define OCONV_NEWLINE(func) func(0, 0x0A)
67 #ifdef HELP_OUTPUT_STDERR
68 #define HELP_OUTPUT stderr
70 #define HELP_OUTPUT stdout
73 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
75 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
91 #if defined(MSDOS) || defined(__OS2__)
94 #if defined(_MSC_VER) || defined(__WATCOMC__)
95 #define mktemp _mktemp
101 #define setbinmode(fp) fsetbin(fp)
102 #elif defined(__DJGPP__)
103 #include <libc/dosio.h>
104 #define setbinmode(fp) djgpp_setbinmode(fp)
105 #else /* Microsoft C, Turbo C */
106 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
109 #define setbinmode(fp)
112 #if defined(__DJGPP__)
113 void djgpp_setbinmode(FILE *fp)
115 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
118 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
119 __file_handle_set(fd, m);
123 #ifdef _IOFBF /* SysV and MSDOS, Windows */
124 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
126 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
129 /*Borland C++ 4.5 EasyWin*/
130 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
139 /* added by satoru@isoternet.org */
141 #include <sys/types.h>
143 #include <sys/stat.h>
144 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
146 #if defined(__WATCOMC__)
147 #include <sys/utime.h>
151 #else /* defined(MSDOS) */
153 #ifdef __BORLANDC__ /* BCC32 */
155 #else /* !defined(__BORLANDC__) */
156 #include <sys/utime.h>
157 #endif /* (__BORLANDC__) */
158 #else /* !defined(__WIN32__) */
159 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
160 #include <sys/utime.h>
161 #elif defined(__TURBOC__) /* BCC */
163 #elif defined(LSI_C) /* LSI C */
164 #endif /* (__WIN32__) */
171 #define HAVE_LANGINFO_H
172 #define HAVE_LOCALE_H
175 #ifdef HAVE_LANGINFO_H
176 #include <langinfo.h>
185 /* state of output_mode and input_mode
199 #define STRICT_MIME 8
263 NKF_ENCODING_TABLE_SIZE,
272 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
273 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
274 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
275 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
276 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
277 void j_oconv(nkf_char c2, nkf_char c1);
278 void s_oconv(nkf_char c2, nkf_char c1);
279 void e_oconv(nkf_char c2, nkf_char c1);
280 void w_oconv(nkf_char c2, nkf_char c1);
281 void w_oconv16(nkf_char c2, nkf_char c1);
282 void w_oconv32(nkf_char c2, nkf_char c1);
286 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
287 void (*oconv)(nkf_char c2, nkf_char c1);
288 } nkf_native_encoding;
290 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
291 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
292 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
293 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
294 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
295 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
296 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
301 const nkf_native_encoding *base_encoding;
304 nkf_encoding nkf_encoding_table[] = {
305 {ASCII, "US-ASCII", &NkfEncodingASCII},
306 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
307 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
308 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
309 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
310 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
311 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
312 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
313 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
314 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
315 {CP10001, "CP10001", &NkfEncodingShift_JIS},
316 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
317 {CP51932, "CP51932", &NkfEncodingEUC_JP},
318 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
319 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
320 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
321 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
322 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
323 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
324 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
325 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
326 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
327 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
328 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
329 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
330 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
331 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
332 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
333 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
334 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
335 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
336 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
337 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
338 {BINARY, "BINARY", &NkfEncodingASCII},
345 } encoding_name_to_id_table[] = {
348 {"ISO-2022-JP", ISO_2022_JP},
349 {"ISO2022JP-CP932", CP50220},
350 {"CP50220", CP50220},
351 {"CP50221", CP50221},
352 {"CP50222", CP50222},
353 {"ISO-2022-JP-1", ISO_2022_JP_1},
354 {"ISO-2022-JP-3", ISO_2022_JP_3},
355 {"SHIFT_JIS", SHIFT_JIS},
357 {"WINDOWS-31J", WINDOWS_31J},
358 {"CSWINDOWS31J", WINDOWS_31J},
359 {"CP932", WINDOWS_31J},
360 {"MS932", WINDOWS_31J},
361 {"CP10001", CP10001},
364 {"CP51932", CP51932},
365 {"EUC-JP-MS", EUCJP_MS},
366 {"EUCJP-MS", EUCJP_MS},
367 {"EUCJPMS", EUCJP_MS},
368 {"EUC-JP-ASCII", EUCJP_ASCII},
369 {"EUCJP-ASCII", EUCJP_ASCII},
370 {"SHIFT_JISX0213", SHIFT_JISX0213},
371 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
372 {"EUC-JISX0213", EUC_JISX0213},
373 {"EUC-JIS-2004", EUC_JIS_2004},
376 {"UTF-8-BOM", UTF_8_BOM},
377 {"UTF8-MAC", UTF8_MAC},
378 {"UTF-8-MAC", UTF8_MAC},
380 {"UTF-16BE", UTF_16BE},
381 {"UTF-16BE-BOM", UTF_16BE_BOM},
382 {"UTF-16LE", UTF_16LE},
383 {"UTF-16LE-BOM", UTF_16LE_BOM},
385 {"UTF-32BE", UTF_32BE},
386 {"UTF-32BE-BOM", UTF_32BE_BOM},
387 {"UTF-32LE", UTF_32LE},
388 {"UTF-32LE-BOM", UTF_32LE_BOM},
393 #if defined(DEFAULT_CODE_JIS)
394 #define DEFAULT_ENCIDX ISO_2022_JP
395 #elif defined(DEFAULT_CODE_SJIS)
396 #define DEFAULT_ENCIDX SHIFT_JIS
397 #elif defined(DEFAULT_CODE_EUC)
398 #define DEFAULT_ENCIDX EUC_JP
399 #elif defined(DEFAULT_CODE_UTF8)
400 #define DEFAULT_ENCIDX UTF_8
404 #define is_alnum(c) \
405 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
407 /* I don't trust portablity of toupper */
408 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
409 #define nkf_isoctal(c) ('0'<=c && c<='7')
410 #define nkf_isdigit(c) ('0'<=c && c<='9')
411 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
412 #define nkf_isblank(c) (c == SP || c == TAB)
413 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
414 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
415 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
416 #define nkf_isprint(c) (SP<=c && c<='~')
417 #define nkf_isgraph(c) ('!'<=c && c<='~')
418 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
419 ('A'<=c&&c<='F') ? (c-'A'+10) : \
420 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
421 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
422 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
423 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
424 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
425 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
427 #define CP932_TABLE_BEGIN 0xFA
428 #define CP932_TABLE_END 0xFC
429 #define CP932INV_TABLE_BEGIN 0xED
430 #define CP932INV_TABLE_END 0xEE
431 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
433 #define HOLD_SIZE 1024
434 #if defined(INT_IS_SHORT)
435 #define IOBUF_SIZE 2048
437 #define IOBUF_SIZE 16384
440 #define DEFAULT_J 'B'
441 #define DEFAULT_R 'B'
443 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
444 #define SJ6394 0x0161 /* 63 - 94 ku offset */
446 #define RANGE_NUM_MAX 18
451 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
452 #define sizeof_euc_to_utf8_1byte 94
453 #define sizeof_euc_to_utf8_2bytes 94
454 #define sizeof_utf8_to_euc_C2 64
455 #define sizeof_utf8_to_euc_E5B8 64
456 #define sizeof_utf8_to_euc_2bytes 112
457 #define sizeof_utf8_to_euc_3bytes 16
460 /* MIME preprocessor */
462 #ifdef EASYWIN /*Easy Win */
463 extern POINT _BufferSize;
472 void (*status_func)(struct input_code *, nkf_char);
473 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
477 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
478 static nkf_encoding *input_encoding = NULL;
479 static nkf_encoding *output_encoding = NULL;
480 static void set_output_encoding(nkf_encoding *enc);
482 #if !defined(PERL_XS) && !defined(WIN32DLL)
483 static nkf_char noconvert(FILE *f);
485 static void module_connection(void);
486 static nkf_char kanji_convert(FILE *f);
487 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
488 static nkf_char push_hold_buf(nkf_char c2);
489 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
490 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
491 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
493 * 0: Shift_JIS, eucJP-ascii
498 #define UCS_MAP_ASCII 0
500 #define UCS_MAP_CP932 2
501 #define UCS_MAP_CP10001 3
502 static int ms_ucs_map_f = UCS_MAP_ASCII;
504 #ifdef UTF8_INPUT_ENABLE
505 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
506 static int no_cp932ext_f = FALSE;
507 /* ignore ZERO WIDTH NO-BREAK SPACE */
508 static int no_best_fit_chars_f = FALSE;
509 static int input_endian = ENDIAN_BIG;
510 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
511 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
512 static void encode_fallback_html(nkf_char c);
513 static void encode_fallback_xml(nkf_char c);
514 static void encode_fallback_java(nkf_char c);
515 static void encode_fallback_perl(nkf_char c);
516 static void encode_fallback_subchar(nkf_char c);
517 static void (*encode_fallback)(nkf_char c) = NULL;
518 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
519 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
520 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
521 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
522 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
523 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
524 static void w_status(struct input_code *, nkf_char);
526 #ifdef UTF8_OUTPUT_ENABLE
527 static int output_bom_f = FALSE;
528 static int output_endian = ENDIAN_BIG;
529 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
531 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
532 static void fold_conv(nkf_char c2,nkf_char c1);
533 static void nl_conv(nkf_char c2,nkf_char c1);
534 static void z_conv(nkf_char c2,nkf_char c1);
535 static void rot_conv(nkf_char c2,nkf_char c1);
536 static void hira_conv(nkf_char c2,nkf_char c1);
537 static void base64_conv(nkf_char c2,nkf_char c1);
538 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
539 static void no_connection(nkf_char c2,nkf_char c1);
540 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
542 static void code_score(struct input_code *ptr);
543 static void code_status(nkf_char c);
545 static void std_putc(nkf_char c);
546 static nkf_char std_getc(FILE *f);
547 static nkf_char std_ungetc(nkf_char c,FILE *f);
549 static nkf_char broken_getc(FILE *f);
550 static nkf_char broken_ungetc(nkf_char c,FILE *f);
552 static nkf_char mime_begin(FILE *f);
553 static nkf_char mime_getc(FILE *f);
554 static nkf_char mime_ungetc(nkf_char c,FILE *f);
556 static void switch_mime_getc(void);
557 static void unswitch_mime_getc(void);
558 static nkf_char mime_begin_strict(FILE *f);
559 static nkf_char mime_getc_buf(FILE *f);
560 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
561 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
563 static nkf_char base64decode(nkf_char c);
564 static void mime_prechar(nkf_char c2, nkf_char c1);
565 static void mime_putc(nkf_char c);
566 static void open_mime(nkf_char c);
567 static void close_mime(void);
568 static void eof_mime(void);
569 static void mimeout_addchar(nkf_char c);
571 static void usage(void);
572 static void version(void);
573 static void show_configuration(void);
575 static void options(unsigned char *c);
576 static void reinit(void);
580 #if !defined(PERL_XS) && !defined(WIN32DLL)
581 static unsigned char stdibuf[IOBUF_SIZE];
582 static unsigned char stdobuf[IOBUF_SIZE];
584 static unsigned char hold_buf[HOLD_SIZE*2];
585 static int hold_count = 0;
587 /* MIME preprocessor fifo */
589 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
590 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
591 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
592 static unsigned char mime_buf[MIME_BUF_SIZE];
593 static unsigned int mime_top = 0;
594 static unsigned int mime_last = 0; /* decoded */
595 static unsigned int mime_input = 0; /* undecoded */
596 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
599 static int unbuf_f = FALSE;
600 static int estab_f = FALSE;
601 static int nop_f = FALSE;
602 static int binmode_f = TRUE; /* binary mode */
603 static int rot_f = FALSE; /* rot14/43 mode */
604 static int hira_f = FALSE; /* hira/kata henkan */
605 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
606 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
607 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
608 static int mimebuf_f = FALSE; /* MIME buffered input */
609 static int broken_f = FALSE; /* convert ESC-less broken JIS */
610 static int iso8859_f = FALSE; /* ISO8859 through */
611 static int mimeout_f = FALSE; /* base64 mode */
612 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
613 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
615 #ifdef UNICODE_NORMALIZATION
616 static int nfc_f = FALSE;
617 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
618 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
619 static nkf_char nfc_getc(FILE *f);
620 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
624 static int cap_f = FALSE;
625 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
626 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
627 static nkf_char cap_getc(FILE *f);
628 static nkf_char cap_ungetc(nkf_char c,FILE *f);
630 static int url_f = FALSE;
631 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
632 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
633 static nkf_char url_getc(FILE *f);
634 static nkf_char url_ungetc(nkf_char c,FILE *f);
637 #if defined(INT_IS_SHORT)
638 #define NKF_INT32_C(n) (n##L)
640 #define NKF_INT32_C(n) (n)
642 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
643 #define CLASS_MASK NKF_INT32_C(0xFF000000)
644 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
645 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
646 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
647 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
648 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
650 #ifdef NUMCHAR_OPTION
651 static int numchar_f = FALSE;
652 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
653 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
654 static nkf_char numchar_getc(FILE *f);
655 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
659 static int noout_f = FALSE;
660 static void no_putc(nkf_char c);
661 static int debug_f = FALSE;
662 static void debug(const char *str);
663 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
666 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
668 static void print_guessed_code(char *filename);
670 static void set_input_codename(char *codename);
673 static int exec_f = 0;
676 #ifdef SHIFTJIS_CP932
677 /* invert IBM extended characters to others */
678 static int cp51932_f = FALSE;
680 /* invert NEC-selected IBM extended characters to IBM extended characters */
681 static int cp932inv_f = TRUE;
683 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
684 #endif /* SHIFTJIS_CP932 */
687 static int x0212_f = FALSE;
688 static nkf_char x0212_shift(nkf_char c);
689 static nkf_char x0212_unshift(nkf_char c);
691 static int x0213_f = FALSE;
693 static unsigned char prefix_table[256];
695 static void set_code_score(struct input_code *ptr, nkf_char score);
696 static void clr_code_score(struct input_code *ptr, nkf_char score);
697 static void status_disable(struct input_code *ptr);
698 static void status_push_ch(struct input_code *ptr, nkf_char c);
699 static void status_clear(struct input_code *ptr);
700 static void status_reset(struct input_code *ptr);
701 static void status_reinit(struct input_code *ptr);
702 static void status_check(struct input_code *ptr, nkf_char c);
703 static void e_status(struct input_code *, nkf_char);
704 static void s_status(struct input_code *, nkf_char);
706 struct input_code input_code_list[] = {
707 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
708 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
709 #ifdef UTF8_INPUT_ENABLE
710 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
711 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
712 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
717 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
718 static int base64_count = 0;
720 /* X0208 -> ASCII converter */
723 static int f_line = 0; /* chars in line */
724 static int f_prev = 0;
725 static int fold_preserve_f = FALSE; /* preserve new lines */
726 static int fold_f = FALSE;
727 static int fold_len = 0;
730 static unsigned char kanji_intro = DEFAULT_J;
731 static unsigned char ascii_intro = DEFAULT_R;
735 #define FOLD_MARGIN 10
736 #define DEFAULT_FOLD 60
738 static int fold_margin = FOLD_MARGIN;
740 /* process default */
741 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
742 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
744 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
745 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
746 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
747 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
748 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
749 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
750 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
752 /* static redirections */
754 static void (*o_putc)(nkf_char c) = std_putc;
756 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
757 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
759 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
760 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
762 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
764 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
765 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
767 /* for strict mime */
768 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
769 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
772 static int output_mode = ASCII, /* output kanji mode */
773 input_mode = ASCII, /* input kanji mode */
774 shift_mode = FALSE; /* TRUE shift out, or X0201 */
775 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
777 /* X0201 / X0208 conversion tables */
779 /* X0201 kana conversion table */
781 static const unsigned char cv[]= {
782 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
783 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
784 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
785 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
786 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
787 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
788 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
789 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
790 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
791 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
792 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
793 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
794 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
795 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
796 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
797 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
801 /* X0201 kana conversion table for daguten */
803 static const unsigned char dv[]= {
804 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
805 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
806 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
807 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
808 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
809 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
810 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
811 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
812 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
813 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
814 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
815 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
816 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
817 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
818 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
819 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
822 /* X0201 kana conversion table for han-daguten */
824 static const unsigned char ev[]= {
825 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
826 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
827 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
828 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
829 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
830 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
831 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
832 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
833 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
834 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
835 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
836 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
837 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
838 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
839 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
840 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
844 /* X0208 kigou conversion table */
845 /* 0x8140 - 0x819e */
846 static const unsigned char fv[] = {
848 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
849 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
850 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
851 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
852 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
853 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
854 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
855 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
856 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
857 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
858 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
859 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
864 static int file_out_f = FALSE;
866 static int overwrite_f = FALSE;
867 static int preserve_time_f = FALSE;
868 static int backup_f = FALSE;
869 static char *backup_suffix = "";
870 static char *get_backup_filename(const char *suffix, const char *filename);
873 static int nlmode_f = 0; /* CR, LF, CRLF */
874 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
875 static nkf_char prev_cr = 0; /* CR or 0 */
876 #ifdef EASYWIN /*Easy Win */
877 static int end_check;
880 #define STD_GC_BUFSIZE (256)
881 nkf_char std_gc_buf[STD_GC_BUFSIZE];
884 char* nkf_strcpy(const char *str)
886 char* result = malloc(strlen(str) + 1);
895 static void nkf_str_upcase(const char *src, char *dest, size_t length)
898 for (; i < length && src[i]; i++) {
899 dest[i] = nkf_toupper(src[i]);
904 static nkf_encoding *nkf_enc_from_index(int idx)
906 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
909 return &nkf_encoding_table[idx];
912 static int nkf_enc_find_index(const char *name)
915 if (*name == 'X' && *(name+1) == '-') name += 2;
916 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
917 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
918 return encoding_name_to_id_table[i].id;
924 static nkf_encoding *nkf_enc_find(const char *name)
927 idx = nkf_enc_find_index(name);
928 if (idx < 0) return 0;
929 return nkf_enc_from_index(idx);
932 #define nkf_enc_name(enc) (enc)->name
933 #define nkf_enc_to_index(enc) (enc)->id
934 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
935 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
936 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
937 #define nkf_enc_asciicompat(enc) (\
938 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
939 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
940 #define nkf_enc_unicode_p(enc) (\
941 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
942 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
943 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
944 #define nkf_enc_cp5022x_p(enc) (\
945 nkf_enc_to_index(enc) == CP50220 ||\
946 nkf_enc_to_index(enc) == CP50221 ||\
947 nkf_enc_to_index(enc) == CP50222)
949 #ifndef DEFAULT_ENCIDX
950 static char* nkf_locale_charmap()
952 #ifdef HAVE_LANGINFO_H
953 return nl_langinfo(CODESET);
954 #elif defined(__WIN32__)
955 return sprintf("CP%d", GetACP());
961 static nkf_encoding* nkf_locale_encoding()
963 nkf_encoding *enc = 0;
964 char *encname = nkf_locale_charmap();
966 enc = nkf_enc_find(encname);
967 if (enc < 0) enc = 0;
972 static nkf_encoding* nkf_default_encoding()
974 #ifdef DEFAULT_ENCIDX
975 return nkf_enc_from_index(DEFAULT_ENCIDX);
977 nkf_encoding *enc = nkf_locale_encoding();
978 if (enc <= 0) enc = nkf_enc_from_index(ISO_2022_JP);
984 #include "nkf32dll.c"
985 #elif defined(PERL_XS)
987 int main(int argc, char **argv)
992 char *outfname = NULL;
995 #ifdef EASYWIN /*Easy Win */
996 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
998 setlocale(LC_CTYPE, "");
1000 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
1001 cp = (unsigned char *)*argv;
1006 if (pipe(fds) < 0 || (pid = fork()) < 0){
1017 execvp(argv[1], &argv[1]);
1034 int debug_f_back = debug_f;
1037 int exec_f_back = exec_f;
1040 int x0212_f_back = x0212_f;
1042 int x0213_f_back = x0213_f;
1043 int guess_f_back = guess_f;
1045 guess_f = guess_f_back;
1048 debug_f = debug_f_back;
1051 exec_f = exec_f_back;
1054 x0212_f = x0212_f_back;
1056 x0213_f = x0213_f_back;
1059 if (binmode_f == TRUE)
1060 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1061 if (freopen("","wb",stdout) == NULL)
1068 setbuf(stdout, (char *) NULL);
1070 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
1073 if (binmode_f == TRUE)
1074 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1075 if (freopen("","rb",stdin) == NULL) return (-1);
1079 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
1083 kanji_convert(stdin);
1084 if (guess_f) print_guessed_code(NULL);
1088 int is_argument_error = FALSE;
1090 input_codename = NULL;
1093 iconv_for_check = 0;
1095 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
1097 is_argument_error = TRUE;
1105 /* reopen file for stdout */
1106 if (file_out_f == TRUE) {
1109 outfname = malloc(strlen(origfname)
1110 + strlen(".nkftmpXXXXXX")
1116 strcpy(outfname, origfname);
1120 for (i = strlen(outfname); i; --i){
1121 if (outfname[i - 1] == '/'
1122 || outfname[i - 1] == '\\'){
1128 strcat(outfname, "ntXXXXXX");
1130 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
1131 S_IREAD | S_IWRITE);
1133 strcat(outfname, ".nkftmpXXXXXX");
1134 fd = mkstemp(outfname);
1137 || (fd_backup = dup(fileno(stdout))) < 0
1138 || dup2(fd, fileno(stdout)) < 0
1149 outfname = "nkf.out";
1152 if(freopen(outfname, "w", stdout) == NULL) {
1156 if (binmode_f == TRUE) {
1157 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1158 if (freopen("","wb",stdout) == NULL)
1165 if (binmode_f == TRUE)
1166 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1167 if (freopen("","rb",fin) == NULL)
1172 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
1176 char *filename = NULL;
1178 if (nfiles > 1) filename = origfname;
1179 if (guess_f) print_guessed_code(filename);
1185 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1193 if (dup2(fd_backup, fileno(stdout)) < 0){
1196 if (stat(origfname, &sb)) {
1197 fprintf(stderr, "Can't stat %s\n", origfname);
1199 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
1200 if (chmod(outfname, sb.st_mode)) {
1201 fprintf(stderr, "Can't set permission %s\n", outfname);
1204 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
1205 if(preserve_time_f){
1206 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1207 tb[0] = tb[1] = sb.st_mtime;
1208 if (utime(outfname, tb)) {
1209 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1212 tb.actime = sb.st_atime;
1213 tb.modtime = sb.st_mtime;
1214 if (utime(outfname, &tb)) {
1215 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1220 char *backup_filename = get_backup_filename(backup_suffix, origfname);
1222 unlink(backup_filename);
1224 if (rename(origfname, backup_filename)) {
1225 perror(backup_filename);
1226 fprintf(stderr, "Can't rename %s to %s\n",
1227 origfname, backup_filename);
1231 if (unlink(origfname)){
1236 if (rename(outfname, origfname)) {
1238 fprintf(stderr, "Can't rename %s to %s\n",
1239 outfname, origfname);
1246 if (is_argument_error)
1249 #ifdef EASYWIN /*Easy Win */
1250 if (file_out_f == FALSE)
1251 scanf("%d",&end_check);
1254 #else /* for Other OS */
1255 if (file_out_f == TRUE)
1257 #endif /*Easy Win */
1260 #endif /* WIN32DLL */
1263 char *get_backup_filename(const char *suffix, const char *filename)
1265 char *backup_filename;
1266 int asterisk_count = 0;
1268 int filename_length = strlen(filename);
1270 for(i = 0; suffix[i]; i++){
1271 if(suffix[i] == '*') asterisk_count++;
1275 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1276 if (!backup_filename){
1277 perror("Can't malloc backup filename.");
1281 for(i = 0, j = 0; suffix[i];){
1282 if(suffix[i] == '*'){
1283 backup_filename[j] = '\0';
1284 strncat(backup_filename, filename, filename_length);
1286 j += filename_length;
1288 backup_filename[j++] = suffix[i++];
1291 backup_filename[j] = '\0';
1293 j = strlen(suffix) + filename_length;
1294 backup_filename = malloc( + 1);
1295 strcpy(backup_filename, filename);
1296 strcat(backup_filename, suffix);
1297 backup_filename[j] = '\0';
1299 return backup_filename;
1303 static const struct {
1327 {"katakana-hiragana","h3"},
1335 #ifdef UTF8_OUTPUT_ENABLE
1345 {"fb-subchar=", ""},
1347 #ifdef UTF8_INPUT_ENABLE
1348 {"utf8-input", "W"},
1349 {"utf16-input", "W16"},
1350 {"no-cp932ext", ""},
1351 {"no-best-fit-chars",""},
1353 #ifdef UNICODE_NORMALIZATION
1354 {"utf8mac-input", ""},
1366 #ifdef NUMCHAR_OPTION
1367 {"numchar-input", ""},
1373 #ifdef SHIFTJIS_CP932
1383 static void set_input_encoding(nkf_encoding *enc)
1385 switch (nkf_enc_to_index(enc)) {
1389 #ifdef SHIFTJIS_CP932
1392 #ifdef UTF8_OUTPUT_ENABLE
1393 ms_ucs_map_f = UCS_MAP_CP932;
1410 #ifdef SHIFTJIS_CP932
1413 #ifdef UTF8_OUTPUT_ENABLE
1414 ms_ucs_map_f = UCS_MAP_CP932;
1420 #ifdef SHIFTJIS_CP932
1423 #ifdef UTF8_OUTPUT_ENABLE
1424 ms_ucs_map_f = UCS_MAP_CP10001;
1428 #ifdef SHIFTJIS_CP932
1431 #ifdef UTF8_OUTPUT_ENABLE
1432 ms_ucs_map_f = UCS_MAP_CP932;
1436 #ifdef SHIFTJIS_CP932
1439 #ifdef UTF8_OUTPUT_ENABLE
1440 ms_ucs_map_f = UCS_MAP_MS;
1444 #ifdef SHIFTJIS_CP932
1447 #ifdef UTF8_OUTPUT_ENABLE
1448 ms_ucs_map_f = UCS_MAP_ASCII;
1451 case SHIFT_JISX0213:
1452 case SHIFT_JIS_2004:
1454 #ifdef SHIFTJIS_CP932
1461 #ifdef SHIFTJIS_CP932
1465 #ifdef UTF8_INPUT_ENABLE
1466 #ifdef UNICODE_NORMALIZATION
1474 input_endian = ENDIAN_BIG;
1478 input_endian = ENDIAN_LITTLE;
1483 input_endian = ENDIAN_BIG;
1487 input_endian = ENDIAN_LITTLE;
1493 static void set_output_encoding(nkf_encoding *enc)
1495 switch (nkf_enc_to_index(enc)) {
1498 #ifdef SHIFTJIS_CP932
1499 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1501 #ifdef UTF8_OUTPUT_ENABLE
1502 ms_ucs_map_f = UCS_MAP_CP932;
1506 #ifdef SHIFTJIS_CP932
1507 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1509 #ifdef UTF8_OUTPUT_ENABLE
1510 ms_ucs_map_f = UCS_MAP_CP932;
1517 #ifdef SHIFTJIS_CP932
1518 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1526 #ifdef SHIFTJIS_CP932
1527 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1533 #ifdef UTF8_OUTPUT_ENABLE
1534 ms_ucs_map_f = UCS_MAP_CP932;
1538 #ifdef UTF8_OUTPUT_ENABLE
1539 ms_ucs_map_f = UCS_MAP_CP10001;
1544 #ifdef SHIFTJIS_CP932
1545 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1547 #ifdef UTF8_OUTPUT_ENABLE
1548 ms_ucs_map_f = UCS_MAP_CP932;
1552 #ifdef SHIFTJIS_CP932
1553 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1555 #ifdef UTF8_OUTPUT_ENABLE
1556 ms_ucs_map_f = UCS_MAP_CP932;
1563 #ifdef UTF8_OUTPUT_ENABLE
1564 ms_ucs_map_f = UCS_MAP_MS;
1571 #ifdef UTF8_OUTPUT_ENABLE
1572 ms_ucs_map_f = UCS_MAP_ASCII;
1575 case SHIFT_JISX0213:
1576 case SHIFT_JIS_2004:
1578 #ifdef SHIFTJIS_CP932
1579 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1588 #ifdef SHIFTJIS_CP932
1589 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1592 #ifdef UTF8_OUTPUT_ENABLE
1594 output_bom_f = TRUE;
1598 output_bom_f = TRUE;
1601 output_endian = ENDIAN_LITTLE;
1602 output_bom_f = FALSE;
1605 output_endian = ENDIAN_LITTLE;
1606 output_bom_f = TRUE;
1609 output_bom_f = TRUE;
1612 output_endian = ENDIAN_LITTLE;
1613 output_bom_f = FALSE;
1616 output_endian = ENDIAN_LITTLE;
1617 output_bom_f = TRUE;
1623 static int option_mode = 0;
1625 void options(unsigned char *cp)
1629 unsigned char *cp_back = NULL;
1635 while(*cp && *cp++!='-');
1636 while (*cp || cp_back) {
1644 case '-': /* literal options */
1645 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1649 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1650 p = (unsigned char *)long_option[i].name;
1651 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1652 if (*p == cp[j] || cp[j] == SP){
1659 fprintf(stderr, "unknown long option: --%s\n", cp);
1662 while(*cp && *cp != SP && cp++);
1663 if (long_option[i].alias[0]){
1665 cp = (unsigned char *)long_option[i].alias;
1667 if (strcmp(long_option[i].name, "ic=") == 0){
1668 nkf_str_upcase((char *)p, codeset, 32);
1669 enc = nkf_enc_find(codeset);
1671 input_encoding = enc;
1674 if (strcmp(long_option[i].name, "oc=") == 0){
1675 nkf_str_upcase((char *)p, codeset, 32);
1676 enc = nkf_enc_find(codeset);
1677 if (enc <= 0) continue;
1678 output_encoding = enc;
1681 if (strcmp(long_option[i].name, "guess=") == 0){
1682 if (p[0] == '0' || p[0] == '1') {
1690 if (strcmp(long_option[i].name, "overwrite") == 0){
1693 preserve_time_f = TRUE;
1696 if (strcmp(long_option[i].name, "overwrite=") == 0){
1699 preserve_time_f = TRUE;
1701 backup_suffix = malloc(strlen((char *) p) + 1);
1702 strcpy(backup_suffix, (char *) p);
1705 if (strcmp(long_option[i].name, "in-place") == 0){
1708 preserve_time_f = FALSE;
1711 if (strcmp(long_option[i].name, "in-place=") == 0){
1714 preserve_time_f = FALSE;
1716 backup_suffix = malloc(strlen((char *) p) + 1);
1717 strcpy(backup_suffix, (char *) p);
1722 if (strcmp(long_option[i].name, "cap-input") == 0){
1726 if (strcmp(long_option[i].name, "url-input") == 0){
1731 #ifdef NUMCHAR_OPTION
1732 if (strcmp(long_option[i].name, "numchar-input") == 0){
1738 if (strcmp(long_option[i].name, "no-output") == 0){
1742 if (strcmp(long_option[i].name, "debug") == 0){
1747 if (strcmp(long_option[i].name, "cp932") == 0){
1748 #ifdef SHIFTJIS_CP932
1752 #ifdef UTF8_OUTPUT_ENABLE
1753 ms_ucs_map_f = UCS_MAP_CP932;
1757 if (strcmp(long_option[i].name, "no-cp932") == 0){
1758 #ifdef SHIFTJIS_CP932
1762 #ifdef UTF8_OUTPUT_ENABLE
1763 ms_ucs_map_f = UCS_MAP_ASCII;
1767 #ifdef SHIFTJIS_CP932
1768 if (strcmp(long_option[i].name, "cp932inv") == 0){
1775 if (strcmp(long_option[i].name, "x0212") == 0){
1782 if (strcmp(long_option[i].name, "exec-in") == 0){
1786 if (strcmp(long_option[i].name, "exec-out") == 0){
1791 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1792 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1793 no_cp932ext_f = TRUE;
1796 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1797 no_best_fit_chars_f = TRUE;
1800 if (strcmp(long_option[i].name, "fb-skip") == 0){
1801 encode_fallback = NULL;
1804 if (strcmp(long_option[i].name, "fb-html") == 0){
1805 encode_fallback = encode_fallback_html;
1808 if (strcmp(long_option[i].name, "fb-xml") == 0){
1809 encode_fallback = encode_fallback_xml;
1812 if (strcmp(long_option[i].name, "fb-java") == 0){
1813 encode_fallback = encode_fallback_java;
1816 if (strcmp(long_option[i].name, "fb-perl") == 0){
1817 encode_fallback = encode_fallback_perl;
1820 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1821 encode_fallback = encode_fallback_subchar;
1824 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1825 encode_fallback = encode_fallback_subchar;
1826 unicode_subchar = 0;
1828 /* decimal number */
1829 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1830 unicode_subchar *= 10;
1831 unicode_subchar += hex2bin(p[i]);
1833 }else if(p[1] == 'x' || p[1] == 'X'){
1834 /* hexadecimal number */
1835 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1836 unicode_subchar <<= 4;
1837 unicode_subchar |= hex2bin(p[i]);
1841 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1842 unicode_subchar *= 8;
1843 unicode_subchar += hex2bin(p[i]);
1846 w16e_conv(unicode_subchar, &i, &j);
1847 unicode_subchar = i<<8 | j;
1851 #ifdef UTF8_OUTPUT_ENABLE
1852 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1853 ms_ucs_map_f = UCS_MAP_MS;
1857 #ifdef UNICODE_NORMALIZATION
1858 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1863 if (strcmp(long_option[i].name, "prefix=") == 0){
1864 if (nkf_isgraph(p[0])){
1865 for (i = 1; nkf_isgraph(p[i]); i++){
1866 prefix_table[p[i]] = p[0];
1873 case 'b': /* buffered mode */
1876 case 'u': /* non bufferd mode */
1879 case 't': /* transparent mode */
1884 } else if (*cp=='2') {
1888 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1896 case 'j': /* JIS output */
1898 output_encoding = nkf_enc_from_index(ISO_2022_JP);
1900 case 'e': /* AT&T EUC output */
1901 output_encoding = nkf_enc_from_index(EUC_JP);
1903 case 's': /* SJIS output */
1904 output_encoding = nkf_enc_from_index(WINDOWS_31J);
1906 case 'l': /* ISO8859 Latin-1 support, no conversion */
1907 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1908 input_encoding = nkf_enc_from_index(ISO_8859_1);
1910 case 'i': /* Kanji IN ESC-$-@/B */
1911 if (*cp=='@'||*cp=='B')
1912 kanji_intro = *cp++;
1914 case 'o': /* ASCII IN ESC-(-J/B */
1915 if (*cp=='J'||*cp=='B'||*cp=='H')
1916 ascii_intro = *cp++;
1920 bit:1 katakana->hiragana
1921 bit:2 hiragana->katakana
1923 if ('9'>= *cp && *cp>='0')
1924 hira_f |= (*cp++ -'0');
1931 #if defined(MSDOS) || defined(__OS2__)
1938 show_configuration();
1946 #ifdef UTF8_OUTPUT_ENABLE
1947 case 'w': /* UTF-8 output */
1952 output_encoding = nkf_enc_from_index(UTF_8N);
1954 output_bom_f = TRUE;
1955 output_encoding = nkf_enc_from_index(UTF_8_BOM);
1959 if ('1'== cp[0] && '6'==cp[1]) {
1962 } else if ('3'== cp[0] && '2'==cp[1]) {
1966 output_encoding = nkf_enc_from_index(UTF_8);
1971 output_endian = ENDIAN_LITTLE;
1972 } else if (cp[0] == 'B') {
1975 output_encoding = nkf_enc_from_index(enc_idx);
1980 enc_idx = enc_idx == UTF_16
1981 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
1982 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
1984 output_bom_f = TRUE;
1985 enc_idx = enc_idx == UTF_16
1986 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
1987 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
1989 output_encoding = nkf_enc_from_index(enc_idx);
1993 #ifdef UTF8_INPUT_ENABLE
1994 case 'W': /* UTF input */
1997 input_encoding = nkf_enc_from_index(UTF_8);
2000 if ('1'== cp[0] && '6'==cp[1]) {
2002 input_endian = ENDIAN_BIG;
2004 } else if ('3'== cp[0] && '2'==cp[1]) {
2006 input_endian = ENDIAN_BIG;
2009 input_encoding = nkf_enc_from_index(UTF_8);
2014 input_endian = ENDIAN_LITTLE;
2015 } else if (cp[0] == 'B') {
2017 input_endian = ENDIAN_BIG;
2019 enc_idx = enc_idx == UTF_16
2020 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
2021 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
2022 input_encoding = nkf_enc_from_index(enc_idx);
2026 /* Input code assumption */
2027 case 'J': /* ISO-2022-JP input */
2028 input_encoding = nkf_enc_from_index(ISO_2022_JP);
2030 case 'E': /* EUC-JP input */
2031 input_encoding = nkf_enc_from_index(EUC_JP);
2033 case 'S': /* Windows-31J input */
2034 input_encoding = nkf_enc_from_index(WINDOWS_31J);
2036 case 'Z': /* Convert X0208 alphabet to asii */
2038 bit:0 Convert JIS X 0208 Alphabet to ASCII
2039 bit:1 Convert Kankaku to one space
2040 bit:2 Convert Kankaku to two spaces
2041 bit:3 Convert HTML Entity
2042 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
2044 while ('0'<= *cp && *cp <='9') {
2045 alpha_f |= 1 << (*cp++ - '0');
2047 if (!alpha_f) alpha_f = 1;
2049 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
2050 x0201_f = FALSE; /* No X0201->X0208 conversion */
2052 ESC-(-I in JIS, EUC, MS Kanji
2053 SI/SO in JIS, EUC, MS Kanji
2054 SSO in EUC, JIS, not in MS Kanji
2055 MS Kanji (0xa0-0xdf)
2057 ESC-(-I in JIS (0x20-0x5f)
2058 SSO in EUC (0xa0-0xdf)
2059 0xa0-0xd in MS Kanji (0xa0-0xdf)
2062 case 'X': /* Convert X0201 kana to X0208 */
2065 case 'F': /* prserve new lines */
2066 fold_preserve_f = TRUE;
2067 case 'f': /* folding -f60 or -f */
2070 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2072 fold_len += *cp++ - '0';
2074 if (!(0<fold_len && fold_len<BUFSIZ))
2075 fold_len = DEFAULT_FOLD;
2079 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2081 fold_margin += *cp++ - '0';
2085 case 'm': /* MIME support */
2086 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
2087 if (*cp=='B'||*cp=='Q') {
2088 mime_decode_mode = *cp++;
2089 mimebuf_f = FIXED_MIME;
2090 } else if (*cp=='N') {
2091 mime_f = TRUE; cp++;
2092 } else if (*cp=='S') {
2093 mime_f = STRICT_MIME; cp++;
2094 } else if (*cp=='0') {
2095 mime_decode_f = FALSE;
2096 mime_f = FALSE; cp++;
2098 mime_f = STRICT_MIME;
2101 case 'M': /* MIME output */
2104 mimeout_f = FIXED_MIME; cp++;
2105 } else if (*cp=='Q') {
2107 mimeout_f = FIXED_MIME; cp++;
2112 case 'B': /* Broken JIS support */
2114 bit:1 allow any x on ESC-(-x or ESC-$-x
2115 bit:2 reset to ascii on NL
2117 if ('9'>= *cp && *cp>='0')
2118 broken_f |= 1<<(*cp++ -'0');
2123 case 'O':/* for Output file */
2127 case 'c':/* add cr code */
2130 case 'd':/* delete cr code */
2133 case 'I': /* ISO-2022-JP output */
2136 case 'L': /* line mode */
2137 if (*cp=='u') { /* unix */
2138 nlmode_f = LF; cp++;
2139 } else if (*cp=='m') { /* mac */
2140 nlmode_f = CR; cp++;
2141 } else if (*cp=='w') { /* windows */
2142 nlmode_f = CRLF; cp++;
2143 } else if (*cp=='0') { /* no conversion */
2149 if ('2' <= *cp && *cp <= '9') {
2152 } else if (*cp == '0' || *cp == '1') {
2161 /* module muliple options in a string are allowed for Perl moudle */
2162 while(*cp && *cp++!='-');
2165 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
2166 /* bogus option but ignored */
2172 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2175 struct input_code *p = input_code_list;
2177 if (iconv_func == p->iconv_func){
2186 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2188 #ifdef INPUT_CODE_FIX
2189 if (f || !input_encoding)
2196 #ifdef INPUT_CODE_FIX
2197 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
2203 if (estab_f && iconv_for_check != iconv){
2204 struct input_code *p = find_inputcode_byfunc(iconv);
2206 set_input_codename(p->name);
2209 iconv_for_check = iconv;
2214 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2215 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2216 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2217 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2218 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2219 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2220 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2221 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2223 #define SCORE_INIT (SCORE_iMIME)
2225 static const char score_table_A0[] = {
2228 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2229 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2232 static const char score_table_F0[] = {
2233 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2234 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2235 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2236 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2239 void set_code_score(struct input_code *ptr, nkf_char score)
2242 ptr->score |= score;
2246 void clr_code_score(struct input_code *ptr, nkf_char score)
2249 ptr->score &= ~score;
2253 void code_score(struct input_code *ptr)
2255 nkf_char c2 = ptr->buf[0];
2256 #ifdef UTF8_OUTPUT_ENABLE
2257 nkf_char c1 = ptr->buf[1];
2260 set_code_score(ptr, SCORE_ERROR);
2261 }else if (c2 == SSO){
2262 set_code_score(ptr, SCORE_KANA);
2263 }else if (c2 == 0x8f){
2264 set_code_score(ptr, SCORE_X0212);
2265 #ifdef UTF8_OUTPUT_ENABLE
2266 }else if (!e2w_conv(c2, c1)){
2267 set_code_score(ptr, SCORE_NO_EXIST);
2269 }else if ((c2 & 0x70) == 0x20){
2270 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2271 }else if ((c2 & 0x70) == 0x70){
2272 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2273 }else if ((c2 & 0x70) >= 0x50){
2274 set_code_score(ptr, SCORE_L2);
2278 void status_disable(struct input_code *ptr)
2283 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2286 void status_push_ch(struct input_code *ptr, nkf_char c)
2288 ptr->buf[ptr->index++] = c;
2291 void status_clear(struct input_code *ptr)
2297 void status_reset(struct input_code *ptr)
2300 ptr->score = SCORE_INIT;
2303 void status_reinit(struct input_code *ptr)
2306 ptr->_file_stat = 0;
2309 void status_check(struct input_code *ptr, nkf_char c)
2311 if (c <= DEL && estab_f){
2316 void s_status(struct input_code *ptr, nkf_char c)
2320 status_check(ptr, c);
2325 #ifdef NUMCHAR_OPTION
2326 }else if (is_unicode_capsule(c)){
2329 }else if (0xa1 <= c && c <= 0xdf){
2330 status_push_ch(ptr, SSO);
2331 status_push_ch(ptr, c);
2334 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2336 status_push_ch(ptr, c);
2337 }else if (0xed <= c && c <= 0xee){
2339 status_push_ch(ptr, c);
2340 #ifdef SHIFTJIS_CP932
2341 }else if (is_ibmext_in_sjis(c)){
2343 status_push_ch(ptr, c);
2344 #endif /* SHIFTJIS_CP932 */
2346 }else if (0xf0 <= c && c <= 0xfc){
2348 status_push_ch(ptr, c);
2349 #endif /* X0212_ENABLE */
2351 status_disable(ptr);
2355 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2356 status_push_ch(ptr, c);
2357 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2361 status_disable(ptr);
2365 #ifdef SHIFTJIS_CP932
2366 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2367 status_push_ch(ptr, c);
2368 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2369 set_code_score(ptr, SCORE_CP932);
2374 #endif /* SHIFTJIS_CP932 */
2375 status_disable(ptr);
2378 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2379 status_push_ch(ptr, c);
2380 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2381 set_code_score(ptr, SCORE_CP932);
2384 status_disable(ptr);
2390 void e_status(struct input_code *ptr, nkf_char c)
2394 status_check(ptr, c);
2399 #ifdef NUMCHAR_OPTION
2400 }else if (is_unicode_capsule(c)){
2403 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2405 status_push_ch(ptr, c);
2407 }else if (0x8f == c){
2409 status_push_ch(ptr, c);
2410 #endif /* X0212_ENABLE */
2412 status_disable(ptr);
2416 if (0xa1 <= c && c <= 0xfe){
2417 status_push_ch(ptr, c);
2421 status_disable(ptr);
2426 if (0xa1 <= c && c <= 0xfe){
2428 status_push_ch(ptr, c);
2430 status_disable(ptr);
2432 #endif /* X0212_ENABLE */
2436 #ifdef UTF8_INPUT_ENABLE
2437 void w_status(struct input_code *ptr, nkf_char c)
2441 status_check(ptr, c);
2446 #ifdef NUMCHAR_OPTION
2447 }else if (is_unicode_capsule(c)){
2450 }else if (0xc0 <= c && c <= 0xdf){
2452 status_push_ch(ptr, c);
2453 }else if (0xe0 <= c && c <= 0xef){
2455 status_push_ch(ptr, c);
2456 }else if (0xf0 <= c && c <= 0xf4){
2458 status_push_ch(ptr, c);
2460 status_disable(ptr);
2465 if (0x80 <= c && c <= 0xbf){
2466 status_push_ch(ptr, c);
2467 if (ptr->index > ptr->stat){
2468 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2469 && ptr->buf[2] == 0xbf);
2470 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2471 &ptr->buf[0], &ptr->buf[1]);
2478 status_disable(ptr);
2482 if (0x80 <= c && c <= 0xbf){
2483 if (ptr->index < ptr->stat){
2484 status_push_ch(ptr, c);
2489 status_disable(ptr);
2496 void code_status(nkf_char c)
2498 int action_flag = 1;
2499 struct input_code *result = 0;
2500 struct input_code *p = input_code_list;
2502 if (!p->status_func) {
2506 if (!p->status_func)
2508 (p->status_func)(p, c);
2511 }else if(p->stat == 0){
2522 if (result && !estab_f){
2523 set_iconv(TRUE, result->iconv_func);
2524 }else if (c <= DEL){
2525 struct input_code *ptr = input_code_list;
2535 nkf_char std_getc(FILE *f)
2538 return std_gc_buf[--std_gc_ndx];
2544 nkf_char std_ungetc(nkf_char c, FILE *f)
2546 if (std_gc_ndx == STD_GC_BUFSIZE){
2549 std_gc_buf[std_gc_ndx++] = c;
2554 void std_putc(nkf_char c)
2561 #if !defined(PERL_XS) && !defined(WIN32DLL)
2562 nkf_char noconvert(FILE *f)
2567 module_connection();
2568 while ((c = (*i_getc)(f)) != EOF)
2575 void module_connection(void)
2577 if (input_encoding) set_input_encoding(input_encoding);
2578 if (!output_encoding) {
2579 output_encoding = nkf_default_encoding();
2581 set_output_encoding(output_encoding);
2582 oconv = nkf_enc_to_oconv(output_encoding);
2585 /* replace continucation module, from output side */
2587 /* output redicrection */
2589 if (noout_f || guess_f){
2596 if (mimeout_f == TRUE) {
2597 o_base64conv = oconv; oconv = base64_conv;
2599 /* base64_count = 0; */
2602 if (nlmode_f || guess_f) {
2603 o_nlconv = oconv; oconv = nl_conv;
2606 o_rot_conv = oconv; oconv = rot_conv;
2609 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2612 o_hira_conv = oconv; oconv = hira_conv;
2615 o_fconv = oconv; oconv = fold_conv;
2618 if (alpha_f || x0201_f) {
2619 o_zconv = oconv; oconv = z_conv;
2623 i_ungetc = std_ungetc;
2624 /* input redicrection */
2627 i_cgetc = i_getc; i_getc = cap_getc;
2628 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2631 i_ugetc = i_getc; i_getc = url_getc;
2632 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2635 #ifdef NUMCHAR_OPTION
2637 i_ngetc = i_getc; i_getc = numchar_getc;
2638 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2641 #ifdef UNICODE_NORMALIZATION
2643 i_nfc_getc = i_getc; i_getc = nfc_getc;
2644 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2647 if (mime_f && mimebuf_f==FIXED_MIME) {
2648 i_mgetc = i_getc; i_getc = mime_getc;
2649 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2652 i_bgetc = i_getc; i_getc = broken_getc;
2653 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2655 if (input_encoding) {
2656 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
2658 set_iconv(FALSE, e_iconv);
2662 struct input_code *p = input_code_list;
2670 * Check and Ignore BOM
2672 void check_bom(FILE *f)
2675 switch(c2 = (*i_getc)(f)){
2677 if((c2 = (*i_getc)(f)) == 0x00){
2678 if((c2 = (*i_getc)(f)) == 0xFE){
2679 if((c2 = (*i_getc)(f)) == 0xFF){
2680 if(!input_encoding){
2681 set_iconv(TRUE, w_iconv32);
2683 if (iconv == w_iconv32) {
2684 input_endian = ENDIAN_BIG;
2687 (*i_ungetc)(0xFF,f);
2688 }else (*i_ungetc)(c2,f);
2689 (*i_ungetc)(0xFE,f);
2690 }else if(c2 == 0xFF){
2691 if((c2 = (*i_getc)(f)) == 0xFE){
2692 if(!input_encoding){
2693 set_iconv(TRUE, w_iconv32);
2695 if (iconv == w_iconv32) {
2696 input_endian = ENDIAN_2143;
2699 (*i_ungetc)(0xFF,f);
2700 }else (*i_ungetc)(c2,f);
2701 (*i_ungetc)(0xFF,f);
2702 }else (*i_ungetc)(c2,f);
2703 (*i_ungetc)(0x00,f);
2704 }else (*i_ungetc)(c2,f);
2705 (*i_ungetc)(0x00,f);
2708 if((c2 = (*i_getc)(f)) == 0xBB){
2709 if((c2 = (*i_getc)(f)) == 0xBF){
2710 if(!input_encoding){
2711 set_iconv(TRUE, w_iconv);
2713 if (iconv == w_iconv) {
2716 (*i_ungetc)(0xBF,f);
2717 }else (*i_ungetc)(c2,f);
2718 (*i_ungetc)(0xBB,f);
2719 }else (*i_ungetc)(c2,f);
2720 (*i_ungetc)(0xEF,f);
2723 if((c2 = (*i_getc)(f)) == 0xFF){
2724 if((c2 = (*i_getc)(f)) == 0x00){
2725 if((c2 = (*i_getc)(f)) == 0x00){
2726 if(!input_encoding){
2727 set_iconv(TRUE, w_iconv32);
2729 if (iconv == w_iconv32) {
2730 input_endian = ENDIAN_3412;
2733 (*i_ungetc)(0x00,f);
2734 }else (*i_ungetc)(c2,f);
2735 (*i_ungetc)(0x00,f);
2736 }else (*i_ungetc)(c2,f);
2737 if(!input_encoding){
2738 set_iconv(TRUE, w_iconv16);
2740 if (iconv == w_iconv16) {
2741 input_endian = ENDIAN_BIG;
2744 (*i_ungetc)(0xFF,f);
2745 }else (*i_ungetc)(c2,f);
2746 (*i_ungetc)(0xFE,f);
2749 if((c2 = (*i_getc)(f)) == 0xFE){
2750 if((c2 = (*i_getc)(f)) == 0x00){
2751 if((c2 = (*i_getc)(f)) == 0x00){
2752 if(!input_encoding){
2753 set_iconv(TRUE, w_iconv32);
2755 if (iconv == w_iconv32) {
2756 input_endian = ENDIAN_LITTLE;
2759 (*i_ungetc)(0x00,f);
2760 }else (*i_ungetc)(c2,f);
2761 (*i_ungetc)(0x00,f);
2762 }else (*i_ungetc)(c2,f);
2763 if(!input_encoding){
2764 set_iconv(TRUE, w_iconv16);
2766 if (iconv == w_iconv16) {
2767 input_endian = ENDIAN_LITTLE;
2770 (*i_ungetc)(0xFE,f);
2771 }else (*i_ungetc)(c2,f);
2772 (*i_ungetc)(0xFF,f);
2781 Conversion main loop. Code detection only.
2784 nkf_char kanji_convert(FILE *f)
2786 nkf_char c3, c2=0, c1, c0=0;
2787 int is_8bit = FALSE;
2789 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
2794 output_mode = ASCII;
2797 #define NEXT continue /* no output, get next */
2798 #define SEND ; /* output c1 and c2, get next */
2799 #define LAST break /* end of loop, go closing */
2801 module_connection();
2804 while ((c1 = (*i_getc)(f)) != EOF) {
2805 #ifdef INPUT_CODE_FIX
2806 if (!input_encoding)
2811 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
2812 /* in case of 8th bit is on */
2813 if (!estab_f&&!mime_decode_mode) {
2814 /* in case of not established yet */
2815 /* It is still ambiguious */
2816 if (h_conv(f, c2, c1)==EOF)
2822 /* in case of already established */
2824 /* ignore bogus code and not CP5022x UCD */
2832 /* second byte, 7 bit code */
2833 /* it might be kanji shitfted */
2834 if ((c1 == DEL) || (c1 <= SP)) {
2835 /* ignore bogus first code */
2842 #ifdef UTF8_INPUT_ENABLE
2843 if (iconv == w_iconv16) {
2844 if (input_endian == ENDIAN_BIG) {
2846 if ((c1 = (*i_getc)(f)) != EOF) {
2847 if (0xD8 <= c2 && c2 <= 0xDB) {
2848 if ((c0 = (*i_getc)(f)) != EOF) {
2850 if ((c3 = (*i_getc)(f)) != EOF) {
2857 if ((c2 = (*i_getc)(f)) != EOF) {
2858 if (0xD8 <= c2 && c2 <= 0xDB) {
2859 if ((c3 = (*i_getc)(f)) != EOF) {
2860 if ((c0 = (*i_getc)(f)) != EOF) {
2869 } else if(iconv == w_iconv32){
2871 if((c2 = (*i_getc)(f)) != EOF &&
2872 (c1 = (*i_getc)(f)) != EOF &&
2873 (c0 = (*i_getc)(f)) != EOF){
2874 switch(input_endian){
2876 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2879 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2882 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2885 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2895 #ifdef NUMCHAR_OPTION
2896 if (is_unicode_capsule(c1)){
2900 if (c1 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
2902 if (!estab_f && !iso8859_f) {
2903 /* not established yet */
2906 } else { /* estab_f==TRUE */
2911 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2912 /* SJIS X0201 Case... */
2913 if (iso2022jp_f && !x0201_f) {
2914 (*oconv)(GETA1, GETA2);
2921 } else if (c1==SSO && iconv != s_iconv) {
2922 /* EUC X0201 Case */
2923 c1 = (*i_getc)(f); /* skip SSO */
2925 if (SSP<=c1 && c1<0xe0) {
2926 if (iso2022jp_f && !x0201_f) {
2927 (*oconv)(GETA1, GETA2);
2934 } else { /* bogus code, skip SSO and one byte */
2937 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2938 (c1 == 0xFD || c1 == 0xFE)) {
2944 /* already established */
2949 } else if ((c1 > SP) && (c1 != DEL)) {
2950 /* in case of Roman characters */
2952 /* output 1 shifted byte */
2956 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2957 /* output 1 shifted byte */
2958 if (iso2022jp_f && !x0201_f) {
2959 (*oconv)(GETA1, GETA2);
2966 /* look like bogus code */
2969 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
2970 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
2971 /* in case of Kanji shifted */
2974 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2975 /* Check MIME code */
2976 if ((c1 = (*i_getc)(f)) == EOF) {
2979 } else if (c1 == '?') {
2980 /* =? is mime conversion start sequence */
2981 if(mime_f == STRICT_MIME) {
2982 /* check in real detail */
2983 if (mime_begin_strict(f) == EOF)
2987 } else if (mime_begin(f) == EOF)
2997 /* normal ASCII code */
3000 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
3003 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
3006 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
3007 if ((c1 = (*i_getc)(f)) == EOF) {
3008 /* (*oconv)(0, ESC); don't send bogus code */
3010 } else if (c1 == '$') {
3011 if ((c1 = (*i_getc)(f)) == EOF) {
3013 (*oconv)(0, ESC); don't send bogus code
3014 (*oconv)(0, '$'); */
3016 } else if (c1 == '@'|| c1 == 'B') {
3017 /* This is kanji introduction */
3018 input_mode = JIS_X_0208;
3020 set_input_codename("ISO-2022-JP");
3022 debug("ISO-2022-JP");
3025 } else if (c1 == '(') {
3026 if ((c1 = (*i_getc)(f)) == EOF) {
3027 /* don't send bogus code
3033 } else if (c1 == '@'|| c1 == 'B') {
3034 /* This is kanji introduction */
3035 input_mode = JIS_X_0208;
3039 } else if (c1 == 'D'){
3040 input_mode = JIS_X_0212;
3043 #endif /* X0212_ENABLE */
3044 } else if (c1 == 0x4F){
3045 input_mode = JIS_X_0213_1;
3048 } else if (c1 == 0x50){
3049 input_mode = JIS_X_0213_2;
3053 /* could be some special code */
3060 } else if (broken_f&0x2) {
3061 /* accept any ESC-(-x as broken code ... */
3062 input_mode = JIS_X_0208;
3071 } else if (c1 == '(') {
3072 if ((c1 = (*i_getc)(f)) == EOF) {
3073 /* don't send bogus code
3075 (*oconv)(0, '('); */
3079 /* This is X0201 kana introduction */
3080 input_mode = JIS_X_0201; shift_mode = JIS_X_0201;
3082 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
3083 /* This is X0208 kanji introduction */
3084 input_mode = ASCII; shift_mode = FALSE;
3086 } else if (broken_f&0x2) {
3087 input_mode = ASCII; shift_mode = FALSE;
3092 /* maintain various input_mode here */
3096 } else if ( c1 == 'N' || c1 == 'n'){
3098 c3 = (*i_getc)(f); /* skip SS2 */
3099 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
3114 } else if (c1 == ESC && iconv == s_iconv) {
3115 /* ESC in Shift_JIS */
3116 if ((c1 = (*i_getc)(f)) == EOF) {
3117 /* (*oconv)(0, ESC); don't send bogus code */
3119 } else if (c1 == '$') {
3121 if ((c1 = (*i_getc)(f)) == EOF) {
3123 (*oconv)(0, ESC); don't send bogus code
3124 (*oconv)(0, '$'); */
3127 if (('E' <= c1 && c1 <= 'G') ||
3128 ('O' <= c1 && c1 <= 'Q')) {
3136 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
3137 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
3138 while ((c1 = (*i_getc)(f)) != EOF) {
3139 if (SP <= c1 && c1 <= 'z') {
3140 (*oconv)(0, c1 + c0);
3141 } else break; /* c1 == SO */
3145 if (c1 == EOF) LAST;
3152 } else if (c1 == LF || c1 == CR) {
3154 input_mode = ASCII; set_iconv(FALSE, 0);
3156 } else if (mime_decode_f && !mime_decode_mode){
3158 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
3166 } else { /* if (c1 == CR)*/
3167 if ((c1=(*i_getc)(f))!=EOF) {
3171 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
3185 } else if (c1 == DEL && input_mode == JIS_X_0208) {
3195 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
3198 if ((c0 = (*i_getc)(f)) != EOF) {
3201 if ((c3 = (*i_getc)(f)) != EOF) {
3203 (*iconv)(c2, c1, c0|c3);
3208 /* 3 bytes EUC or UTF-8 */
3209 if ((c0 = (*i_getc)(f)) != EOF) {
3211 (*iconv)(c2, c1, c0);
3219 0x7F <= c2 && c2 <= 0x92 &&
3220 0x21 <= c1 && c1 <= 0x7E) {
3222 if(c1 == 0x7F) return 0;
3223 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
3226 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
3230 (*oconv)(PREFIX_EUCG3 | c2, c1);
3232 #endif /* X0212_ENABLE */
3234 (*oconv)(PREFIX_EUCG3 | c2, c1);
3237 (*oconv)(input_mode, c1); /* other special case */
3243 /* goto next_word */
3247 (*iconv)(EOF, 0, 0);
3248 if (!input_codename)
3251 struct input_code *p = input_code_list;
3252 struct input_code *result = p;
3254 if (p->score < result->score) result = p;
3257 set_input_codename(result->name);
3259 debug(result->name);
3267 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3269 nkf_char ret, c3, c0;
3273 /** it must NOT be in the kanji shifte sequence */
3274 /** it must NOT be written in JIS7 */
3275 /** and it must be after 2 byte 8bit code */
3281 while ((c1 = (*i_getc)(f)) != EOF) {
3287 if (push_hold_buf(c1) == EOF || estab_f){
3293 struct input_code *p = input_code_list;
3294 struct input_code *result = p;
3299 if (p->status_func && p->score < result->score){
3304 set_iconv(TRUE, result->iconv_func);
3309 ** 1) EOF is detected, or
3310 ** 2) Code is established, or
3311 ** 3) Buffer is FULL (but last word is pushed)
3313 ** in 1) and 3) cases, we continue to use
3314 ** Kanji codes by oconv and leave estab_f unchanged.
3319 while (hold_index < hold_count){
3320 c2 = hold_buf[hold_index++];
3322 #ifdef NUMCHAR_OPTION
3323 || is_unicode_capsule(c2)
3328 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3329 (*iconv)(JIS_X_0201, c2, 0);
3332 if (hold_index < hold_count){
3333 c1 = hold_buf[hold_index++];
3343 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3346 if (hold_index < hold_count){
3347 c0 = hold_buf[hold_index++];
3348 } else if ((c0 = (*i_getc)(f)) == EOF) {
3354 if (hold_index < hold_count){
3355 c3 = hold_buf[hold_index++];
3356 } else if ((c3 = (*i_getc)(f)) == EOF) {
3361 (*iconv)(c2, c1, c0|c3);
3366 /* 3 bytes EUC or UTF-8 */
3367 if (hold_index < hold_count){
3368 c0 = hold_buf[hold_index++];
3369 } else if ((c0 = (*i_getc)(f)) == EOF) {
3375 (*iconv)(c2, c1, c0);
3378 if (c0 == EOF) break;
3383 nkf_char push_hold_buf(nkf_char c2)
3385 if (hold_count >= HOLD_SIZE*2)
3387 hold_buf[hold_count++] = (unsigned char)c2;
3388 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3391 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3393 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3396 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3397 #ifdef SHIFTJIS_CP932
3398 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3399 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3406 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3407 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3413 #endif /* SHIFTJIS_CP932 */
3415 if (!x0213_f && is_ibmext_in_sjis(c2)){
3416 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3419 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3432 if(x0213_f && c2 >= 0xF0){
3433 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3434 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3435 }else{ /* 78<=k<=94 */
3436 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3437 if (0x9E < c1) c2++;
3440 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3441 if (0x9E < c1) c2++;
3444 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3451 c2 = x0212_unshift(c2);
3458 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3460 if (c2 == JIS_X_0201) {
3462 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3464 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3466 if(c1 == 0x7F) return 0;
3467 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3470 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3471 if (ret) return ret;
3477 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3479 if (c2 == JIS_X_0201) {
3482 }else if (c2 == 0x8f){
3486 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3487 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3488 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3491 c2 = (c2 << 8) | (c1 & 0x7f);
3493 #ifdef SHIFTJIS_CP932
3496 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3497 s2e_conv(s2, s1, &c2, &c1);
3504 #endif /* SHIFTJIS_CP932 */
3506 #endif /* X0212_ENABLE */
3507 } else if (c2 == SSO){
3510 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3513 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3514 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3515 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3520 #ifdef SHIFTJIS_CP932
3521 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3523 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3524 s2e_conv(s2, s1, &c2, &c1);
3531 #endif /* SHIFTJIS_CP932 */
3538 #ifdef UTF8_INPUT_ENABLE
3539 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3546 }else if (0xc0 <= c2 && c2 <= 0xef) {
3547 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3548 #ifdef NUMCHAR_OPTION
3551 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3559 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3562 static const char w_iconv_utf8_1st_byte[] =
3564 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3565 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3566 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3567 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3569 if (c2 < 0 || 0xff < c2) {
3570 }else if (c2 == 0) { /* 0 : 1 byte*/
3572 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3575 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3577 if (c1 < 0x80 || 0xBF < c1) return 0;
3580 if (c0 == 0) return -1;
3581 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3586 if (c0 == 0) return -1;
3587 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3591 if (c0 == 0) return -1;
3592 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3596 if (c0 == 0) return -2;
3597 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3601 if (c0 == 0) return -2;
3602 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3606 if (c0 == 0) return -2;
3607 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3615 if (c2 == 0 || c2 == EOF){
3616 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3617 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3620 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3629 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3630 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3637 }else if (val < 0x800){
3638 *p2 = 0xc0 | (val >> 6);
3639 *p1 = 0x80 | (val & 0x3f);
3641 } else if (val <= NKF_INT32_C(0xFFFF)) {
3642 *p2 = 0xe0 | (val >> 12);
3643 *p1 = 0x80 | ((val >> 6) & 0x3f);
3644 *p0 = 0x80 | (val & 0x3f);
3645 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3646 *p2 = 0xe0 | (val >> 16);
3647 *p1 = 0x80 | ((val >> 12) & 0x3f);
3648 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3657 #ifdef UTF8_INPUT_ENABLE
3658 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3663 } else if (c2 >= 0xf0){
3664 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3665 val = (c2 & 0x0f) << 18;
3666 val |= (c1 & 0x3f) << 12;
3667 val |= (c0 & 0x3f00) >> 2;
3669 }else if (c2 >= 0xe0){
3670 val = (c2 & 0x0f) << 12;
3671 val |= (c1 & 0x3f) << 6;
3673 }else if (c2 >= 0xc0){
3674 val = (c2 & 0x1f) << 6;
3682 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3684 nkf_char c2, c1, c0;
3691 w16w_conv(val, &c2, &c1, &c0);
3692 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3693 #ifdef NUMCHAR_OPTION
3696 *p1 = CLASS_UNICODE | val;
3705 #ifdef UTF8_INPUT_ENABLE
3706 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3709 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3712 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3713 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3715 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3717 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3722 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3723 if (ret) return ret;
3728 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3732 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3733 } else if (is_unicode_bmp(c1)) {
3734 ret = w16e_conv(c1, &c2, &c1);
3737 c1 = CLASS_UNICODE | c1;
3739 if (ret) return ret;
3744 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3746 const unsigned short *const *pp;
3747 const unsigned short *const *const *ppp;
3748 static const char no_best_fit_chars_table_C2[] =
3749 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3751 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3752 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3753 static const char no_best_fit_chars_table_C2_ms[] =
3754 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3756 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3757 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3758 static const char no_best_fit_chars_table_932_C2[] =
3759 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3761 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3762 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3763 static const char no_best_fit_chars_table_932_C3[] =
3764 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3765 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3766 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3767 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3773 }else if(c2 < 0xe0){
3774 if(no_best_fit_chars_f){
3775 if(ms_ucs_map_f == UCS_MAP_CP932){
3778 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3781 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3784 }else if(!cp932inv_f){
3787 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3790 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3793 }else if(ms_ucs_map_f == UCS_MAP_MS){
3794 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3795 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3813 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3814 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3815 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3817 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3818 }else if(c0 < 0xF0){
3819 if(no_best_fit_chars_f){
3820 if(ms_ucs_map_f == UCS_MAP_CP932){
3821 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3822 }else if(ms_ucs_map_f == UCS_MAP_MS){
3827 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3830 if(c0 == 0x92) return 1;
3835 if(c1 == 0x80 || c0 == 0x9C) return 1;
3838 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3843 if(c0 == 0x94) return 1;
3846 if(c0 == 0xBB) return 1;
3856 if(c0 == 0x95) return 1;
3859 if(c0 == 0xA5) return 1;
3866 if(c0 == 0x8D) return 1;
3869 if(c0 == 0x9E && !cp932inv_f) return 1;
3872 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3880 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3881 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3882 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3884 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3886 #ifdef SHIFTJIS_CP932
3887 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3889 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3890 s2e_conv(s2, s1, p2, p1);
3899 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3902 const unsigned short *p;
3905 if (pp == 0) return 1;
3908 if (c1 < 0 || psize <= c1) return 1;
3910 if (p == 0) return 1;
3913 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3915 if (val == 0) return 1;
3916 if (no_cp932ext_f && (
3917 (val>>8) == 0x2D || /* NEC special characters */
3918 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3926 if (c2 == SO) c2 = JIS_X_0201;
3933 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3940 (*f)(0, bin2hex(c>>shift));
3950 void encode_fallback_html(nkf_char c)
3955 if(c >= NKF_INT32_C(1000000))
3956 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3957 if(c >= NKF_INT32_C(100000))
3958 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3960 (*oconv)(0, 0x30+(c/10000 )%10);
3962 (*oconv)(0, 0x30+(c/1000 )%10);
3964 (*oconv)(0, 0x30+(c/100 )%10);
3966 (*oconv)(0, 0x30+(c/10 )%10);
3968 (*oconv)(0, 0x30+ c %10);
3973 void encode_fallback_xml(nkf_char c)
3978 nkf_each_char_to_hex(oconv, c);
3983 void encode_fallback_java(nkf_char c)
3987 if(!is_unicode_bmp(c)){
3991 (*oconv)(0, bin2hex(c>>20));
3992 (*oconv)(0, bin2hex(c>>16));
3996 (*oconv)(0, bin2hex(c>>12));
3997 (*oconv)(0, bin2hex(c>> 8));
3998 (*oconv)(0, bin2hex(c>> 4));
3999 (*oconv)(0, bin2hex(c ));
4003 void encode_fallback_perl(nkf_char c)
4008 nkf_each_char_to_hex(oconv, c);
4013 void encode_fallback_subchar(nkf_char c)
4015 c = unicode_subchar;
4016 (*oconv)((c>>8)&0xFF, c&0xFF);
4021 #ifdef UTF8_OUTPUT_ENABLE
4022 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
4024 const unsigned short *p;
4026 if (c2 == JIS_X_0201) {
4027 if (ms_ucs_map_f == UCS_MAP_CP10001) {
4035 p = euc_to_utf8_1byte;
4037 } else if (is_eucg3(c2)){
4038 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
4041 c2 = (c2&0x7f) - 0x21;
4042 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4043 p = x0212_to_utf8_2bytes[c2];
4049 c2 = (c2&0x7f) - 0x21;
4050 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4052 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
4053 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
4054 euc_to_utf8_2bytes_ms[c2];
4059 c1 = (c1 & 0x7f) - 0x21;
4060 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
4065 void w_oconv(nkf_char c2, nkf_char c1)
4071 output_bom_f = FALSE;
4082 #ifdef NUMCHAR_OPTION
4083 if (c2 == 0 && is_unicode_capsule(c1)){
4084 val = c1 & VALUE_MASK;
4087 }else if (val < 0x800){
4088 (*o_putc)(0xC0 | (val >> 6));
4089 (*o_putc)(0x80 | (val & 0x3f));
4090 } else if (val <= NKF_INT32_C(0xFFFF)) {
4091 (*o_putc)(0xE0 | (val >> 12));
4092 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
4093 (*o_putc)(0x80 | (val & 0x3f));
4094 } else if (val <= NKF_INT32_C(0x10FFFF)) {
4095 (*o_putc)(0xF0 | ( val>>18));
4096 (*o_putc)(0x80 | ((val>>12) & 0x3f));
4097 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
4098 (*o_putc)(0x80 | ( val & 0x3f));
4105 output_mode = ASCII;
4107 } else if (c2 == ISO_8859_1) {
4108 output_mode = UTF_8;
4109 (*o_putc)(c1 | 0x080);
4111 output_mode = UTF_8;
4112 val = e2w_conv(c2, c1);
4114 w16w_conv(val, &c2, &c1, &c0);
4118 if (c0) (*o_putc)(c0);
4124 void w_oconv16(nkf_char c2, nkf_char c1)
4127 output_bom_f = FALSE;
4128 if (output_endian == ENDIAN_LITTLE){
4129 (*o_putc)((unsigned char)'\377');
4133 (*o_putc)((unsigned char)'\377');
4142 if (c2 == ISO_8859_1) {
4145 #ifdef NUMCHAR_OPTION
4146 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4147 if (is_unicode_bmp(c1)) {
4148 c2 = (c1 >> 8) & 0xff;
4152 if (c1 <= UNICODE_MAX) {
4153 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
4154 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
4155 if (output_endian == ENDIAN_LITTLE){
4156 (*o_putc)(c2 & 0xff);
4157 (*o_putc)((c2 >> 8) & 0xff);
4158 (*o_putc)(c1 & 0xff);
4159 (*o_putc)((c1 >> 8) & 0xff);
4161 (*o_putc)((c2 >> 8) & 0xff);
4162 (*o_putc)(c2 & 0xff);
4163 (*o_putc)((c1 >> 8) & 0xff);
4164 (*o_putc)(c1 & 0xff);
4171 nkf_char val = e2w_conv(c2, c1);
4172 c2 = (val >> 8) & 0xff;
4176 if (output_endian == ENDIAN_LITTLE){
4185 void w_oconv32(nkf_char c2, nkf_char c1)
4188 output_bom_f = FALSE;
4189 if (output_endian == ENDIAN_LITTLE){
4190 (*o_putc)((unsigned char)'\377');
4198 (*o_putc)((unsigned char)'\377');
4207 if (c2 == ISO_8859_1) {
4209 #ifdef NUMCHAR_OPTION
4210 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4214 c1 = e2w_conv(c2, c1);
4217 if (output_endian == ENDIAN_LITTLE){
4218 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4219 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4220 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4224 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4225 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4226 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4231 void e_oconv(nkf_char c2, nkf_char c1)
4233 #ifdef NUMCHAR_OPTION
4234 if (c2 == 0 && is_unicode_capsule(c1)){
4235 w16e_conv(c1, &c2, &c1);
4236 if (c2 == 0 && is_unicode_capsule(c1)){
4237 c2 = c1 & VALUE_MASK;
4238 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4242 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4243 c1 = 0x21 + c1 % 94;
4246 (*o_putc)((c2 & 0x7f) | 0x080);
4247 (*o_putc)(c1 | 0x080);
4249 (*o_putc)((c2 & 0x7f) | 0x080);
4250 (*o_putc)(c1 | 0x080);
4254 if (encode_fallback) (*encode_fallback)(c1);
4263 } else if (c2 == 0) {
4264 output_mode = ASCII;
4266 } else if (c2 == JIS_X_0201) {
4267 output_mode = EUC_JP;
4268 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4269 } else if (c2 == ISO_8859_1) {
4270 output_mode = ISO_8859_1;
4271 (*o_putc)(c1 | 0x080);
4273 } else if (is_eucg3(c2)){
4274 output_mode = EUC_JP;
4275 #ifdef SHIFTJIS_CP932
4278 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4279 s2e_conv(s2, s1, &c2, &c1);
4284 output_mode = ASCII;
4286 }else if (is_eucg3(c2)){
4289 (*o_putc)((c2 & 0x7f) | 0x080);
4290 (*o_putc)(c1 | 0x080);
4293 (*o_putc)((c2 & 0x7f) | 0x080);
4294 (*o_putc)(c1 | 0x080);
4298 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4299 set_iconv(FALSE, 0);
4300 return; /* too late to rescue this char */
4302 output_mode = EUC_JP;
4303 (*o_putc)(c2 | 0x080);
4304 (*o_putc)(c1 | 0x080);
4309 nkf_char x0212_shift(nkf_char c)
4314 if (0x75 <= c && c <= 0x7f){
4315 ret = c + (0x109 - 0x75);
4318 if (0x75 <= c && c <= 0x7f){
4319 ret = c + (0x113 - 0x75);
4326 nkf_char x0212_unshift(nkf_char c)
4329 if (0x7f <= c && c <= 0x88){
4330 ret = c + (0x75 - 0x7f);
4331 }else if (0x89 <= c && c <= 0x92){
4332 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4336 #endif /* X0212_ENABLE */
4338 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4344 if((0x21 <= ndx && ndx <= 0x2F)){
4345 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4346 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4348 }else if(0x6E <= ndx && ndx <= 0x7E){
4349 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4350 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4356 else if(nkf_isgraph(ndx)){
4358 const unsigned short *ptr;
4359 ptr = x0212_shiftjis[ndx - 0x21];
4361 val = ptr[(c1 & 0x7f) - 0x21];
4370 c2 = x0212_shift(c2);
4372 #endif /* X0212_ENABLE */
4374 if(0x7F < c2) return 1;
4375 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4376 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4380 void s_oconv(nkf_char c2, nkf_char c1)
4382 #ifdef NUMCHAR_OPTION
4383 if (c2 == 0 && is_unicode_capsule(c1)){
4384 w16e_conv(c1, &c2, &c1);
4385 if (c2 == 0 && is_unicode_capsule(c1)){
4386 c2 = c1 & VALUE_MASK;
4387 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4390 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
4392 c1 += 0x40 + (c1 > 0x3e);
4397 if(encode_fallback)(*encode_fallback)(c1);
4406 } else if (c2 == 0) {
4407 output_mode = ASCII;
4409 } else if (c2 == JIS_X_0201) {
4410 output_mode = SHIFT_JIS;
4412 } else if (c2 == ISO_8859_1) {
4413 output_mode = ISO_8859_1;
4414 (*o_putc)(c1 | 0x080);
4416 } else if (is_eucg3(c2)){
4417 output_mode = SHIFT_JIS;
4418 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4424 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4425 set_iconv(FALSE, 0);
4426 return; /* too late to rescue this char */
4428 output_mode = SHIFT_JIS;
4429 e2s_conv(c2, c1, &c2, &c1);
4431 #ifdef SHIFTJIS_CP932
4433 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4434 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4440 #endif /* SHIFTJIS_CP932 */
4443 if (prefix_table[(unsigned char)c1]){
4444 (*o_putc)(prefix_table[(unsigned char)c1]);
4450 void j_oconv(nkf_char c2, nkf_char c1)
4452 #ifdef NUMCHAR_OPTION
4453 if (c2 == 0 && is_unicode_capsule(c1)){
4454 w16e_conv(c1, &c2, &c1);
4455 if (c2 == 0 && is_unicode_capsule(c1)){
4456 c2 = c1 & VALUE_MASK;
4457 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4460 c2 = 0x7F + c1 / 94;
4461 c1 = 0x21 + c1 % 94;
4463 if (encode_fallback) (*encode_fallback)(c1);
4470 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4473 (*o_putc)(ascii_intro);
4474 output_mode = ASCII;
4478 } else if (is_eucg3(c2)){
4480 if(output_mode!=JIS_X_0213_2){
4481 output_mode = JIS_X_0213_2;
4488 if(output_mode!=JIS_X_0212){
4489 output_mode = JIS_X_0212;
4496 (*o_putc)(c2 & 0x7f);
4499 } else if (c2==JIS_X_0201) {
4500 if (output_mode!=JIS_X_0201) {
4501 output_mode = JIS_X_0201;
4507 } else if (c2==ISO_8859_1) {
4508 /* iso8859 introduction, or 8th bit on */
4509 /* Can we convert in 7bit form using ESC-'-'-A ?
4511 output_mode = ISO_8859_1;
4513 } else if (c2 == 0) {
4514 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4517 (*o_putc)(ascii_intro);
4518 output_mode = ASCII;
4523 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4524 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4526 if (output_mode!=JIS_X_0213_1) {
4527 output_mode = JIS_X_0213_1;
4533 }else if (output_mode != JIS_X_0208) {
4534 output_mode = JIS_X_0208;
4537 (*o_putc)(kanji_intro);
4544 void base64_conv(nkf_char c2, nkf_char c1)
4546 mime_prechar(c2, c1);
4547 (*o_base64conv)(c2,c1);
4551 static nkf_char broken_buf[3];
4552 static int broken_counter = 0;
4553 static int broken_last = 0;
4554 nkf_char broken_getc(FILE *f)
4558 if (broken_counter>0) {
4559 return broken_buf[--broken_counter];
4562 if (c=='$' && broken_last != ESC
4563 && (input_mode==ASCII || input_mode==JIS_X_0201)) {
4566 if (c1=='@'|| c1=='B') {
4567 broken_buf[0]=c1; broken_buf[1]=c;
4574 } else if (c=='(' && broken_last != ESC
4575 && (input_mode==JIS_X_0208 || input_mode==JIS_X_0201)) { /* ) */
4578 if (c1=='J'|| c1=='B') {
4579 broken_buf[0]=c1; broken_buf[1]=c;
4592 nkf_char broken_ungetc(nkf_char c, FILE *f)
4594 if (broken_counter<2)
4595 broken_buf[broken_counter++]=c;
4599 void nl_conv(nkf_char c2, nkf_char c1)
4601 if (guess_f && input_newline != EOF) {
4602 if (c2 == 0 && c1 == LF) {
4603 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4604 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4605 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4607 else if (!input_newline) input_newline = CR;
4608 else if (input_newline != CR) input_newline = EOF;
4610 if (prev_cr || (c2 == 0 && c1 == LF)) {
4612 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4613 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4615 if (c2 == 0 && c1 == CR) prev_cr = CR;
4616 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4620 Return value of fold_conv()
4622 LF add newline and output char
4623 CR add newline and output nothing
4626 1 (or else) normal output
4628 fold state in prev (previous character)
4630 >0x80 Japanese (X0208/X0201)
4635 This fold algorthm does not preserve heading space in a line.
4636 This is the main difference from fmt.
4639 #define char_size(c2,c1) (c2?2:1)
4641 void fold_conv(nkf_char c2, nkf_char c1)
4644 nkf_char fold_state;
4646 if (c1== CR && !fold_preserve_f) {
4647 fold_state=0; /* ignore cr */
4648 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4650 fold_state=0; /* ignore cr */
4651 } else if (c1== BS) {
4652 if (f_line>0) f_line--;
4654 } else if (c2==EOF && f_line != 0) { /* close open last line */
4656 } else if ((c1==LF && !fold_preserve_f)
4657 || ((c1==CR||(c1==LF&&f_prev!=CR))
4658 && fold_preserve_f)) {
4660 if (fold_preserve_f) {
4664 } else if ((f_prev == c1 && !fold_preserve_f)
4665 || (f_prev == LF && fold_preserve_f)
4666 ) { /* duplicate newline */
4669 fold_state = LF; /* output two newline */
4675 if (f_prev&0x80) { /* Japanese? */
4677 fold_state = 0; /* ignore given single newline */
4678 } else if (f_prev==SP) {
4682 if (++f_line<=fold_len)
4686 fold_state = CR; /* fold and output nothing */
4690 } else if (c1=='\f') {
4693 fold_state = LF; /* output newline and clear */
4694 } else if ( (c2==0 && c1==SP)||
4695 (c2==0 && c1==TAB)||
4696 (c2=='!'&& c1=='!')) {
4697 /* X0208 kankaku or ascii space */
4699 fold_state = 0; /* remove duplicate spaces */
4702 if (++f_line<=fold_len)
4703 fold_state = SP; /* output ASCII space only */
4705 f_prev = SP; f_line = 0;
4706 fold_state = CR; /* fold and output nothing */
4710 prev0 = f_prev; /* we still need this one... , but almost done */
4712 if (c2 || c2==JIS_X_0201)
4713 f_prev |= 0x80; /* this is Japanese */
4714 f_line += char_size(c2,c1);
4715 if (f_line<=fold_len) { /* normal case */
4718 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4719 f_line = char_size(c2,c1);
4720 fold_state = LF; /* We can't wait, do fold now */
4721 } else if (c2==JIS_X_0201) {
4722 /* simple kinsoku rules return 1 means no folding */
4723 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4724 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4725 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4726 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4727 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4728 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4729 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4731 fold_state = LF;/* add one new f_line before this character */
4734 fold_state = LF;/* add one new f_line before this character */
4737 /* kinsoku point in ASCII */
4738 if ( c1==')'|| /* { [ ( */
4749 /* just after special */
4750 } else if (!is_alnum(prev0)) {
4751 f_line = char_size(c2,c1);
4753 } else if ((prev0==SP) || /* ignored new f_line */
4754 (prev0==LF)|| /* ignored new f_line */
4755 (prev0&0x80)) { /* X0208 - ASCII */
4756 f_line = char_size(c2,c1);
4757 fold_state = LF;/* add one new f_line before this character */
4759 fold_state = 1; /* default no fold in ASCII */
4763 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4764 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4765 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4766 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4767 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4768 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4769 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4770 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4771 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4772 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4773 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4774 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4775 /* default no fold in kinsoku */
4778 f_line = char_size(c2,c1);
4779 /* add one new f_line before this character */
4782 f_line = char_size(c2,c1);
4784 /* add one new f_line before this character */
4789 /* terminator process */
4790 switch(fold_state) {
4792 OCONV_NEWLINE((*o_fconv));
4798 OCONV_NEWLINE((*o_fconv));
4809 nkf_char z_prev2=0,z_prev1=0;
4811 void z_conv(nkf_char c2, nkf_char c1)
4814 /* if (c2) c1 &= 0x7f; assertion */
4816 if (c2 == JIS_X_0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4822 if (z_prev2 == JIS_X_0201) {
4823 if (c2 == JIS_X_0201) {
4824 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4826 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4828 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4830 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4835 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4837 if (c2 == JIS_X_0201) {
4838 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4839 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4844 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4855 if (alpha_f&1 && c2 == 0x23) {
4856 /* JISX0208 Alphabet */
4858 } else if (c2 == 0x21) {
4859 /* JISX0208 Kigou */
4864 } else if (alpha_f&4) {
4869 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4875 if (alpha_f&8 && c2 == 0) {
4879 case '>': entity = ">"; break;
4880 case '<': entity = "<"; break;
4881 case '\"': entity = """; break;
4882 case '&': entity = "&"; break;
4885 while (*entity) (*o_zconv)(0, *entity++);
4891 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4896 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4900 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4904 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4908 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4912 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4916 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4920 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4924 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4929 (*o_zconv)(JIS_X_0201, c);
4932 } else if (c2 == 0x25) {
4933 /* JISX0208 Katakana */
4934 static const int fullwidth_to_halfwidth[] =
4936 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4937 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4938 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4939 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4940 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4941 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4942 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4943 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4944 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4945 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4946 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4947 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4949 if (fullwidth_to_halfwidth[c1-0x20]){
4950 c2 = fullwidth_to_halfwidth[c1-0x20];
4951 (*o_zconv)(JIS_X_0201, c2>>8);
4953 (*o_zconv)(JIS_X_0201, c2&0xFF);
4963 #define rot13(c) ( \
4965 (c <= 'M') ? (c + 13): \
4966 (c <= 'Z') ? (c - 13): \
4968 (c <= 'm') ? (c + 13): \
4969 (c <= 'z') ? (c - 13): \
4973 #define rot47(c) ( \
4975 ( c <= 'O') ? (c + 47) : \
4976 ( c <= '~') ? (c - 47) : \
4980 void rot_conv(nkf_char c2, nkf_char c1)
4982 if (c2==0 || c2==JIS_X_0201 || c2==ISO_8859_1) {
4988 (*o_rot_conv)(c2,c1);
4991 void hira_conv(nkf_char c2, nkf_char c1)
4995 if (0x20 < c1 && c1 < 0x74) {
4997 (*o_hira_conv)(c2,c1);
4999 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
5001 c1 = CLASS_UNICODE | 0x3094;
5002 (*o_hira_conv)(c2,c1);
5005 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
5007 (*o_hira_conv)(c2,c1);
5012 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
5015 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
5017 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
5021 (*o_hira_conv)(c2,c1);
5025 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
5027 static const nkf_char range[RANGE_NUM_MAX][2] = {
5048 nkf_char start, end, c;
5050 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
5054 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
5059 for (i = 0; i < RANGE_NUM_MAX; i++) {
5060 start = range[i][0];
5063 if (c >= start && c <= end) {
5068 (*o_iso2022jp_check_conv)(c2,c1);
5072 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
5074 static const unsigned char *mime_pattern[] = {
5075 (const unsigned char *)"\075?EUC-JP?B?",
5076 (const unsigned char *)"\075?SHIFT_JIS?B?",
5077 (const unsigned char *)"\075?ISO-8859-1?Q?",
5078 (const unsigned char *)"\075?ISO-8859-1?B?",
5079 (const unsigned char *)"\075?ISO-2022-JP?B?",
5080 (const unsigned char *)"\075?ISO-2022-JP?Q?",
5081 #if defined(UTF8_INPUT_ENABLE)
5082 (const unsigned char *)"\075?UTF-8?B?",
5083 (const unsigned char *)"\075?UTF-8?Q?",
5085 (const unsigned char *)"\075?US-ASCII?Q?",
5090 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
5091 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
5092 e_iconv, s_iconv, 0, 0, 0, 0,
5093 #if defined(UTF8_INPUT_ENABLE)
5099 static const nkf_char mime_encode[] = {
5100 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201,
5101 #if defined(UTF8_INPUT_ENABLE)
5108 static const nkf_char mime_encode_method[] = {
5109 'B', 'B','Q', 'B', 'B', 'Q',
5110 #if defined(UTF8_INPUT_ENABLE)
5118 #define MAXRECOVER 20
5120 void switch_mime_getc(void)
5122 if (i_getc!=mime_getc) {
5123 i_mgetc = i_getc; i_getc = mime_getc;
5124 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5125 if(mime_f==STRICT_MIME) {
5126 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
5127 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
5132 void unswitch_mime_getc(void)
5134 if(mime_f==STRICT_MIME) {
5135 i_mgetc = i_mgetc_buf;
5136 i_mungetc = i_mungetc_buf;
5139 i_ungetc = i_mungetc;
5140 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
5141 mime_iconv_back = NULL;
5144 nkf_char mime_begin_strict(FILE *f)
5148 const unsigned char *p,*q;
5149 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
5151 mime_decode_mode = FALSE;
5152 /* =? has been checked */
5154 p = mime_pattern[j];
5157 for(i=2;p[i]>SP;i++) { /* start at =? */
5158 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
5159 /* pattern fails, try next one */
5161 while (mime_pattern[++j]) {
5162 p = mime_pattern[j];
5163 for(k=2;k<i;k++) /* assume length(p) > i */
5164 if (p[k]!=q[k]) break;
5165 if (k==i && nkf_toupper(c1)==p[k]) break;
5167 p = mime_pattern[j];
5168 if (p) continue; /* found next one, continue */
5169 /* all fails, output from recovery buffer */
5177 mime_decode_mode = p[i-2];
5179 mime_iconv_back = iconv;
5180 set_iconv(FALSE, mime_priority_func[j]);
5181 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
5183 if (mime_decode_mode=='B') {
5184 mimebuf_f = unbuf_f;
5186 /* do MIME integrity check */
5187 return mime_integrity(f,mime_pattern[j]);
5195 nkf_char mime_getc_buf(FILE *f)
5197 /* we don't keep eof of Fifo, becase it contains ?= as
5198 a terminator. It was checked in mime_integrity. */
5199 return ((mimebuf_f)?
5200 (*i_mgetc_buf)(f):Fifo(mime_input++));
5203 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
5206 (*i_mungetc_buf)(c,f);
5208 Fifo(--mime_input) = (unsigned char)c;
5212 nkf_char mime_begin(FILE *f)
5217 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
5218 /* re-read and convert again from mime_buffer. */
5220 /* =? has been checked */
5222 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
5223 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
5224 /* We accept any character type even if it is breaked by new lines */
5225 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5226 if (c1==LF||c1==SP||c1==CR||
5227 c1=='-'||c1=='_'||is_alnum(c1)) continue;
5229 /* Failed. But this could be another MIME preemble */
5237 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5238 if (!(++i<MAXRECOVER) || c1==EOF) break;
5239 if (c1=='b'||c1=='B') {
5240 mime_decode_mode = 'B';
5241 } else if (c1=='q'||c1=='Q') {
5242 mime_decode_mode = 'Q';
5246 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5247 if (!(++i<MAXRECOVER) || c1==EOF) break;
5249 mime_decode_mode = FALSE;
5255 if (!mime_decode_mode) {
5256 /* false MIME premble, restart from mime_buffer */
5257 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5258 /* Since we are in MIME mode until buffer becomes empty, */
5259 /* we never go into mime_begin again for a while. */
5262 /* discard mime preemble, and goto MIME mode */
5264 /* do no MIME integrity check */
5265 return c1; /* used only for checking EOF */
5269 void no_putc(nkf_char c)
5274 void debug(const char *str)
5277 fprintf(stderr, "%s\n", str ? str : "NULL");
5282 void set_input_codename(char *codename)
5284 if (!input_codename) {
5285 input_codename = codename;
5286 } else if (strcmp(codename, input_codename) != 0) {
5287 input_codename = "";
5291 static char* get_guessed_code(void)
5293 if (input_codename && !*input_codename) {
5294 input_codename = "BINARY";
5296 struct input_code *p = find_inputcode_byfunc(iconv);
5297 if (!input_codename) {
5298 input_codename = "ASCII";
5299 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5300 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5301 input_codename = "CP932";
5302 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5303 if (p->score & (SCORE_X0212))
5304 input_codename = "EUCJP-MS";
5305 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5306 input_codename = "CP51932";
5307 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5308 if (p->score & (SCORE_KANA))
5309 input_codename = "CP50221";
5310 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5311 input_codename = "CP50220";
5314 return input_codename;
5317 #if !defined(PERL_XS) && !defined(WIN32DLL)
5318 void print_guessed_code(char *filename)
5320 if (filename != NULL) printf("%s: ", filename);
5321 if (input_codename && !*input_codename) {
5324 input_codename = get_guessed_code();
5326 printf("%s\n", input_codename);
5330 input_newline == CR ? " (CR)" :
5331 input_newline == LF ? " (LF)" :
5332 input_newline == CRLF ? " (CRLF)" :
5333 input_newline == EOF ? " (MIXED NL)" :
5342 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5344 nkf_char c1, c2, c3;
5350 if (!nkf_isxdigit(c2)){
5355 if (!nkf_isxdigit(c3)){
5360 return (hex2bin(c2) << 4) | hex2bin(c3);
5363 nkf_char cap_getc(FILE *f)
5365 return hex_getc(':', f, i_cgetc, i_cungetc);
5368 nkf_char cap_ungetc(nkf_char c, FILE *f)
5370 return (*i_cungetc)(c, f);
5373 nkf_char url_getc(FILE *f)
5375 return hex_getc('%', f, i_ugetc, i_uungetc);
5378 nkf_char url_ungetc(nkf_char c, FILE *f)
5380 return (*i_uungetc)(c, f);
5384 #ifdef NUMCHAR_OPTION
5385 nkf_char numchar_getc(FILE *f)
5387 nkf_char (*g)(FILE *) = i_ngetc;
5388 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5399 if (buf[i] == 'x' || buf[i] == 'X'){
5400 for (j = 0; j < 7; j++){
5402 if (!nkf_isxdigit(buf[i])){
5409 c |= hex2bin(buf[i]);
5412 for (j = 0; j < 8; j++){
5416 if (!nkf_isdigit(buf[i])){
5423 c += hex2bin(buf[i]);
5429 return CLASS_UNICODE | c;
5438 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5440 return (*i_nungetc)(c, f);
5444 #ifdef UNICODE_NORMALIZATION
5446 /* Normalization Form C */
5447 nkf_char nfc_getc(FILE *f)
5449 nkf_char (*g)(FILE *f) = i_nfc_getc;
5450 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5451 int i=0, j, k=1, lower, upper;
5453 const nkf_nfchar *array;
5456 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5457 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5458 while (upper >= lower) {
5459 j = (lower+upper) / 2;
5460 array = normalization_table[j].nfd;
5461 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5462 if (array[k] != buf[k]){
5463 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5470 array = normalization_table[j].nfc;
5471 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5472 buf[i] = (nkf_char)(array[i]);
5483 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5485 return (*i_nfc_ungetc)(c, f);
5487 #endif /* UNICODE_NORMALIZATION */
5493 nkf_char c1, c2, c3, c4, cc;
5494 nkf_char t1, t2, t3, t4, mode, exit_mode;
5495 nkf_char lwsp_count;
5498 nkf_char lwsp_size = 128;
5500 if (mime_top != mime_last) { /* Something is in FIFO */
5501 return Fifo(mime_top++);
5503 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5504 mime_decode_mode=FALSE;
5505 unswitch_mime_getc();
5506 return (*i_getc)(f);
5509 if (mimebuf_f == FIXED_MIME)
5510 exit_mode = mime_decode_mode;
5513 if (mime_decode_mode == 'Q') {
5514 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5516 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5517 if (c1<=SP || DEL<=c1) {
5518 mime_decode_mode = exit_mode; /* prepare for quit */
5521 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5525 mime_decode_mode = exit_mode; /* prepare for quit */
5526 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5527 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5528 /* end Q encoding */
5529 input_mode = exit_mode;
5531 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5532 if (lwsp_buf==NULL) {
5533 perror("can't malloc");
5536 while ((c1=(*i_getc)(f))!=EOF) {
5541 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5549 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5550 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5565 lwsp_buf[lwsp_count] = (unsigned char)c1;
5566 if (lwsp_count++>lwsp_size){
5568 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5569 if (lwsp_buf_new==NULL) {
5571 perror("can't realloc");
5574 lwsp_buf = lwsp_buf_new;
5580 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5582 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5583 i_ungetc(lwsp_buf[lwsp_count],f);
5589 if (c1=='='&&c2<SP) { /* this is soft wrap */
5590 while((c1 = (*i_mgetc)(f)) <=SP) {
5591 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5593 mime_decode_mode = 'Q'; /* still in MIME */
5594 goto restart_mime_q;
5597 mime_decode_mode = 'Q'; /* still in MIME */
5601 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5602 if (c2<=SP) return c2;
5603 mime_decode_mode = 'Q'; /* still in MIME */
5604 return ((hex2bin(c2)<<4) + hex2bin(c3));
5607 if (mime_decode_mode != 'B') {
5608 mime_decode_mode = FALSE;
5609 return (*i_mgetc)(f);
5613 /* Base64 encoding */
5615 MIME allows line break in the middle of
5616 Base64, but we are very pessimistic in decoding
5617 in unbuf mode because MIME encoded code may broken by
5618 less or editor's control sequence (such as ESC-[-K in unbuffered
5619 mode. ignore incomplete MIME.
5621 mode = mime_decode_mode;
5622 mime_decode_mode = exit_mode; /* prepare for quit */
5624 while ((c1 = (*i_mgetc)(f))<=SP) {
5629 if ((c2 = (*i_mgetc)(f))<=SP) {
5632 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5633 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5636 if ((c1 == '?') && (c2 == '=')) {
5639 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5640 if (lwsp_buf==NULL) {
5641 perror("can't malloc");
5644 while ((c1=(*i_getc)(f))!=EOF) {
5649 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5657 if ((c1=(*i_getc)(f))!=EOF) {
5661 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5676 lwsp_buf[lwsp_count] = (unsigned char)c1;
5677 if (lwsp_count++>lwsp_size){
5679 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5680 if (lwsp_buf_new==NULL) {
5682 perror("can't realloc");
5685 lwsp_buf = lwsp_buf_new;
5691 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5693 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5694 i_ungetc(lwsp_buf[lwsp_count],f);
5701 if ((c3 = (*i_mgetc)(f))<=SP) {
5704 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5705 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5709 if ((c4 = (*i_mgetc)(f))<=SP) {
5712 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5713 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5717 mime_decode_mode = mode; /* still in MIME sigh... */
5719 /* BASE 64 decoding */
5721 t1 = 0x3f & base64decode(c1);
5722 t2 = 0x3f & base64decode(c2);
5723 t3 = 0x3f & base64decode(c3);
5724 t4 = 0x3f & base64decode(c4);
5725 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5727 Fifo(mime_last++) = (unsigned char)cc;
5728 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5730 Fifo(mime_last++) = (unsigned char)cc;
5731 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5733 Fifo(mime_last++) = (unsigned char)cc;
5738 return Fifo(mime_top++);
5741 nkf_char mime_ungetc(nkf_char c, FILE *f)
5743 Fifo(--mime_top) = (unsigned char)c;
5747 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5751 /* In buffered mode, read until =? or NL or buffer full
5753 mime_input = mime_top;
5754 mime_last = mime_top;
5756 while(*p) Fifo(mime_input++) = *p++;
5759 while((c=(*i_getc)(f))!=EOF) {
5760 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5761 break; /* buffer full */
5763 if (c=='=' && d=='?') {
5764 /* checked. skip header, start decode */
5765 Fifo(mime_input++) = (unsigned char)c;
5766 /* mime_last_input = mime_input; */
5771 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5773 /* Should we check length mod 4? */
5774 Fifo(mime_input++) = (unsigned char)c;
5777 /* In case of Incomplete MIME, no MIME decode */
5778 Fifo(mime_input++) = (unsigned char)c;
5779 mime_last = mime_input; /* point undecoded buffer */
5780 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5781 switch_mime_getc(); /* anyway we need buffered getc */
5785 nkf_char base64decode(nkf_char c)
5790 i = c - 'A'; /* A..Z 0-25 */
5791 } else if (c == '_') {
5792 i = '?' /* 63 */ ; /* _ 63 */
5794 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5796 } else if (c > '/') {
5797 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5798 } else if (c == '+' || c == '-') {
5799 i = '>' /* 62 */ ; /* + and - 62 */
5801 i = '?' /* 63 */ ; /* / 63 */
5806 static const char basis_64[] =
5807 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5809 static nkf_char b64c;
5810 #define MIMEOUT_BUF_LENGTH (60)
5811 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5812 int mimeout_buf_count = 0;
5814 void open_mime(nkf_char mode)
5816 const unsigned char *p;
5819 p = mime_pattern[0];
5820 for(i=0;mime_pattern[i];i++) {
5821 if (mode == mime_encode[i]) {
5822 p = mime_pattern[i];
5826 mimeout_mode = mime_encode_method[i];
5828 if (base64_count>45) {
5829 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5830 (*o_mputc)(mimeout_buf[i]);
5833 PUT_NEWLINE((*o_mputc));
5836 if (mimeout_buf_count>0
5837 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5838 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5842 for (;i<mimeout_buf_count;i++) {
5843 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5844 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5845 (*o_mputc)(mimeout_buf[i]);
5855 j = mimeout_buf_count;
5856 mimeout_buf_count = 0;
5858 mime_putc(mimeout_buf[i]);
5862 void close_mime(void)
5872 switch(mimeout_mode) {
5877 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5883 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5888 if (mimeout_mode > 0) {
5889 if (mimeout_f!=FIXED_MIME) {
5891 } else if (mimeout_mode != 'Q')
5896 void mimeout_addchar(nkf_char c)
5898 switch(mimeout_mode) {
5903 } else if(!nkf_isalnum(c)) {
5905 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5906 (*o_mputc)(bin2hex((c&0xf)));
5915 (*o_mputc)(basis_64[c>>2]);
5920 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5926 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5927 (*o_mputc)(basis_64[c & 0x3F]);
5938 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5940 void mime_prechar(nkf_char c2, nkf_char c1)
5942 if (mimeout_mode > 0){
5944 if (base64_count + mimeout_buf_count/3*4> 73){
5945 (*o_base64conv)(EOF,0);
5946 OCONV_NEWLINE((*o_base64conv));
5947 (*o_base64conv)(0,SP);
5951 if (base64_count + mimeout_buf_count/3*4> 66) {
5952 (*o_base64conv)(EOF,0);
5953 OCONV_NEWLINE((*o_base64conv));
5954 (*o_base64conv)(0,SP);
5960 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5961 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5962 open_mime(output_mode);
5963 (*o_base64conv)(EOF,0);
5964 OCONV_NEWLINE((*o_base64conv));
5965 (*o_base64conv)(0,SP);
5972 void mime_putc(nkf_char c)
5977 if (mimeout_f == FIXED_MIME){
5978 if (mimeout_mode == 'Q'){
5979 if (base64_count > 71){
5980 if (c!=CR && c!=LF) {
5982 PUT_NEWLINE((*o_mputc));
5987 if (base64_count > 71){
5989 PUT_NEWLINE((*o_mputc));
5992 if (c == EOF) { /* c==EOF */
5996 if (c != EOF) { /* c==EOF */
6002 /* mimeout_f != FIXED_MIME */
6004 if (c == EOF) { /* c==EOF */
6005 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
6006 j = mimeout_buf_count;
6007 mimeout_buf_count = 0;
6009 if (mimeout_mode > 0) {
6010 if (!nkf_isblank(mimeout_buf[j-1])) {
6012 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
6015 mimeout_addchar(mimeout_buf[i]);
6019 mimeout_addchar(mimeout_buf[i]);
6023 mimeout_addchar(mimeout_buf[i]);
6029 mimeout_addchar(mimeout_buf[i]);
6035 if (mimeout_buf_count > 0){
6036 lastchar = mimeout_buf[mimeout_buf_count - 1];
6041 if (mimeout_mode=='Q') {
6042 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6043 if (c == CR || c == LF) {
6048 } else if (c <= SP) {
6050 if (base64_count > 70) {
6051 PUT_NEWLINE((*o_mputc));
6054 if (!nkf_isblank(c)) {
6059 if (base64_count > 70) {
6061 PUT_NEWLINE((*o_mputc));
6064 open_mime(output_mode);
6066 if (!nkf_noescape_mime(c)) {
6077 if (mimeout_mode <= 0) {
6078 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6079 if (nkf_isspace(c)) {
6081 if (mimeout_mode == -1) {
6084 if (c==CR || c==LF) {
6086 open_mime(output_mode);
6092 for (i=0;i<mimeout_buf_count;i++) {
6093 (*o_mputc)(mimeout_buf[i]);
6094 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
6105 mimeout_buf[0] = (char)c;
6106 mimeout_buf_count = 1;
6108 if (base64_count > 1
6109 && base64_count + mimeout_buf_count > 76
6110 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
6111 PUT_NEWLINE((*o_mputc));
6113 if (!nkf_isspace(mimeout_buf[0])){
6118 mimeout_buf[mimeout_buf_count++] = (char)c;
6119 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6120 open_mime(output_mode);
6125 if (lastchar==CR || lastchar == LF){
6126 for (i=0;i<mimeout_buf_count;i++) {
6127 (*o_mputc)(mimeout_buf[i]);
6130 mimeout_buf_count = 0;
6133 for (i=0;i<mimeout_buf_count-1;i++) {
6134 (*o_mputc)(mimeout_buf[i]);
6137 mimeout_buf[0] = SP;
6138 mimeout_buf_count = 1;
6140 open_mime(output_mode);
6143 /* mimeout_mode == 'B', 1, 2 */
6144 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6145 if (lastchar == CR || lastchar == LF){
6146 if (nkf_isblank(c)) {
6147 for (i=0;i<mimeout_buf_count;i++) {
6148 mimeout_addchar(mimeout_buf[i]);
6150 mimeout_buf_count = 0;
6151 } else if (SP<c && c<DEL) {
6153 for (i=0;i<mimeout_buf_count;i++) {
6154 (*o_mputc)(mimeout_buf[i]);
6157 mimeout_buf_count = 0;
6159 mimeout_buf[mimeout_buf_count++] = (char)c;
6162 if (c==SP || c==TAB || c==CR || c==LF) {
6163 for (i=0;i<mimeout_buf_count;i++) {
6164 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
6166 for (i=0;i<mimeout_buf_count;i++) {
6167 (*o_mputc)(mimeout_buf[i]);
6170 mimeout_buf_count = 0;
6173 mimeout_buf[mimeout_buf_count++] = (char)c;
6174 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6176 for (i=0;i<mimeout_buf_count;i++) {
6177 (*o_mputc)(mimeout_buf[i]);
6180 mimeout_buf_count = 0;
6184 if (mimeout_buf_count>0 && SP<c && c!='=') {
6185 mimeout_buf[mimeout_buf_count++] = (char)c;
6186 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6187 j = mimeout_buf_count;
6188 mimeout_buf_count = 0;
6190 mimeout_addchar(mimeout_buf[i]);
6197 if (mimeout_buf_count>0) {
6198 j = mimeout_buf_count;
6199 mimeout_buf_count = 0;
6201 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
6203 mimeout_addchar(mimeout_buf[i]);
6209 (*o_mputc)(mimeout_buf[i]);
6211 open_mime(output_mode);
6221 struct input_code *p = input_code_list;
6233 mime_f = MIME_DECODE_DEFAULT;
6234 mime_decode_f = FALSE;
6239 x0201_f = X0201_DEFAULT;
6240 iso2022jp_f = FALSE;
6241 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6242 ms_ucs_map_f = UCS_MAP_ASCII;
6244 #ifdef UTF8_INPUT_ENABLE
6245 no_cp932ext_f = FALSE;
6246 no_best_fit_chars_f = FALSE;
6247 encode_fallback = NULL;
6248 unicode_subchar = '?';
6249 input_endian = ENDIAN_BIG;
6251 #ifdef UTF8_OUTPUT_ENABLE
6252 output_bom_f = FALSE;
6253 output_endian = ENDIAN_BIG;
6255 #ifdef UNICODE_NORMALIZATION
6271 #ifdef SHIFTJIS_CP932
6281 for (i = 0; i < 256; i++){
6282 prefix_table[i] = 0;
6286 mimeout_buf_count = 0;
6291 fold_preserve_f = FALSE;
6294 kanji_intro = DEFAULT_J;
6295 ascii_intro = DEFAULT_R;
6296 fold_margin = FOLD_MARGIN;
6297 o_zconv = no_connection;
6298 o_fconv = no_connection;
6299 o_nlconv = no_connection;
6300 o_rot_conv = no_connection;
6301 o_hira_conv = no_connection;
6302 o_base64conv = no_connection;
6303 o_iso2022jp_check_conv = no_connection;
6306 i_ungetc = std_ungetc;
6308 i_bungetc = std_ungetc;
6311 i_mungetc = std_ungetc;
6312 i_mgetc_buf = std_getc;
6313 i_mungetc_buf = std_ungetc;
6314 output_mode = ASCII;
6317 mime_decode_mode = FALSE;
6325 z_prev2=0,z_prev1=0;
6327 iconv_for_check = 0;
6329 input_codename = NULL;
6330 input_encoding = NULL;
6331 output_encoding = NULL;
6337 void no_connection(nkf_char c2, nkf_char c1)
6339 no_connection2(c2,c1,0);
6342 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6344 fprintf(stderr,"nkf internal module connection failure.\n");
6346 return 0; /* LINT */
6351 #define fprintf dllprintf
6355 fprintf(HELP_OUTPUT,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6356 fprintf(HELP_OUTPUT,"Flags:\n");
6357 fprintf(HELP_OUTPUT,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6358 fprintf(HELP_OUTPUT,"j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n");
6359 #ifdef UTF8_OUTPUT_ENABLE
6360 fprintf(HELP_OUTPUT," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6362 fprintf(HELP_OUTPUT,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6363 #ifdef UTF8_INPUT_ENABLE
6364 fprintf(HELP_OUTPUT," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6366 fprintf(HELP_OUTPUT,"t no conversion\n");
6367 fprintf(HELP_OUTPUT,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6368 fprintf(HELP_OUTPUT,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6369 fprintf(HELP_OUTPUT,"r {de/en}crypt ROT13/47\n");
6370 fprintf(HELP_OUTPUT,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6371 fprintf(HELP_OUTPUT,"m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n");
6372 fprintf(HELP_OUTPUT,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6373 fprintf(HELP_OUTPUT,"l ISO8859-1 (Latin-1) support\n");
6374 fprintf(HELP_OUTPUT,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6375 fprintf(HELP_OUTPUT,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6376 fprintf(HELP_OUTPUT," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6377 fprintf(HELP_OUTPUT," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6378 fprintf(HELP_OUTPUT,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6379 fprintf(HELP_OUTPUT,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6381 fprintf(HELP_OUTPUT,"T Text mode output\n");
6383 fprintf(HELP_OUTPUT,"O Output to File (DEFAULT 'nkf.out')\n");
6384 fprintf(HELP_OUTPUT,"I Convert non ISO-2022-JP charactor to GETA\n");
6385 fprintf(HELP_OUTPUT,"d,c Convert line breaks -d: LF -c: CRLF\n");
6386 fprintf(HELP_OUTPUT,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6387 fprintf(HELP_OUTPUT,"v, V Show this usage. V: show configuration\n");
6388 fprintf(HELP_OUTPUT,"\n");
6389 fprintf(HELP_OUTPUT,"Long name options\n");
6390 fprintf(HELP_OUTPUT," --ic=<input codeset> --oc=<output codeset>\n");
6391 fprintf(HELP_OUTPUT," Specify the input or output codeset\n");
6392 fprintf(HELP_OUTPUT," --fj --unix --mac --windows\n");
6393 fprintf(HELP_OUTPUT," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6394 fprintf(HELP_OUTPUT," Convert for the system or code\n");
6395 fprintf(HELP_OUTPUT," --hiragana --katakana --katakana-hiragana\n");
6396 fprintf(HELP_OUTPUT," To Hiragana/Katakana Conversion\n");
6397 fprintf(HELP_OUTPUT," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6399 fprintf(HELP_OUTPUT," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6401 #ifdef NUMCHAR_OPTION
6402 fprintf(HELP_OUTPUT," --numchar-input Convert Unicode Character Reference\n");
6404 #ifdef UTF8_INPUT_ENABLE
6405 fprintf(HELP_OUTPUT," --fb-{skip, html, xml, perl, java, subchar}\n");
6406 fprintf(HELP_OUTPUT," Specify how nkf handles unassigned characters\n");
6409 fprintf(HELP_OUTPUT," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6410 fprintf(HELP_OUTPUT," Overwrite original listed files by filtered result\n");
6411 fprintf(HELP_OUTPUT," --overwrite preserves timestamp of original files\n");
6413 fprintf(HELP_OUTPUT," -g --guess Guess the input code\n");
6414 fprintf(HELP_OUTPUT," --help --version Show this help/the version\n");
6415 fprintf(HELP_OUTPUT," For more information, see also man nkf\n");
6416 fprintf(HELP_OUTPUT,"\n");
6420 void show_configuration(void)
6422 fprintf(HELP_OUTPUT, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n");
6423 fprintf(HELP_OUTPUT, " Compile-time options:\n");
6424 fprintf(HELP_OUTPUT, " Default output encoding: "
6425 #ifdef DEFAULT_ENCIDX
6426 "%s\n", nkf_enc_name(nkf_default_encoding())
6428 "%s (%s)\n", nkf_locale_encoding() ? "LOCALE" : "DEFAULT",
6429 nkf_enc_name(nkf_default_encoding())
6432 fprintf(HELP_OUTPUT, " Default output newline: "
6433 #if DEFAULT_NEWLINE == CR
6435 #elif DEFAULT_NEWLINE == CRLF
6441 fprintf(HELP_OUTPUT, " Decode MIME encoded string: "
6442 #if MIME_DECODE_DEFAULT
6448 fprintf(HELP_OUTPUT, " Convert JIS X 0201 Katakana: "
6455 fprintf(HELP_OUTPUT, " --help, --version output: "
6456 #if HELP_OUTPUT_HELP_OUTPUT
6466 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");