1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.162 2008/01/01 14:21:20 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-01-02"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #if defined(DEFAULT_CODE_JIS)
44 #elif defined(DEFAULT_CODE_SJIS)
45 #elif defined(DEFAULT_CODE_EUC)
46 #elif defined(DEFAULT_CODE_UTF8)
48 #define DEFAULT_CODE_JIS 1
51 #ifndef MIME_DECODE_DEFAULT
52 #define MIME_DECODE_DEFAULT STRICT_MIME
55 #define X0201_DEFAULT TRUE
58 #if DEFAULT_NEWLINE == 0x0D0A
59 #define PUT_NEWLINE(func) do {\
63 #define OCONV_NEWLINE(func) do {\
67 #elif DEFAULT_NEWLINE == 0x0D
68 #define PUT_NEWLINE(func) func(0x0D)
69 #define OCONV_NEWLINE(func) func(0, 0x0D)
71 #define DEFAULT_NEWLINE 0x0A
72 #define PUT_NEWLINE(func) func(0x0A)
73 #define OCONV_NEWLINE(func) func(0, 0x0A)
75 #ifdef HELP_OUTPUT_STDERR
76 #define HELP_OUTPUT stderr
78 #define HELP_OUTPUT stdout
81 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
83 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
99 #if defined(MSDOS) || defined(__OS2__)
102 #if defined(_MSC_VER) || defined(__WATCOMC__)
103 #define mktemp _mktemp
109 #define setbinmode(fp) fsetbin(fp)
110 #elif defined(__DJGPP__)
111 #include <libc/dosio.h>
112 #define setbinmode(fp) djgpp_setbinmode(fp)
113 #else /* Microsoft C, Turbo C */
114 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
117 #define setbinmode(fp)
120 #if defined(__DJGPP__)
121 void djgpp_setbinmode(FILE *fp)
123 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
126 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
127 __file_handle_set(fd, m);
131 #ifdef _IOFBF /* SysV and MSDOS, Windows */
132 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
134 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
137 /*Borland C++ 4.5 EasyWin*/
138 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
147 /* added by satoru@isoternet.org */
149 #include <sys/types.h>
151 #include <sys/stat.h>
152 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
154 #if defined(__WATCOMC__)
155 #include <sys/utime.h>
159 #else /* defined(MSDOS) */
161 #ifdef __BORLANDC__ /* BCC32 */
163 #else /* !defined(__BORLANDC__) */
164 #include <sys/utime.h>
165 #endif /* (__BORLANDC__) */
166 #else /* !defined(__WIN32__) */
167 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
168 #include <sys/utime.h>
169 #elif defined(__TURBOC__) /* BCC */
171 #elif defined(LSI_C) /* LSI C */
172 #endif /* (__WIN32__) */
180 /* state of output_mode and input_mode
191 /* Input Assumption */
196 #define LATIN1_INPUT 6
197 #define UTF8_INPUT 13
198 #define UTF16_INPUT 1015
199 #define UTF32_INPUT 1017
202 #define STRICT_MIME 8
277 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
278 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
279 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
280 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
281 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
282 void j_oconv(nkf_char c2, nkf_char c1);
283 void s_oconv(nkf_char c2, nkf_char c1);
284 void e_oconv(nkf_char c2, nkf_char c1);
285 void w_oconv(nkf_char c2, nkf_char c1);
286 void w_oconv16(nkf_char c2, nkf_char c1);
287 void w_oconv32(nkf_char c2, nkf_char c1);
291 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
292 void (*oconv)(nkf_char c2, nkf_char c1);
293 } nkf_native_encoding;
295 nkf_native_encoding NkfEncodingASCII = { "US_ASCII", e_iconv, e_oconv };
296 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
297 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
298 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
299 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
300 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
301 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
306 const nkf_native_encoding *base_encoding;
308 nkf_encoding nkf_encoding_table[] = {
309 {ASCII, "ASCII", &NkfEncodingASCII},
310 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
311 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
312 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
313 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
314 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
315 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
316 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
317 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
318 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
319 {CP10001, "CP10001", &NkfEncodingShift_JIS},
320 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
321 {CP51932, "CP51932", &NkfEncodingEUC_JP},
322 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
323 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
324 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
325 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
326 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
327 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
328 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
329 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
330 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
331 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
332 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
333 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
334 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
335 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
336 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
337 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
338 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
339 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
340 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
341 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
342 {BINARY, "BINARY", &NkfEncodingASCII},
345 #define NKF_ENCODING_TABLE_SIZE 34
349 } encoding_name_to_id_table[] = {
351 {"ISO-2022-JP", ISO_2022_JP},
352 {"X-ISO2022JP-CP932", CP50220},
353 {"CP50220", CP50220},
354 {"CP50221", CP50221},
355 {"CP50222", CP50222},
356 {"ISO-2022-JP-1", ISO_2022_JP_1},
357 {"ISO-2022-JP-3", ISO_2022_JP_3},
358 {"SHIFT_JIS", SHIFT_JIS},
360 {"WINDOWS-31J", WINDOWS_31J},
361 {"CSWINDOWS31J", WINDOWS_31J},
362 {"CP932", WINDOWS_31J},
363 {"MS932", WINDOWS_31J},
364 {"CP10001", CP10001},
367 {"CP51932", CP51932},
368 {"EUC-JP-MS", EUCJP_MS},
369 {"EUCJP-MS", EUCJP_MS},
370 {"EUCJPMS", EUCJP_MS},
371 {"EUC-JP-ASCII", EUCJP_ASCII},
372 {"EUCJP-ASCII", EUCJP_ASCII},
373 {"SHIFT_JISX0213", SHIFT_JISX0213},
374 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
375 {"EUC-JISX0213", EUC_JISX0213},
376 {"EUC-JIS-2004", EUC_JIS_2004},
379 {"UTF-8-BOM", UTF_8_BOM},
380 {"UTF8-MAC", UTF8_MAC},
381 {"UTF-8-MAC", UTF8_MAC},
383 {"UTF-16BE", UTF_16BE},
384 {"UTF-16BE-BOM", UTF_16BE_BOM},
385 {"UTF-16LE", UTF_16LE},
386 {"UTF-16LE-BOM", UTF_16LE_BOM},
388 {"UTF-32BE", UTF_32BE},
389 {"UTF-32BE-BOM", UTF_32BE_BOM},
390 {"UTF-32LE", UTF_32LE},
391 {"UTF-32LE-BOM", UTF_32LE_BOM},
395 #if defined(DEFAULT_CODE_JIS)
396 #define DEFAULT_ENCODING ISO_2022_JP
397 #elif defined(DEFAULT_CODE_SJIS)
398 #define DEFAULT_ENCODING SHIFT_JIS
399 #elif defined(DEFAULT_CODE_EUC)
400 #define DEFAULT_ENCODING EUC_JP
401 #elif defined(DEFAULT_CODE_UTF8)
402 #define DEFAULT_ENCODING UTF_8
406 #define is_alnum(c) \
407 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
409 /* I don't trust portablity of toupper */
410 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
411 #define nkf_isoctal(c) ('0'<=c && c<='7')
412 #define nkf_isdigit(c) ('0'<=c && c<='9')
413 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
414 #define nkf_isblank(c) (c == SP || c == TAB)
415 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
416 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
417 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
418 #define nkf_isprint(c) (SP<=c && c<='~')
419 #define nkf_isgraph(c) ('!'<=c && c<='~')
420 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
421 ('A'<=c&&c<='F') ? (c-'A'+10) : \
422 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
423 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
424 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
425 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
426 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
427 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
429 #define CP932_TABLE_BEGIN 0xFA
430 #define CP932_TABLE_END 0xFC
431 #define CP932INV_TABLE_BEGIN 0xED
432 #define CP932INV_TABLE_END 0xEE
433 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
435 #define HOLD_SIZE 1024
436 #if defined(INT_IS_SHORT)
437 #define IOBUF_SIZE 2048
439 #define IOBUF_SIZE 16384
442 #define DEFAULT_J 'B'
443 #define DEFAULT_R 'B'
445 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
446 #define SJ6394 0x0161 /* 63 - 94 ku offset */
448 #define RANGE_NUM_MAX 18
453 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
454 #define sizeof_euc_to_utf8_1byte 94
455 #define sizeof_euc_to_utf8_2bytes 94
456 #define sizeof_utf8_to_euc_C2 64
457 #define sizeof_utf8_to_euc_E5B8 64
458 #define sizeof_utf8_to_euc_2bytes 112
459 #define sizeof_utf8_to_euc_3bytes 16
462 /* MIME preprocessor */
464 #ifdef EASYWIN /*Easy Win */
465 extern POINT _BufferSize;
474 void (*status_func)(struct input_code *, nkf_char);
475 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
479 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
480 static nkf_encoding *input_encoding = NULL;
481 static nkf_encoding *output_encoding = NULL;
483 #if !defined(PERL_XS) && !defined(WIN32DLL)
484 static nkf_char noconvert(FILE *f);
486 static void module_connection(void);
487 static nkf_char kanji_convert(FILE *f);
488 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
489 static nkf_char push_hold_buf(nkf_char c2);
490 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
491 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
492 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
494 * 0: Shift_JIS, eucJP-ascii
499 #define UCS_MAP_ASCII 0
501 #define UCS_MAP_CP932 2
502 #define UCS_MAP_CP10001 3
503 static int ms_ucs_map_f = UCS_MAP_ASCII;
505 #ifdef UTF8_INPUT_ENABLE
506 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
507 static int no_cp932ext_f = FALSE;
508 /* ignore ZERO WIDTH NO-BREAK SPACE */
509 static int no_best_fit_chars_f = FALSE;
510 static int input_endian = ENDIAN_BIG;
511 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
512 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
513 static void encode_fallback_html(nkf_char c);
514 static void encode_fallback_xml(nkf_char c);
515 static void encode_fallback_java(nkf_char c);
516 static void encode_fallback_perl(nkf_char c);
517 static void encode_fallback_subchar(nkf_char c);
518 static void (*encode_fallback)(nkf_char c) = NULL;
519 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
520 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
521 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
522 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
523 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
524 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
525 static void w_status(struct input_code *, nkf_char);
527 #ifdef UTF8_OUTPUT_ENABLE
528 static int output_bom_f = FALSE;
529 static int output_endian = ENDIAN_BIG;
530 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
532 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
533 static void fold_conv(nkf_char c2,nkf_char c1);
534 static void nl_conv(nkf_char c2,nkf_char c1);
535 static void z_conv(nkf_char c2,nkf_char c1);
536 static void rot_conv(nkf_char c2,nkf_char c1);
537 static void hira_conv(nkf_char c2,nkf_char c1);
538 static void base64_conv(nkf_char c2,nkf_char c1);
539 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
540 static void no_connection(nkf_char c2,nkf_char c1);
541 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
543 static void code_score(struct input_code *ptr);
544 static void code_status(nkf_char c);
546 static void std_putc(nkf_char c);
547 static nkf_char std_getc(FILE *f);
548 static nkf_char std_ungetc(nkf_char c,FILE *f);
550 static nkf_char broken_getc(FILE *f);
551 static nkf_char broken_ungetc(nkf_char c,FILE *f);
553 static nkf_char mime_begin(FILE *f);
554 static nkf_char mime_getc(FILE *f);
555 static nkf_char mime_ungetc(nkf_char c,FILE *f);
557 static void switch_mime_getc(void);
558 static void unswitch_mime_getc(void);
559 static nkf_char mime_begin_strict(FILE *f);
560 static nkf_char mime_getc_buf(FILE *f);
561 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
562 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
564 static nkf_char base64decode(nkf_char c);
565 static void mime_prechar(nkf_char c2, nkf_char c1);
566 static void mime_putc(nkf_char c);
567 static void open_mime(nkf_char c);
568 static void close_mime(void);
569 static void eof_mime(void);
570 static void mimeout_addchar(nkf_char c);
572 static void usage(void);
573 static void version(void);
574 static void show_configuration(void);
576 static void options(unsigned char *c);
577 static void reinit(void);
581 #if !defined(PERL_XS) && !defined(WIN32DLL)
582 static unsigned char stdibuf[IOBUF_SIZE];
583 static unsigned char stdobuf[IOBUF_SIZE];
585 static unsigned char hold_buf[HOLD_SIZE*2];
586 static int hold_count = 0;
588 /* MIME preprocessor fifo */
590 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
591 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
592 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
593 static unsigned char mime_buf[MIME_BUF_SIZE];
594 static unsigned int mime_top = 0;
595 static unsigned int mime_last = 0; /* decoded */
596 static unsigned int mime_input = 0; /* undecoded */
597 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
600 static int unbuf_f = FALSE;
601 static int estab_f = FALSE;
602 static int nop_f = FALSE;
603 static int binmode_f = TRUE; /* binary mode */
604 static int rot_f = FALSE; /* rot14/43 mode */
605 static int hira_f = FALSE; /* hira/kata henkan */
606 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
607 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
608 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
609 static int mimebuf_f = FALSE; /* MIME buffered input */
610 static int broken_f = FALSE; /* convert ESC-less broken JIS */
611 static int iso8859_f = FALSE; /* ISO8859 through */
612 static int mimeout_f = FALSE; /* base64 mode */
613 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
614 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
616 #ifdef UNICODE_NORMALIZATION
617 static int nfc_f = FALSE;
618 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
619 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
620 static nkf_char nfc_getc(FILE *f);
621 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
625 static int cap_f = FALSE;
626 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
627 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
628 static nkf_char cap_getc(FILE *f);
629 static nkf_char cap_ungetc(nkf_char c,FILE *f);
631 static int url_f = FALSE;
632 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
633 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
634 static nkf_char url_getc(FILE *f);
635 static nkf_char url_ungetc(nkf_char c,FILE *f);
638 #if defined(INT_IS_SHORT)
639 #define NKF_INT32_C(n) (n##L)
641 #define NKF_INT32_C(n) (n)
643 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
644 #define CLASS_MASK NKF_INT32_C(0xFF000000)
645 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
646 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
647 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
648 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
649 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
651 #ifdef NUMCHAR_OPTION
652 static int numchar_f = FALSE;
653 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
654 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
655 static nkf_char numchar_getc(FILE *f);
656 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
660 static int noout_f = FALSE;
661 static void no_putc(nkf_char c);
662 static int debug_f = FALSE;
663 static void debug(const char *str);
664 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
667 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
669 static void print_guessed_code(char *filename);
671 static void set_input_codename(char *codename);
674 static int exec_f = 0;
677 #ifdef SHIFTJIS_CP932
678 /* invert IBM extended characters to others */
679 static int cp51932_f = FALSE;
681 /* invert NEC-selected IBM extended characters to IBM extended characters */
682 static int cp932inv_f = TRUE;
684 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
685 #endif /* SHIFTJIS_CP932 */
688 static int x0212_f = FALSE;
689 static nkf_char x0212_shift(nkf_char c);
690 static nkf_char x0212_unshift(nkf_char c);
692 static int x0213_f = FALSE;
694 static unsigned char prefix_table[256];
696 static void set_code_score(struct input_code *ptr, nkf_char score);
697 static void clr_code_score(struct input_code *ptr, nkf_char score);
698 static void status_disable(struct input_code *ptr);
699 static void status_push_ch(struct input_code *ptr, nkf_char c);
700 static void status_clear(struct input_code *ptr);
701 static void status_reset(struct input_code *ptr);
702 static void status_reinit(struct input_code *ptr);
703 static void status_check(struct input_code *ptr, nkf_char c);
704 static void e_status(struct input_code *, nkf_char);
705 static void s_status(struct input_code *, nkf_char);
707 struct input_code input_code_list[] = {
708 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
709 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
710 #ifdef UTF8_INPUT_ENABLE
711 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
712 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
713 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
718 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
719 static int base64_count = 0;
721 /* X0208 -> ASCII converter */
724 static int f_line = 0; /* chars in line */
725 static int f_prev = 0;
726 static int fold_preserve_f = FALSE; /* preserve new lines */
727 static int fold_f = FALSE;
728 static int fold_len = 0;
731 static unsigned char kanji_intro = DEFAULT_J;
732 static unsigned char ascii_intro = DEFAULT_R;
736 #define FOLD_MARGIN 10
737 #define DEFAULT_FOLD 60
739 static int fold_margin = FOLD_MARGIN;
743 #ifdef DEFAULT_CODE_JIS
744 # define DEFAULT_CONV j_oconv
746 #ifdef DEFAULT_CODE_SJIS
747 # define DEFAULT_CONV s_oconv
749 #ifdef DEFAULT_CODE_EUC
750 # define DEFAULT_CONV e_oconv
752 #ifdef DEFAULT_CODE_UTF8
753 # define DEFAULT_CONV w_oconv
756 /* process default */
757 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
758 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
760 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
761 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
762 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
763 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
764 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
765 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
766 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
768 /* static redirections */
770 static void (*o_putc)(nkf_char c) = std_putc;
772 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
773 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
775 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
776 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
778 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
780 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
781 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
783 /* for strict mime */
784 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
785 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
788 static int output_mode = ASCII, /* output kanji mode */
789 input_mode = ASCII, /* input kanji mode */
790 shift_mode = FALSE; /* TRUE shift out, or X0201 */
791 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
793 /* X0201 / X0208 conversion tables */
795 /* X0201 kana conversion table */
797 static const unsigned char cv[]= {
798 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
799 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
800 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
801 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
802 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
803 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
804 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
805 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
806 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
807 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
808 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
809 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
810 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
811 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
812 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
813 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
817 /* X0201 kana conversion table for daguten */
819 static const unsigned char dv[]= {
820 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
821 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
822 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
823 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
824 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
825 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
826 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
827 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
828 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
829 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
830 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
831 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
832 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
833 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
834 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
835 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
838 /* X0201 kana conversion table for han-daguten */
840 static const unsigned char ev[]= {
841 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
842 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
843 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
844 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
845 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
846 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
847 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
848 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
849 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
850 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
851 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
852 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
853 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
854 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
855 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
856 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
860 /* X0208 kigou conversion table */
861 /* 0x8140 - 0x819e */
862 static const unsigned char fv[] = {
864 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
865 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
866 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
867 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
868 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
869 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
870 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
871 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
872 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
873 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
874 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
875 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
880 static int file_out_f = FALSE;
882 static int overwrite_f = FALSE;
883 static int preserve_time_f = FALSE;
884 static int backup_f = FALSE;
885 static char *backup_suffix = "";
886 static char *get_backup_filename(const char *suffix, const char *filename);
889 static int nlmode_f = 0; /* CR, LF, CRLF */
890 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
891 static nkf_char prev_cr = 0; /* CR or 0 */
892 #ifdef EASYWIN /*Easy Win */
893 static int end_check;
896 #define STD_GC_BUFSIZE (256)
897 nkf_char std_gc_buf[STD_GC_BUFSIZE];
900 char* nkf_strcpy(const char *str)
902 char* result = malloc(strlen(str) + 1);
911 static void nkf_str_upcase(const char *src, char *dest, size_t length)
914 for (; i < length && src[i]; i++) {
915 dest[i] = nkf_toupper(src[i]);
920 static nkf_encoding *nkf_enc_from_index(int idx)
922 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
925 return &nkf_encoding_table[idx];
928 static int nkf_enc_find_index(const char *name)
931 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
932 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
933 return encoding_name_to_id_table[i].id;
939 static nkf_encoding *nkf_enc_find(const char *name)
942 idx = nkf_enc_find_index(name);
943 if (idx < 0) return 0;
944 return nkf_enc_from_index(idx);
947 #define nkf_enc_name(enc) (enc)->name
948 #define nkf_enc_to_index(enc) (enc)->id
949 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
950 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
951 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
952 #define nkf_enc_asciicompat(enc) (\
953 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
954 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
955 #define nkf_enc_unicode_p(enc) (\
956 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
957 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
958 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
959 #define nkf_enc_cp5022x_p(enc) (\
960 nkf_enc_to_index(enc) == CP50220 ||\
961 nkf_enc_to_index(enc) == CP50221 ||\
962 nkf_enc_to_index(enc) == CP50222)
965 #include "nkf32dll.c"
966 #elif defined(PERL_XS)
968 int main(int argc, char **argv)
973 char *outfname = NULL;
976 #ifdef EASYWIN /*Easy Win */
977 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
980 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
981 cp = (unsigned char *)*argv;
985 int debug_f_back = debug_f;
988 int exec_f_back = exec_f;
991 int x0212_f_back = x0212_f;
993 int x0213_f_back = x0213_f;
994 int guess_f_back = guess_f;
996 guess_f = guess_f_back;
999 debug_f = debug_f_back;
1002 exec_f = exec_f_back;
1005 x0212_f = x0212_f_back;
1007 x0213_f = x0213_f_back;
1012 if (pipe(fds) < 0 || (pid = fork()) < 0){
1023 execvp(argv[1], &argv[1]);
1038 if (binmode_f == TRUE)
1039 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1040 if (freopen("","wb",stdout) == NULL)
1047 setbuf(stdout, (char *) NULL);
1049 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
1052 if (binmode_f == TRUE)
1053 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1054 if (freopen("","rb",stdin) == NULL) return (-1);
1058 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
1062 kanji_convert(stdin);
1063 if (guess_f) print_guessed_code(NULL);
1067 int is_argument_error = FALSE;
1069 input_codename = NULL;
1072 iconv_for_check = 0;
1074 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
1076 is_argument_error = TRUE;
1084 /* reopen file for stdout */
1085 if (file_out_f == TRUE) {
1088 outfname = malloc(strlen(origfname)
1089 + strlen(".nkftmpXXXXXX")
1095 strcpy(outfname, origfname);
1099 for (i = strlen(outfname); i; --i){
1100 if (outfname[i - 1] == '/'
1101 || outfname[i - 1] == '\\'){
1107 strcat(outfname, "ntXXXXXX");
1109 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
1110 S_IREAD | S_IWRITE);
1112 strcat(outfname, ".nkftmpXXXXXX");
1113 fd = mkstemp(outfname);
1116 || (fd_backup = dup(fileno(stdout))) < 0
1117 || dup2(fd, fileno(stdout)) < 0
1128 outfname = "nkf.out";
1131 if(freopen(outfname, "w", stdout) == NULL) {
1135 if (binmode_f == TRUE) {
1136 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1137 if (freopen("","wb",stdout) == NULL)
1144 if (binmode_f == TRUE)
1145 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1146 if (freopen("","rb",fin) == NULL)
1151 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
1155 char *filename = NULL;
1157 if (nfiles > 1) filename = origfname;
1158 if (guess_f) print_guessed_code(filename);
1164 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1172 if (dup2(fd_backup, fileno(stdout)) < 0){
1175 if (stat(origfname, &sb)) {
1176 fprintf(stderr, "Can't stat %s\n", origfname);
1178 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
1179 if (chmod(outfname, sb.st_mode)) {
1180 fprintf(stderr, "Can't set permission %s\n", outfname);
1183 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
1184 if(preserve_time_f){
1185 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1186 tb[0] = tb[1] = sb.st_mtime;
1187 if (utime(outfname, tb)) {
1188 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1191 tb.actime = sb.st_atime;
1192 tb.modtime = sb.st_mtime;
1193 if (utime(outfname, &tb)) {
1194 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1199 char *backup_filename = get_backup_filename(backup_suffix, origfname);
1201 unlink(backup_filename);
1203 if (rename(origfname, backup_filename)) {
1204 perror(backup_filename);
1205 fprintf(stderr, "Can't rename %s to %s\n",
1206 origfname, backup_filename);
1210 if (unlink(origfname)){
1215 if (rename(outfname, origfname)) {
1217 fprintf(stderr, "Can't rename %s to %s\n",
1218 outfname, origfname);
1225 if (is_argument_error)
1228 #ifdef EASYWIN /*Easy Win */
1229 if (file_out_f == FALSE)
1230 scanf("%d",&end_check);
1233 #else /* for Other OS */
1234 if (file_out_f == TRUE)
1236 #endif /*Easy Win */
1239 #endif /* WIN32DLL */
1242 char *get_backup_filename(const char *suffix, const char *filename)
1244 char *backup_filename;
1245 int asterisk_count = 0;
1247 int filename_length = strlen(filename);
1249 for(i = 0; suffix[i]; i++){
1250 if(suffix[i] == '*') asterisk_count++;
1254 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1255 if (!backup_filename){
1256 perror("Can't malloc backup filename.");
1260 for(i = 0, j = 0; suffix[i];){
1261 if(suffix[i] == '*'){
1262 backup_filename[j] = '\0';
1263 strncat(backup_filename, filename, filename_length);
1265 j += filename_length;
1267 backup_filename[j++] = suffix[i++];
1270 backup_filename[j] = '\0';
1272 j = strlen(suffix) + filename_length;
1273 backup_filename = malloc( + 1);
1274 strcpy(backup_filename, filename);
1275 strcat(backup_filename, suffix);
1276 backup_filename[j] = '\0';
1278 return backup_filename;
1282 static const struct {
1306 {"katakana-hiragana","h3"},
1314 #ifdef UTF8_OUTPUT_ENABLE
1324 {"fb-subchar=", ""},
1326 #ifdef UTF8_INPUT_ENABLE
1327 {"utf8-input", "W"},
1328 {"utf16-input", "W16"},
1329 {"no-cp932ext", ""},
1330 {"no-best-fit-chars",""},
1332 #ifdef UNICODE_NORMALIZATION
1333 {"utf8mac-input", ""},
1345 #ifdef NUMCHAR_OPTION
1346 {"numchar-input", ""},
1352 #ifdef SHIFTJIS_CP932
1362 static int option_mode = 0;
1364 void options(unsigned char *cp)
1368 unsigned char *cp_back = NULL;
1374 while(*cp && *cp++!='-');
1375 while (*cp || cp_back) {
1383 case '-': /* literal options */
1384 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1388 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1389 p = (unsigned char *)long_option[i].name;
1390 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1391 if (*p == cp[j] || cp[j] == SP){
1398 fprintf(stderr, "unknown long option: --%s\n", cp);
1401 while(*cp && *cp != SP && cp++);
1402 if (long_option[i].alias[0]){
1404 cp = (unsigned char *)long_option[i].alias;
1406 if (strcmp(long_option[i].name, "ic=") == 0){
1407 nkf_str_upcase((char *)p, codeset, 32);
1408 enc = nkf_enc_find(codeset);
1410 input_encoding = enc;
1411 switch (nkf_enc_to_index(input_encoding)) {
1415 #ifdef SHIFTJIS_CP932
1418 #ifdef UTF8_OUTPUT_ENABLE
1419 ms_ucs_map_f = UCS_MAP_CP932;
1434 #ifdef SHIFTJIS_CP932
1437 #ifdef UTF8_OUTPUT_ENABLE
1438 ms_ucs_map_f = UCS_MAP_CP932;
1442 #ifdef SHIFTJIS_CP932
1445 #ifdef UTF8_OUTPUT_ENABLE
1446 ms_ucs_map_f = UCS_MAP_CP10001;
1450 #ifdef SHIFTJIS_CP932
1453 #ifdef UTF8_OUTPUT_ENABLE
1454 ms_ucs_map_f = UCS_MAP_CP932;
1458 #ifdef SHIFTJIS_CP932
1461 #ifdef UTF8_OUTPUT_ENABLE
1462 ms_ucs_map_f = UCS_MAP_MS;
1466 #ifdef SHIFTJIS_CP932
1469 #ifdef UTF8_OUTPUT_ENABLE
1470 ms_ucs_map_f = UCS_MAP_ASCII;
1473 case SHIFT_JISX0213:
1474 case SHIFT_JIS_2004:
1476 #ifdef SHIFTJIS_CP932
1483 #ifdef SHIFTJIS_CP932
1487 #ifdef UTF8_INPUT_ENABLE
1488 #ifdef UNICODE_NORMALIZATION
1496 input_endian = ENDIAN_BIG;
1500 input_endian = ENDIAN_LITTLE;
1505 input_endian = ENDIAN_BIG;
1509 input_endian = ENDIAN_LITTLE;
1515 if (strcmp(long_option[i].name, "oc=") == 0){
1517 nkf_str_upcase((char *)p, codeset, 32);
1518 enc = nkf_enc_find(codeset);
1519 if (enc <= 0) continue;
1520 output_encoding = enc;
1521 switch (nkf_enc_to_index(output_encoding)) {
1524 #ifdef SHIFTJIS_CP932
1527 #ifdef UTF8_OUTPUT_ENABLE
1528 ms_ucs_map_f = UCS_MAP_CP932;
1532 #ifdef SHIFTJIS_CP932
1535 #ifdef UTF8_OUTPUT_ENABLE
1536 ms_ucs_map_f = UCS_MAP_CP932;
1543 #ifdef SHIFTJIS_CP932
1552 #ifdef SHIFTJIS_CP932
1557 #ifdef UTF8_OUTPUT_ENABLE
1558 ms_ucs_map_f = UCS_MAP_CP932;
1562 #ifdef UTF8_OUTPUT_ENABLE
1563 ms_ucs_map_f = UCS_MAP_CP10001;
1567 #ifdef SHIFTJIS_CP932
1570 #ifdef UTF8_OUTPUT_ENABLE
1571 ms_ucs_map_f = UCS_MAP_CP932;
1578 #ifdef UTF8_OUTPUT_ENABLE
1579 ms_ucs_map_f = UCS_MAP_MS;
1586 #ifdef UTF8_OUTPUT_ENABLE
1587 ms_ucs_map_f = UCS_MAP_ASCII;
1590 case SHIFT_JISX0213:
1591 case SHIFT_JIS_2004:
1593 #ifdef SHIFTJIS_CP932
1603 #ifdef SHIFTJIS_CP932
1607 #ifdef UTF8_OUTPUT_ENABLE
1609 output_bom_f = TRUE;
1613 output_bom_f = TRUE;
1616 output_endian = ENDIAN_LITTLE;
1617 output_bom_f = FALSE;
1620 output_endian = ENDIAN_LITTLE;
1621 output_bom_f = TRUE;
1624 output_bom_f = TRUE;
1627 output_endian = ENDIAN_LITTLE;
1628 output_bom_f = FALSE;
1631 output_endian = ENDIAN_LITTLE;
1632 output_bom_f = TRUE;
1638 if (strcmp(long_option[i].name, "guess=") == 0){
1639 if (p[0] == '0' || p[0] == '1') {
1647 if (strcmp(long_option[i].name, "overwrite") == 0){
1650 preserve_time_f = TRUE;
1653 if (strcmp(long_option[i].name, "overwrite=") == 0){
1656 preserve_time_f = TRUE;
1658 backup_suffix = malloc(strlen((char *) p) + 1);
1659 strcpy(backup_suffix, (char *) p);
1662 if (strcmp(long_option[i].name, "in-place") == 0){
1665 preserve_time_f = FALSE;
1668 if (strcmp(long_option[i].name, "in-place=") == 0){
1671 preserve_time_f = FALSE;
1673 backup_suffix = malloc(strlen((char *) p) + 1);
1674 strcpy(backup_suffix, (char *) p);
1679 if (strcmp(long_option[i].name, "cap-input") == 0){
1683 if (strcmp(long_option[i].name, "url-input") == 0){
1688 #ifdef NUMCHAR_OPTION
1689 if (strcmp(long_option[i].name, "numchar-input") == 0){
1695 if (strcmp(long_option[i].name, "no-output") == 0){
1699 if (strcmp(long_option[i].name, "debug") == 0){
1704 if (strcmp(long_option[i].name, "cp932") == 0){
1705 #ifdef SHIFTJIS_CP932
1709 #ifdef UTF8_OUTPUT_ENABLE
1710 ms_ucs_map_f = UCS_MAP_CP932;
1714 if (strcmp(long_option[i].name, "no-cp932") == 0){
1715 #ifdef SHIFTJIS_CP932
1719 #ifdef UTF8_OUTPUT_ENABLE
1720 ms_ucs_map_f = UCS_MAP_ASCII;
1724 #ifdef SHIFTJIS_CP932
1725 if (strcmp(long_option[i].name, "cp932inv") == 0){
1732 if (strcmp(long_option[i].name, "x0212") == 0){
1739 if (strcmp(long_option[i].name, "exec-in") == 0){
1743 if (strcmp(long_option[i].name, "exec-out") == 0){
1748 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1749 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1750 no_cp932ext_f = TRUE;
1753 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1754 no_best_fit_chars_f = TRUE;
1757 if (strcmp(long_option[i].name, "fb-skip") == 0){
1758 encode_fallback = NULL;
1761 if (strcmp(long_option[i].name, "fb-html") == 0){
1762 encode_fallback = encode_fallback_html;
1765 if (strcmp(long_option[i].name, "fb-xml") == 0){
1766 encode_fallback = encode_fallback_xml;
1769 if (strcmp(long_option[i].name, "fb-java") == 0){
1770 encode_fallback = encode_fallback_java;
1773 if (strcmp(long_option[i].name, "fb-perl") == 0){
1774 encode_fallback = encode_fallback_perl;
1777 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1778 encode_fallback = encode_fallback_subchar;
1781 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1782 encode_fallback = encode_fallback_subchar;
1783 unicode_subchar = 0;
1785 /* decimal number */
1786 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1787 unicode_subchar *= 10;
1788 unicode_subchar += hex2bin(p[i]);
1790 }else if(p[1] == 'x' || p[1] == 'X'){
1791 /* hexadecimal number */
1792 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1793 unicode_subchar <<= 4;
1794 unicode_subchar |= hex2bin(p[i]);
1798 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1799 unicode_subchar *= 8;
1800 unicode_subchar += hex2bin(p[i]);
1803 w16e_conv(unicode_subchar, &i, &j);
1804 unicode_subchar = i<<8 | j;
1808 #ifdef UTF8_OUTPUT_ENABLE
1809 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1810 ms_ucs_map_f = UCS_MAP_MS;
1814 #ifdef UNICODE_NORMALIZATION
1815 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1820 if (strcmp(long_option[i].name, "prefix=") == 0){
1821 if (nkf_isgraph(p[0])){
1822 for (i = 1; nkf_isgraph(p[i]); i++){
1823 prefix_table[p[i]] = p[0];
1830 case 'b': /* buffered mode */
1833 case 'u': /* non bufferd mode */
1836 case 't': /* transparent mode */
1841 } else if (*cp=='2') {
1845 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1853 case 'j': /* JIS output */
1855 output_encoding = nkf_enc_from_index(ISO_2022_JP);
1857 case 'e': /* AT&T EUC output */
1859 output_encoding = nkf_enc_from_index(EUC_JP);
1861 case 's': /* SJIS output */
1862 output_encoding = nkf_enc_from_index(WINDOWS_31J);
1864 case 'l': /* ISO8859 Latin-1 support, no conversion */
1865 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1866 input_encoding = nkf_enc_from_index(ISO_8859_1);
1868 case 'i': /* Kanji IN ESC-$-@/B */
1869 if (*cp=='@'||*cp=='B')
1870 kanji_intro = *cp++;
1872 case 'o': /* ASCII IN ESC-(-J/B */
1873 if (*cp=='J'||*cp=='B'||*cp=='H')
1874 ascii_intro = *cp++;
1878 bit:1 katakana->hiragana
1879 bit:2 hiragana->katakana
1881 if ('9'>= *cp && *cp>='0')
1882 hira_f |= (*cp++ -'0');
1889 #if defined(MSDOS) || defined(__OS2__)
1896 show_configuration();
1904 #ifdef UTF8_OUTPUT_ENABLE
1905 case 'w': /* UTF-8 output */
1910 output_encoding = nkf_enc_from_index(UTF_8N);
1912 output_bom_f = TRUE;
1913 output_encoding = nkf_enc_from_index(UTF_8_BOM);
1917 if ('1'== cp[0] && '6'==cp[1]) {
1920 } else if ('3'== cp[0] && '2'==cp[1]) {
1924 output_encoding = nkf_enc_from_index(UTF_8);
1929 output_endian = ENDIAN_LITTLE;
1930 } else if (cp[0] == 'B') {
1933 output_encoding = nkf_enc_from_index(enc_idx);
1938 enc_idx = enc_idx == UTF_16
1939 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
1940 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
1942 output_bom_f = TRUE;
1943 enc_idx = enc_idx == UTF_16
1944 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
1945 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
1947 output_encoding = nkf_enc_from_index(enc_idx);
1951 #ifdef UTF8_INPUT_ENABLE
1952 case 'W': /* UTF input */
1955 input_encoding = nkf_enc_from_index(UTF_8);
1958 if ('1'== cp[0] && '6'==cp[1]) {
1960 input_endian = ENDIAN_BIG;
1962 } else if ('3'== cp[0] && '2'==cp[1]) {
1964 input_endian = ENDIAN_BIG;
1967 input_encoding = nkf_enc_from_index(UTF_8);
1972 input_endian = ENDIAN_LITTLE;
1973 } else if (cp[0] == 'B') {
1975 input_endian = ENDIAN_BIG;
1977 enc_idx = enc_idx == UTF_16
1978 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
1979 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
1980 input_encoding = nkf_enc_from_index(enc_idx);
1984 /* Input code assumption */
1985 case 'J': /* ISO-2022-JP input */
1986 input_encoding = nkf_enc_from_index(ISO_2022_JP);
1988 case 'E': /* EUC-JP input */
1989 input_encoding = nkf_enc_from_index(EUC_JP);
1991 case 'S': /* Windows-31J input */
1992 input_encoding = nkf_enc_from_index(WINDOWS_31J);
1994 case 'Z': /* Convert X0208 alphabet to asii */
1996 bit:0 Convert JIS X 0208 Alphabet to ASCII
1997 bit:1 Convert Kankaku to one space
1998 bit:2 Convert Kankaku to two spaces
1999 bit:3 Convert HTML Entity
2000 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
2002 while ('0'<= *cp && *cp <='9') {
2003 alpha_f |= 1 << (*cp++ - '0');
2005 if (!alpha_f) alpha_f = 1;
2007 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
2008 x0201_f = FALSE; /* No X0201->X0208 conversion */
2010 ESC-(-I in JIS, EUC, MS Kanji
2011 SI/SO in JIS, EUC, MS Kanji
2012 SSO in EUC, JIS, not in MS Kanji
2013 MS Kanji (0xa0-0xdf)
2015 ESC-(-I in JIS (0x20-0x5f)
2016 SSO in EUC (0xa0-0xdf)
2017 0xa0-0xd in MS Kanji (0xa0-0xdf)
2020 case 'X': /* Convert X0201 kana to X0208 */
2023 case 'F': /* prserve new lines */
2024 fold_preserve_f = TRUE;
2025 case 'f': /* folding -f60 or -f */
2028 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2030 fold_len += *cp++ - '0';
2032 if (!(0<fold_len && fold_len<BUFSIZ))
2033 fold_len = DEFAULT_FOLD;
2037 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2039 fold_margin += *cp++ - '0';
2043 case 'm': /* MIME support */
2044 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
2045 if (*cp=='B'||*cp=='Q') {
2046 mime_decode_mode = *cp++;
2047 mimebuf_f = FIXED_MIME;
2048 } else if (*cp=='N') {
2049 mime_f = TRUE; cp++;
2050 } else if (*cp=='S') {
2051 mime_f = STRICT_MIME; cp++;
2052 } else if (*cp=='0') {
2053 mime_decode_f = FALSE;
2054 mime_f = FALSE; cp++;
2057 case 'M': /* MIME output */
2060 mimeout_f = FIXED_MIME; cp++;
2061 } else if (*cp=='Q') {
2063 mimeout_f = FIXED_MIME; cp++;
2068 case 'B': /* Broken JIS support */
2070 bit:1 allow any x on ESC-(-x or ESC-$-x
2071 bit:2 reset to ascii on NL
2073 if ('9'>= *cp && *cp>='0')
2074 broken_f |= 1<<(*cp++ -'0');
2079 case 'O':/* for Output file */
2083 case 'c':/* add cr code */
2086 case 'd':/* delete cr code */
2089 case 'I': /* ISO-2022-JP output */
2092 case 'L': /* line mode */
2093 if (*cp=='u') { /* unix */
2094 nlmode_f = LF; cp++;
2095 } else if (*cp=='m') { /* mac */
2096 nlmode_f = CR; cp++;
2097 } else if (*cp=='w') { /* windows */
2098 nlmode_f = CRLF; cp++;
2099 } else if (*cp=='0') { /* no conversion */
2105 if ('2' <= *cp && *cp <= '9') {
2108 } else if (*cp == '0' || *cp == '1') {
2117 /* module muliple options in a string are allowed for Perl moudle */
2118 while(*cp && *cp++!='-');
2121 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
2122 /* bogus option but ignored */
2128 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2131 struct input_code *p = input_code_list;
2133 if (iconv_func == p->iconv_func){
2142 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2144 #ifdef INPUT_CODE_FIX
2145 if (f || !input_encoding)
2152 #ifdef INPUT_CODE_FIX
2153 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
2159 if (estab_f && iconv_for_check != iconv){
2160 struct input_code *p = find_inputcode_byfunc(iconv);
2162 set_input_codename(p->name);
2165 iconv_for_check = iconv;
2170 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2171 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2172 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2173 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2174 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2175 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2176 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2177 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2179 #define SCORE_INIT (SCORE_iMIME)
2181 static const char score_table_A0[] = {
2184 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2185 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2188 static const char score_table_F0[] = {
2189 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2190 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2191 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2192 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2195 void set_code_score(struct input_code *ptr, nkf_char score)
2198 ptr->score |= score;
2202 void clr_code_score(struct input_code *ptr, nkf_char score)
2205 ptr->score &= ~score;
2209 void code_score(struct input_code *ptr)
2211 nkf_char c2 = ptr->buf[0];
2212 #ifdef UTF8_OUTPUT_ENABLE
2213 nkf_char c1 = ptr->buf[1];
2216 set_code_score(ptr, SCORE_ERROR);
2217 }else if (c2 == SSO){
2218 set_code_score(ptr, SCORE_KANA);
2219 }else if (c2 == 0x8f){
2220 set_code_score(ptr, SCORE_X0212);
2221 #ifdef UTF8_OUTPUT_ENABLE
2222 }else if (!e2w_conv(c2, c1)){
2223 set_code_score(ptr, SCORE_NO_EXIST);
2225 }else if ((c2 & 0x70) == 0x20){
2226 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2227 }else if ((c2 & 0x70) == 0x70){
2228 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2229 }else if ((c2 & 0x70) >= 0x50){
2230 set_code_score(ptr, SCORE_L2);
2234 void status_disable(struct input_code *ptr)
2239 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2242 void status_push_ch(struct input_code *ptr, nkf_char c)
2244 ptr->buf[ptr->index++] = c;
2247 void status_clear(struct input_code *ptr)
2253 void status_reset(struct input_code *ptr)
2256 ptr->score = SCORE_INIT;
2259 void status_reinit(struct input_code *ptr)
2262 ptr->_file_stat = 0;
2265 void status_check(struct input_code *ptr, nkf_char c)
2267 if (c <= DEL && estab_f){
2272 void s_status(struct input_code *ptr, nkf_char c)
2276 status_check(ptr, c);
2281 #ifdef NUMCHAR_OPTION
2282 }else if (is_unicode_capsule(c)){
2285 }else if (0xa1 <= c && c <= 0xdf){
2286 status_push_ch(ptr, SSO);
2287 status_push_ch(ptr, c);
2290 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2292 status_push_ch(ptr, c);
2293 }else if (0xed <= c && c <= 0xee){
2295 status_push_ch(ptr, c);
2296 #ifdef SHIFTJIS_CP932
2297 }else if (is_ibmext_in_sjis(c)){
2299 status_push_ch(ptr, c);
2300 #endif /* SHIFTJIS_CP932 */
2302 }else if (0xf0 <= c && c <= 0xfc){
2304 status_push_ch(ptr, c);
2305 #endif /* X0212_ENABLE */
2307 status_disable(ptr);
2311 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2312 status_push_ch(ptr, c);
2313 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2317 status_disable(ptr);
2321 #ifdef SHIFTJIS_CP932
2322 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2323 status_push_ch(ptr, c);
2324 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2325 set_code_score(ptr, SCORE_CP932);
2330 #endif /* SHIFTJIS_CP932 */
2331 status_disable(ptr);
2334 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2335 status_push_ch(ptr, c);
2336 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2337 set_code_score(ptr, SCORE_CP932);
2340 status_disable(ptr);
2346 void e_status(struct input_code *ptr, nkf_char c)
2350 status_check(ptr, c);
2355 #ifdef NUMCHAR_OPTION
2356 }else if (is_unicode_capsule(c)){
2359 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2361 status_push_ch(ptr, c);
2363 }else if (0x8f == c){
2365 status_push_ch(ptr, c);
2366 #endif /* X0212_ENABLE */
2368 status_disable(ptr);
2372 if (0xa1 <= c && c <= 0xfe){
2373 status_push_ch(ptr, c);
2377 status_disable(ptr);
2382 if (0xa1 <= c && c <= 0xfe){
2384 status_push_ch(ptr, c);
2386 status_disable(ptr);
2388 #endif /* X0212_ENABLE */
2392 #ifdef UTF8_INPUT_ENABLE
2393 void w_status(struct input_code *ptr, nkf_char c)
2397 status_check(ptr, c);
2402 #ifdef NUMCHAR_OPTION
2403 }else if (is_unicode_capsule(c)){
2406 }else if (0xc0 <= c && c <= 0xdf){
2408 status_push_ch(ptr, c);
2409 }else if (0xe0 <= c && c <= 0xef){
2411 status_push_ch(ptr, c);
2412 }else if (0xf0 <= c && c <= 0xf4){
2414 status_push_ch(ptr, c);
2416 status_disable(ptr);
2421 if (0x80 <= c && c <= 0xbf){
2422 status_push_ch(ptr, c);
2423 if (ptr->index > ptr->stat){
2424 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2425 && ptr->buf[2] == 0xbf);
2426 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2427 &ptr->buf[0], &ptr->buf[1]);
2434 status_disable(ptr);
2438 if (0x80 <= c && c <= 0xbf){
2439 if (ptr->index < ptr->stat){
2440 status_push_ch(ptr, c);
2445 status_disable(ptr);
2452 void code_status(nkf_char c)
2454 int action_flag = 1;
2455 struct input_code *result = 0;
2456 struct input_code *p = input_code_list;
2458 if (!p->status_func) {
2462 if (!p->status_func)
2464 (p->status_func)(p, c);
2467 }else if(p->stat == 0){
2478 if (result && !estab_f){
2479 set_iconv(TRUE, result->iconv_func);
2480 }else if (c <= DEL){
2481 struct input_code *ptr = input_code_list;
2491 nkf_char std_getc(FILE *f)
2494 return std_gc_buf[--std_gc_ndx];
2500 nkf_char std_ungetc(nkf_char c, FILE *f)
2502 if (std_gc_ndx == STD_GC_BUFSIZE){
2505 std_gc_buf[std_gc_ndx++] = c;
2510 void std_putc(nkf_char c)
2517 #if !defined(PERL_XS) && !defined(WIN32DLL)
2518 nkf_char noconvert(FILE *f)
2523 module_connection();
2524 while ((c = (*i_getc)(f)) != EOF)
2531 void module_connection(void)
2533 if (!output_encoding) output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
2534 oconv = nkf_enc_to_oconv(output_encoding);
2537 /* replace continucation module, from output side */
2539 /* output redicrection */
2541 if (noout_f || guess_f){
2548 if (mimeout_f == TRUE) {
2549 o_base64conv = oconv; oconv = base64_conv;
2551 /* base64_count = 0; */
2554 if (nlmode_f || guess_f) {
2555 o_nlconv = oconv; oconv = nl_conv;
2558 o_rot_conv = oconv; oconv = rot_conv;
2561 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2564 o_hira_conv = oconv; oconv = hira_conv;
2567 o_fconv = oconv; oconv = fold_conv;
2570 if (alpha_f || x0201_f) {
2571 o_zconv = oconv; oconv = z_conv;
2575 i_ungetc = std_ungetc;
2576 /* input redicrection */
2579 i_cgetc = i_getc; i_getc = cap_getc;
2580 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2583 i_ugetc = i_getc; i_getc = url_getc;
2584 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2587 #ifdef NUMCHAR_OPTION
2589 i_ngetc = i_getc; i_getc = numchar_getc;
2590 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2593 #ifdef UNICODE_NORMALIZATION
2595 i_nfc_getc = i_getc; i_getc = nfc_getc;
2596 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2599 if (mime_f && mimebuf_f==FIXED_MIME) {
2600 i_mgetc = i_getc; i_getc = mime_getc;
2601 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2604 i_bgetc = i_getc; i_getc = broken_getc;
2605 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2607 if (input_encoding) {
2608 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
2610 set_iconv(FALSE, e_iconv);
2614 struct input_code *p = input_code_list;
2622 * Check and Ignore BOM
2624 void check_bom(FILE *f)
2627 switch(c2 = (*i_getc)(f)){
2629 if((c2 = (*i_getc)(f)) == 0x00){
2630 if((c2 = (*i_getc)(f)) == 0xFE){
2631 if((c2 = (*i_getc)(f)) == 0xFF){
2632 if(!input_encoding){
2633 set_iconv(TRUE, w_iconv32);
2635 if (iconv == w_iconv32) {
2636 input_endian = ENDIAN_BIG;
2639 (*i_ungetc)(0xFF,f);
2640 }else (*i_ungetc)(c2,f);
2641 (*i_ungetc)(0xFE,f);
2642 }else if(c2 == 0xFF){
2643 if((c2 = (*i_getc)(f)) == 0xFE){
2644 if(!input_encoding){
2645 set_iconv(TRUE, w_iconv32);
2647 if (iconv == w_iconv32) {
2648 input_endian = ENDIAN_2143;
2651 (*i_ungetc)(0xFF,f);
2652 }else (*i_ungetc)(c2,f);
2653 (*i_ungetc)(0xFF,f);
2654 }else (*i_ungetc)(c2,f);
2655 (*i_ungetc)(0x00,f);
2656 }else (*i_ungetc)(c2,f);
2657 (*i_ungetc)(0x00,f);
2660 if((c2 = (*i_getc)(f)) == 0xBB){
2661 if((c2 = (*i_getc)(f)) == 0xBF){
2662 if(!input_encoding){
2663 set_iconv(TRUE, w_iconv);
2665 if (iconv == w_iconv) {
2668 (*i_ungetc)(0xBF,f);
2669 }else (*i_ungetc)(c2,f);
2670 (*i_ungetc)(0xBB,f);
2671 }else (*i_ungetc)(c2,f);
2672 (*i_ungetc)(0xEF,f);
2675 if((c2 = (*i_getc)(f)) == 0xFF){
2676 if((c2 = (*i_getc)(f)) == 0x00){
2677 if((c2 = (*i_getc)(f)) == 0x00){
2678 if(!input_encoding){
2679 set_iconv(TRUE, w_iconv32);
2681 if (iconv == w_iconv32) {
2682 input_endian = ENDIAN_3412;
2685 (*i_ungetc)(0x00,f);
2686 }else (*i_ungetc)(c2,f);
2687 (*i_ungetc)(0x00,f);
2688 }else (*i_ungetc)(c2,f);
2689 if(!input_encoding){
2690 set_iconv(TRUE, w_iconv16);
2692 if (iconv == w_iconv16) {
2693 input_endian = ENDIAN_BIG;
2696 (*i_ungetc)(0xFF,f);
2697 }else (*i_ungetc)(c2,f);
2698 (*i_ungetc)(0xFE,f);
2701 if((c2 = (*i_getc)(f)) == 0xFE){
2702 if((c2 = (*i_getc)(f)) == 0x00){
2703 if((c2 = (*i_getc)(f)) == 0x00){
2704 if(!input_encoding){
2705 set_iconv(TRUE, w_iconv32);
2707 if (iconv == w_iconv32) {
2708 input_endian = ENDIAN_LITTLE;
2711 (*i_ungetc)(0x00,f);
2712 }else (*i_ungetc)(c2,f);
2713 (*i_ungetc)(0x00,f);
2714 }else (*i_ungetc)(c2,f);
2715 if(!input_encoding){
2716 set_iconv(TRUE, w_iconv16);
2718 if (iconv == w_iconv16) {
2719 input_endian = ENDIAN_LITTLE;
2722 (*i_ungetc)(0xFE,f);
2723 }else (*i_ungetc)(c2,f);
2724 (*i_ungetc)(0xFF,f);
2733 Conversion main loop. Code detection only.
2736 nkf_char kanji_convert(FILE *f)
2738 nkf_char c3, c2=0, c1, c0=0;
2739 int is_8bit = FALSE;
2741 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
2746 output_mode = ASCII;
2749 #define NEXT continue /* no output, get next */
2750 #define SEND ; /* output c1 and c2, get next */
2751 #define LAST break /* end of loop, go closing */
2753 module_connection();
2756 while ((c1 = (*i_getc)(f)) != EOF) {
2757 #ifdef INPUT_CODE_FIX
2758 if (!input_encoding)
2763 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
2764 /* in case of 8th bit is on */
2765 if (!estab_f&&!mime_decode_mode) {
2766 /* in case of not established yet */
2767 /* It is still ambiguious */
2768 if (h_conv(f, c2, c1)==EOF)
2774 /* in case of already established */
2776 /* ignore bogus code and not CP5022x UCD */
2784 /* second byte, 7 bit code */
2785 /* it might be kanji shitfted */
2786 if ((c1 == DEL) || (c1 <= SP)) {
2787 /* ignore bogus first code */
2794 #ifdef UTF8_INPUT_ENABLE
2795 if (iconv == w_iconv16) {
2796 if (input_endian == ENDIAN_BIG) {
2798 if ((c1 = (*i_getc)(f)) != EOF) {
2799 if (0xD8 <= c2 && c2 <= 0xDB) {
2800 if ((c0 = (*i_getc)(f)) != EOF) {
2802 if ((c3 = (*i_getc)(f)) != EOF) {
2809 if ((c2 = (*i_getc)(f)) != EOF) {
2810 if (0xD8 <= c2 && c2 <= 0xDB) {
2811 if ((c3 = (*i_getc)(f)) != EOF) {
2812 if ((c0 = (*i_getc)(f)) != EOF) {
2821 } else if(iconv == w_iconv32){
2823 if((c2 = (*i_getc)(f)) != EOF &&
2824 (c1 = (*i_getc)(f)) != EOF &&
2825 (c0 = (*i_getc)(f)) != EOF){
2826 switch(input_endian){
2828 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2831 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2834 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2837 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2847 #ifdef NUMCHAR_OPTION
2848 if (is_unicode_capsule(c1)){
2852 if (c1 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
2854 if (!estab_f && !iso8859_f) {
2855 /* not established yet */
2858 } else { /* estab_f==TRUE */
2863 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2864 /* SJIS X0201 Case... */
2865 if (iso2022jp_f && !x0201_f) {
2866 (*oconv)(GETA1, GETA2);
2873 } else if (c1==SSO && iconv != s_iconv) {
2874 /* EUC X0201 Case */
2875 c1 = (*i_getc)(f); /* skip SSO */
2877 if (SSP<=c1 && c1<0xe0) {
2878 if (iso2022jp_f && !x0201_f) {
2879 (*oconv)(GETA1, GETA2);
2886 } else { /* bogus code, skip SSO and one byte */
2889 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2890 (c1 == 0xFD || c1 == 0xFE)) {
2896 /* already established */
2901 } else if ((c1 > SP) && (c1 != DEL)) {
2902 /* in case of Roman characters */
2904 /* output 1 shifted byte */
2908 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2909 /* output 1 shifted byte */
2910 if (iso2022jp_f && !x0201_f) {
2911 (*oconv)(GETA1, GETA2);
2918 /* look like bogus code */
2921 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
2922 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
2923 /* in case of Kanji shifted */
2926 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2927 /* Check MIME code */
2928 if ((c1 = (*i_getc)(f)) == EOF) {
2931 } else if (c1 == '?') {
2932 /* =? is mime conversion start sequence */
2933 if(mime_f == STRICT_MIME) {
2934 /* check in real detail */
2935 if (mime_begin_strict(f) == EOF)
2939 } else if (mime_begin(f) == EOF)
2949 /* normal ASCII code */
2952 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2955 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2958 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2959 if ((c1 = (*i_getc)(f)) == EOF) {
2960 /* (*oconv)(0, ESC); don't send bogus code */
2962 } else if (c1 == '$') {
2963 if ((c1 = (*i_getc)(f)) == EOF) {
2965 (*oconv)(0, ESC); don't send bogus code
2966 (*oconv)(0, '$'); */
2968 } else if (c1 == '@'|| c1 == 'B') {
2969 /* This is kanji introduction */
2970 input_mode = JIS_X_0208;
2972 set_input_codename("ISO-2022-JP");
2974 debug("ISO-2022-JP");
2977 } else if (c1 == '(') {
2978 if ((c1 = (*i_getc)(f)) == EOF) {
2979 /* don't send bogus code
2985 } else if (c1 == '@'|| c1 == 'B') {
2986 /* This is kanji introduction */
2987 input_mode = JIS_X_0208;
2991 } else if (c1 == 'D'){
2992 input_mode = JIS_X_0212;
2995 #endif /* X0212_ENABLE */
2996 } else if (c1 == 0x4F){
2997 input_mode = JIS_X_0213_1;
3000 } else if (c1 == 0x50){
3001 input_mode = JIS_X_0213_2;
3005 /* could be some special code */
3012 } else if (broken_f&0x2) {
3013 /* accept any ESC-(-x as broken code ... */
3014 input_mode = JIS_X_0208;
3023 } else if (c1 == '(') {
3024 if ((c1 = (*i_getc)(f)) == EOF) {
3025 /* don't send bogus code
3027 (*oconv)(0, '('); */
3031 /* This is X0201 kana introduction */
3032 input_mode = JIS_X_0201; shift_mode = JIS_X_0201;
3034 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
3035 /* This is X0208 kanji introduction */
3036 input_mode = ASCII; shift_mode = FALSE;
3038 } else if (broken_f&0x2) {
3039 input_mode = ASCII; shift_mode = FALSE;
3044 /* maintain various input_mode here */
3048 } else if ( c1 == 'N' || c1 == 'n'){
3050 c3 = (*i_getc)(f); /* skip SS2 */
3051 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
3066 } else if (c1 == ESC && iconv == s_iconv) {
3067 /* ESC in Shift_JIS */
3068 if ((c1 = (*i_getc)(f)) == EOF) {
3069 /* (*oconv)(0, ESC); don't send bogus code */
3071 } else if (c1 == '$') {
3073 if ((c1 = (*i_getc)(f)) == EOF) {
3075 (*oconv)(0, ESC); don't send bogus code
3076 (*oconv)(0, '$'); */
3079 if (('E' <= c1 && c1 <= 'G') ||
3080 ('O' <= c1 && c1 <= 'Q')) {
3088 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
3089 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
3090 while ((c1 = (*i_getc)(f)) != EOF) {
3091 if (SP <= c1 && c1 <= 'z') {
3092 (*oconv)(0, c1 + c0);
3093 } else break; /* c1 == SO */
3097 if (c1 == EOF) LAST;
3104 } else if (c1 == LF || c1 == CR) {
3106 input_mode = ASCII; set_iconv(FALSE, 0);
3108 } else if (mime_decode_f && !mime_decode_mode){
3110 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
3118 } else { /* if (c1 == CR)*/
3119 if ((c1=(*i_getc)(f))!=EOF) {
3123 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
3137 } else if (c1 == DEL && input_mode == JIS_X_0208) {
3147 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
3150 if ((c0 = (*i_getc)(f)) != EOF) {
3153 if ((c3 = (*i_getc)(f)) != EOF) {
3155 (*iconv)(c2, c1, c0|c3);
3160 /* 3 bytes EUC or UTF-8 */
3161 if ((c0 = (*i_getc)(f)) != EOF) {
3163 (*iconv)(c2, c1, c0);
3171 0x7F <= c2 && c2 <= 0x92 &&
3172 0x21 <= c1 && c1 <= 0x7E) {
3174 if(c1 == 0x7F) return 0;
3175 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
3178 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
3182 (*oconv)(PREFIX_EUCG3 | c2, c1);
3184 #endif /* X0212_ENABLE */
3186 (*oconv)(PREFIX_EUCG3 | c2, c1);
3189 (*oconv)(input_mode, c1); /* other special case */
3195 /* goto next_word */
3199 (*iconv)(EOF, 0, 0);
3200 if (!input_codename)
3203 struct input_code *p = input_code_list;
3204 struct input_code *result = p;
3206 if (p->score < result->score) result = p;
3209 set_input_codename(result->name);
3211 debug(result->name);
3219 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3221 nkf_char ret, c3, c0;
3225 /** it must NOT be in the kanji shifte sequence */
3226 /** it must NOT be written in JIS7 */
3227 /** and it must be after 2 byte 8bit code */
3233 while ((c1 = (*i_getc)(f)) != EOF) {
3239 if (push_hold_buf(c1) == EOF || estab_f){
3245 struct input_code *p = input_code_list;
3246 struct input_code *result = p;
3251 if (p->status_func && p->score < result->score){
3256 set_iconv(TRUE, result->iconv_func);
3261 ** 1) EOF is detected, or
3262 ** 2) Code is established, or
3263 ** 3) Buffer is FULL (but last word is pushed)
3265 ** in 1) and 3) cases, we continue to use
3266 ** Kanji codes by oconv and leave estab_f unchanged.
3271 while (hold_index < hold_count){
3272 c2 = hold_buf[hold_index++];
3274 #ifdef NUMCHAR_OPTION
3275 || is_unicode_capsule(c2)
3280 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3281 (*iconv)(JIS_X_0201, c2, 0);
3284 if (hold_index < hold_count){
3285 c1 = hold_buf[hold_index++];
3295 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3298 if (hold_index < hold_count){
3299 c0 = hold_buf[hold_index++];
3300 } else if ((c0 = (*i_getc)(f)) == EOF) {
3306 if (hold_index < hold_count){
3307 c3 = hold_buf[hold_index++];
3308 } else if ((c3 = (*i_getc)(f)) == EOF) {
3313 (*iconv)(c2, c1, c0|c3);
3318 /* 3 bytes EUC or UTF-8 */
3319 if (hold_index < hold_count){
3320 c0 = hold_buf[hold_index++];
3321 } else if ((c0 = (*i_getc)(f)) == EOF) {
3327 (*iconv)(c2, c1, c0);
3330 if (c0 == EOF) break;
3335 nkf_char push_hold_buf(nkf_char c2)
3337 if (hold_count >= HOLD_SIZE*2)
3339 hold_buf[hold_count++] = (unsigned char)c2;
3340 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3343 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3345 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3348 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3349 #ifdef SHIFTJIS_CP932
3350 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3351 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3358 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3359 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3365 #endif /* SHIFTJIS_CP932 */
3367 if (!x0213_f && is_ibmext_in_sjis(c2)){
3368 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3371 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3384 if(x0213_f && c2 >= 0xF0){
3385 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3386 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3387 }else{ /* 78<=k<=94 */
3388 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3389 if (0x9E < c1) c2++;
3392 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3393 if (0x9E < c1) c2++;
3396 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3403 c2 = x0212_unshift(c2);
3410 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3412 if (c2 == JIS_X_0201) {
3414 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3416 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3418 if(c1 == 0x7F) return 0;
3419 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3422 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3423 if (ret) return ret;
3429 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3431 if (c2 == JIS_X_0201) {
3434 }else if (c2 == 0x8f){
3438 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3439 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3440 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3443 c2 = (c2 << 8) | (c1 & 0x7f);
3445 #ifdef SHIFTJIS_CP932
3448 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3449 s2e_conv(s2, s1, &c2, &c1);
3456 #endif /* SHIFTJIS_CP932 */
3458 #endif /* X0212_ENABLE */
3459 } else if (c2 == SSO){
3462 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3465 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3466 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3467 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3472 #ifdef SHIFTJIS_CP932
3473 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3475 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3476 s2e_conv(s2, s1, &c2, &c1);
3483 #endif /* SHIFTJIS_CP932 */
3490 #ifdef UTF8_INPUT_ENABLE
3491 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3498 }else if (0xc0 <= c2 && c2 <= 0xef) {
3499 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3500 #ifdef NUMCHAR_OPTION
3503 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3511 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3514 static const char w_iconv_utf8_1st_byte[] =
3516 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3517 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3518 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3519 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3521 if (c2 < 0 || 0xff < c2) {
3522 }else if (c2 == 0) { /* 0 : 1 byte*/
3524 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3527 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3529 if (c1 < 0x80 || 0xBF < c1) return 0;
3532 if (c0 == 0) return -1;
3533 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3538 if (c0 == 0) return -1;
3539 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3543 if (c0 == 0) return -1;
3544 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3548 if (c0 == 0) return -2;
3549 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3553 if (c0 == 0) return -2;
3554 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3558 if (c0 == 0) return -2;
3559 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3567 if (c2 == 0 || c2 == EOF){
3568 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3569 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3572 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3581 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3582 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3589 }else if (val < 0x800){
3590 *p2 = 0xc0 | (val >> 6);
3591 *p1 = 0x80 | (val & 0x3f);
3593 } else if (val <= NKF_INT32_C(0xFFFF)) {
3594 *p2 = 0xe0 | (val >> 12);
3595 *p1 = 0x80 | ((val >> 6) & 0x3f);
3596 *p0 = 0x80 | (val & 0x3f);
3597 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3598 *p2 = 0xe0 | (val >> 16);
3599 *p1 = 0x80 | ((val >> 12) & 0x3f);
3600 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3609 #ifdef UTF8_INPUT_ENABLE
3610 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3615 } else if (c2 >= 0xf0){
3616 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3617 val = (c2 & 0x0f) << 18;
3618 val |= (c1 & 0x3f) << 12;
3619 val |= (c0 & 0x3f00) >> 2;
3621 }else if (c2 >= 0xe0){
3622 val = (c2 & 0x0f) << 12;
3623 val |= (c1 & 0x3f) << 6;
3625 }else if (c2 >= 0xc0){
3626 val = (c2 & 0x1f) << 6;
3634 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3636 nkf_char c2, c1, c0;
3643 w16w_conv(val, &c2, &c1, &c0);
3644 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3645 #ifdef NUMCHAR_OPTION
3648 *p1 = CLASS_UNICODE | val;
3657 #ifdef UTF8_INPUT_ENABLE
3658 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3661 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3664 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3665 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3667 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3669 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3674 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3675 if (ret) return ret;
3680 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3684 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3685 } else if (is_unicode_bmp(c1)) {
3686 ret = w16e_conv(c1, &c2, &c1);
3689 c1 = CLASS_UNICODE | c1;
3691 if (ret) return ret;
3696 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3698 const unsigned short *const *pp;
3699 const unsigned short *const *const *ppp;
3700 static const char no_best_fit_chars_table_C2[] =
3701 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3702 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3703 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3704 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3705 static const char no_best_fit_chars_table_C2_ms[] =
3706 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3707 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3708 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3709 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3710 static const char no_best_fit_chars_table_932_C2[] =
3711 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3712 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3713 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3714 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3715 static const char no_best_fit_chars_table_932_C3[] =
3716 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3717 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3719 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3725 }else if(c2 < 0xe0){
3726 if(no_best_fit_chars_f){
3727 if(ms_ucs_map_f == UCS_MAP_CP932){
3730 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3733 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3736 }else if(!cp932inv_f){
3739 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3742 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3745 }else if(ms_ucs_map_f == UCS_MAP_MS){
3746 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3747 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3765 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3766 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3767 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3769 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3770 }else if(c0 < 0xF0){
3771 if(no_best_fit_chars_f){
3772 if(ms_ucs_map_f == UCS_MAP_CP932){
3773 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3774 }else if(ms_ucs_map_f == UCS_MAP_MS){
3779 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3782 if(c0 == 0x92) return 1;
3787 if(c1 == 0x80 || c0 == 0x9C) return 1;
3790 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3795 if(c0 == 0x94) return 1;
3798 if(c0 == 0xBB) return 1;
3808 if(c0 == 0x95) return 1;
3811 if(c0 == 0xA5) return 1;
3818 if(c0 == 0x8D) return 1;
3821 if(c0 == 0x9E && !cp932inv_f) return 1;
3824 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3832 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3833 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3834 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3836 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3838 #ifdef SHIFTJIS_CP932
3839 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3841 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3842 s2e_conv(s2, s1, p2, p1);
3851 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3854 const unsigned short *p;
3857 if (pp == 0) return 1;
3860 if (c1 < 0 || psize <= c1) return 1;
3862 if (p == 0) return 1;
3865 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3867 if (val == 0) return 1;
3868 if (no_cp932ext_f && (
3869 (val>>8) == 0x2D || /* NEC special characters */
3870 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3878 if (c2 == SO) c2 = JIS_X_0201;
3885 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3892 (*f)(0, bin2hex(c>>shift));
3902 void encode_fallback_html(nkf_char c)
3907 if(c >= NKF_INT32_C(1000000))
3908 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3909 if(c >= NKF_INT32_C(100000))
3910 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3912 (*oconv)(0, 0x30+(c/10000 )%10);
3914 (*oconv)(0, 0x30+(c/1000 )%10);
3916 (*oconv)(0, 0x30+(c/100 )%10);
3918 (*oconv)(0, 0x30+(c/10 )%10);
3920 (*oconv)(0, 0x30+ c %10);
3925 void encode_fallback_xml(nkf_char c)
3930 nkf_each_char_to_hex(oconv, c);
3935 void encode_fallback_java(nkf_char c)
3939 if(!is_unicode_bmp(c)){
3943 (*oconv)(0, bin2hex(c>>20));
3944 (*oconv)(0, bin2hex(c>>16));
3948 (*oconv)(0, bin2hex(c>>12));
3949 (*oconv)(0, bin2hex(c>> 8));
3950 (*oconv)(0, bin2hex(c>> 4));
3951 (*oconv)(0, bin2hex(c ));
3955 void encode_fallback_perl(nkf_char c)
3960 nkf_each_char_to_hex(oconv, c);
3965 void encode_fallback_subchar(nkf_char c)
3967 c = unicode_subchar;
3968 (*oconv)((c>>8)&0xFF, c&0xFF);
3973 #ifdef UTF8_OUTPUT_ENABLE
3974 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3976 const unsigned short *p;
3978 if (c2 == JIS_X_0201) {
3979 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3987 p = euc_to_utf8_1byte;
3989 } else if (is_eucg3(c2)){
3990 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3993 c2 = (c2&0x7f) - 0x21;
3994 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3995 p = x0212_to_utf8_2bytes[c2];
4001 c2 = (c2&0x7f) - 0x21;
4002 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4004 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
4005 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
4006 euc_to_utf8_2bytes_ms[c2];
4011 c1 = (c1 & 0x7f) - 0x21;
4012 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
4017 void w_oconv(nkf_char c2, nkf_char c1)
4023 output_bom_f = FALSE;
4034 #ifdef NUMCHAR_OPTION
4035 if (c2 == 0 && is_unicode_capsule(c1)){
4036 val = c1 & VALUE_MASK;
4039 }else if (val < 0x800){
4040 (*o_putc)(0xC0 | (val >> 6));
4041 (*o_putc)(0x80 | (val & 0x3f));
4042 } else if (val <= NKF_INT32_C(0xFFFF)) {
4043 (*o_putc)(0xE0 | (val >> 12));
4044 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
4045 (*o_putc)(0x80 | (val & 0x3f));
4046 } else if (val <= NKF_INT32_C(0x10FFFF)) {
4047 (*o_putc)(0xF0 | ( val>>18));
4048 (*o_putc)(0x80 | ((val>>12) & 0x3f));
4049 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
4050 (*o_putc)(0x80 | ( val & 0x3f));
4057 output_mode = ASCII;
4059 } else if (c2 == ISO_8859_1) {
4060 output_mode = UTF_8;
4061 (*o_putc)(c1 | 0x080);
4063 output_mode = UTF_8;
4064 val = e2w_conv(c2, c1);
4066 w16w_conv(val, &c2, &c1, &c0);
4070 if (c0) (*o_putc)(c0);
4076 void w_oconv16(nkf_char c2, nkf_char c1)
4079 output_bom_f = FALSE;
4080 if (output_endian == ENDIAN_LITTLE){
4081 (*o_putc)((unsigned char)'\377');
4085 (*o_putc)((unsigned char)'\377');
4094 if (c2 == ISO_8859_1) {
4097 #ifdef NUMCHAR_OPTION
4098 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4099 if (is_unicode_bmp(c1)) {
4100 c2 = (c1 >> 8) & 0xff;
4104 if (c1 <= UNICODE_MAX) {
4105 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
4106 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
4107 if (output_endian == ENDIAN_LITTLE){
4108 (*o_putc)(c2 & 0xff);
4109 (*o_putc)((c2 >> 8) & 0xff);
4110 (*o_putc)(c1 & 0xff);
4111 (*o_putc)((c1 >> 8) & 0xff);
4113 (*o_putc)((c2 >> 8) & 0xff);
4114 (*o_putc)(c2 & 0xff);
4115 (*o_putc)((c1 >> 8) & 0xff);
4116 (*o_putc)(c1 & 0xff);
4123 nkf_char val = e2w_conv(c2, c1);
4124 c2 = (val >> 8) & 0xff;
4128 if (output_endian == ENDIAN_LITTLE){
4137 void w_oconv32(nkf_char c2, nkf_char c1)
4140 output_bom_f = FALSE;
4141 if (output_endian == ENDIAN_LITTLE){
4142 (*o_putc)((unsigned char)'\377');
4150 (*o_putc)((unsigned char)'\377');
4159 if (c2 == ISO_8859_1) {
4161 #ifdef NUMCHAR_OPTION
4162 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4166 c1 = e2w_conv(c2, c1);
4169 if (output_endian == ENDIAN_LITTLE){
4170 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4171 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4172 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4176 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4177 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4178 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4183 void e_oconv(nkf_char c2, nkf_char c1)
4185 #ifdef NUMCHAR_OPTION
4186 if (c2 == 0 && is_unicode_capsule(c1)){
4187 w16e_conv(c1, &c2, &c1);
4188 if (c2 == 0 && is_unicode_capsule(c1)){
4189 c2 = c1 & VALUE_MASK;
4190 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4194 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4195 c1 = 0x21 + c1 % 94;
4198 (*o_putc)((c2 & 0x7f) | 0x080);
4199 (*o_putc)(c1 | 0x080);
4201 (*o_putc)((c2 & 0x7f) | 0x080);
4202 (*o_putc)(c1 | 0x080);
4206 if (encode_fallback) (*encode_fallback)(c1);
4215 } else if (c2 == 0) {
4216 output_mode = ASCII;
4218 } else if (c2 == JIS_X_0201) {
4219 output_mode = EUC_JP;
4220 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4221 } else if (c2 == ISO_8859_1) {
4222 output_mode = ISO_8859_1;
4223 (*o_putc)(c1 | 0x080);
4225 } else if (is_eucg3(c2)){
4226 output_mode = EUC_JP;
4227 #ifdef SHIFTJIS_CP932
4230 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4231 s2e_conv(s2, s1, &c2, &c1);
4236 output_mode = ASCII;
4238 }else if (is_eucg3(c2)){
4241 (*o_putc)((c2 & 0x7f) | 0x080);
4242 (*o_putc)(c1 | 0x080);
4245 (*o_putc)((c2 & 0x7f) | 0x080);
4246 (*o_putc)(c1 | 0x080);
4250 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4251 set_iconv(FALSE, 0);
4252 return; /* too late to rescue this char */
4254 output_mode = EUC_JP;
4255 (*o_putc)(c2 | 0x080);
4256 (*o_putc)(c1 | 0x080);
4261 nkf_char x0212_shift(nkf_char c)
4266 if (0x75 <= c && c <= 0x7f){
4267 ret = c + (0x109 - 0x75);
4270 if (0x75 <= c && c <= 0x7f){
4271 ret = c + (0x113 - 0x75);
4278 nkf_char x0212_unshift(nkf_char c)
4281 if (0x7f <= c && c <= 0x88){
4282 ret = c + (0x75 - 0x7f);
4283 }else if (0x89 <= c && c <= 0x92){
4284 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4288 #endif /* X0212_ENABLE */
4290 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4296 if((0x21 <= ndx && ndx <= 0x2F)){
4297 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4298 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4300 }else if(0x6E <= ndx && ndx <= 0x7E){
4301 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4302 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4308 else if(nkf_isgraph(ndx)){
4310 const unsigned short *ptr;
4311 ptr = x0212_shiftjis[ndx - 0x21];
4313 val = ptr[(c1 & 0x7f) - 0x21];
4322 c2 = x0212_shift(c2);
4324 #endif /* X0212_ENABLE */
4326 if(0x7F < c2) return 1;
4327 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4328 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4332 void s_oconv(nkf_char c2, nkf_char c1)
4334 #ifdef NUMCHAR_OPTION
4335 if (c2 == 0 && is_unicode_capsule(c1)){
4336 w16e_conv(c1, &c2, &c1);
4337 if (c2 == 0 && is_unicode_capsule(c1)){
4338 c2 = c1 & VALUE_MASK;
4339 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4342 c2 = c1 / 188 + 0xF0;
4344 c1 += 0x40 + (c1 > 0x3e);
4349 if(encode_fallback)(*encode_fallback)(c1);
4358 } else if (c2 == 0) {
4359 output_mode = ASCII;
4361 } else if (c2 == JIS_X_0201) {
4362 output_mode = SHIFT_JIS;
4364 } else if (c2 == ISO_8859_1) {
4365 output_mode = ISO_8859_1;
4366 (*o_putc)(c1 | 0x080);
4368 } else if (is_eucg3(c2)){
4369 output_mode = SHIFT_JIS;
4370 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4376 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4377 set_iconv(FALSE, 0);
4378 return; /* too late to rescue this char */
4380 output_mode = SHIFT_JIS;
4381 e2s_conv(c2, c1, &c2, &c1);
4383 #ifdef SHIFTJIS_CP932
4385 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4386 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4392 #endif /* SHIFTJIS_CP932 */
4395 if (prefix_table[(unsigned char)c1]){
4396 (*o_putc)(prefix_table[(unsigned char)c1]);
4402 void j_oconv(nkf_char c2, nkf_char c1)
4404 #ifdef NUMCHAR_OPTION
4405 if (c2 == 0 && is_unicode_capsule(c1)){
4406 w16e_conv(c1, &c2, &c1);
4407 if (c2 == 0 && is_unicode_capsule(c1)){
4408 c2 = c1 & VALUE_MASK;
4409 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4412 c2 = 0x7F + c1 / 94;
4413 c1 = 0x21 + c1 % 94;
4415 if (encode_fallback) (*encode_fallback)(c1);
4422 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4425 (*o_putc)(ascii_intro);
4426 output_mode = ASCII;
4430 } else if (is_eucg3(c2)){
4432 if(output_mode!=JIS_X_0213_2){
4433 output_mode = JIS_X_0213_2;
4440 if(output_mode!=JIS_X_0212){
4441 output_mode = JIS_X_0212;
4448 (*o_putc)(c2 & 0x7f);
4451 } else if (c2==JIS_X_0201) {
4452 if (output_mode!=JIS_X_0201) {
4453 output_mode = JIS_X_0201;
4459 } else if (c2==ISO_8859_1) {
4460 /* iso8859 introduction, or 8th bit on */
4461 /* Can we convert in 7bit form using ESC-'-'-A ?
4463 output_mode = ISO_8859_1;
4465 } else if (c2 == 0) {
4466 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4469 (*o_putc)(ascii_intro);
4470 output_mode = ASCII;
4475 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4476 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4478 if (output_mode!=JIS_X_0213_1) {
4479 output_mode = JIS_X_0213_1;
4485 }else if (output_mode != JIS_X_0208) {
4486 output_mode = JIS_X_0208;
4489 (*o_putc)(kanji_intro);
4496 void base64_conv(nkf_char c2, nkf_char c1)
4498 mime_prechar(c2, c1);
4499 (*o_base64conv)(c2,c1);
4503 static nkf_char broken_buf[3];
4504 static int broken_counter = 0;
4505 static int broken_last = 0;
4506 nkf_char broken_getc(FILE *f)
4510 if (broken_counter>0) {
4511 return broken_buf[--broken_counter];
4514 if (c=='$' && broken_last != ESC
4515 && (input_mode==ASCII || input_mode==JIS_X_0201)) {
4518 if (c1=='@'|| c1=='B') {
4519 broken_buf[0]=c1; broken_buf[1]=c;
4526 } else if (c=='(' && broken_last != ESC
4527 && (input_mode==JIS_X_0208 || input_mode==JIS_X_0201)) { /* ) */
4530 if (c1=='J'|| c1=='B') {
4531 broken_buf[0]=c1; broken_buf[1]=c;
4544 nkf_char broken_ungetc(nkf_char c, FILE *f)
4546 if (broken_counter<2)
4547 broken_buf[broken_counter++]=c;
4551 void nl_conv(nkf_char c2, nkf_char c1)
4553 if (guess_f && input_newline != EOF) {
4554 if (c2 == 0 && c1 == LF) {
4555 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4556 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4557 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4559 else if (!input_newline) input_newline = CR;
4560 else if (input_newline != CR) input_newline = EOF;
4562 if (prev_cr || (c2 == 0 && c1 == LF)) {
4564 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4565 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4567 if (c2 == 0 && c1 == CR) prev_cr = CR;
4568 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4572 Return value of fold_conv()
4574 LF add newline and output char
4575 CR add newline and output nothing
4578 1 (or else) normal output
4580 fold state in prev (previous character)
4582 >0x80 Japanese (X0208/X0201)
4587 This fold algorthm does not preserve heading space in a line.
4588 This is the main difference from fmt.
4591 #define char_size(c2,c1) (c2?2:1)
4593 void fold_conv(nkf_char c2, nkf_char c1)
4596 nkf_char fold_state;
4598 if (c1== CR && !fold_preserve_f) {
4599 fold_state=0; /* ignore cr */
4600 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4602 fold_state=0; /* ignore cr */
4603 } else if (c1== BS) {
4604 if (f_line>0) f_line--;
4606 } else if (c2==EOF && f_line != 0) { /* close open last line */
4608 } else if ((c1==LF && !fold_preserve_f)
4609 || ((c1==CR||(c1==LF&&f_prev!=CR))
4610 && fold_preserve_f)) {
4612 if (fold_preserve_f) {
4616 } else if ((f_prev == c1 && !fold_preserve_f)
4617 || (f_prev == LF && fold_preserve_f)
4618 ) { /* duplicate newline */
4621 fold_state = LF; /* output two newline */
4627 if (f_prev&0x80) { /* Japanese? */
4629 fold_state = 0; /* ignore given single newline */
4630 } else if (f_prev==SP) {
4634 if (++f_line<=fold_len)
4638 fold_state = CR; /* fold and output nothing */
4642 } else if (c1=='\f') {
4645 fold_state = LF; /* output newline and clear */
4646 } else if ( (c2==0 && c1==SP)||
4647 (c2==0 && c1==TAB)||
4648 (c2=='!'&& c1=='!')) {
4649 /* X0208 kankaku or ascii space */
4651 fold_state = 0; /* remove duplicate spaces */
4654 if (++f_line<=fold_len)
4655 fold_state = SP; /* output ASCII space only */
4657 f_prev = SP; f_line = 0;
4658 fold_state = CR; /* fold and output nothing */
4662 prev0 = f_prev; /* we still need this one... , but almost done */
4664 if (c2 || c2==JIS_X_0201)
4665 f_prev |= 0x80; /* this is Japanese */
4666 f_line += char_size(c2,c1);
4667 if (f_line<=fold_len) { /* normal case */
4670 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4671 f_line = char_size(c2,c1);
4672 fold_state = LF; /* We can't wait, do fold now */
4673 } else if (c2==JIS_X_0201) {
4674 /* simple kinsoku rules return 1 means no folding */
4675 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4676 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4677 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4678 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4679 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4680 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4681 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4683 fold_state = LF;/* add one new f_line before this character */
4686 fold_state = LF;/* add one new f_line before this character */
4689 /* kinsoku point in ASCII */
4690 if ( c1==')'|| /* { [ ( */
4701 /* just after special */
4702 } else if (!is_alnum(prev0)) {
4703 f_line = char_size(c2,c1);
4705 } else if ((prev0==SP) || /* ignored new f_line */
4706 (prev0==LF)|| /* ignored new f_line */
4707 (prev0&0x80)) { /* X0208 - ASCII */
4708 f_line = char_size(c2,c1);
4709 fold_state = LF;/* add one new f_line before this character */
4711 fold_state = 1; /* default no fold in ASCII */
4715 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4716 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4717 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4718 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4719 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4720 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4721 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4722 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4723 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4724 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4725 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4726 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4727 /* default no fold in kinsoku */
4730 f_line = char_size(c2,c1);
4731 /* add one new f_line before this character */
4734 f_line = char_size(c2,c1);
4736 /* add one new f_line before this character */
4741 /* terminator process */
4742 switch(fold_state) {
4744 OCONV_NEWLINE((*o_fconv));
4750 OCONV_NEWLINE((*o_fconv));
4761 nkf_char z_prev2=0,z_prev1=0;
4763 void z_conv(nkf_char c2, nkf_char c1)
4766 /* if (c2) c1 &= 0x7f; assertion */
4768 if (c2 == JIS_X_0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4774 if (z_prev2 == JIS_X_0201) {
4775 if (c2 == JIS_X_0201) {
4776 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4778 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4780 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4782 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4787 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4789 if (c2 == JIS_X_0201) {
4790 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4791 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4796 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4807 if (alpha_f&1 && c2 == 0x23) {
4808 /* JISX0208 Alphabet */
4810 } else if (c2 == 0x21) {
4811 /* JISX0208 Kigou */
4816 } else if (alpha_f&4) {
4821 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4827 if (alpha_f&8 && c2 == 0) {
4831 case '>': entity = ">"; break;
4832 case '<': entity = "<"; break;
4833 case '\"': entity = """; break;
4834 case '&': entity = "&"; break;
4837 while (*entity) (*o_zconv)(0, *entity++);
4843 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4848 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4852 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4856 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4860 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4864 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4868 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4872 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4876 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4881 (*o_zconv)(JIS_X_0201, c);
4884 } else if (c2 == 0x25) {
4885 /* JISX0208 Katakana */
4886 static const int fullwidth_to_halfwidth[] =
4888 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4889 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4890 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4891 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4892 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4893 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4894 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4895 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4896 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4897 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4898 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4899 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4901 if (fullwidth_to_halfwidth[c1-0x20]){
4902 c2 = fullwidth_to_halfwidth[c1-0x20];
4903 (*o_zconv)(JIS_X_0201, c2>>8);
4905 (*o_zconv)(JIS_X_0201, c2&0xFF);
4915 #define rot13(c) ( \
4917 (c <= 'M') ? (c + 13): \
4918 (c <= 'Z') ? (c - 13): \
4920 (c <= 'm') ? (c + 13): \
4921 (c <= 'z') ? (c - 13): \
4925 #define rot47(c) ( \
4927 ( c <= 'O') ? (c + 47) : \
4928 ( c <= '~') ? (c - 47) : \
4932 void rot_conv(nkf_char c2, nkf_char c1)
4934 if (c2==0 || c2==JIS_X_0201 || c2==ISO_8859_1) {
4940 (*o_rot_conv)(c2,c1);
4943 void hira_conv(nkf_char c2, nkf_char c1)
4947 if (0x20 < c1 && c1 < 0x74) {
4949 (*o_hira_conv)(c2,c1);
4951 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
4953 c1 = CLASS_UNICODE | 0x3094;
4954 (*o_hira_conv)(c2,c1);
4957 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4959 (*o_hira_conv)(c2,c1);
4964 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4967 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4969 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4973 (*o_hira_conv)(c2,c1);
4977 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4979 static const nkf_char range[RANGE_NUM_MAX][2] = {
5000 nkf_char start, end, c;
5002 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
5006 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
5011 for (i = 0; i < RANGE_NUM_MAX; i++) {
5012 start = range[i][0];
5015 if (c >= start && c <= end) {
5020 (*o_iso2022jp_check_conv)(c2,c1);
5024 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
5026 static const unsigned char *mime_pattern[] = {
5027 (const unsigned char *)"\075?EUC-JP?B?",
5028 (const unsigned char *)"\075?SHIFT_JIS?B?",
5029 (const unsigned char *)"\075?ISO-8859-1?Q?",
5030 (const unsigned char *)"\075?ISO-8859-1?B?",
5031 (const unsigned char *)"\075?ISO-2022-JP?B?",
5032 (const unsigned char *)"\075?ISO-2022-JP?Q?",
5033 #if defined(UTF8_INPUT_ENABLE)
5034 (const unsigned char *)"\075?UTF-8?B?",
5035 (const unsigned char *)"\075?UTF-8?Q?",
5037 (const unsigned char *)"\075?US-ASCII?Q?",
5042 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
5043 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
5044 e_iconv, s_iconv, 0, 0, 0, 0,
5045 #if defined(UTF8_INPUT_ENABLE)
5051 static const nkf_char mime_encode[] = {
5052 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201,
5053 #if defined(UTF8_INPUT_ENABLE)
5060 static const nkf_char mime_encode_method[] = {
5061 'B', 'B','Q', 'B', 'B', 'Q',
5062 #if defined(UTF8_INPUT_ENABLE)
5070 #define MAXRECOVER 20
5072 void switch_mime_getc(void)
5074 if (i_getc!=mime_getc) {
5075 i_mgetc = i_getc; i_getc = mime_getc;
5076 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5077 if(mime_f==STRICT_MIME) {
5078 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
5079 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
5084 void unswitch_mime_getc(void)
5086 if(mime_f==STRICT_MIME) {
5087 i_mgetc = i_mgetc_buf;
5088 i_mungetc = i_mungetc_buf;
5091 i_ungetc = i_mungetc;
5092 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
5093 mime_iconv_back = NULL;
5096 nkf_char mime_begin_strict(FILE *f)
5100 const unsigned char *p,*q;
5101 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
5103 mime_decode_mode = FALSE;
5104 /* =? has been checked */
5106 p = mime_pattern[j];
5109 for(i=2;p[i]>SP;i++) { /* start at =? */
5110 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
5111 /* pattern fails, try next one */
5113 while (mime_pattern[++j]) {
5114 p = mime_pattern[j];
5115 for(k=2;k<i;k++) /* assume length(p) > i */
5116 if (p[k]!=q[k]) break;
5117 if (k==i && nkf_toupper(c1)==p[k]) break;
5119 p = mime_pattern[j];
5120 if (p) continue; /* found next one, continue */
5121 /* all fails, output from recovery buffer */
5129 mime_decode_mode = p[i-2];
5131 mime_iconv_back = iconv;
5132 set_iconv(FALSE, mime_priority_func[j]);
5133 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
5135 if (mime_decode_mode=='B') {
5136 mimebuf_f = unbuf_f;
5138 /* do MIME integrity check */
5139 return mime_integrity(f,mime_pattern[j]);
5147 nkf_char mime_getc_buf(FILE *f)
5149 /* we don't keep eof of Fifo, becase it contains ?= as
5150 a terminator. It was checked in mime_integrity. */
5151 return ((mimebuf_f)?
5152 (*i_mgetc_buf)(f):Fifo(mime_input++));
5155 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
5158 (*i_mungetc_buf)(c,f);
5160 Fifo(--mime_input) = (unsigned char)c;
5164 nkf_char mime_begin(FILE *f)
5169 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
5170 /* re-read and convert again from mime_buffer. */
5172 /* =? has been checked */
5174 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
5175 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
5176 /* We accept any character type even if it is breaked by new lines */
5177 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5178 if (c1==LF||c1==SP||c1==CR||
5179 c1=='-'||c1=='_'||is_alnum(c1)) continue;
5181 /* Failed. But this could be another MIME preemble */
5189 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5190 if (!(++i<MAXRECOVER) || c1==EOF) break;
5191 if (c1=='b'||c1=='B') {
5192 mime_decode_mode = 'B';
5193 } else if (c1=='q'||c1=='Q') {
5194 mime_decode_mode = 'Q';
5198 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5199 if (!(++i<MAXRECOVER) || c1==EOF) break;
5201 mime_decode_mode = FALSE;
5207 if (!mime_decode_mode) {
5208 /* false MIME premble, restart from mime_buffer */
5209 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5210 /* Since we are in MIME mode until buffer becomes empty, */
5211 /* we never go into mime_begin again for a while. */
5214 /* discard mime preemble, and goto MIME mode */
5216 /* do no MIME integrity check */
5217 return c1; /* used only for checking EOF */
5221 void no_putc(nkf_char c)
5226 void debug(const char *str)
5229 fprintf(stderr, "%s\n", str ? str : "NULL");
5234 void set_input_codename(char *codename)
5236 if (!input_codename) {
5237 input_codename = codename;
5238 } else if (strcmp(codename, input_codename) != 0) {
5239 input_codename = "";
5243 static char* get_guessed_code(void)
5245 if (input_codename && !*input_codename) {
5246 input_codename = "BINARY";
5248 struct input_code *p = find_inputcode_byfunc(iconv);
5249 if (!input_codename) {
5250 input_codename = "ASCII";
5251 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5252 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5253 input_codename = "CP932";
5254 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5255 if (p->score & (SCORE_X0212))
5256 input_codename = "EUCJP-MS";
5257 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5258 input_codename = "CP51932";
5259 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5260 if (p->score & (SCORE_KANA))
5261 input_codename = "CP50221";
5262 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5263 input_codename = "CP50220";
5266 return input_codename;
5269 #if !defined(PERL_XS) && !defined(WIN32DLL)
5270 void print_guessed_code(char *filename)
5272 if (filename != NULL) printf("%s: ", filename);
5273 if (input_codename && !*input_codename) {
5276 input_codename = get_guessed_code();
5278 printf("%s\n", input_codename);
5282 input_newline == CR ? " (CR)" :
5283 input_newline == LF ? " (LF)" :
5284 input_newline == CRLF ? " (CRLF)" :
5285 input_newline == EOF ? " (MIXED NL)" :
5294 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5296 nkf_char c1, c2, c3;
5302 if (!nkf_isxdigit(c2)){
5307 if (!nkf_isxdigit(c3)){
5312 return (hex2bin(c2) << 4) | hex2bin(c3);
5315 nkf_char cap_getc(FILE *f)
5317 return hex_getc(':', f, i_cgetc, i_cungetc);
5320 nkf_char cap_ungetc(nkf_char c, FILE *f)
5322 return (*i_cungetc)(c, f);
5325 nkf_char url_getc(FILE *f)
5327 return hex_getc('%', f, i_ugetc, i_uungetc);
5330 nkf_char url_ungetc(nkf_char c, FILE *f)
5332 return (*i_uungetc)(c, f);
5336 #ifdef NUMCHAR_OPTION
5337 nkf_char numchar_getc(FILE *f)
5339 nkf_char (*g)(FILE *) = i_ngetc;
5340 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5351 if (buf[i] == 'x' || buf[i] == 'X'){
5352 for (j = 0; j < 7; j++){
5354 if (!nkf_isxdigit(buf[i])){
5361 c |= hex2bin(buf[i]);
5364 for (j = 0; j < 8; j++){
5368 if (!nkf_isdigit(buf[i])){
5375 c += hex2bin(buf[i]);
5381 return CLASS_UNICODE | c;
5390 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5392 return (*i_nungetc)(c, f);
5396 #ifdef UNICODE_NORMALIZATION
5398 /* Normalization Form C */
5399 nkf_char nfc_getc(FILE *f)
5401 nkf_char (*g)(FILE *f) = i_nfc_getc;
5402 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5403 int i=0, j, k=1, lower, upper;
5405 const nkf_nfchar *array;
5408 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5409 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5410 while (upper >= lower) {
5411 j = (lower+upper) / 2;
5412 array = normalization_table[j].nfd;
5413 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5414 if (array[k] != buf[k]){
5415 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5422 array = normalization_table[j].nfc;
5423 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5424 buf[i] = (nkf_char)(array[i]);
5435 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5437 return (*i_nfc_ungetc)(c, f);
5439 #endif /* UNICODE_NORMALIZATION */
5445 nkf_char c1, c2, c3, c4, cc;
5446 nkf_char t1, t2, t3, t4, mode, exit_mode;
5447 nkf_char lwsp_count;
5450 nkf_char lwsp_size = 128;
5452 if (mime_top != mime_last) { /* Something is in FIFO */
5453 return Fifo(mime_top++);
5455 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5456 mime_decode_mode=FALSE;
5457 unswitch_mime_getc();
5458 return (*i_getc)(f);
5461 if (mimebuf_f == FIXED_MIME)
5462 exit_mode = mime_decode_mode;
5465 if (mime_decode_mode == 'Q') {
5466 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5468 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5469 if (c1<=SP || DEL<=c1) {
5470 mime_decode_mode = exit_mode; /* prepare for quit */
5473 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5477 mime_decode_mode = exit_mode; /* prepare for quit */
5478 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5479 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5480 /* end Q encoding */
5481 input_mode = exit_mode;
5483 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5484 if (lwsp_buf==NULL) {
5485 perror("can't malloc");
5488 while ((c1=(*i_getc)(f))!=EOF) {
5493 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5501 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5502 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5517 lwsp_buf[lwsp_count] = (unsigned char)c1;
5518 if (lwsp_count++>lwsp_size){
5520 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5521 if (lwsp_buf_new==NULL) {
5523 perror("can't realloc");
5526 lwsp_buf = lwsp_buf_new;
5532 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5534 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5535 i_ungetc(lwsp_buf[lwsp_count],f);
5541 if (c1=='='&&c2<SP) { /* this is soft wrap */
5542 while((c1 = (*i_mgetc)(f)) <=SP) {
5543 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5545 mime_decode_mode = 'Q'; /* still in MIME */
5546 goto restart_mime_q;
5549 mime_decode_mode = 'Q'; /* still in MIME */
5553 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5554 if (c2<=SP) return c2;
5555 mime_decode_mode = 'Q'; /* still in MIME */
5556 return ((hex2bin(c2)<<4) + hex2bin(c3));
5559 if (mime_decode_mode != 'B') {
5560 mime_decode_mode = FALSE;
5561 return (*i_mgetc)(f);
5565 /* Base64 encoding */
5567 MIME allows line break in the middle of
5568 Base64, but we are very pessimistic in decoding
5569 in unbuf mode because MIME encoded code may broken by
5570 less or editor's control sequence (such as ESC-[-K in unbuffered
5571 mode. ignore incomplete MIME.
5573 mode = mime_decode_mode;
5574 mime_decode_mode = exit_mode; /* prepare for quit */
5576 while ((c1 = (*i_mgetc)(f))<=SP) {
5581 if ((c2 = (*i_mgetc)(f))<=SP) {
5584 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5585 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5588 if ((c1 == '?') && (c2 == '=')) {
5591 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5592 if (lwsp_buf==NULL) {
5593 perror("can't malloc");
5596 while ((c1=(*i_getc)(f))!=EOF) {
5601 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5609 if ((c1=(*i_getc)(f))!=EOF) {
5613 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5628 lwsp_buf[lwsp_count] = (unsigned char)c1;
5629 if (lwsp_count++>lwsp_size){
5631 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5632 if (lwsp_buf_new==NULL) {
5634 perror("can't realloc");
5637 lwsp_buf = lwsp_buf_new;
5643 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5645 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5646 i_ungetc(lwsp_buf[lwsp_count],f);
5653 if ((c3 = (*i_mgetc)(f))<=SP) {
5656 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5657 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5661 if ((c4 = (*i_mgetc)(f))<=SP) {
5664 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5665 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5669 mime_decode_mode = mode; /* still in MIME sigh... */
5671 /* BASE 64 decoding */
5673 t1 = 0x3f & base64decode(c1);
5674 t2 = 0x3f & base64decode(c2);
5675 t3 = 0x3f & base64decode(c3);
5676 t4 = 0x3f & base64decode(c4);
5677 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5679 Fifo(mime_last++) = (unsigned char)cc;
5680 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5682 Fifo(mime_last++) = (unsigned char)cc;
5683 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5685 Fifo(mime_last++) = (unsigned char)cc;
5690 return Fifo(mime_top++);
5693 nkf_char mime_ungetc(nkf_char c, FILE *f)
5695 Fifo(--mime_top) = (unsigned char)c;
5699 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5703 /* In buffered mode, read until =? or NL or buffer full
5705 mime_input = mime_top;
5706 mime_last = mime_top;
5708 while(*p) Fifo(mime_input++) = *p++;
5711 while((c=(*i_getc)(f))!=EOF) {
5712 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5713 break; /* buffer full */
5715 if (c=='=' && d=='?') {
5716 /* checked. skip header, start decode */
5717 Fifo(mime_input++) = (unsigned char)c;
5718 /* mime_last_input = mime_input; */
5723 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5725 /* Should we check length mod 4? */
5726 Fifo(mime_input++) = (unsigned char)c;
5729 /* In case of Incomplete MIME, no MIME decode */
5730 Fifo(mime_input++) = (unsigned char)c;
5731 mime_last = mime_input; /* point undecoded buffer */
5732 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5733 switch_mime_getc(); /* anyway we need buffered getc */
5737 nkf_char base64decode(nkf_char c)
5742 i = c - 'A'; /* A..Z 0-25 */
5743 } else if (c == '_') {
5744 i = '?' /* 63 */ ; /* _ 63 */
5746 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5748 } else if (c > '/') {
5749 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5750 } else if (c == '+' || c == '-') {
5751 i = '>' /* 62 */ ; /* + and - 62 */
5753 i = '?' /* 63 */ ; /* / 63 */
5758 static const char basis_64[] =
5759 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5761 static nkf_char b64c;
5762 #define MIMEOUT_BUF_LENGTH (60)
5763 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5764 int mimeout_buf_count = 0;
5766 void open_mime(nkf_char mode)
5768 const unsigned char *p;
5771 p = mime_pattern[0];
5772 for(i=0;mime_pattern[i];i++) {
5773 if (mode == mime_encode[i]) {
5774 p = mime_pattern[i];
5778 mimeout_mode = mime_encode_method[i];
5780 if (base64_count>45) {
5781 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5782 (*o_mputc)(mimeout_buf[i]);
5785 PUT_NEWLINE((*o_mputc));
5788 if (mimeout_buf_count>0
5789 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5790 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5794 for (;i<mimeout_buf_count;i++) {
5795 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5796 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5797 (*o_mputc)(mimeout_buf[i]);
5807 j = mimeout_buf_count;
5808 mimeout_buf_count = 0;
5810 mime_putc(mimeout_buf[i]);
5814 void close_mime(void)
5824 switch(mimeout_mode) {
5829 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5835 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5840 if (mimeout_mode > 0) {
5841 if (mimeout_f!=FIXED_MIME) {
5843 } else if (mimeout_mode != 'Q')
5848 void mimeout_addchar(nkf_char c)
5850 switch(mimeout_mode) {
5855 } else if(!nkf_isalnum(c)) {
5857 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5858 (*o_mputc)(bin2hex((c&0xf)));
5867 (*o_mputc)(basis_64[c>>2]);
5872 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5878 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5879 (*o_mputc)(basis_64[c & 0x3F]);
5890 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5892 void mime_prechar(nkf_char c2, nkf_char c1)
5894 if (mimeout_mode > 0){
5896 if (base64_count + mimeout_buf_count/3*4> 73){
5897 (*o_base64conv)(EOF,0);
5898 OCONV_NEWLINE((*o_base64conv));
5899 (*o_base64conv)(0,SP);
5903 if (base64_count + mimeout_buf_count/3*4> 66) {
5904 (*o_base64conv)(EOF,0);
5905 OCONV_NEWLINE((*o_base64conv));
5906 (*o_base64conv)(0,SP);
5912 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5913 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5914 open_mime(output_mode);
5915 (*o_base64conv)(EOF,0);
5916 OCONV_NEWLINE((*o_base64conv));
5917 (*o_base64conv)(0,SP);
5924 void mime_putc(nkf_char c)
5929 if (mimeout_f == FIXED_MIME){
5930 if (mimeout_mode == 'Q'){
5931 if (base64_count > 71){
5932 if (c!=CR && c!=LF) {
5934 PUT_NEWLINE((*o_mputc));
5939 if (base64_count > 71){
5941 PUT_NEWLINE((*o_mputc));
5944 if (c == EOF) { /* c==EOF */
5948 if (c != EOF) { /* c==EOF */
5954 /* mimeout_f != FIXED_MIME */
5956 if (c == EOF) { /* c==EOF */
5957 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
5958 j = mimeout_buf_count;
5959 mimeout_buf_count = 0;
5961 if (mimeout_mode > 0) {
5962 if (!nkf_isblank(mimeout_buf[j-1])) {
5964 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5967 mimeout_addchar(mimeout_buf[i]);
5971 mimeout_addchar(mimeout_buf[i]);
5975 mimeout_addchar(mimeout_buf[i]);
5981 mimeout_addchar(mimeout_buf[i]);
5987 if (mimeout_buf_count > 0){
5988 lastchar = mimeout_buf[mimeout_buf_count - 1];
5993 if (mimeout_mode=='Q') {
5994 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5995 if (c == CR || c == LF) {
6000 } else if (c <= SP) {
6002 if (base64_count > 70) {
6003 PUT_NEWLINE((*o_mputc));
6006 if (!nkf_isblank(c)) {
6011 if (base64_count > 70) {
6013 PUT_NEWLINE((*o_mputc));
6016 open_mime(output_mode);
6018 if (!nkf_noescape_mime(c)) {
6029 if (mimeout_mode <= 0) {
6030 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6031 if (nkf_isspace(c)) {
6033 if (mimeout_mode == -1) {
6036 if (c==CR || c==LF) {
6038 open_mime(output_mode);
6044 for (i=0;i<mimeout_buf_count;i++) {
6045 (*o_mputc)(mimeout_buf[i]);
6046 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
6057 mimeout_buf[0] = (char)c;
6058 mimeout_buf_count = 1;
6060 if (base64_count > 1
6061 && base64_count + mimeout_buf_count > 76
6062 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
6063 PUT_NEWLINE((*o_mputc));
6065 if (!nkf_isspace(mimeout_buf[0])){
6070 mimeout_buf[mimeout_buf_count++] = (char)c;
6071 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6072 open_mime(output_mode);
6077 if (lastchar==CR || lastchar == LF){
6078 for (i=0;i<mimeout_buf_count;i++) {
6079 (*o_mputc)(mimeout_buf[i]);
6082 mimeout_buf_count = 0;
6085 for (i=0;i<mimeout_buf_count-1;i++) {
6086 (*o_mputc)(mimeout_buf[i]);
6089 mimeout_buf[0] = SP;
6090 mimeout_buf_count = 1;
6092 open_mime(output_mode);
6095 /* mimeout_mode == 'B', 1, 2 */
6096 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6097 if (lastchar == CR || lastchar == LF){
6098 if (nkf_isblank(c)) {
6099 for (i=0;i<mimeout_buf_count;i++) {
6100 mimeout_addchar(mimeout_buf[i]);
6102 mimeout_buf_count = 0;
6103 } else if (SP<c && c<DEL) {
6105 for (i=0;i<mimeout_buf_count;i++) {
6106 (*o_mputc)(mimeout_buf[i]);
6109 mimeout_buf_count = 0;
6111 mimeout_buf[mimeout_buf_count++] = (char)c;
6114 if (c==SP || c==TAB || c==CR || c==LF) {
6115 for (i=0;i<mimeout_buf_count;i++) {
6116 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
6118 for (i=0;i<mimeout_buf_count;i++) {
6119 (*o_mputc)(mimeout_buf[i]);
6122 mimeout_buf_count = 0;
6125 mimeout_buf[mimeout_buf_count++] = (char)c;
6126 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6128 for (i=0;i<mimeout_buf_count;i++) {
6129 (*o_mputc)(mimeout_buf[i]);
6132 mimeout_buf_count = 0;
6136 if (mimeout_buf_count>0 && SP<c && c!='=') {
6137 mimeout_buf[mimeout_buf_count++] = (char)c;
6138 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6139 j = mimeout_buf_count;
6140 mimeout_buf_count = 0;
6142 mimeout_addchar(mimeout_buf[i]);
6149 if (mimeout_buf_count>0) {
6150 j = mimeout_buf_count;
6151 mimeout_buf_count = 0;
6153 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
6155 mimeout_addchar(mimeout_buf[i]);
6161 (*o_mputc)(mimeout_buf[i]);
6163 open_mime(output_mode);
6173 struct input_code *p = input_code_list;
6185 mime_f = MIME_DECODE_DEFAULT;
6186 mime_decode_f = FALSE;
6191 x0201_f = X0201_DEFAULT;
6192 iso2022jp_f = FALSE;
6193 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6194 ms_ucs_map_f = UCS_MAP_ASCII;
6196 #ifdef UTF8_INPUT_ENABLE
6197 no_cp932ext_f = FALSE;
6198 no_best_fit_chars_f = FALSE;
6199 encode_fallback = NULL;
6200 unicode_subchar = '?';
6201 input_endian = ENDIAN_BIG;
6203 #ifdef UTF8_OUTPUT_ENABLE
6204 output_bom_f = FALSE;
6205 output_endian = ENDIAN_BIG;
6207 #ifdef UNICODE_NORMALIZATION
6223 #ifdef SHIFTJIS_CP932
6233 for (i = 0; i < 256; i++){
6234 prefix_table[i] = 0;
6238 mimeout_buf_count = 0;
6243 fold_preserve_f = FALSE;
6246 kanji_intro = DEFAULT_J;
6247 ascii_intro = DEFAULT_R;
6248 fold_margin = FOLD_MARGIN;
6249 oconv = DEFAULT_CONV;
6250 o_zconv = no_connection;
6251 o_fconv = no_connection;
6252 o_nlconv = no_connection;
6253 o_rot_conv = no_connection;
6254 o_hira_conv = no_connection;
6255 o_base64conv = no_connection;
6256 o_iso2022jp_check_conv = no_connection;
6259 i_ungetc = std_ungetc;
6261 i_bungetc = std_ungetc;
6264 i_mungetc = std_ungetc;
6265 i_mgetc_buf = std_getc;
6266 i_mungetc_buf = std_ungetc;
6267 output_mode = ASCII;
6270 mime_decode_mode = FALSE;
6278 z_prev2=0,z_prev1=0;
6280 iconv_for_check = 0;
6282 input_codename = NULL;
6283 input_encoding = NULL;
6284 output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
6290 void no_connection(nkf_char c2, nkf_char c1)
6292 no_connection2(c2,c1,0);
6295 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6297 fprintf(stderr,"nkf internal module connection failure.\n");
6299 return 0; /* LINT */
6304 #define fprintf dllprintf
6308 fprintf(HELP_OUTPUT,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6309 fprintf(HELP_OUTPUT,"Flags:\n");
6310 fprintf(HELP_OUTPUT,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6311 #ifdef DEFAULT_CODE_SJIS
6312 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6314 #ifdef DEFAULT_CODE_JIS
6315 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6317 #ifdef DEFAULT_CODE_EUC
6318 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6320 #ifdef DEFAULT_CODE_UTF8
6321 fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6323 #ifdef UTF8_OUTPUT_ENABLE
6324 fprintf(HELP_OUTPUT," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6326 fprintf(HELP_OUTPUT,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6327 #ifdef UTF8_INPUT_ENABLE
6328 fprintf(HELP_OUTPUT," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6330 fprintf(HELP_OUTPUT,"t no conversion\n");
6331 fprintf(HELP_OUTPUT,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6332 fprintf(HELP_OUTPUT,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6333 fprintf(HELP_OUTPUT,"r {de/en}crypt ROT13/47\n");
6334 fprintf(HELP_OUTPUT,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6335 fprintf(HELP_OUTPUT,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6336 fprintf(HELP_OUTPUT,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6337 fprintf(HELP_OUTPUT,"l ISO8859-1 (Latin-1) support\n");
6338 fprintf(HELP_OUTPUT,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6339 fprintf(HELP_OUTPUT,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6340 fprintf(HELP_OUTPUT," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6341 fprintf(HELP_OUTPUT," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6342 fprintf(HELP_OUTPUT,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6343 fprintf(HELP_OUTPUT,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6345 fprintf(HELP_OUTPUT,"T Text mode output\n");
6347 fprintf(HELP_OUTPUT,"O Output to File (DEFAULT 'nkf.out')\n");
6348 fprintf(HELP_OUTPUT,"I Convert non ISO-2022-JP charactor to GETA\n");
6349 fprintf(HELP_OUTPUT,"d,c Convert line breaks -d: LF -c: CRLF\n");
6350 fprintf(HELP_OUTPUT,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6351 fprintf(HELP_OUTPUT,"v, V Show this usage. V: show configuration\n");
6352 fprintf(HELP_OUTPUT,"\n");
6353 fprintf(HELP_OUTPUT,"Long name options\n");
6354 fprintf(HELP_OUTPUT," --ic=<input codeset> --oc=<output codeset>\n");
6355 fprintf(HELP_OUTPUT," Specify the input or output codeset\n");
6356 fprintf(HELP_OUTPUT," --fj --unix --mac --windows\n");
6357 fprintf(HELP_OUTPUT," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6358 fprintf(HELP_OUTPUT," Convert for the system or code\n");
6359 fprintf(HELP_OUTPUT," --hiragana --katakana --katakana-hiragana\n");
6360 fprintf(HELP_OUTPUT," To Hiragana/Katakana Conversion\n");
6361 fprintf(HELP_OUTPUT," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6363 fprintf(HELP_OUTPUT," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6365 #ifdef NUMCHAR_OPTION
6366 fprintf(HELP_OUTPUT," --numchar-input Convert Unicode Character Reference\n");
6368 #ifdef UTF8_INPUT_ENABLE
6369 fprintf(HELP_OUTPUT," --fb-{skip, html, xml, perl, java, subchar}\n");
6370 fprintf(HELP_OUTPUT," Specify how nkf handles unassigned characters\n");
6373 fprintf(HELP_OUTPUT," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6374 fprintf(HELP_OUTPUT," Overwrite original listed files by filtered result\n");
6375 fprintf(HELP_OUTPUT," --overwrite preserves timestamp of original files\n");
6377 fprintf(HELP_OUTPUT," -g --guess Guess the input code\n");
6378 fprintf(HELP_OUTPUT," --help --version Show this help/the version\n");
6379 fprintf(HELP_OUTPUT," For more information, see also man nkf\n");
6380 fprintf(HELP_OUTPUT,"\n");
6384 void show_configuration(void)
6386 fprintf(HELP_OUTPUT, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n");
6387 fprintf(HELP_OUTPUT, " Compile-time options:\n");
6388 fprintf(HELP_OUTPUT, " Default output encoding: "
6389 #if defined(DEFAULT_CODE_JIS)
6391 #elif defined(DEFAULT_CODE_SJIS)
6393 #elif defined(DEFAULT_CODE_EUC)
6395 #elif defined(DEFAULT_CODE_UTF8)
6399 fprintf(HELP_OUTPUT, " Default output newline: "
6400 #if DEFAULT_NEWLINE == CR
6402 #elif DEFAULT_NEWLINE == CRLF
6408 fprintf(HELP_OUTPUT, " Decode MIME encoded string: "
6409 #if MIME_DECODE_DEFAULT
6415 fprintf(HELP_OUTPUT, " Convert JIS X 0201 Katakana: "
6422 fprintf(HELP_OUTPUT, " --help, --version output: "
6423 #if HELP_OUTPUT_HELP_OUTPUT
6433 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");