1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.94 2006/03/24 06:14:32 naruse Exp $ */
43 #define NKF_VERSION "2.0.6"
44 #define NKF_RELEASE_DATE "2006-03-24"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
49 " 2002-2006 Kono, Furukawa, Naruse, mastodon"
56 ** USAGE: nkf [flags] [file]
59 ** b Output is buffered (DEFAULT)
60 ** u Output is unbuffered
64 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
65 ** s Output code is MS Kanji (DEFAULT SELECT)
66 ** e Output code is AT&T JIS (DEFAULT SELECT)
67 ** w Output code is AT&T JIS (DEFAULT SELECT)
68 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
70 ** m MIME conversion for ISO-2022-JP
71 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
72 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
73 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
74 ** M MIME output conversion
76 ** r {de/en}crypt ROT13/47
80 ** T Text mode output (for MS-DOS)
82 ** x Do not convert X0201 kana into X0208
83 ** Z Convert X0208 alphabet to ASCII
88 ** B try to fix broken JIS, missing Escape
89 ** B[1-9] broken level
91 ** O Output to 'nkf.out' file or last file name
92 ** d Delete \r in line feed
93 ** c Add \r in line feed
94 ** -- other long option
95 ** -- ignore following option (don't use with -O )
99 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
101 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
117 #if defined(MSDOS) || defined(__OS2__)
124 #define setbinmode(fp) fsetbin(fp)
125 #else /* Microsoft C, Turbo C */
126 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
128 #else /* UNIX,OS/2 */
129 #define setbinmode(fp)
132 #ifdef _IOFBF /* SysV and MSDOS, Windows */
133 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
135 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
138 /*Borland C++ 4.5 EasyWin*/
139 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
148 /* added by satoru@isoternet.org */
149 #include <sys/stat.h>
150 #ifndef MSDOS /* UNIX, OS/2 */
153 #else /* defined(MSDOS) */
155 #ifdef __BORLANDC__ /* BCC32 */
157 #else /* !defined(__BORLANDC__) */
158 #include <sys/utime.h>
159 #endif /* (__BORLANDC__) */
160 #else /* !defined(__WIN32__) */
161 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
162 #include <sys/utime.h>
163 #elif defined(__TURBOC__) /* BCC */
165 #elif defined(LSI_C) /* LSI C */
166 #endif /* (__WIN32__) */
178 /* state of output_mode and input_mode
195 #define X0213_1 0x284F
196 #define X0213_2 0x2850
198 /* Input Assumption */
202 #define LATIN1_INPUT 6
204 #define STRICT_MIME 8
209 #define JAPANESE_EUC 10
213 #define UTF8_INPUT 13
214 #define UTF16BE_INPUT 14
215 #define UTF16LE_INPUT 15
235 #define is_alnum(c) \
236 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
238 /* I don't trust portablity of toupper */
239 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
240 #define nkf_isoctal(c) ('0'<=c && c<='7')
241 #define nkf_isdigit(c) ('0'<=c && c<='9')
242 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
243 #define nkf_isblank(c) (c == SPACE || c == TAB)
244 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
245 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
246 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
247 #define hex2bin(x) ( nkf_isdigit(x) ? x - '0' : nkf_toupper(x) - 'A' + 10)
249 #define HOLD_SIZE 1024
250 #define IOBUF_SIZE 16384
252 #define DEFAULT_J 'B'
253 #define DEFAULT_R 'B'
255 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
256 #define SJ6394 0x0161 /* 63 - 94 ku offset */
258 #define RANGE_NUM_MAX 18
263 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
264 #define sizeof_euc_to_utf8_1byte 94
265 #define sizeof_euc_to_utf8_2bytes 94
266 #define sizeof_utf8_to_euc_C2 64
267 #define sizeof_utf8_to_euc_E5B8 64
268 #define sizeof_utf8_to_euc_2bytes 112
269 #define sizeof_utf8_to_euc_3bytes 16
272 /* MIME preprocessor */
274 #ifdef EASYWIN /*Easy Win */
275 extern POINT _BufferSize;
278 /* function prototype */
280 #ifdef ANSI_C_PROTOTYPE
282 #define STATIC static
296 void (*status_func)PROTO((struct input_code *, int));
297 int (*iconv_func)PROTO((int c2, int c1, int c0));
301 STATIC char *input_codename = "";
304 STATIC const char *CopyRight = COPY_RIGHT;
306 #if !defined(PERL_XS) && !defined(WIN32DLL)
307 STATIC int noconvert PROTO((FILE *f));
309 STATIC int kanji_convert PROTO((FILE *f));
310 STATIC int h_conv PROTO((FILE *f,int c2,int c1));
311 STATIC int push_hold_buf PROTO((int c2));
312 STATIC void set_iconv PROTO((int f, int (*iconv_func)(int c2,int c1,int c0)));
313 STATIC int s_iconv PROTO((int c2,int c1,int c0));
314 STATIC int s2e_conv PROTO((int c2, int c1, int *p2, int *p1));
315 STATIC int e_iconv PROTO((int c2,int c1,int c0));
316 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
318 * 0: Shift_JIS, eucJP-ascii
322 #define UCS_MAP_ASCII 0
324 #define UCS_MAP_CP932 2
325 STATIC int ms_ucs_map_f = UCS_MAP_ASCII;
327 #ifdef UTF8_INPUT_ENABLE
328 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
329 STATIC int no_cp932ext_f = FALSE;
330 /* ignore ZERO WIDTH NO-BREAK SPACE */
331 STATIC int ignore_zwnbsp_f = TRUE;
332 STATIC int no_best_fit_chars_f = FALSE;
333 STATIC int unicode_subchar = '?'; /* the regular substitution character */
334 STATIC void encode_fallback_html PROTO((int c));
335 STATIC void encode_fallback_xml PROTO((int c));
336 STATIC void encode_fallback_java PROTO((int c));
337 STATIC void encode_fallback_perl PROTO((int c));
338 STATIC void encode_fallback_subchar PROTO((int c));
339 STATIC void (*encode_fallback)PROTO((int c)) = NULL;
340 STATIC int w2e_conv PROTO((int c2,int c1,int c0,int *p2,int *p1));
341 STATIC int w_iconv PROTO((int c2,int c1,int c0));
342 STATIC int w_iconv16 PROTO((int c2,int c1,int c0));
343 STATIC int unicode_to_jis_common PROTO((int c2,int c1,int c0,int *p2,int *p1));
344 STATIC int w_iconv_common PROTO((int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1));
345 STATIC int ww16_conv PROTO((int c2, int c1, int c0));
346 STATIC int w16e_conv PROTO((unsigned short val,int *p2,int *p1));
348 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
349 STATIC int internal_unicode_f = FALSE; /* Internal Unicode Processing */
351 #ifdef UTF8_OUTPUT_ENABLE
352 STATIC int unicode_bom_f= 0; /* Output Unicode BOM */
353 STATIC int w_oconv16_LE = 0; /* utf-16 little endian */
354 STATIC int e2w_conv PROTO((int c2,int c1));
355 STATIC void w_oconv PROTO((int c2,int c1));
356 STATIC void w_oconv16 PROTO((int c2,int c1));
358 STATIC void e_oconv PROTO((int c2,int c1));
359 STATIC int e2s_conv PROTO((int c2, int c1, int *p2, int *p1));
360 STATIC void s_oconv PROTO((int c2,int c1));
361 STATIC void j_oconv PROTO((int c2,int c1));
362 STATIC void fold_conv PROTO((int c2,int c1));
363 STATIC void cr_conv PROTO((int c2,int c1));
364 STATIC void z_conv PROTO((int c2,int c1));
365 STATIC void rot_conv PROTO((int c2,int c1));
366 STATIC void hira_conv PROTO((int c2,int c1));
367 STATIC void base64_conv PROTO((int c2,int c1));
368 STATIC void iso2022jp_check_conv PROTO((int c2,int c1));
369 STATIC void no_connection PROTO((int c2,int c1));
370 STATIC int no_connection2 PROTO((int c2,int c1,int c0));
372 STATIC void code_score PROTO((struct input_code *ptr));
373 STATIC void code_status PROTO((int c));
375 STATIC void std_putc PROTO((int c));
376 STATIC int std_getc PROTO((FILE *f));
377 STATIC int std_ungetc PROTO((int c,FILE *f));
379 STATIC int broken_getc PROTO((FILE *f));
380 STATIC int broken_ungetc PROTO((int c,FILE *f));
382 STATIC int mime_begin PROTO((FILE *f));
383 STATIC int mime_getc PROTO((FILE *f));
384 STATIC int mime_ungetc PROTO((int c,FILE *f));
386 STATIC int mime_begin_strict PROTO((FILE *f));
387 STATIC int mime_getc_buf PROTO((FILE *f));
388 STATIC int mime_ungetc_buf PROTO((int c,FILE *f));
389 STATIC int mime_integrity PROTO((FILE *f,const unsigned char *p));
391 STATIC int base64decode PROTO((int c));
392 STATIC void mime_prechar PROTO((int c2, int c1));
393 STATIC void mime_putc PROTO((int c));
394 STATIC void open_mime PROTO((int c));
395 STATIC void close_mime PROTO(());
397 STATIC void usage PROTO(());
398 STATIC void version PROTO(());
400 STATIC void options PROTO((unsigned char *c));
401 #if defined(PERL_XS) || defined(WIN32DLL)
402 STATIC void reinit PROTO(());
407 #if !defined(PERL_XS) && !defined(WIN32DLL)
408 STATIC unsigned char stdibuf[IOBUF_SIZE];
409 STATIC unsigned char stdobuf[IOBUF_SIZE];
411 STATIC unsigned char hold_buf[HOLD_SIZE*2];
412 STATIC int hold_count;
414 /* MIME preprocessor fifo */
416 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
417 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
418 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
419 STATIC unsigned char mime_buf[MIME_BUF_SIZE];
420 STATIC unsigned int mime_top = 0;
421 STATIC unsigned int mime_last = 0; /* decoded */
422 STATIC unsigned int mime_input = 0; /* undecoded */
423 STATIC int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
426 STATIC int unbuf_f = FALSE;
427 STATIC int estab_f = FALSE;
428 STATIC int nop_f = FALSE;
429 STATIC int binmode_f = TRUE; /* binary mode */
430 STATIC int rot_f = FALSE; /* rot14/43 mode */
431 STATIC int hira_f = FALSE; /* hira/kata henkan */
432 STATIC int input_f = FALSE; /* non fixed input code */
433 STATIC int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
434 STATIC int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
435 STATIC int mime_decode_f = FALSE; /* mime decode is explicitly on */
436 STATIC int mimebuf_f = FALSE; /* MIME buffered input */
437 STATIC int broken_f = FALSE; /* convert ESC-less broken JIS */
438 STATIC int iso8859_f = FALSE; /* ISO8859 through */
439 STATIC int mimeout_f = FALSE; /* base64 mode */
440 #if defined(MSDOS) || defined(__OS2__)
441 STATIC int x0201_f = TRUE; /* Assume JISX0201 kana */
443 STATIC int x0201_f = NO_X0201; /* Assume NO JISX0201 */
445 STATIC int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
447 #ifdef UNICODE_NORMALIZATION
448 STATIC int nfc_f = FALSE;
449 STATIC int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
450 STATIC int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
451 STATIC int nfc_getc PROTO((FILE *f));
452 STATIC int nfc_ungetc PROTO((int c,FILE *f));
456 STATIC int cap_f = FALSE;
457 STATIC int (*i_cgetc)PROTO((FILE *)) = std_getc; /* input of cgetc */
458 STATIC int (*i_cungetc)PROTO((int c ,FILE *f)) = std_ungetc;
459 STATIC int cap_getc PROTO((FILE *f));
460 STATIC int cap_ungetc PROTO((int c,FILE *f));
462 STATIC int url_f = FALSE;
463 STATIC int (*i_ugetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
464 STATIC int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
465 STATIC int url_getc PROTO((FILE *f));
466 STATIC int url_ungetc PROTO((int c,FILE *f));
469 #ifdef NUMCHAR_OPTION
470 #define CLASS_MASK 0x0f000000
471 #define CLASS_UTF16 0x01000000
472 STATIC int numchar_f = FALSE;
473 STATIC int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
474 STATIC int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
475 STATIC int numchar_getc PROTO((FILE *f));
476 STATIC int numchar_ungetc PROTO((int c,FILE *f));
480 STATIC int noout_f = FALSE;
481 STATIC void no_putc PROTO((int c));
482 STATIC int debug_f = FALSE;
483 STATIC void debug PROTO((const char *str));
484 STATIC int (*iconv_for_check)() = 0;
487 STATIC int guess_f = FALSE;
489 STATIC void print_guessed_code PROTO((char *filename));
491 STATIC void set_input_codename PROTO((char *codename));
492 STATIC int is_inputcode_mixed = FALSE;
493 STATIC int is_inputcode_set = FALSE;
496 STATIC int exec_f = 0;
499 #ifdef SHIFTJIS_CP932
500 /* invert IBM extended characters to others */
501 STATIC int cp51932_f = TRUE;
502 #define CP932_TABLE_BEGIN (0xfa)
503 #define CP932_TABLE_END (0xfc)
505 /* invert NEC-selected IBM extended characters to IBM extended characters */
506 STATIC int cp932inv_f = TRUE;
507 #define CP932INV_TABLE_BEGIN (0xed)
508 #define CP932INV_TABLE_END (0xee)
510 /* STATIC int cp932_conv PROTO((int c2, int c1)); */
511 #endif /* SHIFTJIS_CP932 */
514 STATIC int x0212_f = FALSE;
515 STATIC int x0212_shift PROTO((int c));
516 STATIC int x0212_unshift PROTO((int c));
518 STATIC int x0213_f = FALSE;
520 STATIC unsigned char prefix_table[256];
522 STATIC void e_status PROTO((struct input_code *, int));
523 STATIC void s_status PROTO((struct input_code *, int));
525 #ifdef UTF8_INPUT_ENABLE
526 STATIC void w_status PROTO((struct input_code *, int));
527 STATIC void w16_status PROTO((struct input_code *, int));
528 STATIC int utf16_mode = UTF16BE_INPUT;
531 struct input_code input_code_list[] = {
532 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
533 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
534 #ifdef UTF8_INPUT_ENABLE
535 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
536 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
541 STATIC int mimeout_mode = 0;
542 STATIC int base64_count = 0;
544 /* X0208 -> ASCII converter */
547 STATIC int f_line = 0; /* chars in line */
548 STATIC int f_prev = 0;
549 STATIC int fold_preserve_f = FALSE; /* preserve new lines */
550 STATIC int fold_f = FALSE;
551 STATIC int fold_len = 0;
554 STATIC unsigned char kanji_intro = DEFAULT_J;
555 STATIC unsigned char ascii_intro = DEFAULT_R;
559 #define FOLD_MARGIN 10
560 #define DEFAULT_FOLD 60
562 STATIC int fold_margin = FOLD_MARGIN;
566 #ifdef DEFAULT_CODE_JIS
567 # define DEFAULT_CONV j_oconv
569 #ifdef DEFAULT_CODE_SJIS
570 # define DEFAULT_CONV s_oconv
572 #ifdef DEFAULT_CODE_EUC
573 # define DEFAULT_CONV e_oconv
575 #ifdef DEFAULT_CODE_UTF8
576 # define DEFAULT_CONV w_oconv
579 /* process default */
580 STATIC void (*output_conv)PROTO((int c2,int c1)) = DEFAULT_CONV;
582 STATIC void (*oconv)PROTO((int c2,int c1)) = no_connection;
583 /* s_iconv or oconv */
584 STATIC int (*iconv)PROTO((int c2,int c1,int c0)) = no_connection2;
586 STATIC void (*o_zconv)PROTO((int c2,int c1)) = no_connection;
587 STATIC void (*o_fconv)PROTO((int c2,int c1)) = no_connection;
588 STATIC void (*o_crconv)PROTO((int c2,int c1)) = no_connection;
589 STATIC void (*o_rot_conv)PROTO((int c2,int c1)) = no_connection;
590 STATIC void (*o_hira_conv)PROTO((int c2,int c1)) = no_connection;
591 STATIC void (*o_base64conv)PROTO((int c2,int c1)) = no_connection;
592 STATIC void (*o_iso2022jp_check_conv)PROTO((int c2,int c1)) = no_connection;
594 /* STATIC redirections */
596 STATIC void (*o_putc)PROTO((int c)) = std_putc;
598 STATIC int (*i_getc)PROTO((FILE *f)) = std_getc; /* general input */
599 STATIC int (*i_ungetc)PROTO((int c,FILE *f)) =std_ungetc;
601 STATIC int (*i_bgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
602 STATIC int (*i_bungetc)PROTO((int c ,FILE *f)) = std_ungetc;
604 STATIC void (*o_mputc)PROTO((int c)) = std_putc ; /* output of mputc */
606 STATIC int (*i_mgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
607 STATIC int (*i_mungetc)PROTO((int c ,FILE *f)) = std_ungetc;
609 /* for strict mime */
610 STATIC int (*i_mgetc_buf)PROTO((FILE *)) = std_getc; /* input of mgetc_buf */
611 STATIC int (*i_mungetc_buf)PROTO((int c,FILE *f)) = std_ungetc;
614 STATIC int output_mode = ASCII, /* output kanji mode */
615 input_mode = ASCII, /* input kanji mode */
616 shift_mode = FALSE; /* TRUE shift out, or X0201 */
617 STATIC int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
619 /* X0201 / X0208 conversion tables */
621 /* X0201 kana conversion table */
624 unsigned char cv[]= {
625 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
626 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
627 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
628 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
629 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
630 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
631 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
632 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
633 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
634 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
635 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
636 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
637 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
638 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
639 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
640 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
644 /* X0201 kana conversion table for daguten */
647 unsigned char dv[]= {
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
653 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
654 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
655 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
656 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
657 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
658 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
659 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 /* X0201 kana conversion table for han-daguten */
669 unsigned char ev[]= {
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
681 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
683 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
684 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
685 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
689 /* X0208 kigou conversion table */
690 /* 0x8140 - 0x819e */
692 unsigned char fv[] = {
694 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
695 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
696 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
697 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
698 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
699 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
700 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
702 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
703 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
704 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
705 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
711 STATIC int file_out_f = FALSE;
713 STATIC int overwrite_f = FALSE;
714 STATIC int preserve_time_f = FALSE;
715 STATIC int backup_f = FALSE;
716 STATIC char *backup_suffix = "";
717 STATIC char *get_backup_filename PROTO((const char *suffix, const char *filename));
720 STATIC int crmode_f = 0; /* CR, NL, CRLF */
721 #ifdef EASYWIN /*Easy Win */
722 STATIC int end_check;
725 #define STD_GC_BUFSIZE (256)
726 int std_gc_buf[STD_GC_BUFSIZE];
730 #include "nkf32dll.c"
731 #elif defined(PERL_XS)
741 char *outfname = NULL;
744 #ifdef EASYWIN /*Easy Win */
745 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
748 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
749 cp = (unsigned char *)*argv;
754 if (pipe(fds) < 0 || (pid = fork()) < 0){
765 execvp(argv[1], &argv[1]);
779 if(x0201_f == WISH_TRUE)
780 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
782 if (binmode_f == TRUE)
784 if (freopen("","wb",stdout) == NULL)
791 setbuf(stdout, (char *) NULL);
793 setvbuffer(stdout, stdobuf, IOBUF_SIZE);
796 if (binmode_f == TRUE)
798 if (freopen("","rb",stdin) == NULL) return (-1);
802 setvbuffer(stdin, stdibuf, IOBUF_SIZE);
806 kanji_convert(stdin);
807 if (guess_f) print_guessed_code(NULL);
812 is_inputcode_mixed = FALSE;
813 is_inputcode_set = FALSE;
818 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
827 /* reopen file for stdout */
828 if (file_out_f == TRUE) {
831 outfname = malloc(strlen(origfname)
832 + strlen(".nkftmpXXXXXX")
838 strcpy(outfname, origfname);
842 for (i = strlen(outfname); i; --i){
843 if (outfname[i - 1] == '/'
844 || outfname[i - 1] == '\\'){
850 strcat(outfname, "ntXXXXXX");
852 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
855 strcat(outfname, ".nkftmpXXXXXX");
856 fd = mkstemp(outfname);
859 || (fd_backup = dup(fileno(stdout))) < 0
860 || dup2(fd, fileno(stdout)) < 0
871 outfname = "nkf.out";
874 if(freopen(outfname, "w", stdout) == NULL) {
878 if (binmode_f == TRUE) {
880 if (freopen("","wb",stdout) == NULL)
887 if (binmode_f == TRUE)
889 if (freopen("","rb",fin) == NULL)
894 setvbuffer(fin, stdibuf, IOBUF_SIZE);
898 char *filename = NULL;
900 if (nfiles > 1) filename = origfname;
901 if (guess_f) print_guessed_code(filename);
907 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
915 if (dup2(fd_backup, fileno(stdout)) < 0){
918 if (stat(origfname, &sb)) {
919 fprintf(stderr, "Can't stat %s\n", origfname);
921 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
922 if (chmod(outfname, sb.st_mode)) {
923 fprintf(stderr, "Can't set permission %s\n", outfname);
926 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
928 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
929 tb[0] = tb[1] = sb.st_mtime;
930 if (utime(outfname, tb)) {
931 fprintf(stderr, "Can't set timestamp %s\n", outfname);
934 tb.actime = sb.st_atime;
935 tb.modtime = sb.st_mtime;
936 if (utime(outfname, &tb)) {
937 fprintf(stderr, "Can't set timestamp %s\n", outfname);
942 char *backup_filename = get_backup_filename(backup_suffix, origfname);
944 unlink(backup_filename);
946 if (rename(origfname, backup_filename)) {
947 perror(backup_filename);
948 fprintf(stderr, "Can't rename %s to %s\n",
949 origfname, backup_filename);
953 if (unlink(origfname)){
958 if (rename(outfname, origfname)) {
960 fprintf(stderr, "Can't rename %s to %s\n",
961 outfname, origfname);
969 #ifdef EASYWIN /*Easy Win */
970 if (file_out_f == FALSE)
971 scanf("%d",&end_check);
974 #else /* for Other OS */
975 if (file_out_f == TRUE)
980 #endif /* WIN32DLL */
983 char *get_backup_filename(suffix, filename)
985 const char *filename;
987 char *backup_filename = NULL;
988 int asterisk_count = 0;
990 int filename_length = strlen(filename);
992 for(i = 0; suffix[i]; i++){
993 if(suffix[i] == '*') asterisk_count++;
997 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
998 if (!backup_filename){
999 perror("Can't malloc backup filename.");
1003 for(i = 0, j = 0; suffix[i];){
1004 if(suffix[i] == '*'){
1005 backup_filename[j] = '\0';
1006 strncat(backup_filename, filename, filename_length);
1008 j += filename_length;
1010 backup_filename[j++] = suffix[i++];
1013 backup_filename[j] = '\0';
1015 j = strlen(suffix) + filename_length;
1016 backup_filename = malloc( + 1);
1017 strcpy(backup_filename, filename);
1018 strcat(backup_filename, suffix);
1019 backup_filename[j] = '\0';
1021 return backup_filename;
1050 {"katakana-hiragana","h3"},
1057 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1058 {"internal-unicode", ""},
1060 #ifdef UTF8_OUTPUT_ENABLE
1070 {"fb-subchar=", ""},
1072 #ifdef UTF8_INPUT_ENABLE
1073 {"utf8-input", "W"},
1074 {"utf16-input", "W16"},
1075 {"no-cp932ext", ""},
1076 {"no-best-fit-chars",""},
1078 #ifdef UNICODE_NORMALIZATION
1079 {"utf8mac-input", ""},
1091 #ifdef NUMCHAR_OPTION
1092 {"numchar-input", ""},
1098 #ifdef SHIFTJIS_CP932
1108 STATIC int option_mode = 0;
1115 unsigned char *p = NULL;
1116 unsigned char *cp_back = NULL;
1117 unsigned char codeset[32];
1121 while(*cp && *cp++!='-');
1122 while (*cp || cp_back) {
1130 case '-': /* literal options */
1131 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1135 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1136 p = (unsigned char *)long_option[i].name;
1137 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1138 if (*p == cp[j] || cp[j] == ' '){
1145 while(*cp && *cp != SPACE && cp++);
1146 if (long_option[i].alias[0]){
1148 cp = (unsigned char *)long_option[i].alias;
1150 if (strcmp(long_option[i].name, "ic=") == 0){
1151 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1152 codeset[i] = nkf_toupper(p[i]);
1155 if(strcmp(codeset, "ISO-2022-JP") == 0){
1156 input_f = JIS_INPUT;
1157 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1158 input_f = JIS_INPUT;
1162 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1163 input_f = JIS_INPUT;
1168 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1169 input_f = SJIS_INPUT;
1170 if (x0201_f==NO_X0201) x0201_f=TRUE;
1171 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1172 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1173 strcmp(codeset, "CP932") == 0 ||
1174 strcmp(codeset, "MS932") == 0){
1175 input_f = SJIS_INPUT;
1177 #ifdef SHIFTJIS_CP932
1180 #ifdef UTF8_OUTPUT_ENABLE
1181 ms_ucs_map_f = UCS_MAP_CP932;
1183 }else if(strcmp(codeset, "EUCJP") == 0 ||
1184 strcmp(codeset, "EUC-JP") == 0){
1185 input_f = JIS_INPUT;
1186 }else if(strcmp(codeset, "CP51932") == 0){
1187 input_f = JIS_INPUT;
1189 #ifdef SHIFTJIS_CP932
1192 #ifdef UTF8_OUTPUT_ENABLE
1193 ms_ucs_map_f = UCS_MAP_CP932;
1195 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1196 strcmp(codeset, "EUCJP-MS") == 0 ||
1197 strcmp(codeset, "EUCJPMS") == 0){
1198 input_f = JIS_INPUT;
1200 #ifdef SHIFTJIS_CP932
1203 #ifdef UTF8_OUTPUT_ENABLE
1204 ms_ucs_map_f = UCS_MAP_MS;
1206 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1207 strcmp(codeset, "EUCJP-ASCII") == 0){
1208 input_f = JIS_INPUT;
1210 #ifdef SHIFTJIS_CP932
1213 #ifdef UTF8_OUTPUT_ENABLE
1214 ms_ucs_map_f = UCS_MAP_ASCII;
1216 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0){
1217 input_f = SJIS_INPUT;
1219 #ifdef SHIFTJIS_CP932
1223 if (x0201_f==NO_X0201) x0201_f=TRUE;
1224 }else if(strcmp(codeset, "EUC-JISX0213") == 0){
1225 input_f = JIS_INPUT;
1228 #ifdef SHIFTJIS_CP932
1232 #ifdef UTF8_INPUT_ENABLE
1233 }else if(strcmp(codeset, "UTF-8") == 0 ||
1234 strcmp(codeset, "UTF-8N") == 0 ||
1235 strcmp(codeset, "UTF-8-BOM") == 0){
1236 input_f = UTF8_INPUT;
1237 #ifdef UNICODE_NORMALIZATION
1238 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1239 strcmp(codeset, "UTF-8-MAC") == 0){
1240 input_f = UTF8_INPUT;
1243 }else if(strcmp(codeset, "UTF-16") == 0){
1244 input_f = UTF16BE_INPUT;
1245 utf16_mode = UTF16BE_INPUT;
1246 }else if(strcmp(codeset, "UTF-16BE") == 0 ||
1247 strcmp(codeset, "UTF-16BE-BOM") == 0){
1248 input_f = UTF16BE_INPUT;
1249 utf16_mode = UTF16BE_INPUT;
1250 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1251 strcmp(codeset, "UTF-16LE-BOM") == 0){
1252 input_f = UTF16LE_INPUT;
1253 utf16_mode = UTF16LE_INPUT;
1258 if (strcmp(long_option[i].name, "oc=") == 0){
1259 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1260 codeset[i] = nkf_toupper(p[i]);
1263 if(strcmp(codeset, "ISO-2022-JP") == 0){
1264 output_conv = j_oconv;
1265 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1266 output_conv = j_oconv;
1270 #ifdef SHIFTJIS_CP932
1273 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1274 output_conv = j_oconv;
1279 #ifdef SHIFTJIS_CP932
1282 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1283 output_conv = j_oconv;
1288 #ifdef SHIFTJIS_CP932
1291 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1292 output_conv = s_oconv;
1293 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1294 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1295 strcmp(codeset, "CP932") == 0 ||
1296 strcmp(codeset, "MS932") == 0){
1297 output_conv = s_oconv;
1299 #ifdef SHIFTJIS_CP932
1303 #ifdef UTF8_OUTPUT_ENABLE
1304 ms_ucs_map_f = UCS_MAP_CP932;
1306 }else if(strcmp(codeset, "EUCJP") == 0 ||
1307 strcmp(codeset, "EUC-JP") == 0){
1308 output_conv = e_oconv;
1309 }else if(strcmp(codeset, "CP51932") == 0){
1310 output_conv = e_oconv;
1312 #ifdef SHIFTJIS_CP932
1315 #ifdef UTF8_OUTPUT_ENABLE
1316 ms_ucs_map_f = UCS_MAP_CP932;
1318 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1319 strcmp(codeset, "EUCJP-MS") == 0 ||
1320 strcmp(codeset, "EUCJPMS") == 0){
1321 output_conv = e_oconv;
1326 #ifdef SHIFTJIS_CP932
1329 #ifdef UTF8_OUTPUT_ENABLE
1330 ms_ucs_map_f = UCS_MAP_MS;
1332 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1333 strcmp(codeset, "EUCJP-ASCII") == 0){
1334 output_conv = e_oconv;
1339 #ifdef SHIFTJIS_CP932
1342 #ifdef UTF8_OUTPUT_ENABLE
1343 ms_ucs_map_f = UCS_MAP_ASCII;
1345 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0){
1346 output_conv = s_oconv;
1348 #ifdef SHIFTJIS_CP932
1351 }else if(strcmp(codeset, "EUC-JISX0213") == 0){
1352 output_conv = e_oconv;
1357 #ifdef SHIFTJIS_CP932
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 }else if(strcmp(codeset, "UTF-8") == 0){
1362 output_conv = w_oconv;
1363 }else if(strcmp(codeset, "UTF-8N") == 0){
1364 output_conv = w_oconv;
1366 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1367 output_conv = w_oconv;
1369 }else if(strcmp(codeset, "UTF-16BE") == 0){
1370 output_conv = w_oconv16;
1372 }else if(strcmp(codeset, "UTF-16") == 0 ||
1373 strcmp(codeset, "UTF-16BE-BOM") == 0){
1374 output_conv = w_oconv16;
1376 }else if(strcmp(codeset, "UTF-16LE") == 0){
1377 output_conv = w_oconv16;
1380 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1381 output_conv = w_oconv16;
1389 if (strcmp(long_option[i].name, "overwrite") == 0){
1392 preserve_time_f = TRUE;
1395 if (strcmp(long_option[i].name, "overwrite=") == 0){
1398 preserve_time_f = TRUE;
1400 backup_suffix = malloc(strlen(p) + 1);
1401 strcpy(backup_suffix, p);
1404 if (strcmp(long_option[i].name, "in-place") == 0){
1407 preserve_time_f = FALSE;
1410 if (strcmp(long_option[i].name, "in-place=") == 0){
1413 preserve_time_f = FALSE;
1415 backup_suffix = malloc(strlen(p) + 1);
1416 strcpy(backup_suffix, p);
1421 if (strcmp(long_option[i].name, "cap-input") == 0){
1425 if (strcmp(long_option[i].name, "url-input") == 0){
1430 #ifdef NUMCHAR_OPTION
1431 if (strcmp(long_option[i].name, "numchar-input") == 0){
1437 if (strcmp(long_option[i].name, "no-output") == 0){
1441 if (strcmp(long_option[i].name, "debug") == 0){
1446 if (strcmp(long_option[i].name, "cp932") == 0){
1447 #ifdef SHIFTJIS_CP932
1451 #ifdef UTF8_OUTPUT_ENABLE
1452 ms_ucs_map_f = UCS_MAP_CP932;
1456 if (strcmp(long_option[i].name, "no-cp932") == 0){
1457 #ifdef SHIFTJIS_CP932
1461 #ifdef UTF8_OUTPUT_ENABLE
1462 ms_ucs_map_f = UCS_MAP_ASCII;
1466 #ifdef SHIFTJIS_CP932
1467 if (strcmp(long_option[i].name, "cp932inv") == 0){
1474 if (strcmp(long_option[i].name, "x0212") == 0){
1481 if (strcmp(long_option[i].name, "exec-in") == 0){
1485 if (strcmp(long_option[i].name, "exec-out") == 0){
1490 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1491 if (strcmp(long_option[i].name, "internal-unicode") == 0){
1492 internal_unicode_f = TRUE;
1495 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1496 no_cp932ext_f = TRUE;
1499 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1500 no_best_fit_chars_f = TRUE;
1503 if (strcmp(long_option[i].name, "fb-skip") == 0){
1504 encode_fallback = NULL;
1507 if (strcmp(long_option[i].name, "fb-html") == 0){
1508 encode_fallback = encode_fallback_html;
1511 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1512 encode_fallback = encode_fallback_xml;
1515 if (strcmp(long_option[i].name, "fb-java") == 0){
1516 encode_fallback = encode_fallback_java;
1519 if (strcmp(long_option[i].name, "fb-perl") == 0){
1520 encode_fallback = encode_fallback_perl;
1523 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1524 encode_fallback = encode_fallback_subchar;
1527 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1528 encode_fallback = encode_fallback_subchar;
1529 unicode_subchar = 0;
1531 /* decimal number */
1532 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1533 unicode_subchar *= 10;
1534 unicode_subchar += hex2bin(p[i]);
1536 }else if(p[1] == 'x' || p[1] == 'X'){
1537 /* hexadecimal number */
1538 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1539 unicode_subchar <<= 4;
1540 unicode_subchar |= hex2bin(p[i]);
1544 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1545 unicode_subchar *= 8;
1546 unicode_subchar += hex2bin(p[i]);
1549 w16e_conv(unicode_subchar, &i, &j);
1550 unicode_subchar = i<<8 | j;
1554 #ifdef UTF8_OUTPUT_ENABLE
1555 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1556 ms_ucs_map_f = UCS_MAP_MS;
1560 #ifdef UNICODE_NORMALIZATION
1561 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1562 input_f = UTF8_INPUT;
1567 if (strcmp(long_option[i].name, "prefix=") == 0){
1568 if (' ' < p[0] && p[0] < 128){
1569 for (i = 1; ' ' < p[i] && p[i] < 128; i++){
1570 prefix_table[p[i]] = p[0];
1577 case 'b': /* buffered mode */
1580 case 'u': /* non bufferd mode */
1583 case 't': /* transparent mode */
1586 case 'j': /* JIS output */
1588 output_conv = j_oconv;
1590 case 'e': /* AT&T EUC output */
1591 output_conv = e_oconv;
1593 case 's': /* SJIS output */
1594 output_conv = s_oconv;
1596 case 'l': /* ISO8859 Latin-1 support, no conversion */
1597 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1598 input_f = LATIN1_INPUT;
1600 case 'i': /* Kanji IN ESC-$-@/B */
1601 if (*cp=='@'||*cp=='B')
1602 kanji_intro = *cp++;
1604 case 'o': /* ASCII IN ESC-(-J/B */
1605 if (*cp=='J'||*cp=='B'||*cp=='H')
1606 ascii_intro = *cp++;
1610 bit:1 katakana->hiragana
1611 bit:2 hiragana->katakana
1613 if ('9'>= *cp && *cp>='0')
1614 hira_f |= (*cp++ -'0');
1621 #if defined(MSDOS) || defined(__OS2__)
1636 #ifdef UTF8_OUTPUT_ENABLE
1637 case 'w': /* UTF-8 output */
1638 if ('1'== cp[0] && '6'==cp[1]) {
1639 output_conv = w_oconv16; cp+=2;
1641 unicode_bom_f=2; cp++;
1644 unicode_bom_f=1; cp++;
1646 } else if (cp[0] == 'B') {
1647 unicode_bom_f=2; cp++;
1649 unicode_bom_f=1; cp++;
1652 } else if (cp[0] == '8') {
1653 output_conv = w_oconv; cp++;
1656 unicode_bom_f=1; cp++;
1659 output_conv = w_oconv;
1662 #ifdef UTF8_INPUT_ENABLE
1663 case 'W': /* UTF-8 input */
1664 if ('1'== cp[0] && '6'==cp[1]) {
1665 input_f = UTF16BE_INPUT;
1666 utf16_mode = UTF16BE_INPUT;
1670 input_f = UTF16LE_INPUT;
1671 utf16_mode = UTF16LE_INPUT;
1672 } else if (cp[0] == 'B') {
1674 input_f = UTF16BE_INPUT;
1675 utf16_mode = UTF16BE_INPUT;
1677 } else if (cp[0] == '8') {
1679 input_f = UTF8_INPUT;
1681 input_f = UTF8_INPUT;
1684 /* Input code assumption */
1685 case 'J': /* JIS input */
1686 case 'E': /* AT&T EUC input */
1687 input_f = JIS_INPUT;
1689 case 'S': /* MS Kanji input */
1690 input_f = SJIS_INPUT;
1691 if (x0201_f==NO_X0201) x0201_f=TRUE;
1693 case 'Z': /* Convert X0208 alphabet to asii */
1694 /* bit:0 Convert X0208
1695 bit:1 Convert Kankaku to one space
1696 bit:2 Convert Kankaku to two spaces
1697 bit:3 Convert HTML Entity
1699 if ('9'>= *cp && *cp>='0')
1700 alpha_f |= 1<<(*cp++ -'0');
1704 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1705 x0201_f = FALSE; /* No X0201->X0208 conversion */
1707 ESC-(-I in JIS, EUC, MS Kanji
1708 SI/SO in JIS, EUC, MS Kanji
1709 SSO in EUC, JIS, not in MS Kanji
1710 MS Kanji (0xa0-0xdf)
1712 ESC-(-I in JIS (0x20-0x5f)
1713 SSO in EUC (0xa0-0xdf)
1714 0xa0-0xd in MS Kanji (0xa0-0xdf)
1717 case 'X': /* Assume X0201 kana */
1718 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1721 case 'F': /* prserve new lines */
1722 fold_preserve_f = TRUE;
1723 case 'f': /* folding -f60 or -f */
1726 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1728 fold_len += *cp++ - '0';
1730 if (!(0<fold_len && fold_len<BUFSIZ))
1731 fold_len = DEFAULT_FOLD;
1735 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1737 fold_margin += *cp++ - '0';
1741 case 'm': /* MIME support */
1742 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1743 if (*cp=='B'||*cp=='Q') {
1744 mime_decode_mode = *cp++;
1745 mimebuf_f = FIXED_MIME;
1746 } else if (*cp=='N') {
1747 mime_f = TRUE; cp++;
1748 } else if (*cp=='S') {
1749 mime_f = STRICT_MIME; cp++;
1750 } else if (*cp=='0') {
1751 mime_decode_f = FALSE;
1752 mime_f = FALSE; cp++;
1755 case 'M': /* MIME output */
1758 mimeout_f = FIXED_MIME; cp++;
1759 } else if (*cp=='Q') {
1761 mimeout_f = FIXED_MIME; cp++;
1766 case 'B': /* Broken JIS support */
1768 bit:1 allow any x on ESC-(-x or ESC-$-x
1769 bit:2 reset to ascii on NL
1771 if ('9'>= *cp && *cp>='0')
1772 broken_f |= 1<<(*cp++ -'0');
1777 case 'O':/* for Output file */
1781 case 'c':/* add cr code */
1784 case 'd':/* delete cr code */
1787 case 'I': /* ISO-2022-JP output */
1790 case 'L': /* line mode */
1791 if (*cp=='u') { /* unix */
1792 crmode_f = NL; cp++;
1793 } else if (*cp=='m') { /* mac */
1794 crmode_f = CR; cp++;
1795 } else if (*cp=='w') { /* windows */
1796 crmode_f = CRLF; cp++;
1797 } else if (*cp=='0') { /* no conversion */
1807 /* module muliple options in a string are allowed for Perl moudle */
1808 while(*cp && *cp++!='-');
1811 /* bogus option but ignored */
1817 #ifdef ANSI_C_PROTOTYPE
1818 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1820 struct input_code * find_inputcode_byfunc(iconv_func)
1821 int (*iconv_func)();
1825 struct input_code *p = input_code_list;
1827 if (iconv_func == p->iconv_func){
1836 #ifdef ANSI_C_PROTOTYPE
1837 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1839 void set_iconv(f, iconv_func)
1841 int (*iconv_func)();
1844 #ifdef INPUT_CODE_FIX
1852 #ifdef INPUT_CODE_FIX
1853 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1859 if (estab_f && iconv_for_check != iconv){
1860 struct input_code *p = find_inputcode_byfunc(iconv);
1862 set_input_codename(p->name);
1863 debug(input_codename);
1865 iconv_for_check = iconv;
1870 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1871 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1872 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1873 #ifdef SHIFTJIS_CP932
1874 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1875 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1877 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1879 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1880 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1882 #define SCORE_INIT (SCORE_iMIME)
1884 const int score_table_A0[] = {
1887 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1888 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1891 const int score_table_F0[] = {
1892 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1893 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1894 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1895 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1898 void set_code_score(ptr, score)
1899 struct input_code *ptr;
1903 ptr->score |= score;
1907 void clr_code_score(ptr, score)
1908 struct input_code *ptr;
1912 ptr->score &= ~score;
1916 void code_score(ptr)
1917 struct input_code *ptr;
1919 int c2 = ptr->buf[0];
1920 #ifdef UTF8_OUTPUT_ENABLE
1921 int c1 = ptr->buf[1];
1924 set_code_score(ptr, SCORE_ERROR);
1925 }else if (c2 == SSO){
1926 set_code_score(ptr, SCORE_KANA);
1927 #ifdef UTF8_OUTPUT_ENABLE
1928 }else if (!e2w_conv(c2, c1)){
1929 set_code_score(ptr, SCORE_NO_EXIST);
1931 }else if ((c2 & 0x70) == 0x20){
1932 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1933 }else if ((c2 & 0x70) == 0x70){
1934 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1935 }else if ((c2 & 0x70) >= 0x50){
1936 set_code_score(ptr, SCORE_L2);
1940 void status_disable(ptr)
1941 struct input_code *ptr;
1946 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1949 void status_push_ch(ptr, c)
1950 struct input_code *ptr;
1953 ptr->buf[ptr->index++] = c;
1956 void status_clear(ptr)
1957 struct input_code *ptr;
1963 void status_reset(ptr)
1964 struct input_code *ptr;
1967 ptr->score = SCORE_INIT;
1970 void status_reinit(ptr)
1971 struct input_code *ptr;
1974 ptr->_file_stat = 0;
1977 void status_check(ptr, c)
1978 struct input_code *ptr;
1981 if (c <= DEL && estab_f){
1986 void s_status(ptr, c)
1987 struct input_code *ptr;
1992 status_check(ptr, c);
1997 #ifdef NUMCHAR_OPTION
1998 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2001 }else if (0xa1 <= c && c <= 0xdf){
2002 status_push_ch(ptr, SSO);
2003 status_push_ch(ptr, c);
2006 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2008 status_push_ch(ptr, c);
2009 #ifdef SHIFTJIS_CP932
2011 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
2013 status_push_ch(ptr, c);
2014 #endif /* SHIFTJIS_CP932 */
2016 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2018 status_push_ch(ptr, c);
2019 #endif /* X0212_ENABLE */
2021 status_disable(ptr);
2025 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2026 status_push_ch(ptr, c);
2027 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2031 status_disable(ptr);
2035 #ifdef SHIFTJIS_CP932
2036 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2037 status_push_ch(ptr, c);
2038 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2039 set_code_score(ptr, SCORE_CP932);
2044 #endif /* SHIFTJIS_CP932 */
2045 #ifndef X0212_ENABLE
2046 status_disable(ptr);
2052 void e_status(ptr, c)
2053 struct input_code *ptr;
2058 status_check(ptr, c);
2063 #ifdef NUMCHAR_OPTION
2064 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2067 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2069 status_push_ch(ptr, c);
2071 }else if (0x8f == c){
2073 status_push_ch(ptr, c);
2074 #endif /* X0212_ENABLE */
2076 status_disable(ptr);
2080 if (0xa1 <= c && c <= 0xfe){
2081 status_push_ch(ptr, c);
2085 status_disable(ptr);
2090 if (0xa1 <= c && c <= 0xfe){
2092 status_push_ch(ptr, c);
2094 status_disable(ptr);
2096 #endif /* X0212_ENABLE */
2100 #ifdef UTF8_INPUT_ENABLE
2101 void w16_status(ptr, c)
2102 struct input_code *ptr;
2109 if (ptr->_file_stat == 0){
2110 if (c == 0xfe || c == 0xff){
2112 status_push_ch(ptr, c);
2113 ptr->_file_stat = 1;
2115 status_disable(ptr);
2116 ptr->_file_stat = -1;
2118 }else if (ptr->_file_stat > 0){
2120 status_push_ch(ptr, c);
2121 }else if (ptr->_file_stat < 0){
2122 status_disable(ptr);
2128 status_disable(ptr);
2129 ptr->_file_stat = -1;
2131 status_push_ch(ptr, c);
2138 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
2139 status_push_ch(ptr, c);
2142 status_disable(ptr);
2143 ptr->_file_stat = -1;
2149 void w_status(ptr, c)
2150 struct input_code *ptr;
2155 status_check(ptr, c);
2160 #ifdef NUMCHAR_OPTION
2161 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2164 }else if (0xc0 <= c && c <= 0xdf){
2166 status_push_ch(ptr, c);
2167 }else if (0xe0 <= c && c <= 0xef){
2169 status_push_ch(ptr, c);
2171 status_disable(ptr);
2176 if (0x80 <= c && c <= 0xbf){
2177 status_push_ch(ptr, c);
2178 if (ptr->index > ptr->stat){
2179 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2180 && ptr->buf[2] == 0xbf);
2181 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2182 &ptr->buf[0], &ptr->buf[1]);
2189 status_disable(ptr);
2200 int action_flag = 1;
2201 struct input_code *result = 0;
2202 struct input_code *p = input_code_list;
2204 (p->status_func)(p, c);
2207 }else if(p->stat == 0){
2218 if (result && !estab_f){
2219 set_iconv(TRUE, result->iconv_func);
2220 }else if (c <= DEL){
2221 struct input_code *ptr = input_code_list;
2236 return std_gc_buf[--std_gc_ndx];
2247 if (std_gc_ndx == STD_GC_BUFSIZE){
2250 std_gc_buf[std_gc_ndx++] = c;
2264 #if !defined(PERL_XS) && !defined(WIN32DLL)
2271 while ((c = (*i_getc)(f)) != EOF)
2280 oconv = output_conv;
2283 /* replace continucation module, from output side */
2285 /* output redicrection */
2287 if (noout_f || guess_f){
2294 if (mimeout_f == TRUE) {
2295 o_base64conv = oconv; oconv = base64_conv;
2297 /* base64_count = 0; */
2301 o_crconv = oconv; oconv = cr_conv;
2304 o_rot_conv = oconv; oconv = rot_conv;
2307 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2310 o_hira_conv = oconv; oconv = hira_conv;
2313 o_fconv = oconv; oconv = fold_conv;
2316 if (alpha_f || x0201_f) {
2317 o_zconv = oconv; oconv = z_conv;
2321 i_ungetc = std_ungetc;
2322 /* input redicrection */
2325 i_cgetc = i_getc; i_getc = cap_getc;
2326 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2329 i_ugetc = i_getc; i_getc = url_getc;
2330 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2333 #ifdef NUMCHAR_OPTION
2335 i_ngetc = i_getc; i_getc = numchar_getc;
2336 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2339 #ifdef UNICODE_NORMALIZATION
2340 if (nfc_f && input_f == UTF8_INPUT){
2341 i_nfc_getc = i_getc; i_getc = nfc_getc;
2342 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2345 if (mime_f && mimebuf_f==FIXED_MIME) {
2346 i_mgetc = i_getc; i_getc = mime_getc;
2347 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2350 i_bgetc = i_getc; i_getc = broken_getc;
2351 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2353 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
2354 set_iconv(-TRUE, e_iconv);
2355 } else if (input_f == SJIS_INPUT) {
2356 set_iconv(-TRUE, s_iconv);
2357 #ifdef UTF8_INPUT_ENABLE
2358 } else if (input_f == UTF8_INPUT) {
2359 set_iconv(-TRUE, w_iconv);
2360 } else if (input_f == UTF16BE_INPUT) {
2361 set_iconv(-TRUE, w_iconv16);
2362 } else if (input_f == UTF16LE_INPUT) {
2363 set_iconv(-TRUE, w_iconv16);
2366 set_iconv(FALSE, e_iconv);
2370 struct input_code *p = input_code_list;
2378 Conversion main loop. Code detection only.
2387 int is_8bit = FALSE;
2389 module_connection();
2392 if(input_f == SJIS_INPUT
2393 #ifdef UTF8_INPUT_ENABLE
2394 || input_f == UTF8_INPUT || input_f == UTF16BE_INPUT || input_f == UTF16LE_INPUT
2402 output_mode = ASCII;
2405 #define NEXT continue /* no output, get next */
2406 #define SEND ; /* output c1 and c2, get next */
2407 #define LAST break /* end of loop, go closing */
2409 while ((c1 = (*i_getc)(f)) != EOF) {
2410 #ifdef INPUT_CODE_FIX
2417 /* in case of 8th bit is on */
2418 if (!estab_f&&!mime_decode_mode) {
2419 /* in case of not established yet */
2420 /* It is still ambiguious */
2421 if (h_conv(f, c2, c1)==EOF)
2427 /* in case of already established */
2429 /* ignore bogus code */
2435 /* second byte, 7 bit code */
2436 /* it might be kanji shitfted */
2437 if ((c1 == DEL) || (c1 <= SPACE)) {
2438 /* ignore bogus first code */
2446 #ifdef UTF8_INPUT_ENABLE
2455 #ifdef NUMCHAR_OPTION
2456 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2459 } else if (c1 > DEL) {
2461 if (!estab_f && !iso8859_f) {
2462 /* not established yet */
2463 if (!is_8bit) is_8bit = TRUE;
2466 } else { /* estab_f==TRUE */
2471 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2472 /* SJIS X0201 Case... */
2473 if(iso2022jp_f && x0201_f==NO_X0201) {
2474 (*oconv)(GETA1, GETA2);
2481 } else if (c1==SSO && iconv != s_iconv) {
2482 /* EUC X0201 Case */
2483 c1 = (*i_getc)(f); /* skip SSO */
2485 if (SSP<=c1 && c1<0xe0) {
2486 if(iso2022jp_f && x0201_f==NO_X0201) {
2487 (*oconv)(GETA1, GETA2);
2494 } else { /* bogus code, skip SSO and one byte */
2498 /* already established */
2503 } else if ((c1 > SPACE) && (c1 != DEL)) {
2504 /* in case of Roman characters */
2506 /* output 1 shifted byte */
2510 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2511 /* output 1 shifted byte */
2512 if(iso2022jp_f && x0201_f==NO_X0201) {
2513 (*oconv)(GETA1, GETA2);
2520 /* look like bogus code */
2523 } else if (input_mode == X0208 || input_mode == X0212 ||
2524 input_mode == X0213_1 || input_mode == X0213_2) {
2525 /* in case of Kanji shifted */
2528 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2529 /* Check MIME code */
2530 if ((c1 = (*i_getc)(f)) == EOF) {
2533 } else if (c1 == '?') {
2534 /* =? is mime conversion start sequence */
2535 if(mime_f == STRICT_MIME) {
2536 /* check in real detail */
2537 if (mime_begin_strict(f) == EOF)
2541 } else if (mime_begin(f) == EOF)
2551 /* normal ASCII code */
2554 } else if (!is_8bit && c1 == SI) {
2557 } else if (!is_8bit && c1 == SO) {
2560 } else if (!is_8bit && c1 == ESC ) {
2561 if ((c1 = (*i_getc)(f)) == EOF) {
2562 /* (*oconv)(0, ESC); don't send bogus code */
2564 } else if (c1 == '$') {
2565 if ((c1 = (*i_getc)(f)) == EOF) {
2567 (*oconv)(0, ESC); don't send bogus code
2568 (*oconv)(0, '$'); */
2570 } else if (c1 == '@'|| c1 == 'B') {
2571 /* This is kanji introduction */
2574 set_input_codename("ISO-2022-JP");
2576 debug(input_codename);
2579 } else if (c1 == '(') {
2580 if ((c1 = (*i_getc)(f)) == EOF) {
2581 /* don't send bogus code
2587 } else if (c1 == '@'|| c1 == 'B') {
2588 /* This is kanji introduction */
2593 } else if (c1 == 'D'){
2597 #endif /* X0212_ENABLE */
2598 } else if (c1 == (X0213_1&0x7F)){
2599 input_mode = X0213_1;
2602 } else if (c1 == (X0213_2&0x7F)){
2603 input_mode = X0213_2;
2607 /* could be some special code */
2614 } else if (broken_f&0x2) {
2615 /* accept any ESC-(-x as broken code ... */
2625 } else if (c1 == '(') {
2626 if ((c1 = (*i_getc)(f)) == EOF) {
2627 /* don't send bogus code
2629 (*oconv)(0, '('); */
2633 /* This is X0201 kana introduction */
2634 input_mode = X0201; shift_mode = X0201;
2636 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2637 /* This is X0208 kanji introduction */
2638 input_mode = ASCII; shift_mode = FALSE;
2640 } else if (broken_f&0x2) {
2641 input_mode = ASCII; shift_mode = FALSE;
2646 /* maintain various input_mode here */
2650 } else if ( c1 == 'N' || c1 == 'n' ){
2652 c3 = (*i_getc)(f); /* skip SS2 */
2653 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2668 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2669 input_mode = ASCII; set_iconv(FALSE, 0);
2671 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2672 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2680 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2681 if ((c1=(*i_getc)(f))!=EOF) {
2685 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2703 if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2704 int c0 = (*i_getc)(f);
2707 (*iconv)(c2, c1, c0);
2713 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2717 (*oconv)((0x8f << 8) | c2, c1);
2719 #endif /* X0212_ENABLE */
2721 (*oconv)((0x8f << 8) | c2, c1);
2724 (*oconv)(input_mode, c1); /* other special case */
2729 /* goto next_word */
2733 (*iconv)(EOF, 0, 0);
2734 if (!is_inputcode_set)
2737 struct input_code *p = input_code_list;
2738 struct input_code *result = p;
2740 if (p->score < result->score) result = p;
2743 set_input_codename(result->name);
2758 /** it must NOT be in the kanji shifte sequence */
2759 /** it must NOT be written in JIS7 */
2760 /** and it must be after 2 byte 8bit code */
2767 while ((c1 = (*i_getc)(f)) != EOF) {
2773 if (push_hold_buf(c1) == EOF || estab_f){
2779 struct input_code *p = input_code_list;
2780 struct input_code *result = p;
2785 if (p->score < result->score){
2790 set_iconv(FALSE, result->iconv_func);
2795 ** 1) EOF is detected, or
2796 ** 2) Code is established, or
2797 ** 3) Buffer is FULL (but last word is pushed)
2799 ** in 1) and 3) cases, we continue to use
2800 ** Kanji codes by oconv and leave estab_f unchanged.
2805 while (wc < hold_count){
2806 c2 = hold_buf[wc++];
2808 #ifdef NUMCHAR_OPTION
2809 || (c2 & CLASS_MASK) == CLASS_UTF16
2814 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2815 (*iconv)(X0201, c2, 0);
2818 if (wc < hold_count){
2819 c1 = hold_buf[wc++];
2828 if ((*iconv)(c2, c1, 0) < 0){
2830 if (wc < hold_count){
2831 c0 = hold_buf[wc++];
2840 (*iconv)(c2, c1, c0);
2853 if (hold_count >= HOLD_SIZE*2)
2855 hold_buf[hold_count++] = c2;
2856 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2859 int s2e_conv(c2, c1, p2, p1)
2863 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2866 STATIC const int shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
2867 #ifdef SHIFTJIS_CP932
2868 if (cp51932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2869 extern const unsigned short shiftjis_cp932[3][189];
2870 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2876 #endif /* SHIFTJIS_CP932 */
2878 if (!x0213_f && 0xfa <= c2 && c2 <= 0xfc){
2879 extern const unsigned short shiftjis_x0212[3][189];
2880 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2883 c2 = (0x8f << 8) | (val >> 8);
2896 if(x0213_f && c2 >= 0xF0){
2897 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
2898 c2 = 0x8F20 + shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
2899 }else{ /* 78<=k<=94 */
2900 c2 = 0x8F00 | (c2 * 2 - 0x17B);
2901 if (0x9E < c1) c2++;
2904 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
2905 if (0x9E < c1) c2++;
2908 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
2915 c2 = x0212_unshift(c2);
2929 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2932 int ret = s2e_conv(c2, c1, &c2, &c1);
2933 if (ret) return ret;
2947 }else if (c2 == 0x8f){
2951 c2 = (c2 << 8) | (c1 & 0x7f);
2953 #ifdef SHIFTJIS_CP932
2956 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2957 s2e_conv(s2, s1, &c2, &c1);
2958 if ((c2 & 0xff00) == 0){
2964 #endif /* SHIFTJIS_CP932 */
2965 #endif /* X0212_ENABLE */
2966 } else if (c2 == SSO){
2969 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2979 #ifdef UTF8_INPUT_ENABLE
2981 w2e_conv(c2, c1, c0, p2, p1)
2990 }else if (0xc0 <= c2 && c2 <= 0xef) {
2991 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2992 #ifdef NUMCHAR_OPTION
2995 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
3010 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
3011 if(ignore_zwnbsp_f){
3012 ignore_zwnbsp_f = FALSE;
3013 if(c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
3017 if (c2 == 0) /* 0x00-0x7f */
3018 c1 &= 0x7F; /* 1byte */
3020 if ((c2 & 0xe0) == 0xc0){ /* 0xc0-0xdf */
3022 if((c2 & 0xFE) == 0xC0 || c1 < 0x80 || 0xBF < c1) return 0;
3023 }else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
3024 return -1; /* 3bytes */
3026 else if (0xf0 <= c2)
3027 return 0; /* 4,5,6bytes */
3028 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
3029 return 0; /* trail byte */
3033 /* must be 3bytes */
3035 if(c1 < 0xA0 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
3037 }else if(c2 == 0xED){
3038 if(c1 < 0x80 || 0x9F < c1 || c0 < 0x80 || 0xBF < c0)
3040 }else if((c2 & 0xf0) == 0xe0){
3041 if(c1 < 0x80 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
3045 if (c2 == 0 || c2 == EOF){
3046 #ifdef UTF8_OUTPUT_ENABLE
3047 } else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
3048 unsigned short val = 0;
3053 val = ww16_conv(c2, c1, c0);
3054 c2 = (val >> 8) & 0xff;
3058 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3067 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3069 w16w_conv(val, p2, p1, p0)
3077 }else if (val < 0x800){
3078 *p2 = 0xc0 | (val >> 6);
3079 *p1 = 0x80 | (val & 0x3f);
3082 *p2 = 0xe0 | (val >> 12);
3083 *p1 = 0x80 | ((val >> 6) & 0x3f);
3084 *p0 = 0x80 | (val & 0x3f);
3089 #ifdef UTF8_INPUT_ENABLE
3091 ww16_conv(c2, c1, c0)
3097 }else if (c2 >= 0xe0){
3098 val = (c2 & 0x0f) << 12;
3099 val |= (c1 & 0x3f) << 6;
3101 }else if (c2 >= 0xc0){
3102 val = (c2 & 0x1f) << 6;
3111 w16e_conv(val, p2, p1)
3122 w16w_conv(val, &c2, &c1, &c0);
3123 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3124 #ifdef NUMCHAR_OPTION
3127 *p1 = CLASS_UTF16 | val;
3136 #ifdef UTF8_INPUT_ENABLE
3138 w_iconv16(c2, c1, c0)
3143 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
3144 if(ignore_zwnbsp_f){
3145 ignore_zwnbsp_f = FALSE;
3146 if (c2==0376 && c1==0377){
3147 utf16_mode = UTF16BE_INPUT;
3149 }else if(c2==0377 && c1==0376){
3150 utf16_mode = UTF16LE_INPUT;
3154 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
3156 tmp=c1; c1=c2; c2=tmp;
3158 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3161 }else if((c2>>3)==27){ /* surrogate pair */
3163 #ifdef UTF8_OUTPUT_ENABLE
3164 }else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
3166 }else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
3167 if (ret) return ret;
3173 unicode_to_jis_common(c2, c1, c0, p2, p1)
3177 extern const unsigned short *const utf8_to_euc_2bytes[];
3178 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3179 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3180 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3181 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3182 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3183 const unsigned short *const *pp;
3184 const unsigned short *const *const *ppp;
3185 STATIC const int no_best_fit_chars_table_C2[] =
3186 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
3189 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0};
3190 STATIC const int no_best_fit_chars_table_C2_ascii[] =
3191 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3193 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3194 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3195 STATIC const int no_best_fit_chars_table_932_C2[] =
3196 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3197 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3198 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3199 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3200 STATIC const int no_best_fit_chars_table_932_C3[] =
3201 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3202 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3204 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3210 }else if(c2 < 0xe0){
3211 if(no_best_fit_chars_f){
3212 if(ms_ucs_map_f == UCS_MAP_CP932){
3215 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3218 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3221 }else if(cp51932_f){
3222 if(c2 == 0xC2 && no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3224 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ascii[c1&0x3F]) return 1;
3228 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3229 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3231 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3233 if(no_best_fit_chars_f){
3234 if(ms_ucs_map_f == UCS_MAP_CP932){
3235 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3236 }else if(ms_ucs_map_f == UCS_MAP_MS){
3241 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3244 if(c0 == 0x92) return 1;
3249 if(c1 == 0x80 || c0 == 0x9C) return 1;
3257 if(c0 == 0x95) return 1;
3260 if(c0 == 0xA5) return 1;
3267 if(c0 == 0x8D) return 1;
3270 if(c0 == 0x9E && cp51932_f) return 1;
3273 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3281 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3282 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3284 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3290 w_iconv_common(c1, c0, pp, psize, p2, p1)
3292 const unsigned short *const *pp;
3297 const unsigned short *p;
3300 if (pp == 0) return 1;
3303 if (c1 < 0 || psize <= c1) return 1;
3305 if (p == 0) return 1;
3308 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3310 if (val == 0) return 1;
3311 if (no_cp932ext_f && (
3312 (val>>8) == 0x2D || /* NEC special characters */
3313 val > 0xF300 /* NEC special characters */
3321 if (c2 == SO) c2 = X0201;
3329 nkf_each_char_to_hex(f, c)
3330 void (*f)PROTO((int c2,int c1));
3333 const char *hex = "0123456789ABCDEF";
3339 (*f)(0, hex[(c>>shift)&0xF]);
3350 encode_fallback_html(c)
3357 (*oconv)(0, 0x30+(c/1000000)%10);
3359 (*oconv)(0, 0x30+(c/100000 )%10);
3361 (*oconv)(0, 0x30+(c/10000 )%10);
3363 (*oconv)(0, 0x30+(c/1000 )%10);
3365 (*oconv)(0, 0x30+(c/100 )%10);
3367 (*oconv)(0, 0x30+(c/10 )%10);
3369 (*oconv)(0, 0x30+ c %10);
3375 encode_fallback_xml(c)
3381 nkf_each_char_to_hex(oconv, c);
3387 encode_fallback_java(c)
3390 const char *hex = "0123456789ABCDEF";
3392 if((c&0x00FFFFFF) > 0xFFFF){
3396 (*oconv)(0, hex[(c>>20)&0xF]);
3397 (*oconv)(0, hex[(c>>16)&0xF]);
3401 (*oconv)(0, hex[(c>>12)&0xF]);
3402 (*oconv)(0, hex[(c>> 8)&0xF]);
3403 (*oconv)(0, hex[(c>> 4)&0xF]);
3404 (*oconv)(0, hex[ c &0xF]);
3409 encode_fallback_perl(c)
3415 nkf_each_char_to_hex(oconv, c);
3421 encode_fallback_subchar(c)
3424 c = unicode_subchar;
3425 (*oconv)((c>>8)&0xFF, c&0xFF);
3431 (*oconv)(0, (c>>shift)&0xFF);
3442 #ifdef UTF8_OUTPUT_ENABLE
3447 extern const unsigned short euc_to_utf8_1byte[];
3448 extern const unsigned short *const euc_to_utf8_2bytes[];
3449 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3450 const unsigned short *p;
3453 p = euc_to_utf8_1byte;
3455 } else if (c2 >> 8 == 0x8f){
3456 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == 0x8F22 && c1 == 0x43){
3459 extern const unsigned short *const x0212_to_utf8_2bytes[];
3460 c2 = (c2&0x7f) - 0x21;
3461 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3462 p = x0212_to_utf8_2bytes[c2];
3468 c2 = (c2&0x7f) - 0x21;
3469 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3470 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3475 c1 = (c1 & 0x7f) - 0x21;
3476 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3493 if (unicode_bom_f==2) {
3500 #ifdef NUMCHAR_OPTION
3501 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3502 w16w_conv(c1, &c2, &c1, &c0);
3506 if (c0) (*o_putc)(c0);
3513 output_mode = ASCII;
3515 } else if (c2 == ISO8859_1) {
3516 output_mode = ISO8859_1;
3517 (*o_putc)(c1 | 0x080);
3520 #ifdef UTF8_INPUT_ENABLE
3521 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
3522 val = ((c2<<8)&0xff00) + c1;
3525 val = e2w_conv(c2, c1);
3527 w16w_conv(val, &c2, &c1, &c0);
3531 if (c0) (*o_putc)(c0);
3547 if (unicode_bom_f==2) {
3549 (*o_putc)((unsigned char)'\377');
3553 (*o_putc)((unsigned char)'\377');
3558 #ifdef UTF8_INPUT_ENABLE
3559 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
3562 if (c2 == ISO8859_1) {
3565 #ifdef NUMCHAR_OPTION
3566 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
3567 c2 = (c1 >> 8) & 0xff;
3571 unsigned short val = e2w_conv(c2, c1);
3572 c2 = (val >> 8) & 0xff;
3591 #ifdef NUMCHAR_OPTION
3592 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3593 w16e_conv(c1, &c2, &c1);
3594 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3595 if(encode_fallback)(*encode_fallback)(c1);
3603 } else if (c2 == 0) {
3604 output_mode = ASCII;
3606 } else if (c2 == X0201) {
3607 output_mode = JAPANESE_EUC;
3608 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3609 } else if (c2 == ISO8859_1) {
3610 output_mode = ISO8859_1;
3611 (*o_putc)(c1 | 0x080);
3613 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3614 output_mode = JAPANESE_EUC;
3615 #ifdef SHIFTJIS_CP932
3618 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3619 s2e_conv(s2, s1, &c2, &c1);
3624 output_mode = ASCII;
3626 }else if ((c2 & 0xff00) >> 8 == 0x8f){
3629 (*o_putc)((c2 & 0x7f) | 0x080);
3630 (*o_putc)(c1 | 0x080);
3633 (*o_putc)((c2 & 0x7f) | 0x080);
3634 (*o_putc)(c1 | 0x080);
3638 if ((c1<0x21 || 0x7e<c1) ||
3639 (c2<0x21 || 0x7e<c2)) {
3640 set_iconv(FALSE, 0);
3641 return; /* too late to rescue this char */
3643 output_mode = JAPANESE_EUC;
3644 (*o_putc)(c2 | 0x080);
3645 (*o_putc)(c1 | 0x080);
3655 if ((ret & 0xff00) == 0x8f00){
3656 if (0x75 <= c && c <= 0x7f){
3657 ret = c + (0x109 - 0x75);
3660 if (0x75 <= c && c <= 0x7f){
3661 ret = c + (0x113 - 0x75);
3668 int x0212_unshift(c)
3672 if (0x7f <= c && c <= 0x88){
3673 ret = c + (0x75 - 0x7f);
3674 }else if (0x89 <= c && c <= 0x92){
3675 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
3679 #endif /* X0212_ENABLE */
3682 e2s_conv(c2, c1, p2, p1)
3683 int c2, c1, *p2, *p1;
3686 if ((c2 & 0xff00) == 0x8f00){
3689 if((0x21 <= ndx && ndx <= 0x2F)){
3690 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3691 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3693 }else if(0x6E <= ndx && ndx <= 0x7E){
3694 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3695 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3701 else if(0x21 <= ndx && ndx <= 0x7e){
3703 const unsigned short *ptr;
3704 extern const unsigned short *const x0212_shiftjis[];
3705 ptr = x0212_shiftjis[ndx - 0x21];
3707 val = ptr[(c1 & 0x7f) - 0x21];
3716 c2 = x0212_shift(c2);
3718 #endif /* X0212_ENABLE */
3720 if(0x7F < c2) return 1;
3721 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3722 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3731 #ifdef NUMCHAR_OPTION
3732 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3733 w16e_conv(c1, &c2, &c1);
3734 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3735 if(encode_fallback)(*encode_fallback)(c1);
3743 } else if (c2 == 0) {
3744 output_mode = ASCII;
3746 } else if (c2 == X0201) {
3747 output_mode = SHIFT_JIS;
3749 } else if (c2 == ISO8859_1) {
3750 output_mode = ISO8859_1;
3751 (*o_putc)(c1 | 0x080);
3753 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3754 output_mode = SHIFT_JIS;
3755 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3761 if ((c1<0x20 || 0x7e<c1) ||
3762 (c2<0x20 || 0x7e<c2)) {
3763 set_iconv(FALSE, 0);
3764 return; /* too late to rescue this char */
3766 output_mode = SHIFT_JIS;
3767 e2s_conv(c2, c1, &c2, &c1);
3769 #ifdef SHIFTJIS_CP932
3771 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3772 extern const unsigned short cp932inv[2][189];
3773 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3779 #endif /* SHIFTJIS_CP932 */
3782 if (prefix_table[(unsigned char)c1]){
3783 (*o_putc)(prefix_table[(unsigned char)c1]);
3794 #ifdef NUMCHAR_OPTION
3795 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3796 w16e_conv(c1, &c2, &c1);
3797 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3798 if(encode_fallback)(*encode_fallback)(c1);
3804 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3807 (*o_putc)(ascii_intro);
3808 output_mode = ASCII;
3812 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3814 if(output_mode!=X0213_2){
3815 output_mode = X0213_2;
3819 (*o_putc)(X0213_2&0x7F);
3822 if(output_mode!=X0212){
3823 output_mode = X0212;
3827 (*o_putc)(X0212&0x7F);
3830 (*o_putc)(c2 & 0x7f);
3833 } else if (c2==X0201) {
3834 if (output_mode!=X0201) {
3835 output_mode = X0201;
3841 } else if (c2==ISO8859_1) {
3842 /* iso8859 introduction, or 8th bit on */
3843 /* Can we convert in 7bit form using ESC-'-'-A ?
3845 output_mode = ISO8859_1;
3847 } else if (c2 == 0) {
3848 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3851 (*o_putc)(ascii_intro);
3852 output_mode = ASCII;
3856 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
3858 if (output_mode!=X0213_1) {
3859 output_mode = X0213_1;
3863 (*o_putc)(X0213_1&0x7F);
3865 }else if (output_mode != X0208) {
3866 output_mode = X0208;
3869 (*o_putc)(kanji_intro);
3881 mime_prechar(c2, c1);
3882 (*o_base64conv)(c2,c1);
3886 STATIC int broken_buf[3];
3887 STATIC int broken_counter = 0;
3888 STATIC int broken_last = 0;
3895 if (broken_counter>0) {
3896 return broken_buf[--broken_counter];
3899 if (c=='$' && broken_last != ESC
3900 && (input_mode==ASCII || input_mode==X0201)) {
3903 if (c1=='@'|| c1=='B') {
3904 broken_buf[0]=c1; broken_buf[1]=c;
3911 } else if (c=='(' && broken_last != ESC
3912 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3915 if (c1=='J'|| c1=='B') {
3916 broken_buf[0]=c1; broken_buf[1]=c;
3934 if (broken_counter<2)
3935 broken_buf[broken_counter++]=c;
3939 STATIC int prev_cr = 0;
3947 if (! (c2==0&&c1==NL) ) {
3953 } else if (c1=='\r') {
3955 } else if (c1=='\n') {
3956 if (crmode_f==CRLF) {
3957 (*o_crconv)(0,'\r');
3958 } else if (crmode_f==CR) {
3959 (*o_crconv)(0,'\r');
3963 } else if (c1!='\032' || crmode_f!=NL){
3969 Return value of fold_conv()
3971 \n add newline and output char
3972 \r add newline and output nothing
3975 1 (or else) normal output
3977 fold state in prev (previous character)
3979 >0x80 Japanese (X0208/X0201)
3984 This fold algorthm does not preserve heading space in a line.
3985 This is the main difference from fmt.
3988 #define char_size(c2,c1) (c2?2:1)
3997 if (c1== '\r' && !fold_preserve_f) {
3998 fold_state=0; /* ignore cr */
3999 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4001 fold_state=0; /* ignore cr */
4002 } else if (c1== BS) {
4003 if (f_line>0) f_line--;
4005 } else if (c2==EOF && f_line != 0) { /* close open last line */
4007 } else if ((c1=='\n' && !fold_preserve_f)
4008 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4009 && fold_preserve_f)) {
4011 if (fold_preserve_f) {
4015 } else if ((f_prev == c1 && !fold_preserve_f)
4016 || (f_prev == '\n' && fold_preserve_f)
4017 ) { /* duplicate newline */
4020 fold_state = '\n'; /* output two newline */
4026 if (f_prev&0x80) { /* Japanese? */
4028 fold_state = 0; /* ignore given single newline */
4029 } else if (f_prev==' ') {
4033 if (++f_line<=fold_len)
4037 fold_state = '\r'; /* fold and output nothing */
4041 } else if (c1=='\f') {
4046 fold_state = '\n'; /* output newline and clear */
4047 } else if ( (c2==0 && c1==' ')||
4048 (c2==0 && c1=='\t')||
4049 (c2=='!'&& c1=='!')) {
4050 /* X0208 kankaku or ascii space */
4051 if (f_prev == ' ') {
4052 fold_state = 0; /* remove duplicate spaces */
4055 if (++f_line<=fold_len)
4056 fold_state = ' '; /* output ASCII space only */
4058 f_prev = ' '; f_line = 0;
4059 fold_state = '\r'; /* fold and output nothing */
4063 prev0 = f_prev; /* we still need this one... , but almost done */
4065 if (c2 || c2==X0201)
4066 f_prev |= 0x80; /* this is Japanese */
4067 f_line += char_size(c2,c1);
4068 if (f_line<=fold_len) { /* normal case */
4071 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4072 f_line = char_size(c2,c1);
4073 fold_state = '\n'; /* We can't wait, do fold now */
4074 } else if (c2==X0201) {
4075 /* simple kinsoku rules return 1 means no folding */
4076 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4077 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4078 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4079 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4080 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4081 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4082 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4084 fold_state = '\n';/* add one new f_line before this character */
4087 fold_state = '\n';/* add one new f_line before this character */
4090 /* kinsoku point in ASCII */
4091 if ( c1==')'|| /* { [ ( */
4102 /* just after special */
4103 } else if (!is_alnum(prev0)) {
4104 f_line = char_size(c2,c1);
4106 } else if ((prev0==' ') || /* ignored new f_line */
4107 (prev0=='\n')|| /* ignored new f_line */
4108 (prev0&0x80)) { /* X0208 - ASCII */
4109 f_line = char_size(c2,c1);
4110 fold_state = '\n';/* add one new f_line before this character */
4112 fold_state = 1; /* default no fold in ASCII */
4116 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4117 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4118 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4119 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4120 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4121 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4122 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4123 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4124 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4125 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4126 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4127 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4128 /* default no fold in kinsoku */
4131 f_line = char_size(c2,c1);
4132 /* add one new f_line before this character */
4135 f_line = char_size(c2,c1);
4137 /* add one new f_line before this character */
4142 /* terminator process */
4143 switch(fold_state) {
4162 int z_prev2=0,z_prev1=0;
4169 /* if (c2) c1 &= 0x7f; assertion */
4171 if (x0201_f && z_prev2==X0201) { /* X0201 */
4172 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4174 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4176 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4178 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4182 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4191 if (x0201_f && c2==X0201) {
4192 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4193 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4194 z_prev1 = c1; z_prev2 = c2;
4197 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4202 /* JISX0208 Alphabet */
4203 if (alpha_f && c2 == 0x23 ) {
4205 } else if (alpha_f && c2 == 0x21 ) {
4206 /* JISX0208 Kigou */
4211 } else if (alpha_f&0x4) {
4216 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4222 case '>': entity = ">"; break;
4223 case '<': entity = "<"; break;
4224 case '\"': entity = """; break;
4225 case '&': entity = "&"; break;
4228 while (*entity) (*o_zconv)(0, *entity++);
4238 #define rot13(c) ( \
4240 (c <= 'M') ? (c + 13): \
4241 (c <= 'Z') ? (c - 13): \
4243 (c <= 'm') ? (c + 13): \
4244 (c <= 'z') ? (c - 13): \
4248 #define rot47(c) ( \
4250 ( c <= 'O' ) ? (c + 47) : \
4251 ( c <= '~' ) ? (c - 47) : \
4259 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4265 (*o_rot_conv)(c2,c1);
4272 if ((hira_f & 1) && c2==0x25 && 0x20<c1 && c1<0x74) {
4274 } else if ((hira_f & 2) && c2==0x24 && 0x20<c1 && c1<0x74) {
4277 (*o_hira_conv)(c2,c1);
4282 iso2022jp_check_conv(c2,c1)
4285 STATIC const int range[RANGE_NUM_MAX][2] = {
4308 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4312 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4317 for (i = 0; i < RANGE_NUM_MAX; i++) {
4318 start = range[i][0];
4321 if (c >= start && c <= end) {
4326 (*o_iso2022jp_check_conv)(c2,c1);
4330 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4332 const unsigned char *mime_pattern[] = {
4333 (const unsigned char *)"\075?EUC-JP?B?",
4334 (const unsigned char *)"\075?SHIFT_JIS?B?",
4335 (const unsigned char *)"\075?ISO-8859-1?Q?",
4336 (const unsigned char *)"\075?ISO-8859-1?B?",
4337 (const unsigned char *)"\075?ISO-2022-JP?B?",
4338 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4339 #if defined(UTF8_INPUT_ENABLE)
4340 (const unsigned char *)"\075?UTF-8?B?",
4341 (const unsigned char *)"\075?UTF-8?Q?",
4343 (const unsigned char *)"\075?US-ASCII?Q?",
4348 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4349 int (*mime_priority_func[])PROTO((int c2, int c1, int c0)) = {
4350 e_iconv, s_iconv, 0, 0, 0, 0,
4351 #if defined(UTF8_INPUT_ENABLE)
4357 const int mime_encode[] = {
4358 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4359 #if defined(UTF8_INPUT_ENABLE)
4366 const int mime_encode_method[] = {
4367 'B', 'B','Q', 'B', 'B', 'Q',
4368 #if defined(UTF8_INPUT_ENABLE)
4376 #define MAXRECOVER 20
4381 if (i_getc!=mime_getc) {
4382 i_mgetc = i_getc; i_getc = mime_getc;
4383 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4384 if(mime_f==STRICT_MIME) {
4385 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4386 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4392 unswitch_mime_getc()
4394 if(mime_f==STRICT_MIME) {
4395 i_mgetc = i_mgetc_buf;
4396 i_mungetc = i_mungetc_buf;
4399 i_ungetc = i_mungetc;
4400 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4401 mime_iconv_back = NULL;
4405 mime_begin_strict(f)
4410 const unsigned char *p,*q;
4411 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
4413 mime_decode_mode = FALSE;
4414 /* =? has been checked */
4416 p = mime_pattern[j];
4419 for(i=2;p[i]>' ';i++) { /* start at =? */
4420 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4421 /* pattern fails, try next one */
4423 while ((p = mime_pattern[++j])) {
4424 for(k=2;k<i;k++) /* assume length(p) > i */
4425 if (p[k]!=q[k]) break;
4426 if (k==i && nkf_toupper(c1)==p[k]) break;
4428 if (p) continue; /* found next one, continue */
4429 /* all fails, output from recovery buffer */
4437 mime_decode_mode = p[i-2];
4439 mime_iconv_back = iconv;
4440 set_iconv(FALSE, mime_priority_func[j]);
4441 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4443 if (mime_decode_mode=='B') {
4444 mimebuf_f = unbuf_f;
4446 /* do MIME integrity check */
4447 return mime_integrity(f,mime_pattern[j]);
4459 /* we don't keep eof of Fifo, becase it contains ?= as
4460 a terminator. It was checked in mime_integrity. */
4461 return ((mimebuf_f)?
4462 (*i_mgetc_buf)(f):Fifo(mime_input++));
4466 mime_ungetc_buf(c,f)
4471 (*i_mungetc_buf)(c,f);
4473 Fifo(--mime_input)=c;
4484 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4485 /* re-read and convert again from mime_buffer. */
4487 /* =? has been checked */
4489 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4490 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4491 /* We accept any character type even if it is breaked by new lines */
4492 c1 = (*i_getc)(f); Fifo(mime_last++)= c1 ;
4493 if (c1=='\n'||c1==' '||c1=='\r'||
4494 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4496 /* Failed. But this could be another MIME preemble */
4504 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
4505 if (!(++i<MAXRECOVER) || c1==EOF) break;
4506 if (c1=='b'||c1=='B') {
4507 mime_decode_mode = 'B';
4508 } else if (c1=='q'||c1=='Q') {
4509 mime_decode_mode = 'Q';
4513 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
4514 if (!(++i<MAXRECOVER) || c1==EOF) break;
4516 mime_decode_mode = FALSE;
4522 if (!mime_decode_mode) {
4523 /* false MIME premble, restart from mime_buffer */
4524 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4525 /* Since we are in MIME mode until buffer becomes empty, */
4526 /* we never go into mime_begin again for a while. */
4529 /* discard mime preemble, and goto MIME mode */
4531 /* do no MIME integrity check */
4532 return c1; /* used only for checking EOF */
4547 fprintf(stderr, "%s\n", str);
4553 set_input_codename (codename)
4558 strcmp(codename, "") != 0 &&
4559 strcmp(codename, input_codename) != 0)
4561 is_inputcode_mixed = TRUE;
4563 input_codename = codename;
4564 is_inputcode_set = TRUE;
4567 #if !defined(PERL_XS) && !defined(WIN32DLL)
4569 print_guessed_code (filename)
4572 char *codename = "BINARY";
4573 if (!is_inputcode_mixed) {
4574 if (strcmp(input_codename, "") == 0) {
4577 codename = input_codename;
4580 if (filename != NULL) printf("%s:", filename);
4581 printf("%s\n", codename);
4587 #ifdef ANSI_C_PROTOTYPE
4588 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
4591 hex_getc(ch, f, g, u)
4604 if (!nkf_isxdigit(c2)){
4609 if (!nkf_isxdigit(c3)){
4614 return (hex2bin(c2) << 4) | hex2bin(c3);
4621 return hex_getc(':', f, i_cgetc, i_cungetc);
4629 return (*i_cungetc)(c, f);
4636 return hex_getc('%', f, i_ugetc, i_uungetc);
4644 return (*i_uungetc)(c, f);
4648 #ifdef NUMCHAR_OPTION
4653 int (*g)() = i_ngetc;
4654 int (*u)() = i_nungetc;
4665 if (buf[i] == 'x' || buf[i] == 'X'){
4666 for (j = 0; j < 5; j++){
4668 if (!nkf_isxdigit(buf[i])){
4675 c |= hex2bin(buf[i]);
4678 for (j = 0; j < 6; j++){
4682 if (!nkf_isdigit(buf[i])){
4689 c += hex2bin(buf[i]);
4695 return CLASS_UTF16 | c;
4705 numchar_ungetc(c, f)
4709 return (*i_nungetc)(c, f);
4713 #ifdef UNICODE_NORMALIZATION
4715 /* Normalization Form C */
4720 int (*g)() = i_nfc_getc;
4721 int (*u)() = i_nfc_ungetc;
4722 int i=0, j, k=1, lower, upper;
4724 const int *array = NULL;
4725 extern const struct normalization_pair normalization_table[];
4728 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4729 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4730 while (upper >= lower) {
4731 j = (lower+upper) / 2;
4732 array = normalization_table[j].nfd;
4733 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4734 if (array[k] != buf[k]){
4735 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4742 array = normalization_table[j].nfc;
4743 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4760 return (*i_nfc_ungetc)(c, f);
4762 #endif /* UNICODE_NORMALIZATION */
4769 int c1, c2, c3, c4, cc;
4770 int t1, t2, t3, t4, mode, exit_mode;
4774 int lwsp_size = 128;
4776 if (mime_top != mime_last) { /* Something is in FIFO */
4777 return Fifo(mime_top++);
4779 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4780 mime_decode_mode=FALSE;
4781 unswitch_mime_getc();
4782 return (*i_getc)(f);
4785 if (mimebuf_f == FIXED_MIME)
4786 exit_mode = mime_decode_mode;
4789 if (mime_decode_mode == 'Q') {
4790 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4792 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4793 if (c1<=' ' || DEL<=c1) {
4794 mime_decode_mode = exit_mode; /* prepare for quit */
4797 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4801 mime_decode_mode = exit_mode; /* prepare for quit */
4802 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4803 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4804 /* end Q encoding */
4805 input_mode = exit_mode;
4807 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4808 if (lwsp_buf==NULL) {
4809 perror("can't malloc");
4812 while ((c1=(*i_getc)(f))!=EOF) {
4817 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4825 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4826 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4841 lwsp_buf[lwsp_count] = c1;
4842 if (lwsp_count++>lwsp_size){
4844 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4845 if (lwsp_buf_new==NULL) {
4848 perror("can't realloc");
4851 lwsp_buf = lwsp_buf_new;
4857 if (lwsp_count > 0) {
4858 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4862 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4863 i_ungetc(lwsp_buf[lwsp_count],f);
4871 if (c1=='='&&c2<' ') { /* this is soft wrap */
4872 while((c1 = (*i_mgetc)(f)) <=' ') {
4873 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4875 mime_decode_mode = 'Q'; /* still in MIME */
4876 goto restart_mime_q;
4879 mime_decode_mode = 'Q'; /* still in MIME */
4883 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4884 if (c2<=' ') return c2;
4885 mime_decode_mode = 'Q'; /* still in MIME */
4886 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4887 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4888 return ((hex(c2)<<4) + hex(c3));
4891 if (mime_decode_mode != 'B') {
4892 mime_decode_mode = FALSE;
4893 return (*i_mgetc)(f);
4897 /* Base64 encoding */
4899 MIME allows line break in the middle of
4900 Base64, but we are very pessimistic in decoding
4901 in unbuf mode because MIME encoded code may broken by
4902 less or editor's control sequence (such as ESC-[-K in unbuffered
4903 mode. ignore incomplete MIME.
4905 mode = mime_decode_mode;
4906 mime_decode_mode = exit_mode; /* prepare for quit */
4908 while ((c1 = (*i_mgetc)(f))<=' ') {
4913 if ((c2 = (*i_mgetc)(f))<=' ') {
4916 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4917 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4920 if ((c1 == '?') && (c2 == '=')) {
4923 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4924 if (lwsp_buf==NULL) {
4925 perror("can't malloc");
4928 while ((c1=(*i_getc)(f))!=EOF) {
4933 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4941 if ((c1=(*i_getc)(f))!=EOF) {
4945 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4960 lwsp_buf[lwsp_count] = c1;
4961 if (lwsp_count++>lwsp_size){
4963 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4964 if (lwsp_buf_new==NULL) {
4967 perror("can't realloc");
4970 lwsp_buf = lwsp_buf_new;
4976 if (lwsp_count > 0) {
4977 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4981 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4982 i_ungetc(lwsp_buf[lwsp_count],f);
4991 if ((c3 = (*i_mgetc)(f))<=' ') {
4994 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4995 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4999 if ((c4 = (*i_mgetc)(f))<=' ') {
5002 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5003 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5007 mime_decode_mode = mode; /* still in MIME sigh... */
5009 /* BASE 64 decoding */
5011 t1 = 0x3f & base64decode(c1);
5012 t2 = 0x3f & base64decode(c2);
5013 t3 = 0x3f & base64decode(c3);
5014 t4 = 0x3f & base64decode(c4);
5015 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5017 Fifo(mime_last++) = cc;
5018 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5020 Fifo(mime_last++) = cc;
5021 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5023 Fifo(mime_last++) = cc;
5028 return Fifo(mime_top++);
5036 Fifo(--mime_top) = c;
5043 const unsigned char *p;
5047 /* In buffered mode, read until =? or NL or buffer full
5049 mime_input = mime_top;
5050 mime_last = mime_top;
5052 while(*p) Fifo(mime_input++) = *p++;
5055 while((c=(*i_getc)(f))!=EOF) {
5056 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5057 break; /* buffer full */
5059 if (c=='=' && d=='?') {
5060 /* checked. skip header, start decode */
5061 Fifo(mime_input++) = c;
5062 /* mime_last_input = mime_input; */
5067 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5069 /* Should we check length mod 4? */
5070 Fifo(mime_input++) = c;
5073 /* In case of Incomplete MIME, no MIME decode */
5074 Fifo(mime_input++) = c;
5075 mime_last = mime_input; /* point undecoded buffer */
5076 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5077 switch_mime_getc(); /* anyway we need buffered getc */
5088 i = c - 'A'; /* A..Z 0-25 */
5090 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5092 } else if (c > '/') {
5093 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5094 } else if (c == '+') {
5095 i = '>' /* 62 */ ; /* + 62 */
5097 i = '?' /* 63 */ ; /* / 63 */
5102 STATIC const char basis_64[] =
5103 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5106 #define MIMEOUT_BUF_LENGTH (60)
5107 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5108 int mimeout_buf_count = 0;
5109 int mimeout_preserve_space = 0;
5110 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5116 const unsigned char *p;
5119 p = mime_pattern[0];
5120 for(i=0;mime_encode[i];i++) {
5121 if (mode == mime_encode[i]) {
5122 p = mime_pattern[i];
5126 mimeout_mode = mime_encode_method[i];
5129 if (base64_count>45) {
5130 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5131 (*o_mputc)(mimeout_buf[i]);
5137 if (!mimeout_preserve_space && mimeout_buf_count>0
5138 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5139 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5143 if (!mimeout_preserve_space) {
5144 for (;i<mimeout_buf_count;i++) {
5145 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5146 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5147 (*o_mputc)(mimeout_buf[i]);
5154 mimeout_preserve_space = FALSE;
5160 j = mimeout_buf_count;
5161 mimeout_buf_count = 0;
5163 mime_putc(mimeout_buf[i]);
5179 switch(mimeout_mode) {
5184 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5190 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5196 if (mimeout_f!=FIXED_MIME) {
5198 } else if (mimeout_mode != 'Q')
5207 switch(mimeout_mode) {
5212 } else if(!nkf_isalnum(c)) {
5214 (*o_mputc)(itoh4(((c>>4)&0xf)));
5215 (*o_mputc)(itoh4((c&0xf)));
5224 (*o_mputc)(basis_64[c>>2]);
5229 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5235 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5236 (*o_mputc)(basis_64[c & 0x3F]);
5247 int mime_lastchar2, mime_lastchar1;
5249 void mime_prechar(c2, c1)
5254 if (base64_count + mimeout_buf_count/3*4> 66){
5255 (*o_base64conv)(EOF,0);
5256 (*o_base64conv)(0,NL);
5257 (*o_base64conv)(0,SPACE);
5259 }/*else if (mime_lastchar2){
5260 if (c1 <=DEL && !nkf_isspace(c1)){
5261 (*o_base64conv)(0,SPACE);
5265 if (c2 && mime_lastchar2 == 0
5266 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5267 (*o_base64conv)(0,SPACE);
5270 mime_lastchar2 = c2;
5271 mime_lastchar1 = c1;
5282 if (mimeout_f == FIXED_MIME){
5283 if (mimeout_mode == 'Q'){
5284 if (base64_count > 71){
5285 if (c!=CR && c!=NL) {
5292 if (base64_count > 71){
5297 if (c == EOF) { /* c==EOF */
5301 if (c != EOF) { /* c==EOF */
5307 /* mimeout_f != FIXED_MIME */
5309 if (c == EOF) { /* c==EOF */
5310 j = mimeout_buf_count;
5311 mimeout_buf_count = 0;
5314 /*if (nkf_isspace(mimeout_buf[i])){
5317 mimeout_addchar(mimeout_buf[i]);
5321 (*o_mputc)(mimeout_buf[i]);
5327 if (mimeout_mode=='Q') {
5328 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5340 if (mimeout_buf_count > 0){
5341 lastchar = mimeout_buf[mimeout_buf_count - 1];
5346 if (!mimeout_mode) {
5347 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5348 if (nkf_isspace(c)) {
5349 if (c==CR || c==NL) {
5352 for (i=0;i<mimeout_buf_count;i++) {
5353 (*o_mputc)(mimeout_buf[i]);
5354 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5361 mimeout_buf_count = 1;
5363 if (base64_count > 1
5364 && base64_count + mimeout_buf_count > 76){
5367 if (!nkf_isspace(mimeout_buf[0])){
5372 mimeout_buf[mimeout_buf_count++] = c;
5373 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5374 open_mime(output_mode);
5379 if (lastchar==CR || lastchar == NL){
5380 for (i=0;i<mimeout_buf_count;i++) {
5381 (*o_mputc)(mimeout_buf[i]);
5384 mimeout_buf_count = 0;
5386 if (lastchar==SPACE) {
5387 for (i=0;i<mimeout_buf_count-1;i++) {
5388 (*o_mputc)(mimeout_buf[i]);
5391 mimeout_buf[0] = SPACE;
5392 mimeout_buf_count = 1;
5394 open_mime(output_mode);
5397 /* mimeout_mode == 'B', 1, 2 */
5398 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5399 if (lastchar == CR || lastchar == NL){
5400 if (nkf_isblank(c)) {
5401 for (i=0;i<mimeout_buf_count;i++) {
5402 mimeout_addchar(mimeout_buf[i]);
5404 mimeout_buf_count = 0;
5405 } else if (SPACE<c && c<DEL) {
5407 for (i=0;i<mimeout_buf_count;i++) {
5408 (*o_mputc)(mimeout_buf[i]);
5411 mimeout_buf_count = 0;
5414 if (c==SPACE || c==TAB || c==CR || c==NL) {
5415 for (i=0;i<mimeout_buf_count;i++) {
5416 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5418 for (i=0;i<mimeout_buf_count;i++) {
5419 (*o_mputc)(mimeout_buf[i]);
5422 mimeout_buf_count = 0;
5425 mimeout_buf[mimeout_buf_count++] = c;
5426 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5428 for (i=0;i<mimeout_buf_count;i++) {
5429 (*o_mputc)(mimeout_buf[i]);
5432 mimeout_buf_count = 0;
5436 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5437 mimeout_buf[mimeout_buf_count++] = c;
5438 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5439 j = mimeout_buf_count;
5440 mimeout_buf_count = 0;
5442 mimeout_addchar(mimeout_buf[i]);
5449 if (mimeout_buf_count>0) {
5450 j = mimeout_buf_count;
5451 mimeout_buf_count = 0;
5453 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5455 mimeout_addchar(mimeout_buf[i]);
5461 (*o_mputc)(mimeout_buf[i]);
5463 open_mime(output_mode);
5470 #if defined(PERL_XS) || defined(WIN32DLL)
5475 struct input_code *p = input_code_list;
5488 mime_f = STRICT_MIME;
5489 mime_decode_f = FALSE;
5494 #if defined(MSDOS) || defined(__OS2__)
5499 iso2022jp_f = FALSE;
5500 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5501 ms_ucs_map_f = UCS_MAP_ASCII;
5503 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5504 internal_unicode_f = FALSE;
5506 #ifdef UTF8_INPUT_ENABLE
5507 no_cp932ext_f = FALSE;
5508 ignore_zwnbsp_f = TRUE;
5509 no_best_fit_chars_f = FALSE;
5510 encode_fallback = NULL;
5511 unicode_subchar = '?';
5513 #ifdef UTF8_OUTPUT_ENABLE
5517 #ifdef UNICODE_NORMALIZATION
5530 is_inputcode_mixed = FALSE;
5531 is_inputcode_set = FALSE;
5535 #ifdef SHIFTJIS_CP932
5545 for (i = 0; i < 256; i++){
5546 prefix_table[i] = 0;
5549 #ifdef UTF8_INPUT_ENABLE
5550 utf16_mode = UTF16BE_INPUT;
5552 mimeout_buf_count = 0;
5557 fold_preserve_f = FALSE;
5560 kanji_intro = DEFAULT_J;
5561 ascii_intro = DEFAULT_R;
5562 fold_margin = FOLD_MARGIN;
5563 output_conv = DEFAULT_CONV;
5564 oconv = DEFAULT_CONV;
5565 o_zconv = no_connection;
5566 o_fconv = no_connection;
5567 o_crconv = no_connection;
5568 o_rot_conv = no_connection;
5569 o_hira_conv = no_connection;
5570 o_base64conv = no_connection;
5571 o_iso2022jp_check_conv = no_connection;
5574 i_ungetc = std_ungetc;
5576 i_bungetc = std_ungetc;
5579 i_mungetc = std_ungetc;
5580 i_mgetc_buf = std_getc;
5581 i_mungetc_buf = std_ungetc;
5582 output_mode = ASCII;
5585 mime_decode_mode = FALSE;
5591 z_prev2=0,z_prev1=0;
5593 iconv_for_check = 0;
5595 input_codename = "";
5603 no_connection(c2,c1)
5606 no_connection2(c2,c1,0);
5610 no_connection2(c2,c1,c0)
5613 fprintf(stderr,"nkf internal module connection failure.\n");
5615 return 0; /* LINT */
5620 #define fprintf dllprintf
5625 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5626 fprintf(stderr,"Flags:\n");
5627 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5628 #ifdef DEFAULT_CODE_SJIS
5629 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5631 #ifdef DEFAULT_CODE_JIS
5632 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5634 #ifdef DEFAULT_CODE_EUC
5635 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5637 #ifdef DEFAULT_CODE_UTF8
5638 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5640 #ifdef UTF8_OUTPUT_ENABLE
5641 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5643 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5644 #ifdef UTF8_INPUT_ENABLE
5645 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5647 fprintf(stderr,"t no conversion\n");
5648 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5649 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5650 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5651 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5652 fprintf(stderr,"v Show this usage. V: show version\n");
5653 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5654 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5655 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5656 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5657 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5658 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5659 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5660 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5662 fprintf(stderr,"T Text mode output\n");
5664 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5665 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5666 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5667 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5668 fprintf(stderr,"\n");
5669 fprintf(stderr,"Long name options\n");
5670 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5671 fprintf(stderr," Specify the input or output codeset\n");
5672 fprintf(stderr," --fj --unix --mac --windows\n");
5673 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5674 fprintf(stderr," Convert for the system or code\n");
5675 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5676 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5677 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5679 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5681 #ifdef NUMCHAR_OPTION
5682 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5684 #ifdef UTF8_INPUT_ENABLE
5685 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5686 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5689 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5690 fprintf(stderr," Overwrite original listed files by filtered result\n");
5691 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5693 fprintf(stderr," -g --guess Guess the input code\n");
5694 fprintf(stderr," --help --version Show this help/the version\n");
5695 fprintf(stderr," For more information, see also man nkf\n");
5696 fprintf(stderr,"\n");
5703 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5704 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__)
5707 #if defined(MSDOS) && defined(__WIN16__)
5710 #if defined(MSDOS) && defined(__WIN32__)
5716 ,NKF_VERSION,NKF_RELEASE_DATE);
5717 fprintf(stderr,"\n%s\n",CopyRight);
5722 **
\e$B%Q%C%A@):n<T
\e(B
5723 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5724 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5725 ** ohta@src.ricoh.co.jp (Junn Ohta)
5726 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5727 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5728 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5729 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5730 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5731 ** GHG00637@nifty-serve.or.jp (COW)