1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.111 2006/10/08 03:02:34 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2006-09-15"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = TRUE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
584 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
585 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
590 static int mimeout_mode = 0;
591 static int base64_count = 0;
593 /* X0208 -> ASCII converter */
596 static int f_line = 0; /* chars in line */
597 static int f_prev = 0;
598 static int fold_preserve_f = FALSE; /* preserve new lines */
599 static int fold_f = FALSE;
600 static int fold_len = 0;
603 static unsigned char kanji_intro = DEFAULT_J;
604 static unsigned char ascii_intro = DEFAULT_R;
608 #define FOLD_MARGIN 10
609 #define DEFAULT_FOLD 60
611 static int fold_margin = FOLD_MARGIN;
615 #ifdef DEFAULT_CODE_JIS
616 # define DEFAULT_CONV j_oconv
618 #ifdef DEFAULT_CODE_SJIS
619 # define DEFAULT_CONV s_oconv
621 #ifdef DEFAULT_CODE_EUC
622 # define DEFAULT_CONV e_oconv
624 #ifdef DEFAULT_CODE_UTF8
625 # define DEFAULT_CONV w_oconv
628 /* process default */
629 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
631 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
632 /* s_iconv or oconv */
633 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
635 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
643 /* static redirections */
645 static void (*o_putc)(nkf_char c) = std_putc;
647 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
648 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
650 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
651 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
653 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
655 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
656 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
658 /* for strict mime */
659 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
660 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
663 static int output_mode = ASCII, /* output kanji mode */
664 input_mode = ASCII, /* input kanji mode */
665 shift_mode = FALSE; /* TRUE shift out, or X0201 */
666 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
668 /* X0201 / X0208 conversion tables */
670 /* X0201 kana conversion table */
673 unsigned char cv[]= {
674 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
675 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
676 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
677 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
678 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
679 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
680 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
681 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
682 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
683 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
684 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
685 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
686 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
687 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
688 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
689 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
693 /* X0201 kana conversion table for daguten */
696 unsigned char dv[]= {
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
702 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
703 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
704 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
705 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
706 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
708 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
715 /* X0201 kana conversion table for han-daguten */
718 unsigned char ev[]= {
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
730 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
738 /* X0208 kigou conversion table */
739 /* 0x8140 - 0x819e */
741 unsigned char fv[] = {
743 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
744 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
745 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
746 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
747 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
748 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
749 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
751 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
753 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
760 static int file_out_f = FALSE;
762 static int overwrite_f = FALSE;
763 static int preserve_time_f = FALSE;
764 static int backup_f = FALSE;
765 static char *backup_suffix = "";
766 static char *get_backup_filename(const char *suffix, const char *filename);
769 static int crmode_f = 0; /* CR, NL, CRLF */
770 #ifdef EASYWIN /*Easy Win */
771 static int end_check;
774 #define STD_GC_BUFSIZE (256)
775 nkf_char std_gc_buf[STD_GC_BUFSIZE];
779 #include "nkf32dll.c"
780 #elif defined(PERL_XS)
782 int main(int argc, char **argv)
787 char *outfname = NULL;
790 #ifdef EASYWIN /*Easy Win */
791 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
794 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
795 cp = (unsigned char *)*argv;
800 if (pipe(fds) < 0 || (pid = fork()) < 0){
811 execvp(argv[1], &argv[1]);
825 if(x0201_f == WISH_TRUE)
826 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
828 if (binmode_f == TRUE)
829 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
830 if (freopen("","wb",stdout) == NULL)
837 setbuf(stdout, (char *) NULL);
839 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
842 if (binmode_f == TRUE)
843 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
844 if (freopen("","rb",stdin) == NULL) return (-1);
848 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
852 kanji_convert(stdin);
853 if (guess_f) print_guessed_code(NULL);
858 is_inputcode_mixed = FALSE;
859 is_inputcode_set = FALSE;
864 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
874 /* reopen file for stdout */
875 if (file_out_f == TRUE) {
878 outfname = malloc(strlen(origfname)
879 + strlen(".nkftmpXXXXXX")
885 strcpy(outfname, origfname);
889 for (i = strlen(outfname); i; --i){
890 if (outfname[i - 1] == '/'
891 || outfname[i - 1] == '\\'){
897 strcat(outfname, "ntXXXXXX");
899 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
902 strcat(outfname, ".nkftmpXXXXXX");
903 fd = mkstemp(outfname);
906 || (fd_backup = dup(fileno(stdout))) < 0
907 || dup2(fd, fileno(stdout)) < 0
918 outfname = "nkf.out";
921 if(freopen(outfname, "w", stdout) == NULL) {
925 if (binmode_f == TRUE) {
926 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
927 if (freopen("","wb",stdout) == NULL)
934 if (binmode_f == TRUE)
935 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
936 if (freopen("","rb",fin) == NULL)
941 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
945 char *filename = NULL;
947 if (nfiles > 1) filename = origfname;
948 if (guess_f) print_guessed_code(filename);
954 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
962 if (dup2(fd_backup, fileno(stdout)) < 0){
965 if (stat(origfname, &sb)) {
966 fprintf(stderr, "Can't stat %s\n", origfname);
968 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
969 if (chmod(outfname, sb.st_mode)) {
970 fprintf(stderr, "Can't set permission %s\n", outfname);
973 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
975 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
976 tb[0] = tb[1] = sb.st_mtime;
977 if (utime(outfname, tb)) {
978 fprintf(stderr, "Can't set timestamp %s\n", outfname);
981 tb.actime = sb.st_atime;
982 tb.modtime = sb.st_mtime;
983 if (utime(outfname, &tb)) {
984 fprintf(stderr, "Can't set timestamp %s\n", outfname);
989 char *backup_filename = get_backup_filename(backup_suffix, origfname);
991 unlink(backup_filename);
993 if (rename(origfname, backup_filename)) {
994 perror(backup_filename);
995 fprintf(stderr, "Can't rename %s to %s\n",
996 origfname, backup_filename);
1000 if (unlink(origfname)){
1005 if (rename(outfname, origfname)) {
1007 fprintf(stderr, "Can't rename %s to %s\n",
1008 outfname, origfname);
1016 #ifdef EASYWIN /*Easy Win */
1017 if (file_out_f == FALSE)
1018 scanf("%d",&end_check);
1021 #else /* for Other OS */
1022 if (file_out_f == TRUE)
1024 #endif /*Easy Win */
1027 #endif /* WIN32DLL */
1030 char *get_backup_filename(const char *suffix, const char *filename)
1032 char *backup_filename;
1033 int asterisk_count = 0;
1035 int filename_length = strlen(filename);
1037 for(i = 0; suffix[i]; i++){
1038 if(suffix[i] == '*') asterisk_count++;
1042 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1043 if (!backup_filename){
1044 perror("Can't malloc backup filename.");
1048 for(i = 0, j = 0; suffix[i];){
1049 if(suffix[i] == '*'){
1050 backup_filename[j] = '\0';
1051 strncat(backup_filename, filename, filename_length);
1053 j += filename_length;
1055 backup_filename[j++] = suffix[i++];
1058 backup_filename[j] = '\0';
1060 j = strlen(suffix) + filename_length;
1061 backup_filename = malloc( + 1);
1062 strcpy(backup_filename, filename);
1063 strcat(backup_filename, suffix);
1064 backup_filename[j] = '\0';
1066 return backup_filename;
1095 {"katakana-hiragana","h3"},
1102 #ifdef UTF8_OUTPUT_ENABLE
1112 {"fb-subchar=", ""},
1114 #ifdef UTF8_INPUT_ENABLE
1115 {"utf8-input", "W"},
1116 {"utf16-input", "W16"},
1117 {"no-cp932ext", ""},
1118 {"no-best-fit-chars",""},
1120 #ifdef UNICODE_NORMALIZATION
1121 {"utf8mac-input", ""},
1133 #ifdef NUMCHAR_OPTION
1134 {"numchar-input", ""},
1140 #ifdef SHIFTJIS_CP932
1150 static int option_mode = 0;
1152 void options(unsigned char *cp)
1156 unsigned char *cp_back = NULL;
1161 while(*cp && *cp++!='-');
1162 while (*cp || cp_back) {
1170 case '-': /* literal options */
1171 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1175 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1176 p = (unsigned char *)long_option[i].name;
1177 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1178 if (*p == cp[j] || cp[j] == ' '){
1185 while(*cp && *cp != SPACE && cp++);
1186 if (long_option[i].alias[0]){
1188 cp = (unsigned char *)long_option[i].alias;
1190 if (strcmp(long_option[i].name, "ic=") == 0){
1191 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1192 codeset[i] = nkf_toupper(p[i]);
1195 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1196 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1197 strcmp(codeset, "CP50220") == 0 ||
1198 strcmp(codeset, "CP50221") == 0 ||
1199 strcmp(codeset, "CP50222") == 0 ||
1200 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1201 input_f = JIS_INPUT;
1202 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1203 input_f = JIS_INPUT;
1207 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1208 input_f = JIS_INPUT;
1213 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1214 input_f = SJIS_INPUT;
1215 if (x0201_f==NO_X0201) x0201_f=TRUE;
1216 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1217 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1218 strcmp(codeset, "CP932") == 0 ||
1219 strcmp(codeset, "MS932") == 0){
1220 input_f = SJIS_INPUT;
1222 #ifdef SHIFTJIS_CP932
1225 #ifdef UTF8_OUTPUT_ENABLE
1226 ms_ucs_map_f = UCS_MAP_CP932;
1228 }else if(strcmp(codeset, "EUCJP") == 0 ||
1229 strcmp(codeset, "EUC-JP") == 0){
1230 input_f = EUC_INPUT;
1231 }else if(strcmp(codeset, "CP51932") == 0){
1232 input_f = EUC_INPUT;
1234 #ifdef SHIFTJIS_CP932
1237 #ifdef UTF8_OUTPUT_ENABLE
1238 ms_ucs_map_f = UCS_MAP_CP932;
1240 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1241 strcmp(codeset, "EUCJP-MS") == 0 ||
1242 strcmp(codeset, "EUCJPMS") == 0){
1243 input_f = EUC_INPUT;
1245 #ifdef SHIFTJIS_CP932
1248 #ifdef UTF8_OUTPUT_ENABLE
1249 ms_ucs_map_f = UCS_MAP_MS;
1251 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1252 strcmp(codeset, "EUCJP-ASCII") == 0){
1253 input_f = EUC_INPUT;
1255 #ifdef SHIFTJIS_CP932
1258 #ifdef UTF8_OUTPUT_ENABLE
1259 ms_ucs_map_f = UCS_MAP_ASCII;
1261 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1262 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1263 input_f = SJIS_INPUT;
1265 #ifdef SHIFTJIS_CP932
1269 if (x0201_f==NO_X0201) x0201_f=TRUE;
1270 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1271 strcmp(codeset, "EUC-JIS-2004") == 0){
1272 input_f = EUC_INPUT;
1275 #ifdef SHIFTJIS_CP932
1279 #ifdef UTF8_INPUT_ENABLE
1280 }else if(strcmp(codeset, "UTF-8") == 0 ||
1281 strcmp(codeset, "UTF-8N") == 0 ||
1282 strcmp(codeset, "UTF-8-BOM") == 0){
1283 input_f = UTF8_INPUT;
1284 #ifdef UNICODE_NORMALIZATION
1285 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1286 strcmp(codeset, "UTF-8-MAC") == 0){
1287 input_f = UTF8_INPUT;
1290 }else if(strcmp(codeset, "UTF-16") == 0 ||
1291 strcmp(codeset, "UTF-16BE") == 0 ||
1292 strcmp(codeset, "UTF-16BE-BOM") == 0){
1293 input_f = UTF16_INPUT;
1294 input_endian = ENDIAN_BIG;
1295 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1296 strcmp(codeset, "UTF-16LE-BOM") == 0){
1297 input_f = UTF16_INPUT;
1298 input_endian = ENDIAN_LITTLE;
1299 }else if(strcmp(codeset, "UTF-32") == 0 ||
1300 strcmp(codeset, "UTF-32BE") == 0 ||
1301 strcmp(codeset, "UTF-32BE-BOM") == 0){
1302 input_f = UTF32_INPUT;
1303 input_endian = ENDIAN_BIG;
1304 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1305 strcmp(codeset, "UTF-32LE-BOM") == 0){
1306 input_f = UTF32_INPUT;
1307 input_endian = ENDIAN_LITTLE;
1312 if (strcmp(long_option[i].name, "oc=") == 0){
1313 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1314 codeset[i] = nkf_toupper(p[i]);
1317 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1318 strcmp(codeset, "CP50220") == 0){
1319 output_conv = j_oconv;
1320 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1321 output_conv = j_oconv;
1322 no_cp932ext_f = TRUE;
1323 }else if(strcmp(codeset, "CP50221") == 0 ||
1324 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1325 output_conv = j_oconv;
1327 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1328 output_conv = j_oconv;
1332 #ifdef SHIFTJIS_CP932
1335 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1336 output_conv = j_oconv;
1341 #ifdef SHIFTJIS_CP932
1344 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1345 output_conv = j_oconv;
1350 #ifdef SHIFTJIS_CP932
1353 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1354 output_conv = s_oconv;
1355 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1356 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1357 strcmp(codeset, "CP932") == 0 ||
1358 strcmp(codeset, "MS932") == 0){
1359 output_conv = s_oconv;
1361 #ifdef SHIFTJIS_CP932
1365 #ifdef UTF8_OUTPUT_ENABLE
1366 ms_ucs_map_f = UCS_MAP_CP932;
1368 }else if(strcmp(codeset, "EUCJP") == 0 ||
1369 strcmp(codeset, "EUC-JP") == 0){
1370 output_conv = e_oconv;
1371 }else if(strcmp(codeset, "CP51932") == 0){
1372 output_conv = e_oconv;
1374 #ifdef SHIFTJIS_CP932
1377 #ifdef UTF8_OUTPUT_ENABLE
1378 ms_ucs_map_f = UCS_MAP_CP932;
1380 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1381 strcmp(codeset, "EUCJP-MS") == 0 ||
1382 strcmp(codeset, "EUCJPMS") == 0){
1383 output_conv = e_oconv;
1388 #ifdef SHIFTJIS_CP932
1391 #ifdef UTF8_OUTPUT_ENABLE
1392 ms_ucs_map_f = UCS_MAP_MS;
1394 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1395 strcmp(codeset, "EUCJP-ASCII") == 0){
1396 output_conv = e_oconv;
1401 #ifdef SHIFTJIS_CP932
1404 #ifdef UTF8_OUTPUT_ENABLE
1405 ms_ucs_map_f = UCS_MAP_ASCII;
1407 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1408 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1409 output_conv = s_oconv;
1411 #ifdef SHIFTJIS_CP932
1414 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1415 strcmp(codeset, "EUC-JIS-2004") == 0){
1416 output_conv = e_oconv;
1421 #ifdef SHIFTJIS_CP932
1424 #ifdef UTF8_OUTPUT_ENABLE
1425 }else if(strcmp(codeset, "UTF-8") == 0){
1426 output_conv = w_oconv;
1427 }else if(strcmp(codeset, "UTF-8N") == 0){
1428 output_conv = w_oconv;
1429 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1430 output_conv = w_oconv;
1431 output_bom_f = TRUE;
1432 }else if(strcmp(codeset, "UTF-16BE") == 0){
1433 output_conv = w_oconv16;
1434 }else if(strcmp(codeset, "UTF-16") == 0 ||
1435 strcmp(codeset, "UTF-16BE-BOM") == 0){
1436 output_conv = w_oconv16;
1437 output_bom_f = TRUE;
1438 }else if(strcmp(codeset, "UTF-16LE") == 0){
1439 output_conv = w_oconv16;
1440 output_endian = ENDIAN_LITTLE;
1441 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1442 output_conv = w_oconv16;
1443 output_endian = ENDIAN_LITTLE;
1444 output_bom_f = TRUE;
1445 }else if(strcmp(codeset, "UTF-32") == 0 ||
1446 strcmp(codeset, "UTF-32BE") == 0){
1447 output_conv = w_oconv32;
1448 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1449 output_conv = w_oconv32;
1450 output_bom_f = TRUE;
1451 }else if(strcmp(codeset, "UTF-32LE") == 0){
1452 output_conv = w_oconv32;
1453 output_endian = ENDIAN_LITTLE;
1454 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1455 output_conv = w_oconv32;
1456 output_endian = ENDIAN_LITTLE;
1457 output_bom_f = TRUE;
1463 if (strcmp(long_option[i].name, "overwrite") == 0){
1466 preserve_time_f = TRUE;
1469 if (strcmp(long_option[i].name, "overwrite=") == 0){
1472 preserve_time_f = TRUE;
1474 backup_suffix = malloc(strlen((char *) p) + 1);
1475 strcpy(backup_suffix, (char *) p);
1478 if (strcmp(long_option[i].name, "in-place") == 0){
1481 preserve_time_f = FALSE;
1484 if (strcmp(long_option[i].name, "in-place=") == 0){
1487 preserve_time_f = FALSE;
1489 backup_suffix = malloc(strlen((char *) p) + 1);
1490 strcpy(backup_suffix, (char *) p);
1495 if (strcmp(long_option[i].name, "cap-input") == 0){
1499 if (strcmp(long_option[i].name, "url-input") == 0){
1504 #ifdef NUMCHAR_OPTION
1505 if (strcmp(long_option[i].name, "numchar-input") == 0){
1511 if (strcmp(long_option[i].name, "no-output") == 0){
1515 if (strcmp(long_option[i].name, "debug") == 0){
1520 if (strcmp(long_option[i].name, "cp932") == 0){
1521 #ifdef SHIFTJIS_CP932
1525 #ifdef UTF8_OUTPUT_ENABLE
1526 ms_ucs_map_f = UCS_MAP_CP932;
1530 if (strcmp(long_option[i].name, "no-cp932") == 0){
1531 #ifdef SHIFTJIS_CP932
1535 #ifdef UTF8_OUTPUT_ENABLE
1536 ms_ucs_map_f = UCS_MAP_ASCII;
1540 #ifdef SHIFTJIS_CP932
1541 if (strcmp(long_option[i].name, "cp932inv") == 0){
1548 if (strcmp(long_option[i].name, "x0212") == 0){
1555 if (strcmp(long_option[i].name, "exec-in") == 0){
1559 if (strcmp(long_option[i].name, "exec-out") == 0){
1564 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1565 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1566 no_cp932ext_f = TRUE;
1569 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1570 no_best_fit_chars_f = TRUE;
1573 if (strcmp(long_option[i].name, "fb-skip") == 0){
1574 encode_fallback = NULL;
1577 if (strcmp(long_option[i].name, "fb-html") == 0){
1578 encode_fallback = encode_fallback_html;
1581 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1582 encode_fallback = encode_fallback_xml;
1585 if (strcmp(long_option[i].name, "fb-java") == 0){
1586 encode_fallback = encode_fallback_java;
1589 if (strcmp(long_option[i].name, "fb-perl") == 0){
1590 encode_fallback = encode_fallback_perl;
1593 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1594 encode_fallback = encode_fallback_subchar;
1597 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1598 encode_fallback = encode_fallback_subchar;
1599 unicode_subchar = 0;
1601 /* decimal number */
1602 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1603 unicode_subchar *= 10;
1604 unicode_subchar += hex2bin(p[i]);
1606 }else if(p[1] == 'x' || p[1] == 'X'){
1607 /* hexadecimal number */
1608 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1609 unicode_subchar <<= 4;
1610 unicode_subchar |= hex2bin(p[i]);
1614 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1615 unicode_subchar *= 8;
1616 unicode_subchar += hex2bin(p[i]);
1619 w16e_conv(unicode_subchar, &i, &j);
1620 unicode_subchar = i<<8 | j;
1624 #ifdef UTF8_OUTPUT_ENABLE
1625 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1626 ms_ucs_map_f = UCS_MAP_MS;
1630 #ifdef UNICODE_NORMALIZATION
1631 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1632 input_f = UTF8_INPUT;
1637 if (strcmp(long_option[i].name, "prefix=") == 0){
1638 if (nkf_isgraph(p[0])){
1639 for (i = 1; nkf_isgraph(p[i]); i++){
1640 prefix_table[p[i]] = p[0];
1647 case 'b': /* buffered mode */
1650 case 'u': /* non bufferd mode */
1653 case 't': /* transparent mode */
1658 } else if (*cp=='2') {
1662 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1670 case 'j': /* JIS output */
1672 output_conv = j_oconv;
1674 case 'e': /* AT&T EUC output */
1675 output_conv = e_oconv;
1677 case 's': /* SJIS output */
1678 output_conv = s_oconv;
1680 case 'l': /* ISO8859 Latin-1 support, no conversion */
1681 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1682 input_f = LATIN1_INPUT;
1684 case 'i': /* Kanji IN ESC-$-@/B */
1685 if (*cp=='@'||*cp=='B')
1686 kanji_intro = *cp++;
1688 case 'o': /* ASCII IN ESC-(-J/B */
1689 if (*cp=='J'||*cp=='B'||*cp=='H')
1690 ascii_intro = *cp++;
1694 bit:1 katakana->hiragana
1695 bit:2 hiragana->katakana
1697 if ('9'>= *cp && *cp>='0')
1698 hira_f |= (*cp++ -'0');
1705 #if defined(MSDOS) || defined(__OS2__)
1720 #ifdef UTF8_OUTPUT_ENABLE
1721 case 'w': /* UTF-8 output */
1723 output_conv = w_oconv; cp++;
1727 output_bom_f = TRUE;
1730 if ('1'== cp[0] && '6'==cp[1]) {
1731 output_conv = w_oconv16; cp+=2;
1732 } else if ('3'== cp[0] && '2'==cp[1]) {
1733 output_conv = w_oconv32; cp+=2;
1735 output_conv = w_oconv;
1740 output_endian = ENDIAN_LITTLE;
1741 } else if (cp[0] == 'B') {
1749 output_bom_f = TRUE;
1754 #ifdef UTF8_INPUT_ENABLE
1755 case 'W': /* UTF input */
1758 input_f = UTF8_INPUT;
1760 if ('1'== cp[0] && '6'==cp[1]) {
1762 input_f = UTF16_INPUT;
1763 input_endian = ENDIAN_BIG;
1764 } else if ('3'== cp[0] && '2'==cp[1]) {
1766 input_f = UTF32_INPUT;
1767 input_endian = ENDIAN_BIG;
1769 input_f = UTF8_INPUT;
1774 input_endian = ENDIAN_LITTLE;
1775 } else if (cp[0] == 'B') {
1781 /* Input code assumption */
1782 case 'J': /* JIS input */
1783 input_f = JIS_INPUT;
1785 case 'E': /* AT&T EUC input */
1786 input_f = EUC_INPUT;
1788 case 'S': /* MS Kanji input */
1789 input_f = SJIS_INPUT;
1790 if (x0201_f==NO_X0201) x0201_f=TRUE;
1792 case 'Z': /* Convert X0208 alphabet to asii */
1793 /* bit:0 Convert X0208
1794 bit:1 Convert Kankaku to one space
1795 bit:2 Convert Kankaku to two spaces
1796 bit:3 Convert HTML Entity
1798 if ('9'>= *cp && *cp>='0')
1799 alpha_f |= 1<<(*cp++ -'0');
1803 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1804 x0201_f = FALSE; /* No X0201->X0208 conversion */
1806 ESC-(-I in JIS, EUC, MS Kanji
1807 SI/SO in JIS, EUC, MS Kanji
1808 SSO in EUC, JIS, not in MS Kanji
1809 MS Kanji (0xa0-0xdf)
1811 ESC-(-I in JIS (0x20-0x5f)
1812 SSO in EUC (0xa0-0xdf)
1813 0xa0-0xd in MS Kanji (0xa0-0xdf)
1816 case 'X': /* Assume X0201 kana */
1817 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1820 case 'F': /* prserve new lines */
1821 fold_preserve_f = TRUE;
1822 case 'f': /* folding -f60 or -f */
1825 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1827 fold_len += *cp++ - '0';
1829 if (!(0<fold_len && fold_len<BUFSIZ))
1830 fold_len = DEFAULT_FOLD;
1834 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1836 fold_margin += *cp++ - '0';
1840 case 'm': /* MIME support */
1841 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1842 if (*cp=='B'||*cp=='Q') {
1843 mime_decode_mode = *cp++;
1844 mimebuf_f = FIXED_MIME;
1845 } else if (*cp=='N') {
1846 mime_f = TRUE; cp++;
1847 } else if (*cp=='S') {
1848 mime_f = STRICT_MIME; cp++;
1849 } else if (*cp=='0') {
1850 mime_decode_f = FALSE;
1851 mime_f = FALSE; cp++;
1854 case 'M': /* MIME output */
1857 mimeout_f = FIXED_MIME; cp++;
1858 } else if (*cp=='Q') {
1860 mimeout_f = FIXED_MIME; cp++;
1865 case 'B': /* Broken JIS support */
1867 bit:1 allow any x on ESC-(-x or ESC-$-x
1868 bit:2 reset to ascii on NL
1870 if ('9'>= *cp && *cp>='0')
1871 broken_f |= 1<<(*cp++ -'0');
1876 case 'O':/* for Output file */
1880 case 'c':/* add cr code */
1883 case 'd':/* delete cr code */
1886 case 'I': /* ISO-2022-JP output */
1889 case 'L': /* line mode */
1890 if (*cp=='u') { /* unix */
1891 crmode_f = NL; cp++;
1892 } else if (*cp=='m') { /* mac */
1893 crmode_f = CR; cp++;
1894 } else if (*cp=='w') { /* windows */
1895 crmode_f = CRLF; cp++;
1896 } else if (*cp=='0') { /* no conversion */
1906 /* module muliple options in a string are allowed for Perl moudle */
1907 while(*cp && *cp++!='-');
1910 /* bogus option but ignored */
1916 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1919 struct input_code *p = input_code_list;
1921 if (iconv_func == p->iconv_func){
1930 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1932 #ifdef INPUT_CODE_FIX
1940 #ifdef INPUT_CODE_FIX
1941 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1947 if (estab_f && iconv_for_check != iconv){
1948 struct input_code *p = find_inputcode_byfunc(iconv);
1950 set_input_codename(p->name);
1951 debug(input_codename);
1953 iconv_for_check = iconv;
1958 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1959 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1960 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1961 #ifdef SHIFTJIS_CP932
1962 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1963 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1965 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1967 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1968 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1970 #define SCORE_INIT (SCORE_iMIME)
1972 const nkf_char score_table_A0[] = {
1975 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1976 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1979 const nkf_char score_table_F0[] = {
1980 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1981 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1982 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1983 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1986 void set_code_score(struct input_code *ptr, nkf_char score)
1989 ptr->score |= score;
1993 void clr_code_score(struct input_code *ptr, nkf_char score)
1996 ptr->score &= ~score;
2000 void code_score(struct input_code *ptr)
2002 nkf_char c2 = ptr->buf[0];
2003 #ifdef UTF8_OUTPUT_ENABLE
2004 nkf_char c1 = ptr->buf[1];
2007 set_code_score(ptr, SCORE_ERROR);
2008 }else if (c2 == SSO){
2009 set_code_score(ptr, SCORE_KANA);
2010 #ifdef UTF8_OUTPUT_ENABLE
2011 }else if (!e2w_conv(c2, c1)){
2012 set_code_score(ptr, SCORE_NO_EXIST);
2014 }else if ((c2 & 0x70) == 0x20){
2015 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2016 }else if ((c2 & 0x70) == 0x70){
2017 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2018 }else if ((c2 & 0x70) >= 0x50){
2019 set_code_score(ptr, SCORE_L2);
2023 void status_disable(struct input_code *ptr)
2028 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2031 void status_push_ch(struct input_code *ptr, nkf_char c)
2033 ptr->buf[ptr->index++] = c;
2036 void status_clear(struct input_code *ptr)
2042 void status_reset(struct input_code *ptr)
2045 ptr->score = SCORE_INIT;
2048 void status_reinit(struct input_code *ptr)
2051 ptr->_file_stat = 0;
2054 void status_check(struct input_code *ptr, nkf_char c)
2056 if (c <= DEL && estab_f){
2061 void s_status(struct input_code *ptr, nkf_char c)
2065 status_check(ptr, c);
2070 #ifdef NUMCHAR_OPTION
2071 }else if (is_unicode_capsule(c)){
2074 }else if (0xa1 <= c && c <= 0xdf){
2075 status_push_ch(ptr, SSO);
2076 status_push_ch(ptr, c);
2079 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2081 status_push_ch(ptr, c);
2082 #ifdef SHIFTJIS_CP932
2084 && is_ibmext_in_sjis(c)){
2086 status_push_ch(ptr, c);
2087 #endif /* SHIFTJIS_CP932 */
2089 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2091 status_push_ch(ptr, c);
2092 #endif /* X0212_ENABLE */
2094 status_disable(ptr);
2098 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2099 status_push_ch(ptr, c);
2100 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2104 status_disable(ptr);
2108 #ifdef SHIFTJIS_CP932
2109 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2110 status_push_ch(ptr, c);
2111 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2112 set_code_score(ptr, SCORE_CP932);
2117 #endif /* SHIFTJIS_CP932 */
2118 #ifndef X0212_ENABLE
2119 status_disable(ptr);
2125 void e_status(struct input_code *ptr, nkf_char c)
2129 status_check(ptr, c);
2134 #ifdef NUMCHAR_OPTION
2135 }else if (is_unicode_capsule(c)){
2138 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2140 status_push_ch(ptr, c);
2142 }else if (0x8f == c){
2144 status_push_ch(ptr, c);
2145 #endif /* X0212_ENABLE */
2147 status_disable(ptr);
2151 if (0xa1 <= c && c <= 0xfe){
2152 status_push_ch(ptr, c);
2156 status_disable(ptr);
2161 if (0xa1 <= c && c <= 0xfe){
2163 status_push_ch(ptr, c);
2165 status_disable(ptr);
2167 #endif /* X0212_ENABLE */
2171 #ifdef UTF8_INPUT_ENABLE
2172 void w_status(struct input_code *ptr, nkf_char c)
2176 status_check(ptr, c);
2181 #ifdef NUMCHAR_OPTION
2182 }else if (is_unicode_capsule(c)){
2185 }else if (0xc0 <= c && c <= 0xdf){
2187 status_push_ch(ptr, c);
2188 }else if (0xe0 <= c && c <= 0xef){
2190 status_push_ch(ptr, c);
2191 }else if (0xf0 <= c && c <= 0xf4){
2193 status_push_ch(ptr, c);
2195 status_disable(ptr);
2200 if (0x80 <= c && c <= 0xbf){
2201 status_push_ch(ptr, c);
2202 if (ptr->index > ptr->stat){
2203 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2204 && ptr->buf[2] == 0xbf);
2205 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2206 &ptr->buf[0], &ptr->buf[1]);
2213 status_disable(ptr);
2217 if (0x80 <= c && c <= 0xbf){
2218 if (ptr->index < ptr->stat){
2219 status_push_ch(ptr, c);
2224 status_disable(ptr);
2231 void code_status(nkf_char c)
2233 int action_flag = 1;
2234 struct input_code *result = 0;
2235 struct input_code *p = input_code_list;
2237 if (!p->status_func) {
2241 if (!p->status_func)
2243 (p->status_func)(p, c);
2246 }else if(p->stat == 0){
2257 if (result && !estab_f){
2258 set_iconv(TRUE, result->iconv_func);
2259 }else if (c <= DEL){
2260 struct input_code *ptr = input_code_list;
2270 nkf_char std_getc(FILE *f)
2273 return std_gc_buf[--std_gc_ndx];
2279 nkf_char std_ungetc(nkf_char c, FILE *f)
2281 if (std_gc_ndx == STD_GC_BUFSIZE){
2284 std_gc_buf[std_gc_ndx++] = c;
2289 void std_putc(nkf_char c)
2296 #if !defined(PERL_XS) && !defined(WIN32DLL)
2297 nkf_char noconvert(FILE *f)
2302 module_connection();
2303 while ((c = (*i_getc)(f)) != EOF)
2310 void module_connection(void)
2312 oconv = output_conv;
2315 /* replace continucation module, from output side */
2317 /* output redicrection */
2319 if (noout_f || guess_f){
2326 if (mimeout_f == TRUE) {
2327 o_base64conv = oconv; oconv = base64_conv;
2329 /* base64_count = 0; */
2333 o_crconv = oconv; oconv = cr_conv;
2336 o_rot_conv = oconv; oconv = rot_conv;
2339 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2342 o_hira_conv = oconv; oconv = hira_conv;
2345 o_fconv = oconv; oconv = fold_conv;
2348 if (alpha_f || x0201_f) {
2349 o_zconv = oconv; oconv = z_conv;
2353 i_ungetc = std_ungetc;
2354 /* input redicrection */
2357 i_cgetc = i_getc; i_getc = cap_getc;
2358 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2361 i_ugetc = i_getc; i_getc = url_getc;
2362 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2365 #ifdef NUMCHAR_OPTION
2367 i_ngetc = i_getc; i_getc = numchar_getc;
2368 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2371 #ifdef UNICODE_NORMALIZATION
2372 if (nfc_f && input_f == UTF8_INPUT){
2373 i_nfc_getc = i_getc; i_getc = nfc_getc;
2374 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2377 if (mime_f && mimebuf_f==FIXED_MIME) {
2378 i_mgetc = i_getc; i_getc = mime_getc;
2379 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2382 i_bgetc = i_getc; i_getc = broken_getc;
2383 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2385 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2386 set_iconv(-TRUE, e_iconv);
2387 } else if (input_f == SJIS_INPUT) {
2388 set_iconv(-TRUE, s_iconv);
2389 #ifdef UTF8_INPUT_ENABLE
2390 } else if (input_f == UTF8_INPUT) {
2391 set_iconv(-TRUE, w_iconv);
2392 } else if (input_f == UTF16_INPUT) {
2393 set_iconv(-TRUE, w_iconv16);
2394 } else if (input_f == UTF32_INPUT) {
2395 set_iconv(-TRUE, w_iconv32);
2398 set_iconv(FALSE, e_iconv);
2402 struct input_code *p = input_code_list;
2410 * Check and Ignore BOM
2412 void check_bom(FILE *f)
2415 switch(c2 = (*i_getc)(f)){
2417 if((c2 = (*i_getc)(f)) == 0x00){
2418 if((c2 = (*i_getc)(f)) == 0xFE){
2419 if((c2 = (*i_getc)(f)) == 0xFF){
2421 set_iconv(TRUE, w_iconv32);
2423 if (iconv == w_iconv32) {
2424 input_endian = ENDIAN_BIG;
2427 (*i_ungetc)(0xFF,f);
2428 }else (*i_ungetc)(c2,f);
2429 (*i_ungetc)(0xFE,f);
2430 }else if(c2 == 0xFF){
2431 if((c2 = (*i_getc)(f)) == 0xFE){
2433 set_iconv(TRUE, w_iconv32);
2435 if (iconv == w_iconv32) {
2436 input_endian = ENDIAN_2143;
2439 (*i_ungetc)(0xFF,f);
2440 }else (*i_ungetc)(c2,f);
2441 (*i_ungetc)(0xFF,f);
2442 }else (*i_ungetc)(c2,f);
2443 (*i_ungetc)(0x00,f);
2444 }else (*i_ungetc)(c2,f);
2445 (*i_ungetc)(0x00,f);
2448 if((c2 = (*i_getc)(f)) == 0xBB){
2449 if((c2 = (*i_getc)(f)) == 0xBF){
2451 set_iconv(TRUE, w_iconv);
2453 if (iconv == w_iconv) {
2456 (*i_ungetc)(0xBF,f);
2457 }else (*i_ungetc)(c2,f);
2458 (*i_ungetc)(0xBB,f);
2459 }else (*i_ungetc)(c2,f);
2460 (*i_ungetc)(0xEF,f);
2463 if((c2 = (*i_getc)(f)) == 0xFF){
2464 if((c2 = (*i_getc)(f)) == 0x00){
2465 if((c2 = (*i_getc)(f)) == 0x00){
2467 set_iconv(TRUE, w_iconv32);
2469 if (iconv == w_iconv32) {
2470 input_endian = ENDIAN_3412;
2473 (*i_ungetc)(0x00,f);
2474 }else (*i_ungetc)(c2,f);
2475 (*i_ungetc)(0x00,f);
2476 }else (*i_ungetc)(c2,f);
2478 set_iconv(TRUE, w_iconv16);
2480 if (iconv == w_iconv16) {
2481 input_endian = ENDIAN_BIG;
2484 (*i_ungetc)(0xFF,f);
2485 }else (*i_ungetc)(c2,f);
2486 (*i_ungetc)(0xFE,f);
2489 if((c2 = (*i_getc)(f)) == 0xFE){
2490 if((c2 = (*i_getc)(f)) == 0x00){
2491 if((c2 = (*i_getc)(f)) == 0x00){
2493 set_iconv(TRUE, w_iconv32);
2495 if (iconv == w_iconv32) {
2496 input_endian = ENDIAN_LITTLE;
2499 (*i_ungetc)(0x00,f);
2500 }else (*i_ungetc)(c2,f);
2501 (*i_ungetc)(0x00,f);
2502 }else (*i_ungetc)(c2,f);
2504 set_iconv(TRUE, w_iconv16);
2506 if (iconv == w_iconv16) {
2507 input_endian = ENDIAN_LITTLE;
2510 (*i_ungetc)(0xFE,f);
2511 }else (*i_ungetc)(c2,f);
2512 (*i_ungetc)(0xFF,f);
2521 Conversion main loop. Code detection only.
2524 nkf_char kanji_convert(FILE *f)
2526 nkf_char c3, c2=0, c1, c0=0;
2527 int is_8bit = FALSE;
2529 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2530 #ifdef UTF8_INPUT_ENABLE
2531 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2538 output_mode = ASCII;
2541 #define NEXT continue /* no output, get next */
2542 #define SEND ; /* output c1 and c2, get next */
2543 #define LAST break /* end of loop, go closing */
2545 module_connection();
2548 while ((c1 = (*i_getc)(f)) != EOF) {
2549 #ifdef INPUT_CODE_FIX
2556 /* in case of 8th bit is on */
2557 if (!estab_f&&!mime_decode_mode) {
2558 /* in case of not established yet */
2559 /* It is still ambiguious */
2560 if (h_conv(f, c2, c1)==EOF)
2566 /* in case of already established */
2568 /* ignore bogus code */
2574 /* second byte, 7 bit code */
2575 /* it might be kanji shitfted */
2576 if ((c1 == DEL) || (c1 <= SPACE)) {
2577 /* ignore bogus first code */
2584 #ifdef UTF8_INPUT_ENABLE
2585 if (iconv == w_iconv16) {
2586 if (input_endian == ENDIAN_BIG) {
2588 if ((c1 = (*i_getc)(f)) != EOF) {
2589 if (0xD8 <= c2 && c2 <= 0xDB) {
2590 if ((c0 = (*i_getc)(f)) != EOF) {
2592 if ((c3 = (*i_getc)(f)) != EOF) {
2599 if ((c2 = (*i_getc)(f)) != EOF) {
2600 if (0xD8 <= c2 && c2 <= 0xDB) {
2601 if ((c3 = (*i_getc)(f)) != EOF) {
2602 if ((c0 = (*i_getc)(f)) != EOF) {
2611 } else if(iconv == w_iconv32){
2613 if((c2 = (*i_getc)(f)) != EOF &&
2614 (c1 = (*i_getc)(f)) != EOF &&
2615 (c0 = (*i_getc)(f)) != EOF){
2616 switch(input_endian){
2618 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2621 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2624 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2627 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2637 #ifdef NUMCHAR_OPTION
2638 if (is_unicode_capsule(c1)){
2644 if (!estab_f && !iso8859_f) {
2645 /* not established yet */
2648 } else { /* estab_f==TRUE */
2653 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2654 /* SJIS X0201 Case... */
2655 if(iso2022jp_f && x0201_f==NO_X0201) {
2656 (*oconv)(GETA1, GETA2);
2663 } else if (c1==SSO && iconv != s_iconv) {
2664 /* EUC X0201 Case */
2665 c1 = (*i_getc)(f); /* skip SSO */
2667 if (SSP<=c1 && c1<0xe0) {
2668 if(iso2022jp_f && x0201_f==NO_X0201) {
2669 (*oconv)(GETA1, GETA2);
2676 } else { /* bogus code, skip SSO and one byte */
2680 /* already established */
2685 } else if ((c1 > SPACE) && (c1 != DEL)) {
2686 /* in case of Roman characters */
2688 /* output 1 shifted byte */
2692 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2693 /* output 1 shifted byte */
2694 if(iso2022jp_f && x0201_f==NO_X0201) {
2695 (*oconv)(GETA1, GETA2);
2702 /* look like bogus code */
2705 } else if (input_mode == X0208 || input_mode == X0212 ||
2706 input_mode == X0213_1 || input_mode == X0213_2) {
2707 /* in case of Kanji shifted */
2710 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2711 /* Check MIME code */
2712 if ((c1 = (*i_getc)(f)) == EOF) {
2715 } else if (c1 == '?') {
2716 /* =? is mime conversion start sequence */
2717 if(mime_f == STRICT_MIME) {
2718 /* check in real detail */
2719 if (mime_begin_strict(f) == EOF)
2723 } else if (mime_begin(f) == EOF)
2733 /* normal ASCII code */
2736 } else if (!is_8bit && c1 == SI) {
2739 } else if (!is_8bit && c1 == SO) {
2742 } else if (!is_8bit && c1 == ESC ) {
2743 if ((c1 = (*i_getc)(f)) == EOF) {
2744 /* (*oconv)(0, ESC); don't send bogus code */
2746 } else if (c1 == '$') {
2747 if ((c1 = (*i_getc)(f)) == EOF) {
2749 (*oconv)(0, ESC); don't send bogus code
2750 (*oconv)(0, '$'); */
2752 } else if (c1 == '@'|| c1 == 'B') {
2753 /* This is kanji introduction */
2756 set_input_codename("ISO-2022-JP");
2758 debug(input_codename);
2761 } else if (c1 == '(') {
2762 if ((c1 = (*i_getc)(f)) == EOF) {
2763 /* don't send bogus code
2769 } else if (c1 == '@'|| c1 == 'B') {
2770 /* This is kanji introduction */
2775 } else if (c1 == 'D'){
2779 #endif /* X0212_ENABLE */
2780 } else if (c1 == (X0213_1&0x7F)){
2781 input_mode = X0213_1;
2784 } else if (c1 == (X0213_2&0x7F)){
2785 input_mode = X0213_2;
2789 /* could be some special code */
2796 } else if (broken_f&0x2) {
2797 /* accept any ESC-(-x as broken code ... */
2807 } else if (c1 == '(') {
2808 if ((c1 = (*i_getc)(f)) == EOF) {
2809 /* don't send bogus code
2811 (*oconv)(0, '('); */
2815 /* This is X0201 kana introduction */
2816 input_mode = X0201; shift_mode = X0201;
2818 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2819 /* This is X0208 kanji introduction */
2820 input_mode = ASCII; shift_mode = FALSE;
2822 } else if (broken_f&0x2) {
2823 input_mode = ASCII; shift_mode = FALSE;
2828 /* maintain various input_mode here */
2832 } else if ( c1 == 'N' || c1 == 'n' ){
2834 c3 = (*i_getc)(f); /* skip SS2 */
2835 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2850 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2851 input_mode = ASCII; set_iconv(FALSE, 0);
2853 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2854 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2862 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2863 if ((c1=(*i_getc)(f))!=EOF) {
2867 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2885 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2888 if ((c0 = (*i_getc)(f)) != EOF) {
2891 if ((c3 = (*i_getc)(f)) != EOF) {
2893 (*iconv)(c2, c1, c0|c3);
2898 /* 3 bytes EUC or UTF-8 */
2899 if ((c0 = (*i_getc)(f)) != EOF) {
2901 (*iconv)(c2, c1, c0);
2908 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2912 (*oconv)(PREFIX_EUCG3 | c2, c1);
2914 #endif /* X0212_ENABLE */
2916 (*oconv)(PREFIX_EUCG3 | c2, c1);
2919 (*oconv)(input_mode, c1); /* other special case */
2925 /* goto next_word */
2929 (*iconv)(EOF, 0, 0);
2930 if (!is_inputcode_set)
2933 struct input_code *p = input_code_list;
2934 struct input_code *result = p;
2936 if (p->score < result->score) result = p;
2939 set_input_codename(result->name);
2946 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2948 nkf_char ret, c3, c0;
2952 /** it must NOT be in the kanji shifte sequence */
2953 /** it must NOT be written in JIS7 */
2954 /** and it must be after 2 byte 8bit code */
2960 while ((c1 = (*i_getc)(f)) != EOF) {
2966 if (push_hold_buf(c1) == EOF || estab_f){
2972 struct input_code *p = input_code_list;
2973 struct input_code *result = p;
2978 if (p->score < result->score){
2983 set_iconv(FALSE, result->iconv_func);
2988 ** 1) EOF is detected, or
2989 ** 2) Code is established, or
2990 ** 3) Buffer is FULL (but last word is pushed)
2992 ** in 1) and 3) cases, we continue to use
2993 ** Kanji codes by oconv and leave estab_f unchanged.
2998 while (hold_index < hold_count){
2999 c2 = hold_buf[hold_index++];
3001 #ifdef NUMCHAR_OPTION
3002 || is_unicode_capsule(c2)
3007 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3008 (*iconv)(X0201, c2, 0);
3011 if (hold_index < hold_count){
3012 c1 = hold_buf[hold_index++];
3022 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3025 if (hold_index < hold_count){
3026 c0 = hold_buf[hold_index++];
3027 } else if ((c0 = (*i_getc)(f)) == EOF) {
3033 if (hold_index < hold_count){
3034 c3 = hold_buf[hold_index++];
3035 } else if ((c3 = (*i_getc)(f)) == EOF) {
3040 (*iconv)(c2, c1, c0|c3);
3045 /* 3 bytes EUC or UTF-8 */
3046 if (hold_index < hold_count){
3047 c0 = hold_buf[hold_index++];
3048 } else if ((c0 = (*i_getc)(f)) == EOF) {
3054 (*iconv)(c2, c1, c0);
3057 if (c0 == EOF) break;
3062 nkf_char push_hold_buf(nkf_char c2)
3064 if (hold_count >= HOLD_SIZE*2)
3066 hold_buf[hold_count++] = (unsigned char)c2;
3067 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3070 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3072 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3075 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3076 #ifdef SHIFTJIS_CP932
3077 if (cp51932_f && is_ibmext_in_sjis(c2)){
3079 extern const unsigned short shiftjis_cp932[3][189];
3081 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3087 #endif /* SHIFTJIS_CP932 */
3089 if (!x0213_f && is_ibmext_in_sjis(c2)){
3091 extern const unsigned short shiftjis_x0212[3][189];
3093 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3096 c2 = PREFIX_EUCG3 | (val >> 8);
3109 if(x0213_f && c2 >= 0xF0){
3110 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3111 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3112 }else{ /* 78<=k<=94 */
3113 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3114 if (0x9E < c1) c2++;
3117 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3118 if (0x9E < c1) c2++;
3121 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3128 c2 = x0212_unshift(c2);
3135 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3139 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3142 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3143 if (ret) return ret;
3149 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3154 }else if (c2 == 0x8f){
3158 c2 = (c2 << 8) | (c1 & 0x7f);
3160 #ifdef SHIFTJIS_CP932
3163 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3164 s2e_conv(s2, s1, &c2, &c1);
3171 #endif /* SHIFTJIS_CP932 */
3172 #endif /* X0212_ENABLE */
3173 } else if (c2 == SSO){
3176 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3186 #ifdef UTF8_INPUT_ENABLE
3187 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3194 }else if (0xc0 <= c2 && c2 <= 0xef) {
3195 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3196 #ifdef NUMCHAR_OPTION
3199 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3207 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3210 static const int w_iconv_utf8_1st_byte[] =
3212 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3213 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3214 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3215 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3217 if (c2 < 0 || 0xff < c2) {
3218 }else if (c2 == 0) { /* 0 : 1 byte*/
3220 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3223 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3225 if (c1 < 0x80 || 0xBF < c1) return 0;
3228 if (c0 == 0) return -1;
3229 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3234 if (c0 == 0) return -1;
3235 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3239 if (c0 == 0) return -1;
3240 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3244 if (c0 == 0) return -2;
3245 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3249 if (c0 == 0) return -2;
3250 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3254 if (c0 == 0) return -2;
3255 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3263 if (c2 == 0 || c2 == EOF){
3264 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3265 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3268 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3277 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3278 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3285 }else if (val < 0x800){
3286 *p2 = 0xc0 | (val >> 6);
3287 *p1 = 0x80 | (val & 0x3f);
3289 } else if (val <= NKF_INT32_C(0xFFFF)) {
3290 *p2 = 0xe0 | (val >> 12);
3291 *p1 = 0x80 | ((val >> 6) & 0x3f);
3292 *p0 = 0x80 | (val & 0x3f);
3293 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3294 *p2 = 0xe0 | (val >> 16);
3295 *p1 = 0x80 | ((val >> 12) & 0x3f);
3296 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3305 #ifdef UTF8_INPUT_ENABLE
3306 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3311 } else if (c2 >= 0xf0){
3312 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3313 val = (c2 & 0x0f) << 18;
3314 val |= (c1 & 0x3f) << 12;
3315 val |= (c0 & 0x3f00) >> 2;
3317 }else if (c2 >= 0xe0){
3318 val = (c2 & 0x0f) << 12;
3319 val |= (c1 & 0x3f) << 6;
3321 }else if (c2 >= 0xc0){
3322 val = (c2 & 0x1f) << 6;
3330 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3332 nkf_char c2, c1, c0;
3339 w16w_conv(val, &c2, &c1, &c0);
3340 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3341 #ifdef NUMCHAR_OPTION
3344 *p1 = CLASS_UNICODE | val;
3353 #ifdef UTF8_INPUT_ENABLE
3354 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3357 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3360 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3361 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3363 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3365 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3370 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3371 if (ret) return ret;
3376 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3380 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3381 } else if (is_unicode_bmp(c1)) {
3382 ret = w16e_conv(c1, &c2, &c1);
3385 c1 = CLASS_UNICODE | c1;
3387 if (ret) return ret;
3392 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3395 extern const unsigned short *const utf8_to_euc_2bytes[];
3396 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3397 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3398 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3399 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3400 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3402 const unsigned short *const *pp;
3403 const unsigned short *const *const *ppp;
3404 static const int no_best_fit_chars_table_C2[] =
3405 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3406 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3407 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3408 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3409 static const int no_best_fit_chars_table_C2_ms[] =
3410 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3411 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3412 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3413 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3414 static const int no_best_fit_chars_table_932_C2[] =
3415 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3416 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3417 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3418 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3419 static const int no_best_fit_chars_table_932_C3[] =
3420 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3421 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3422 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3423 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3429 }else if(c2 < 0xe0){
3430 if(no_best_fit_chars_f){
3431 if(ms_ucs_map_f == UCS_MAP_CP932){
3434 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3437 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3440 }else if(cp51932_f){
3443 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3446 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3449 }else if(ms_ucs_map_f == UCS_MAP_MS){
3450 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3454 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3455 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3457 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3458 }else if(c0 < 0xF0){
3459 if(no_best_fit_chars_f){
3460 if(ms_ucs_map_f == UCS_MAP_CP932){
3461 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3462 }else if(ms_ucs_map_f == UCS_MAP_MS){
3467 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3470 if(c0 == 0x92) return 1;
3475 if(c1 == 0x80 || c0 == 0x9C) return 1;
3483 if(c0 == 0x95) return 1;
3486 if(c0 == 0xA5) return 1;
3493 if(c0 == 0x8D) return 1;
3496 if(c0 == 0x9E && cp51932_f) return 1;
3499 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3507 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3508 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3510 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3512 #ifdef SHIFTJIS_CP932
3513 if (!ret && cp51932_f && is_eucg3(*p2)) {
3515 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3516 s2e_conv(s2, s1, p2, p1);
3525 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3528 const unsigned short *p;
3531 if (pp == 0) return 1;
3534 if (c1 < 0 || psize <= c1) return 1;
3536 if (p == 0) return 1;
3539 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3541 if (val == 0) return 1;
3542 if (no_cp932ext_f && (
3543 (val>>8) == 0x2D || /* NEC special characters */
3544 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3552 if (c2 == SO) c2 = X0201;
3559 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3561 const char *hex = "0123456789ABCDEF";
3567 (*f)(0, hex[(c>>shift)&0xF]);
3577 void encode_fallback_html(nkf_char c)
3582 if(c >= NKF_INT32_C(1000000))
3583 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3584 if(c >= NKF_INT32_C(100000))
3585 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3587 (*oconv)(0, 0x30+(c/10000 )%10);
3589 (*oconv)(0, 0x30+(c/1000 )%10);
3591 (*oconv)(0, 0x30+(c/100 )%10);
3593 (*oconv)(0, 0x30+(c/10 )%10);
3595 (*oconv)(0, 0x30+ c %10);
3600 void encode_fallback_xml(nkf_char c)
3605 nkf_each_char_to_hex(oconv, c);
3610 void encode_fallback_java(nkf_char c)
3612 const char *hex = "0123456789ABCDEF";
3615 if(!is_unicode_bmp(c)){
3619 (*oconv)(0, hex[(c>>20)&0xF]);
3620 (*oconv)(0, hex[(c>>16)&0xF]);
3624 (*oconv)(0, hex[(c>>12)&0xF]);
3625 (*oconv)(0, hex[(c>> 8)&0xF]);
3626 (*oconv)(0, hex[(c>> 4)&0xF]);
3627 (*oconv)(0, hex[ c &0xF]);
3631 void encode_fallback_perl(nkf_char c)
3636 nkf_each_char_to_hex(oconv, c);
3641 void encode_fallback_subchar(nkf_char c)
3643 c = unicode_subchar;
3644 (*oconv)((c>>8)&0xFF, c&0xFF);
3649 #ifdef UTF8_OUTPUT_ENABLE
3650 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3653 extern const unsigned short euc_to_utf8_1byte[];
3654 extern const unsigned short *const euc_to_utf8_2bytes[];
3655 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3656 extern const unsigned short *const x0212_to_utf8_2bytes[];
3658 const unsigned short *p;
3661 p = euc_to_utf8_1byte;
3663 } else if (is_eucg3(c2)){
3664 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3667 c2 = (c2&0x7f) - 0x21;
3668 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3669 p = x0212_to_utf8_2bytes[c2];
3675 c2 = (c2&0x7f) - 0x21;
3676 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3677 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3682 c1 = (c1 & 0x7f) - 0x21;
3683 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3688 void w_oconv(nkf_char c2, nkf_char c1)
3694 output_bom_f = FALSE;
3705 #ifdef NUMCHAR_OPTION
3706 if (c2 == 0 && is_unicode_capsule(c1)){
3707 val = c1 & VALUE_MASK;
3710 }else if (val < 0x800){
3711 (*o_putc)(0xC0 | (val >> 6));
3712 (*o_putc)(0x80 | (val & 0x3f));
3713 } else if (val <= NKF_INT32_C(0xFFFF)) {
3714 (*o_putc)(0xE0 | (val >> 12));
3715 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3716 (*o_putc)(0x80 | (val & 0x3f));
3717 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3718 (*o_putc)(0xF0 | ( val>>18));
3719 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3720 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3721 (*o_putc)(0x80 | ( val & 0x3f));
3728 output_mode = ASCII;
3730 } else if (c2 == ISO8859_1) {
3731 output_mode = ISO8859_1;
3732 (*o_putc)(c1 | 0x080);
3735 val = e2w_conv(c2, c1);
3737 w16w_conv(val, &c2, &c1, &c0);
3741 if (c0) (*o_putc)(c0);
3747 void w_oconv16(nkf_char c2, nkf_char c1)
3750 output_bom_f = FALSE;
3751 if (output_endian == ENDIAN_LITTLE){
3752 (*o_putc)((unsigned char)'\377');
3756 (*o_putc)((unsigned char)'\377');
3765 if (c2 == ISO8859_1) {
3768 #ifdef NUMCHAR_OPTION
3769 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3770 if (is_unicode_bmp(c1)) {
3771 c2 = (c1 >> 8) & 0xff;
3775 if (c1 <= UNICODE_MAX) {
3776 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3777 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3778 if (output_endian == ENDIAN_LITTLE){
3779 (*o_putc)(c2 & 0xff);
3780 (*o_putc)((c2 >> 8) & 0xff);
3781 (*o_putc)(c1 & 0xff);
3782 (*o_putc)((c1 >> 8) & 0xff);
3784 (*o_putc)((c2 >> 8) & 0xff);
3785 (*o_putc)(c2 & 0xff);
3786 (*o_putc)((c1 >> 8) & 0xff);
3787 (*o_putc)(c1 & 0xff);
3794 nkf_char val = e2w_conv(c2, c1);
3795 c2 = (val >> 8) & 0xff;
3798 if (output_endian == ENDIAN_LITTLE){
3807 void w_oconv32(nkf_char c2, nkf_char c1)
3810 output_bom_f = FALSE;
3811 if (output_endian == ENDIAN_LITTLE){
3812 (*o_putc)((unsigned char)'\377');
3820 (*o_putc)((unsigned char)'\377');
3829 if (c2 == ISO8859_1) {
3831 #ifdef NUMCHAR_OPTION
3832 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3836 c1 = e2w_conv(c2, c1);
3838 if (output_endian == ENDIAN_LITTLE){
3839 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3840 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3841 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3845 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3846 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3847 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3852 void e_oconv(nkf_char c2, nkf_char c1)
3854 #ifdef NUMCHAR_OPTION
3855 if (c2 == 0 && is_unicode_capsule(c1)){
3856 w16e_conv(c1, &c2, &c1);
3857 if (c2 == 0 && is_unicode_capsule(c1)){
3858 if(encode_fallback)(*encode_fallback)(c1);
3866 } else if (c2 == 0) {
3867 output_mode = ASCII;
3869 } else if (c2 == X0201) {
3870 output_mode = JAPANESE_EUC;
3871 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3872 } else if (c2 == ISO8859_1) {
3873 output_mode = ISO8859_1;
3874 (*o_putc)(c1 | 0x080);
3876 } else if (is_eucg3(c2)){
3877 output_mode = JAPANESE_EUC;
3878 #ifdef SHIFTJIS_CP932
3881 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3882 s2e_conv(s2, s1, &c2, &c1);
3887 output_mode = ASCII;
3889 }else if (is_eucg3(c2)){
3892 (*o_putc)((c2 & 0x7f) | 0x080);
3893 (*o_putc)(c1 | 0x080);
3896 (*o_putc)((c2 & 0x7f) | 0x080);
3897 (*o_putc)(c1 | 0x080);
3901 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3902 set_iconv(FALSE, 0);
3903 return; /* too late to rescue this char */
3905 output_mode = JAPANESE_EUC;
3906 (*o_putc)(c2 | 0x080);
3907 (*o_putc)(c1 | 0x080);
3912 nkf_char x0212_shift(nkf_char c)
3917 if (0x75 <= c && c <= 0x7f){
3918 ret = c + (0x109 - 0x75);
3921 if (0x75 <= c && c <= 0x7f){
3922 ret = c + (0x113 - 0x75);
3929 nkf_char x0212_unshift(nkf_char c)
3932 if (0x7f <= c && c <= 0x88){
3933 ret = c + (0x75 - 0x7f);
3934 }else if (0x89 <= c && c <= 0x92){
3935 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
3939 #endif /* X0212_ENABLE */
3941 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3947 if((0x21 <= ndx && ndx <= 0x2F)){
3948 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3949 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3951 }else if(0x6E <= ndx && ndx <= 0x7E){
3952 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3953 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3959 else if(nkf_isgraph(ndx)){
3961 const unsigned short *ptr;
3963 extern const unsigned short *const x0212_shiftjis[];
3965 ptr = x0212_shiftjis[ndx - 0x21];
3967 val = ptr[(c1 & 0x7f) - 0x21];
3976 c2 = x0212_shift(c2);
3978 #endif /* X0212_ENABLE */
3980 if(0x7F < c2) return 1;
3981 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3982 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3986 void s_oconv(nkf_char c2, nkf_char c1)
3988 #ifdef NUMCHAR_OPTION
3989 if (c2 == 0 && is_unicode_capsule(c1)){
3990 w16e_conv(c1, &c2, &c1);
3991 if (c2 == 0 && is_unicode_capsule(c1)){
3992 if(encode_fallback)(*encode_fallback)(c1);
4000 } else if (c2 == 0) {
4001 output_mode = ASCII;
4003 } else if (c2 == X0201) {
4004 output_mode = SHIFT_JIS;
4006 } else if (c2 == ISO8859_1) {
4007 output_mode = ISO8859_1;
4008 (*o_putc)(c1 | 0x080);
4010 } else if (is_eucg3(c2)){
4011 output_mode = SHIFT_JIS;
4012 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4018 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4019 set_iconv(FALSE, 0);
4020 return; /* too late to rescue this char */
4022 output_mode = SHIFT_JIS;
4023 e2s_conv(c2, c1, &c2, &c1);
4025 #ifdef SHIFTJIS_CP932
4027 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4029 extern const unsigned short cp932inv[2][189];
4031 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4037 #endif /* SHIFTJIS_CP932 */
4040 if (prefix_table[(unsigned char)c1]){
4041 (*o_putc)(prefix_table[(unsigned char)c1]);
4047 void j_oconv(nkf_char c2, nkf_char c1)
4049 #ifdef NUMCHAR_OPTION
4050 if (c2 == 0 && is_unicode_capsule(c1)){
4051 w16e_conv(c1, &c2, &c1);
4052 if (c2 == 0 && is_unicode_capsule(c1)){
4053 if(encode_fallback)(*encode_fallback)(c1);
4059 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4062 (*o_putc)(ascii_intro);
4063 output_mode = ASCII;
4067 } else if (is_eucg3(c2)){
4069 if(output_mode!=X0213_2){
4070 output_mode = X0213_2;
4074 (*o_putc)(X0213_2&0x7F);
4077 if(output_mode!=X0212){
4078 output_mode = X0212;
4082 (*o_putc)(X0212&0x7F);
4085 (*o_putc)(c2 & 0x7f);
4088 } else if (c2==X0201) {
4089 if (output_mode!=X0201) {
4090 output_mode = X0201;
4096 } else if (c2==ISO8859_1) {
4097 /* iso8859 introduction, or 8th bit on */
4098 /* Can we convert in 7bit form using ESC-'-'-A ?
4100 output_mode = ISO8859_1;
4102 } else if (c2 == 0) {
4103 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4106 (*o_putc)(ascii_intro);
4107 output_mode = ASCII;
4111 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4113 if (output_mode!=X0213_1) {
4114 output_mode = X0213_1;
4118 (*o_putc)(X0213_1&0x7F);
4120 }else if (output_mode != X0208) {
4121 output_mode = X0208;
4124 (*o_putc)(kanji_intro);
4131 void base64_conv(nkf_char c2, nkf_char c1)
4133 mime_prechar(c2, c1);
4134 (*o_base64conv)(c2,c1);
4138 static nkf_char broken_buf[3];
4139 static int broken_counter = 0;
4140 static int broken_last = 0;
4141 nkf_char broken_getc(FILE *f)
4145 if (broken_counter>0) {
4146 return broken_buf[--broken_counter];
4149 if (c=='$' && broken_last != ESC
4150 && (input_mode==ASCII || input_mode==X0201)) {
4153 if (c1=='@'|| c1=='B') {
4154 broken_buf[0]=c1; broken_buf[1]=c;
4161 } else if (c=='(' && broken_last != ESC
4162 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4165 if (c1=='J'|| c1=='B') {
4166 broken_buf[0]=c1; broken_buf[1]=c;
4179 nkf_char broken_ungetc(nkf_char c, FILE *f)
4181 if (broken_counter<2)
4182 broken_buf[broken_counter++]=c;
4186 static nkf_char prev_cr = 0;
4188 void cr_conv(nkf_char c2, nkf_char c1)
4192 if (! (c2==0&&c1==NL) ) {
4198 } else if (c1=='\r') {
4200 } else if (c1=='\n') {
4201 if (crmode_f==CRLF) {
4202 (*o_crconv)(0,'\r');
4203 } else if (crmode_f==CR) {
4204 (*o_crconv)(0,'\r');
4208 } else if (c1!='\032' || crmode_f!=NL){
4214 Return value of fold_conv()
4216 \n add newline and output char
4217 \r add newline and output nothing
4220 1 (or else) normal output
4222 fold state in prev (previous character)
4224 >0x80 Japanese (X0208/X0201)
4229 This fold algorthm does not preserve heading space in a line.
4230 This is the main difference from fmt.
4233 #define char_size(c2,c1) (c2?2:1)
4235 void fold_conv(nkf_char c2, nkf_char c1)
4238 nkf_char fold_state;
4240 if (c1== '\r' && !fold_preserve_f) {
4241 fold_state=0; /* ignore cr */
4242 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4244 fold_state=0; /* ignore cr */
4245 } else if (c1== BS) {
4246 if (f_line>0) f_line--;
4248 } else if (c2==EOF && f_line != 0) { /* close open last line */
4250 } else if ((c1=='\n' && !fold_preserve_f)
4251 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4252 && fold_preserve_f)) {
4254 if (fold_preserve_f) {
4258 } else if ((f_prev == c1 && !fold_preserve_f)
4259 || (f_prev == '\n' && fold_preserve_f)
4260 ) { /* duplicate newline */
4263 fold_state = '\n'; /* output two newline */
4269 if (f_prev&0x80) { /* Japanese? */
4271 fold_state = 0; /* ignore given single newline */
4272 } else if (f_prev==' ') {
4276 if (++f_line<=fold_len)
4280 fold_state = '\r'; /* fold and output nothing */
4284 } else if (c1=='\f') {
4287 fold_state = '\n'; /* output newline and clear */
4288 } else if ( (c2==0 && c1==' ')||
4289 (c2==0 && c1=='\t')||
4290 (c2=='!'&& c1=='!')) {
4291 /* X0208 kankaku or ascii space */
4292 if (f_prev == ' ') {
4293 fold_state = 0; /* remove duplicate spaces */
4296 if (++f_line<=fold_len)
4297 fold_state = ' '; /* output ASCII space only */
4299 f_prev = ' '; f_line = 0;
4300 fold_state = '\r'; /* fold and output nothing */
4304 prev0 = f_prev; /* we still need this one... , but almost done */
4306 if (c2 || c2==X0201)
4307 f_prev |= 0x80; /* this is Japanese */
4308 f_line += char_size(c2,c1);
4309 if (f_line<=fold_len) { /* normal case */
4312 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4313 f_line = char_size(c2,c1);
4314 fold_state = '\n'; /* We can't wait, do fold now */
4315 } else if (c2==X0201) {
4316 /* simple kinsoku rules return 1 means no folding */
4317 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4318 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4319 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4320 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4321 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4322 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4323 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4325 fold_state = '\n';/* add one new f_line before this character */
4328 fold_state = '\n';/* add one new f_line before this character */
4331 /* kinsoku point in ASCII */
4332 if ( c1==')'|| /* { [ ( */
4343 /* just after special */
4344 } else if (!is_alnum(prev0)) {
4345 f_line = char_size(c2,c1);
4347 } else if ((prev0==' ') || /* ignored new f_line */
4348 (prev0=='\n')|| /* ignored new f_line */
4349 (prev0&0x80)) { /* X0208 - ASCII */
4350 f_line = char_size(c2,c1);
4351 fold_state = '\n';/* add one new f_line before this character */
4353 fold_state = 1; /* default no fold in ASCII */
4357 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4358 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4359 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4360 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4361 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4362 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4363 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4364 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4365 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4366 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4367 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4368 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4369 /* default no fold in kinsoku */
4372 f_line = char_size(c2,c1);
4373 /* add one new f_line before this character */
4376 f_line = char_size(c2,c1);
4378 /* add one new f_line before this character */
4383 /* terminator process */
4384 switch(fold_state) {
4403 nkf_char z_prev2=0,z_prev1=0;
4405 void z_conv(nkf_char c2, nkf_char c1)
4408 /* if (c2) c1 &= 0x7f; assertion */
4410 if (x0201_f && z_prev2==X0201) { /* X0201 */
4411 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4413 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4415 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4417 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4421 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4430 if (x0201_f && c2==X0201) {
4431 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4432 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4433 z_prev1 = c1; z_prev2 = c2;
4436 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4441 /* JISX0208 Alphabet */
4442 if (alpha_f && c2 == 0x23 ) {
4444 } else if (alpha_f && c2 == 0x21 ) {
4445 /* JISX0208 Kigou */
4450 } else if (alpha_f&0x4) {
4455 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4461 case '>': entity = ">"; break;
4462 case '<': entity = "<"; break;
4463 case '\"': entity = """; break;
4464 case '&': entity = "&"; break;
4467 while (*entity) (*o_zconv)(0, *entity++);
4477 #define rot13(c) ( \
4479 (c <= 'M') ? (c + 13): \
4480 (c <= 'Z') ? (c - 13): \
4482 (c <= 'm') ? (c + 13): \
4483 (c <= 'z') ? (c - 13): \
4487 #define rot47(c) ( \
4489 ( c <= 'O' ) ? (c + 47) : \
4490 ( c <= '~' ) ? (c - 47) : \
4494 void rot_conv(nkf_char c2, nkf_char c1)
4496 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4502 (*o_rot_conv)(c2,c1);
4505 void hira_conv(nkf_char c2, nkf_char c1)
4509 if (0x20 < c1 && c1 < 0x74) {
4511 (*o_hira_conv)(c2,c1);
4513 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4515 c1 = CLASS_UNICODE | 0x3094;
4516 (*o_hira_conv)(c2,c1);
4519 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4521 (*o_hira_conv)(c2,c1);
4526 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4529 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4531 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4535 (*o_hira_conv)(c2,c1);
4539 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4541 static const nkf_char range[RANGE_NUM_MAX][2] = {
4562 nkf_char start, end, c;
4564 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4568 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4573 for (i = 0; i < RANGE_NUM_MAX; i++) {
4574 start = range[i][0];
4577 if (c >= start && c <= end) {
4582 (*o_iso2022jp_check_conv)(c2,c1);
4586 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4588 const unsigned char *mime_pattern[] = {
4589 (const unsigned char *)"\075?EUC-JP?B?",
4590 (const unsigned char *)"\075?SHIFT_JIS?B?",
4591 (const unsigned char *)"\075?ISO-8859-1?Q?",
4592 (const unsigned char *)"\075?ISO-8859-1?B?",
4593 (const unsigned char *)"\075?ISO-2022-JP?B?",
4594 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4595 #if defined(UTF8_INPUT_ENABLE)
4596 (const unsigned char *)"\075?UTF-8?B?",
4597 (const unsigned char *)"\075?UTF-8?Q?",
4599 (const unsigned char *)"\075?US-ASCII?Q?",
4604 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4605 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4606 e_iconv, s_iconv, 0, 0, 0, 0,
4607 #if defined(UTF8_INPUT_ENABLE)
4613 const nkf_char mime_encode[] = {
4614 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4615 #if defined(UTF8_INPUT_ENABLE)
4622 const nkf_char mime_encode_method[] = {
4623 'B', 'B','Q', 'B', 'B', 'Q',
4624 #if defined(UTF8_INPUT_ENABLE)
4632 #define MAXRECOVER 20
4634 void switch_mime_getc(void)
4636 if (i_getc!=mime_getc) {
4637 i_mgetc = i_getc; i_getc = mime_getc;
4638 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4639 if(mime_f==STRICT_MIME) {
4640 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4641 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4646 void unswitch_mime_getc(void)
4648 if(mime_f==STRICT_MIME) {
4649 i_mgetc = i_mgetc_buf;
4650 i_mungetc = i_mungetc_buf;
4653 i_ungetc = i_mungetc;
4654 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4655 mime_iconv_back = NULL;
4658 nkf_char mime_begin_strict(FILE *f)
4662 const unsigned char *p,*q;
4663 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4665 mime_decode_mode = FALSE;
4666 /* =? has been checked */
4668 p = mime_pattern[j];
4671 for(i=2;p[i]>' ';i++) { /* start at =? */
4672 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4673 /* pattern fails, try next one */
4675 while (mime_pattern[++j]) {
4676 p = mime_pattern[j];
4677 for(k=2;k<i;k++) /* assume length(p) > i */
4678 if (p[k]!=q[k]) break;
4679 if (k==i && nkf_toupper(c1)==p[k]) break;
4681 p = mime_pattern[j];
4682 if (p) continue; /* found next one, continue */
4683 /* all fails, output from recovery buffer */
4691 mime_decode_mode = p[i-2];
4693 mime_iconv_back = iconv;
4694 set_iconv(FALSE, mime_priority_func[j]);
4695 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4697 if (mime_decode_mode=='B') {
4698 mimebuf_f = unbuf_f;
4700 /* do MIME integrity check */
4701 return mime_integrity(f,mime_pattern[j]);
4709 nkf_char mime_getc_buf(FILE *f)
4711 /* we don't keep eof of Fifo, becase it contains ?= as
4712 a terminator. It was checked in mime_integrity. */
4713 return ((mimebuf_f)?
4714 (*i_mgetc_buf)(f):Fifo(mime_input++));
4717 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4720 (*i_mungetc_buf)(c,f);
4722 Fifo(--mime_input) = (unsigned char)c;
4726 nkf_char mime_begin(FILE *f)
4731 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4732 /* re-read and convert again from mime_buffer. */
4734 /* =? has been checked */
4736 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4737 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4738 /* We accept any character type even if it is breaked by new lines */
4739 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4740 if (c1=='\n'||c1==' '||c1=='\r'||
4741 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4743 /* Failed. But this could be another MIME preemble */
4751 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4752 if (!(++i<MAXRECOVER) || c1==EOF) break;
4753 if (c1=='b'||c1=='B') {
4754 mime_decode_mode = 'B';
4755 } else if (c1=='q'||c1=='Q') {
4756 mime_decode_mode = 'Q';
4760 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4761 if (!(++i<MAXRECOVER) || c1==EOF) break;
4763 mime_decode_mode = FALSE;
4769 if (!mime_decode_mode) {
4770 /* false MIME premble, restart from mime_buffer */
4771 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4772 /* Since we are in MIME mode until buffer becomes empty, */
4773 /* we never go into mime_begin again for a while. */
4776 /* discard mime preemble, and goto MIME mode */
4778 /* do no MIME integrity check */
4779 return c1; /* used only for checking EOF */
4783 void no_putc(nkf_char c)
4788 void debug(const char *str)
4791 fprintf(stderr, "%s\n", str);
4796 void set_input_codename(char *codename)
4800 strcmp(codename, "") != 0 &&
4801 strcmp(codename, input_codename) != 0)
4803 is_inputcode_mixed = TRUE;
4805 input_codename = codename;
4806 is_inputcode_set = TRUE;
4809 #if !defined(PERL_XS) && !defined(WIN32DLL)
4810 void print_guessed_code(char *filename)
4812 char *codename = "BINARY";
4813 if (!is_inputcode_mixed) {
4814 if (strcmp(input_codename, "") == 0) {
4817 codename = input_codename;
4820 if (filename != NULL) printf("%s:", filename);
4821 printf("%s\n", codename);
4827 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4829 nkf_char c1, c2, c3;
4835 if (!nkf_isxdigit(c2)){
4840 if (!nkf_isxdigit(c3)){
4845 return (hex2bin(c2) << 4) | hex2bin(c3);
4848 nkf_char cap_getc(FILE *f)
4850 return hex_getc(':', f, i_cgetc, i_cungetc);
4853 nkf_char cap_ungetc(nkf_char c, FILE *f)
4855 return (*i_cungetc)(c, f);
4858 nkf_char url_getc(FILE *f)
4860 return hex_getc('%', f, i_ugetc, i_uungetc);
4863 nkf_char url_ungetc(nkf_char c, FILE *f)
4865 return (*i_uungetc)(c, f);
4869 #ifdef NUMCHAR_OPTION
4870 nkf_char numchar_getc(FILE *f)
4872 nkf_char (*g)(FILE *) = i_ngetc;
4873 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4884 if (buf[i] == 'x' || buf[i] == 'X'){
4885 for (j = 0; j < 7; j++){
4887 if (!nkf_isxdigit(buf[i])){
4894 c |= hex2bin(buf[i]);
4897 for (j = 0; j < 8; j++){
4901 if (!nkf_isdigit(buf[i])){
4908 c += hex2bin(buf[i]);
4914 return CLASS_UNICODE | c;
4923 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4925 return (*i_nungetc)(c, f);
4929 #ifdef UNICODE_NORMALIZATION
4931 /* Normalization Form C */
4932 nkf_char nfc_getc(FILE *f)
4934 nkf_char (*g)(FILE *f) = i_nfc_getc;
4935 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4936 int i=0, j, k=1, lower, upper;
4938 const nkf_nfchar *array;
4940 extern const struct normalization_pair normalization_table[];
4944 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4945 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4946 while (upper >= lower) {
4947 j = (lower+upper) / 2;
4948 array = normalization_table[j].nfd;
4949 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4950 if (array[k] != buf[k]){
4951 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4958 array = normalization_table[j].nfc;
4959 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4960 buf[i] = (nkf_char)(array[i]);
4971 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4973 return (*i_nfc_ungetc)(c, f);
4975 #endif /* UNICODE_NORMALIZATION */
4981 nkf_char c1, c2, c3, c4, cc;
4982 nkf_char t1, t2, t3, t4, mode, exit_mode;
4983 nkf_char lwsp_count;
4986 nkf_char lwsp_size = 128;
4988 if (mime_top != mime_last) { /* Something is in FIFO */
4989 return Fifo(mime_top++);
4991 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4992 mime_decode_mode=FALSE;
4993 unswitch_mime_getc();
4994 return (*i_getc)(f);
4997 if (mimebuf_f == FIXED_MIME)
4998 exit_mode = mime_decode_mode;
5001 if (mime_decode_mode == 'Q') {
5002 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5004 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5005 if (c1<=' ' || DEL<=c1) {
5006 mime_decode_mode = exit_mode; /* prepare for quit */
5009 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5013 mime_decode_mode = exit_mode; /* prepare for quit */
5014 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5015 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5016 /* end Q encoding */
5017 input_mode = exit_mode;
5019 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5020 if (lwsp_buf==NULL) {
5021 perror("can't malloc");
5024 while ((c1=(*i_getc)(f))!=EOF) {
5029 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5037 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5038 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5053 lwsp_buf[lwsp_count] = (unsigned char)c1;
5054 if (lwsp_count++>lwsp_size){
5056 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5057 if (lwsp_buf_new==NULL) {
5059 perror("can't realloc");
5062 lwsp_buf = lwsp_buf_new;
5068 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5070 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5071 i_ungetc(lwsp_buf[lwsp_count],f);
5077 if (c1=='='&&c2<' ') { /* this is soft wrap */
5078 while((c1 = (*i_mgetc)(f)) <=' ') {
5079 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5081 mime_decode_mode = 'Q'; /* still in MIME */
5082 goto restart_mime_q;
5085 mime_decode_mode = 'Q'; /* still in MIME */
5089 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5090 if (c2<=' ') return c2;
5091 mime_decode_mode = 'Q'; /* still in MIME */
5092 return ((hex2bin(c2)<<4) + hex2bin(c3));
5095 if (mime_decode_mode != 'B') {
5096 mime_decode_mode = FALSE;
5097 return (*i_mgetc)(f);
5101 /* Base64 encoding */
5103 MIME allows line break in the middle of
5104 Base64, but we are very pessimistic in decoding
5105 in unbuf mode because MIME encoded code may broken by
5106 less or editor's control sequence (such as ESC-[-K in unbuffered
5107 mode. ignore incomplete MIME.
5109 mode = mime_decode_mode;
5110 mime_decode_mode = exit_mode; /* prepare for quit */
5112 while ((c1 = (*i_mgetc)(f))<=' ') {
5117 if ((c2 = (*i_mgetc)(f))<=' ') {
5120 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5121 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5124 if ((c1 == '?') && (c2 == '=')) {
5127 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5128 if (lwsp_buf==NULL) {
5129 perror("can't malloc");
5132 while ((c1=(*i_getc)(f))!=EOF) {
5137 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5145 if ((c1=(*i_getc)(f))!=EOF) {
5149 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5164 lwsp_buf[lwsp_count] = (unsigned char)c1;
5165 if (lwsp_count++>lwsp_size){
5167 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5168 if (lwsp_buf_new==NULL) {
5170 perror("can't realloc");
5173 lwsp_buf = lwsp_buf_new;
5179 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5181 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5182 i_ungetc(lwsp_buf[lwsp_count],f);
5189 if ((c3 = (*i_mgetc)(f))<=' ') {
5192 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5193 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5197 if ((c4 = (*i_mgetc)(f))<=' ') {
5200 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5201 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5205 mime_decode_mode = mode; /* still in MIME sigh... */
5207 /* BASE 64 decoding */
5209 t1 = 0x3f & base64decode(c1);
5210 t2 = 0x3f & base64decode(c2);
5211 t3 = 0x3f & base64decode(c3);
5212 t4 = 0x3f & base64decode(c4);
5213 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5215 Fifo(mime_last++) = (unsigned char)cc;
5216 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5218 Fifo(mime_last++) = (unsigned char)cc;
5219 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5221 Fifo(mime_last++) = (unsigned char)cc;
5226 return Fifo(mime_top++);
5229 nkf_char mime_ungetc(nkf_char c, FILE *f)
5231 Fifo(--mime_top) = (unsigned char)c;
5235 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5239 /* In buffered mode, read until =? or NL or buffer full
5241 mime_input = mime_top;
5242 mime_last = mime_top;
5244 while(*p) Fifo(mime_input++) = *p++;
5247 while((c=(*i_getc)(f))!=EOF) {
5248 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5249 break; /* buffer full */
5251 if (c=='=' && d=='?') {
5252 /* checked. skip header, start decode */
5253 Fifo(mime_input++) = (unsigned char)c;
5254 /* mime_last_input = mime_input; */
5259 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5261 /* Should we check length mod 4? */
5262 Fifo(mime_input++) = (unsigned char)c;
5265 /* In case of Incomplete MIME, no MIME decode */
5266 Fifo(mime_input++) = (unsigned char)c;
5267 mime_last = mime_input; /* point undecoded buffer */
5268 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5269 switch_mime_getc(); /* anyway we need buffered getc */
5273 nkf_char base64decode(nkf_char c)
5278 i = c - 'A'; /* A..Z 0-25 */
5280 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5282 } else if (c > '/') {
5283 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5284 } else if (c == '+') {
5285 i = '>' /* 62 */ ; /* + 62 */
5287 i = '?' /* 63 */ ; /* / 63 */
5292 static const char basis_64[] =
5293 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5295 static nkf_char b64c;
5296 #define MIMEOUT_BUF_LENGTH (60)
5297 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5298 int mimeout_buf_count = 0;
5299 int mimeout_preserve_space = 0;
5300 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5302 void open_mime(nkf_char mode)
5304 const unsigned char *p;
5307 p = mime_pattern[0];
5308 for(i=0;mime_encode[i];i++) {
5309 if (mode == mime_encode[i]) {
5310 p = mime_pattern[i];
5314 mimeout_mode = mime_encode_method[i];
5317 if (base64_count>45) {
5318 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5319 (*o_mputc)(mimeout_buf[i]);
5325 if (!mimeout_preserve_space && mimeout_buf_count>0
5326 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5327 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5331 if (!mimeout_preserve_space) {
5332 for (;i<mimeout_buf_count;i++) {
5333 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5334 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5335 (*o_mputc)(mimeout_buf[i]);
5342 mimeout_preserve_space = FALSE;
5348 j = mimeout_buf_count;
5349 mimeout_buf_count = 0;
5351 mime_putc(mimeout_buf[i]);
5355 void close_mime(void)
5365 switch(mimeout_mode) {
5370 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5376 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5382 if (mimeout_f!=FIXED_MIME) {
5384 } else if (mimeout_mode != 'Q')
5389 void mimeout_addchar(nkf_char c)
5391 switch(mimeout_mode) {
5396 } else if(!nkf_isalnum(c)) {
5398 (*o_mputc)(itoh4(((c>>4)&0xf)));
5399 (*o_mputc)(itoh4((c&0xf)));
5408 (*o_mputc)(basis_64[c>>2]);
5413 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5419 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5420 (*o_mputc)(basis_64[c & 0x3F]);
5431 nkf_char mime_lastchar2, mime_lastchar1;
5433 void mime_prechar(nkf_char c2, nkf_char c1)
5437 if (base64_count + mimeout_buf_count/3*4> 66){
5438 (*o_base64conv)(EOF,0);
5439 (*o_base64conv)(0,NL);
5440 (*o_base64conv)(0,SPACE);
5442 }/*else if (mime_lastchar2){
5443 if (c1 <=DEL && !nkf_isspace(c1)){
5444 (*o_base64conv)(0,SPACE);
5448 if (c2 && mime_lastchar2 == 0
5449 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5450 (*o_base64conv)(0,SPACE);
5453 mime_lastchar2 = c2;
5454 mime_lastchar1 = c1;
5457 void mime_putc(nkf_char c)
5462 if (mimeout_f == FIXED_MIME){
5463 if (mimeout_mode == 'Q'){
5464 if (base64_count > 71){
5465 if (c!=CR && c!=NL) {
5472 if (base64_count > 71){
5477 if (c == EOF) { /* c==EOF */
5481 if (c != EOF) { /* c==EOF */
5487 /* mimeout_f != FIXED_MIME */
5489 if (c == EOF) { /* c==EOF */
5490 j = mimeout_buf_count;
5491 mimeout_buf_count = 0;
5495 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5498 mimeout_addchar(mimeout_buf[i]);
5502 mimeout_addchar(mimeout_buf[i]);
5506 mimeout_addchar(mimeout_buf[i]);
5512 if (mimeout_mode=='Q') {
5513 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5525 if (mimeout_buf_count > 0){
5526 lastchar = mimeout_buf[mimeout_buf_count - 1];
5531 if (!mimeout_mode) {
5532 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5533 if (nkf_isspace(c)) {
5534 if (c==CR || c==NL) {
5537 for (i=0;i<mimeout_buf_count;i++) {
5538 (*o_mputc)(mimeout_buf[i]);
5539 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5545 mimeout_buf[0] = (char)c;
5546 mimeout_buf_count = 1;
5548 if (base64_count > 1
5549 && base64_count + mimeout_buf_count > 76){
5552 if (!nkf_isspace(mimeout_buf[0])){
5557 mimeout_buf[mimeout_buf_count++] = (char)c;
5558 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5559 open_mime(output_mode);
5564 if (lastchar==CR || lastchar == NL){
5565 for (i=0;i<mimeout_buf_count;i++) {
5566 (*o_mputc)(mimeout_buf[i]);
5569 mimeout_buf_count = 0;
5571 if (lastchar==SPACE) {
5572 for (i=0;i<mimeout_buf_count-1;i++) {
5573 (*o_mputc)(mimeout_buf[i]);
5576 mimeout_buf[0] = SPACE;
5577 mimeout_buf_count = 1;
5579 open_mime(output_mode);
5582 /* mimeout_mode == 'B', 1, 2 */
5583 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5584 if (lastchar == CR || lastchar == NL){
5585 if (nkf_isblank(c)) {
5586 for (i=0;i<mimeout_buf_count;i++) {
5587 mimeout_addchar(mimeout_buf[i]);
5589 mimeout_buf_count = 0;
5590 } else if (SPACE<c && c<DEL) {
5592 for (i=0;i<mimeout_buf_count;i++) {
5593 (*o_mputc)(mimeout_buf[i]);
5596 mimeout_buf_count = 0;
5599 if (c==SPACE || c==TAB || c==CR || c==NL) {
5600 for (i=0;i<mimeout_buf_count;i++) {
5601 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5603 for (i=0;i<mimeout_buf_count;i++) {
5604 (*o_mputc)(mimeout_buf[i]);
5607 mimeout_buf_count = 0;
5610 mimeout_buf[mimeout_buf_count++] = (char)c;
5611 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5613 for (i=0;i<mimeout_buf_count;i++) {
5614 (*o_mputc)(mimeout_buf[i]);
5617 mimeout_buf_count = 0;
5621 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5622 mimeout_buf[mimeout_buf_count++] = (char)c;
5623 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5624 j = mimeout_buf_count;
5625 mimeout_buf_count = 0;
5627 mimeout_addchar(mimeout_buf[i]);
5634 if (mimeout_buf_count>0) {
5635 j = mimeout_buf_count;
5636 mimeout_buf_count = 0;
5638 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5640 mimeout_addchar(mimeout_buf[i]);
5646 (*o_mputc)(mimeout_buf[i]);
5648 open_mime(output_mode);
5655 #if defined(PERL_XS) || defined(WIN32DLL)
5659 struct input_code *p = input_code_list;
5672 mime_f = STRICT_MIME;
5673 mime_decode_f = FALSE;
5678 #if defined(MSDOS) || defined(__OS2__)
5683 iso2022jp_f = FALSE;
5684 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5685 ms_ucs_map_f = UCS_MAP_ASCII;
5687 #ifdef UTF8_INPUT_ENABLE
5688 no_cp932ext_f = FALSE;
5689 no_best_fit_chars_f = FALSE;
5690 encode_fallback = NULL;
5691 unicode_subchar = '?';
5692 input_endian = ENDIAN_BIG;
5694 #ifdef UTF8_OUTPUT_ENABLE
5695 output_bom_f = FALSE;
5696 output_endian = ENDIAN_BIG;
5698 #ifdef UNICODE_NORMALIZATION
5711 is_inputcode_mixed = FALSE;
5712 is_inputcode_set = FALSE;
5716 #ifdef SHIFTJIS_CP932
5726 for (i = 0; i < 256; i++){
5727 prefix_table[i] = 0;
5731 mimeout_buf_count = 0;
5736 fold_preserve_f = FALSE;
5739 kanji_intro = DEFAULT_J;
5740 ascii_intro = DEFAULT_R;
5741 fold_margin = FOLD_MARGIN;
5742 output_conv = DEFAULT_CONV;
5743 oconv = DEFAULT_CONV;
5744 o_zconv = no_connection;
5745 o_fconv = no_connection;
5746 o_crconv = no_connection;
5747 o_rot_conv = no_connection;
5748 o_hira_conv = no_connection;
5749 o_base64conv = no_connection;
5750 o_iso2022jp_check_conv = no_connection;
5753 i_ungetc = std_ungetc;
5755 i_bungetc = std_ungetc;
5758 i_mungetc = std_ungetc;
5759 i_mgetc_buf = std_getc;
5760 i_mungetc_buf = std_ungetc;
5761 output_mode = ASCII;
5764 mime_decode_mode = FALSE;
5770 z_prev2=0,z_prev1=0;
5772 iconv_for_check = 0;
5774 input_codename = "";
5781 void no_connection(nkf_char c2, nkf_char c1)
5783 no_connection2(c2,c1,0);
5786 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5788 fprintf(stderr,"nkf internal module connection failure.\n");
5790 return 0; /* LINT */
5795 #define fprintf dllprintf
5799 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5800 fprintf(stderr,"Flags:\n");
5801 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5802 #ifdef DEFAULT_CODE_SJIS
5803 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5805 #ifdef DEFAULT_CODE_JIS
5806 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5808 #ifdef DEFAULT_CODE_EUC
5809 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5811 #ifdef DEFAULT_CODE_UTF8
5812 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5814 #ifdef UTF8_OUTPUT_ENABLE
5815 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5817 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5818 #ifdef UTF8_INPUT_ENABLE
5819 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5821 fprintf(stderr,"t no conversion\n");
5822 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5823 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5824 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5825 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5826 fprintf(stderr,"v Show this usage. V: show version\n");
5827 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5828 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5829 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5830 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5831 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5832 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5833 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5834 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5836 fprintf(stderr,"T Text mode output\n");
5838 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5839 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5840 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5841 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5842 fprintf(stderr,"\n");
5843 fprintf(stderr,"Long name options\n");
5844 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5845 fprintf(stderr," Specify the input or output codeset\n");
5846 fprintf(stderr," --fj --unix --mac --windows\n");
5847 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5848 fprintf(stderr," Convert for the system or code\n");
5849 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5850 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5851 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5853 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5855 #ifdef NUMCHAR_OPTION
5856 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5858 #ifdef UTF8_INPUT_ENABLE
5859 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5860 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5863 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5864 fprintf(stderr," Overwrite original listed files by filtered result\n");
5865 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5867 fprintf(stderr," -g --guess Guess the input code\n");
5868 fprintf(stderr," --help --version Show this help/the version\n");
5869 fprintf(stderr," For more information, see also man nkf\n");
5870 fprintf(stderr,"\n");
5876 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5877 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
5880 #if defined(MSDOS) && defined(__WIN16__)
5883 #if defined(MSDOS) && defined(__WIN32__)
5889 ,NKF_VERSION,NKF_RELEASE_DATE);
5890 fprintf(stderr,"\n%s\n",CopyRight);
5895 **
\e$B%Q%C%A@):n<T
\e(B
5896 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5897 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5898 ** ohta@src.ricoh.co.jp (Junn Ohta)
5899 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5900 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5901 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5902 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5903 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5904 ** GHG00637@nifty-serve.or.jp (COW)