1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.113 2006/10/12 16:41:25 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2006-10-13"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = TRUE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
584 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
585 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
590 static int mimeout_mode = 0;
591 static int base64_count = 0;
593 /* X0208 -> ASCII converter */
596 static int f_line = 0; /* chars in line */
597 static int f_prev = 0;
598 static int fold_preserve_f = FALSE; /* preserve new lines */
599 static int fold_f = FALSE;
600 static int fold_len = 0;
603 static unsigned char kanji_intro = DEFAULT_J;
604 static unsigned char ascii_intro = DEFAULT_R;
608 #define FOLD_MARGIN 10
609 #define DEFAULT_FOLD 60
611 static int fold_margin = FOLD_MARGIN;
615 #ifdef DEFAULT_CODE_JIS
616 # define DEFAULT_CONV j_oconv
618 #ifdef DEFAULT_CODE_SJIS
619 # define DEFAULT_CONV s_oconv
621 #ifdef DEFAULT_CODE_EUC
622 # define DEFAULT_CONV e_oconv
624 #ifdef DEFAULT_CODE_UTF8
625 # define DEFAULT_CONV w_oconv
628 /* process default */
629 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
631 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
632 /* s_iconv or oconv */
633 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
635 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
643 /* static redirections */
645 static void (*o_putc)(nkf_char c) = std_putc;
647 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
648 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
650 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
651 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
653 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
655 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
656 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
658 /* for strict mime */
659 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
660 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
663 static int output_mode = ASCII, /* output kanji mode */
664 input_mode = ASCII, /* input kanji mode */
665 shift_mode = FALSE; /* TRUE shift out, or X0201 */
666 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
668 /* X0201 / X0208 conversion tables */
670 /* X0201 kana conversion table */
673 unsigned char cv[]= {
674 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
675 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
676 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
677 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
678 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
679 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
680 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
681 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
682 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
683 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
684 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
685 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
686 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
687 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
688 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
689 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
693 /* X0201 kana conversion table for daguten */
696 unsigned char dv[]= {
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
702 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
703 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
704 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
705 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
706 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
708 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
715 /* X0201 kana conversion table for han-daguten */
718 unsigned char ev[]= {
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
730 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
738 /* X0208 kigou conversion table */
739 /* 0x8140 - 0x819e */
741 unsigned char fv[] = {
743 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
744 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
745 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
746 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
747 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
748 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
749 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
751 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
753 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
760 static int file_out_f = FALSE;
762 static int overwrite_f = FALSE;
763 static int preserve_time_f = FALSE;
764 static int backup_f = FALSE;
765 static char *backup_suffix = "";
766 static char *get_backup_filename(const char *suffix, const char *filename);
769 static int crmode_f = 0; /* CR, NL, CRLF */
770 #ifdef EASYWIN /*Easy Win */
771 static int end_check;
774 #define STD_GC_BUFSIZE (256)
775 nkf_char std_gc_buf[STD_GC_BUFSIZE];
779 #include "nkf32dll.c"
780 #elif defined(PERL_XS)
782 int main(int argc, char **argv)
787 char *outfname = NULL;
790 #ifdef EASYWIN /*Easy Win */
791 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
794 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
795 cp = (unsigned char *)*argv;
800 if (pipe(fds) < 0 || (pid = fork()) < 0){
811 execvp(argv[1], &argv[1]);
825 if(x0201_f == WISH_TRUE)
826 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
828 if (binmode_f == TRUE)
829 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
830 if (freopen("","wb",stdout) == NULL)
837 setbuf(stdout, (char *) NULL);
839 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
842 if (binmode_f == TRUE)
843 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
844 if (freopen("","rb",stdin) == NULL) return (-1);
848 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
852 kanji_convert(stdin);
853 if (guess_f) print_guessed_code(NULL);
857 int is_argument_error = FALSE;
859 is_inputcode_mixed = FALSE;
860 is_inputcode_set = FALSE;
865 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
868 is_argument_error = TRUE;
876 /* reopen file for stdout */
877 if (file_out_f == TRUE) {
880 outfname = malloc(strlen(origfname)
881 + strlen(".nkftmpXXXXXX")
887 strcpy(outfname, origfname);
891 for (i = strlen(outfname); i; --i){
892 if (outfname[i - 1] == '/'
893 || outfname[i - 1] == '\\'){
899 strcat(outfname, "ntXXXXXX");
901 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
904 strcat(outfname, ".nkftmpXXXXXX");
905 fd = mkstemp(outfname);
908 || (fd_backup = dup(fileno(stdout))) < 0
909 || dup2(fd, fileno(stdout)) < 0
920 outfname = "nkf.out";
923 if(freopen(outfname, "w", stdout) == NULL) {
927 if (binmode_f == TRUE) {
928 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
929 if (freopen("","wb",stdout) == NULL)
936 if (binmode_f == TRUE)
937 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
938 if (freopen("","rb",fin) == NULL)
943 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
947 char *filename = NULL;
949 if (nfiles > 1) filename = origfname;
950 if (guess_f) print_guessed_code(filename);
956 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
964 if (dup2(fd_backup, fileno(stdout)) < 0){
967 if (stat(origfname, &sb)) {
968 fprintf(stderr, "Can't stat %s\n", origfname);
970 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
971 if (chmod(outfname, sb.st_mode)) {
972 fprintf(stderr, "Can't set permission %s\n", outfname);
975 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
977 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
978 tb[0] = tb[1] = sb.st_mtime;
979 if (utime(outfname, tb)) {
980 fprintf(stderr, "Can't set timestamp %s\n", outfname);
983 tb.actime = sb.st_atime;
984 tb.modtime = sb.st_mtime;
985 if (utime(outfname, &tb)) {
986 fprintf(stderr, "Can't set timestamp %s\n", outfname);
991 char *backup_filename = get_backup_filename(backup_suffix, origfname);
993 unlink(backup_filename);
995 if (rename(origfname, backup_filename)) {
996 perror(backup_filename);
997 fprintf(stderr, "Can't rename %s to %s\n",
998 origfname, backup_filename);
1002 if (unlink(origfname)){
1007 if (rename(outfname, origfname)) {
1009 fprintf(stderr, "Can't rename %s to %s\n",
1010 outfname, origfname);
1017 if (is_argument_error)
1020 #ifdef EASYWIN /*Easy Win */
1021 if (file_out_f == FALSE)
1022 scanf("%d",&end_check);
1025 #else /* for Other OS */
1026 if (file_out_f == TRUE)
1028 #endif /*Easy Win */
1031 #endif /* WIN32DLL */
1034 char *get_backup_filename(const char *suffix, const char *filename)
1036 char *backup_filename;
1037 int asterisk_count = 0;
1039 int filename_length = strlen(filename);
1041 for(i = 0; suffix[i]; i++){
1042 if(suffix[i] == '*') asterisk_count++;
1046 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1047 if (!backup_filename){
1048 perror("Can't malloc backup filename.");
1052 for(i = 0, j = 0; suffix[i];){
1053 if(suffix[i] == '*'){
1054 backup_filename[j] = '\0';
1055 strncat(backup_filename, filename, filename_length);
1057 j += filename_length;
1059 backup_filename[j++] = suffix[i++];
1062 backup_filename[j] = '\0';
1064 j = strlen(suffix) + filename_length;
1065 backup_filename = malloc( + 1);
1066 strcpy(backup_filename, filename);
1067 strcat(backup_filename, suffix);
1068 backup_filename[j] = '\0';
1070 return backup_filename;
1099 {"katakana-hiragana","h3"},
1106 #ifdef UTF8_OUTPUT_ENABLE
1116 {"fb-subchar=", ""},
1118 #ifdef UTF8_INPUT_ENABLE
1119 {"utf8-input", "W"},
1120 {"utf16-input", "W16"},
1121 {"no-cp932ext", ""},
1122 {"no-best-fit-chars",""},
1124 #ifdef UNICODE_NORMALIZATION
1125 {"utf8mac-input", ""},
1137 #ifdef NUMCHAR_OPTION
1138 {"numchar-input", ""},
1144 #ifdef SHIFTJIS_CP932
1154 static int option_mode = 0;
1156 void options(unsigned char *cp)
1160 unsigned char *cp_back = NULL;
1165 while(*cp && *cp++!='-');
1166 while (*cp || cp_back) {
1174 case '-': /* literal options */
1175 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1179 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1180 p = (unsigned char *)long_option[i].name;
1181 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1182 if (*p == cp[j] || cp[j] == ' '){
1189 while(*cp && *cp != SPACE && cp++);
1190 if (long_option[i].alias[0]){
1192 cp = (unsigned char *)long_option[i].alias;
1194 if (strcmp(long_option[i].name, "ic=") == 0){
1195 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1196 codeset[i] = nkf_toupper(p[i]);
1199 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1200 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1201 strcmp(codeset, "CP50220") == 0 ||
1202 strcmp(codeset, "CP50221") == 0 ||
1203 strcmp(codeset, "CP50222") == 0 ||
1204 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1205 input_f = JIS_INPUT;
1206 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1207 input_f = JIS_INPUT;
1211 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1212 input_f = JIS_INPUT;
1217 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1218 input_f = SJIS_INPUT;
1219 if (x0201_f==NO_X0201) x0201_f=TRUE;
1220 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1221 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1222 strcmp(codeset, "CP932") == 0 ||
1223 strcmp(codeset, "MS932") == 0){
1224 input_f = SJIS_INPUT;
1226 #ifdef SHIFTJIS_CP932
1229 #ifdef UTF8_OUTPUT_ENABLE
1230 ms_ucs_map_f = UCS_MAP_CP932;
1232 }else if(strcmp(codeset, "EUCJP") == 0 ||
1233 strcmp(codeset, "EUC-JP") == 0){
1234 input_f = EUC_INPUT;
1235 }else if(strcmp(codeset, "CP51932") == 0){
1236 input_f = EUC_INPUT;
1238 #ifdef SHIFTJIS_CP932
1241 #ifdef UTF8_OUTPUT_ENABLE
1242 ms_ucs_map_f = UCS_MAP_CP932;
1244 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1245 strcmp(codeset, "EUCJP-MS") == 0 ||
1246 strcmp(codeset, "EUCJPMS") == 0){
1247 input_f = EUC_INPUT;
1249 #ifdef SHIFTJIS_CP932
1252 #ifdef UTF8_OUTPUT_ENABLE
1253 ms_ucs_map_f = UCS_MAP_MS;
1255 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1256 strcmp(codeset, "EUCJP-ASCII") == 0){
1257 input_f = EUC_INPUT;
1259 #ifdef SHIFTJIS_CP932
1262 #ifdef UTF8_OUTPUT_ENABLE
1263 ms_ucs_map_f = UCS_MAP_ASCII;
1265 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1266 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1267 input_f = SJIS_INPUT;
1269 #ifdef SHIFTJIS_CP932
1273 if (x0201_f==NO_X0201) x0201_f=TRUE;
1274 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1275 strcmp(codeset, "EUC-JIS-2004") == 0){
1276 input_f = EUC_INPUT;
1279 #ifdef SHIFTJIS_CP932
1283 #ifdef UTF8_INPUT_ENABLE
1284 }else if(strcmp(codeset, "UTF-8") == 0 ||
1285 strcmp(codeset, "UTF-8N") == 0 ||
1286 strcmp(codeset, "UTF-8-BOM") == 0){
1287 input_f = UTF8_INPUT;
1288 #ifdef UNICODE_NORMALIZATION
1289 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1290 strcmp(codeset, "UTF-8-MAC") == 0){
1291 input_f = UTF8_INPUT;
1294 }else if(strcmp(codeset, "UTF-16") == 0 ||
1295 strcmp(codeset, "UTF-16BE") == 0 ||
1296 strcmp(codeset, "UTF-16BE-BOM") == 0){
1297 input_f = UTF16_INPUT;
1298 input_endian = ENDIAN_BIG;
1299 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1300 strcmp(codeset, "UTF-16LE-BOM") == 0){
1301 input_f = UTF16_INPUT;
1302 input_endian = ENDIAN_LITTLE;
1303 }else if(strcmp(codeset, "UTF-32") == 0 ||
1304 strcmp(codeset, "UTF-32BE") == 0 ||
1305 strcmp(codeset, "UTF-32BE-BOM") == 0){
1306 input_f = UTF32_INPUT;
1307 input_endian = ENDIAN_BIG;
1308 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1309 strcmp(codeset, "UTF-32LE-BOM") == 0){
1310 input_f = UTF32_INPUT;
1311 input_endian = ENDIAN_LITTLE;
1316 if (strcmp(long_option[i].name, "oc=") == 0){
1317 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1318 codeset[i] = nkf_toupper(p[i]);
1321 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1322 strcmp(codeset, "CP50220") == 0){
1323 output_conv = j_oconv;
1324 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1325 output_conv = j_oconv;
1326 no_cp932ext_f = TRUE;
1327 }else if(strcmp(codeset, "CP50221") == 0 ||
1328 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1329 output_conv = j_oconv;
1331 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1332 output_conv = j_oconv;
1336 #ifdef SHIFTJIS_CP932
1339 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1340 output_conv = j_oconv;
1345 #ifdef SHIFTJIS_CP932
1348 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1349 output_conv = j_oconv;
1354 #ifdef SHIFTJIS_CP932
1357 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1358 output_conv = s_oconv;
1359 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1360 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1361 strcmp(codeset, "CP932") == 0 ||
1362 strcmp(codeset, "MS932") == 0){
1363 output_conv = s_oconv;
1365 #ifdef SHIFTJIS_CP932
1369 #ifdef UTF8_OUTPUT_ENABLE
1370 ms_ucs_map_f = UCS_MAP_CP932;
1372 }else if(strcmp(codeset, "EUCJP") == 0 ||
1373 strcmp(codeset, "EUC-JP") == 0){
1374 output_conv = e_oconv;
1375 }else if(strcmp(codeset, "CP51932") == 0){
1376 output_conv = e_oconv;
1378 #ifdef SHIFTJIS_CP932
1381 #ifdef UTF8_OUTPUT_ENABLE
1382 ms_ucs_map_f = UCS_MAP_CP932;
1384 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1385 strcmp(codeset, "EUCJP-MS") == 0 ||
1386 strcmp(codeset, "EUCJPMS") == 0){
1387 output_conv = e_oconv;
1392 #ifdef SHIFTJIS_CP932
1395 #ifdef UTF8_OUTPUT_ENABLE
1396 ms_ucs_map_f = UCS_MAP_MS;
1398 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1399 strcmp(codeset, "EUCJP-ASCII") == 0){
1400 output_conv = e_oconv;
1405 #ifdef SHIFTJIS_CP932
1408 #ifdef UTF8_OUTPUT_ENABLE
1409 ms_ucs_map_f = UCS_MAP_ASCII;
1411 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1412 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1413 output_conv = s_oconv;
1415 #ifdef SHIFTJIS_CP932
1418 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1419 strcmp(codeset, "EUC-JIS-2004") == 0){
1420 output_conv = e_oconv;
1425 #ifdef SHIFTJIS_CP932
1428 #ifdef UTF8_OUTPUT_ENABLE
1429 }else if(strcmp(codeset, "UTF-8") == 0){
1430 output_conv = w_oconv;
1431 }else if(strcmp(codeset, "UTF-8N") == 0){
1432 output_conv = w_oconv;
1433 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1434 output_conv = w_oconv;
1435 output_bom_f = TRUE;
1436 }else if(strcmp(codeset, "UTF-16BE") == 0){
1437 output_conv = w_oconv16;
1438 }else if(strcmp(codeset, "UTF-16") == 0 ||
1439 strcmp(codeset, "UTF-16BE-BOM") == 0){
1440 output_conv = w_oconv16;
1441 output_bom_f = TRUE;
1442 }else if(strcmp(codeset, "UTF-16LE") == 0){
1443 output_conv = w_oconv16;
1444 output_endian = ENDIAN_LITTLE;
1445 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1446 output_conv = w_oconv16;
1447 output_endian = ENDIAN_LITTLE;
1448 output_bom_f = TRUE;
1449 }else if(strcmp(codeset, "UTF-32") == 0 ||
1450 strcmp(codeset, "UTF-32BE") == 0){
1451 output_conv = w_oconv32;
1452 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1453 output_conv = w_oconv32;
1454 output_bom_f = TRUE;
1455 }else if(strcmp(codeset, "UTF-32LE") == 0){
1456 output_conv = w_oconv32;
1457 output_endian = ENDIAN_LITTLE;
1458 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1459 output_conv = w_oconv32;
1460 output_endian = ENDIAN_LITTLE;
1461 output_bom_f = TRUE;
1467 if (strcmp(long_option[i].name, "overwrite") == 0){
1470 preserve_time_f = TRUE;
1473 if (strcmp(long_option[i].name, "overwrite=") == 0){
1476 preserve_time_f = TRUE;
1478 backup_suffix = malloc(strlen((char *) p) + 1);
1479 strcpy(backup_suffix, (char *) p);
1482 if (strcmp(long_option[i].name, "in-place") == 0){
1485 preserve_time_f = FALSE;
1488 if (strcmp(long_option[i].name, "in-place=") == 0){
1491 preserve_time_f = FALSE;
1493 backup_suffix = malloc(strlen((char *) p) + 1);
1494 strcpy(backup_suffix, (char *) p);
1499 if (strcmp(long_option[i].name, "cap-input") == 0){
1503 if (strcmp(long_option[i].name, "url-input") == 0){
1508 #ifdef NUMCHAR_OPTION
1509 if (strcmp(long_option[i].name, "numchar-input") == 0){
1515 if (strcmp(long_option[i].name, "no-output") == 0){
1519 if (strcmp(long_option[i].name, "debug") == 0){
1524 if (strcmp(long_option[i].name, "cp932") == 0){
1525 #ifdef SHIFTJIS_CP932
1529 #ifdef UTF8_OUTPUT_ENABLE
1530 ms_ucs_map_f = UCS_MAP_CP932;
1534 if (strcmp(long_option[i].name, "no-cp932") == 0){
1535 #ifdef SHIFTJIS_CP932
1539 #ifdef UTF8_OUTPUT_ENABLE
1540 ms_ucs_map_f = UCS_MAP_ASCII;
1544 #ifdef SHIFTJIS_CP932
1545 if (strcmp(long_option[i].name, "cp932inv") == 0){
1552 if (strcmp(long_option[i].name, "x0212") == 0){
1559 if (strcmp(long_option[i].name, "exec-in") == 0){
1563 if (strcmp(long_option[i].name, "exec-out") == 0){
1568 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1569 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1570 no_cp932ext_f = TRUE;
1573 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1574 no_best_fit_chars_f = TRUE;
1577 if (strcmp(long_option[i].name, "fb-skip") == 0){
1578 encode_fallback = NULL;
1581 if (strcmp(long_option[i].name, "fb-html") == 0){
1582 encode_fallback = encode_fallback_html;
1585 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1586 encode_fallback = encode_fallback_xml;
1589 if (strcmp(long_option[i].name, "fb-java") == 0){
1590 encode_fallback = encode_fallback_java;
1593 if (strcmp(long_option[i].name, "fb-perl") == 0){
1594 encode_fallback = encode_fallback_perl;
1597 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1598 encode_fallback = encode_fallback_subchar;
1601 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1602 encode_fallback = encode_fallback_subchar;
1603 unicode_subchar = 0;
1605 /* decimal number */
1606 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1607 unicode_subchar *= 10;
1608 unicode_subchar += hex2bin(p[i]);
1610 }else if(p[1] == 'x' || p[1] == 'X'){
1611 /* hexadecimal number */
1612 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1613 unicode_subchar <<= 4;
1614 unicode_subchar |= hex2bin(p[i]);
1618 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1619 unicode_subchar *= 8;
1620 unicode_subchar += hex2bin(p[i]);
1623 w16e_conv(unicode_subchar, &i, &j);
1624 unicode_subchar = i<<8 | j;
1628 #ifdef UTF8_OUTPUT_ENABLE
1629 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1630 ms_ucs_map_f = UCS_MAP_MS;
1634 #ifdef UNICODE_NORMALIZATION
1635 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1636 input_f = UTF8_INPUT;
1641 if (strcmp(long_option[i].name, "prefix=") == 0){
1642 if (nkf_isgraph(p[0])){
1643 for (i = 1; nkf_isgraph(p[i]); i++){
1644 prefix_table[p[i]] = p[0];
1651 case 'b': /* buffered mode */
1654 case 'u': /* non bufferd mode */
1657 case 't': /* transparent mode */
1662 } else if (*cp=='2') {
1666 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1674 case 'j': /* JIS output */
1676 output_conv = j_oconv;
1678 case 'e': /* AT&T EUC output */
1679 output_conv = e_oconv;
1681 case 's': /* SJIS output */
1682 output_conv = s_oconv;
1684 case 'l': /* ISO8859 Latin-1 support, no conversion */
1685 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1686 input_f = LATIN1_INPUT;
1688 case 'i': /* Kanji IN ESC-$-@/B */
1689 if (*cp=='@'||*cp=='B')
1690 kanji_intro = *cp++;
1692 case 'o': /* ASCII IN ESC-(-J/B */
1693 if (*cp=='J'||*cp=='B'||*cp=='H')
1694 ascii_intro = *cp++;
1698 bit:1 katakana->hiragana
1699 bit:2 hiragana->katakana
1701 if ('9'>= *cp && *cp>='0')
1702 hira_f |= (*cp++ -'0');
1709 #if defined(MSDOS) || defined(__OS2__)
1724 #ifdef UTF8_OUTPUT_ENABLE
1725 case 'w': /* UTF-8 output */
1727 output_conv = w_oconv; cp++;
1731 output_bom_f = TRUE;
1734 if ('1'== cp[0] && '6'==cp[1]) {
1735 output_conv = w_oconv16; cp+=2;
1736 } else if ('3'== cp[0] && '2'==cp[1]) {
1737 output_conv = w_oconv32; cp+=2;
1739 output_conv = w_oconv;
1744 output_endian = ENDIAN_LITTLE;
1745 } else if (cp[0] == 'B') {
1753 output_bom_f = TRUE;
1758 #ifdef UTF8_INPUT_ENABLE
1759 case 'W': /* UTF input */
1762 input_f = UTF8_INPUT;
1764 if ('1'== cp[0] && '6'==cp[1]) {
1766 input_f = UTF16_INPUT;
1767 input_endian = ENDIAN_BIG;
1768 } else if ('3'== cp[0] && '2'==cp[1]) {
1770 input_f = UTF32_INPUT;
1771 input_endian = ENDIAN_BIG;
1773 input_f = UTF8_INPUT;
1778 input_endian = ENDIAN_LITTLE;
1779 } else if (cp[0] == 'B') {
1785 /* Input code assumption */
1786 case 'J': /* JIS input */
1787 input_f = JIS_INPUT;
1789 case 'E': /* AT&T EUC input */
1790 input_f = EUC_INPUT;
1792 case 'S': /* MS Kanji input */
1793 input_f = SJIS_INPUT;
1794 if (x0201_f==NO_X0201) x0201_f=TRUE;
1796 case 'Z': /* Convert X0208 alphabet to asii */
1797 /* bit:0 Convert X0208
1798 bit:1 Convert Kankaku to one space
1799 bit:2 Convert Kankaku to two spaces
1800 bit:3 Convert HTML Entity
1802 if ('9'>= *cp && *cp>='0')
1803 alpha_f |= 1<<(*cp++ -'0');
1807 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1808 x0201_f = FALSE; /* No X0201->X0208 conversion */
1810 ESC-(-I in JIS, EUC, MS Kanji
1811 SI/SO in JIS, EUC, MS Kanji
1812 SSO in EUC, JIS, not in MS Kanji
1813 MS Kanji (0xa0-0xdf)
1815 ESC-(-I in JIS (0x20-0x5f)
1816 SSO in EUC (0xa0-0xdf)
1817 0xa0-0xd in MS Kanji (0xa0-0xdf)
1820 case 'X': /* Assume X0201 kana */
1821 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1824 case 'F': /* prserve new lines */
1825 fold_preserve_f = TRUE;
1826 case 'f': /* folding -f60 or -f */
1829 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1831 fold_len += *cp++ - '0';
1833 if (!(0<fold_len && fold_len<BUFSIZ))
1834 fold_len = DEFAULT_FOLD;
1838 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1840 fold_margin += *cp++ - '0';
1844 case 'm': /* MIME support */
1845 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1846 if (*cp=='B'||*cp=='Q') {
1847 mime_decode_mode = *cp++;
1848 mimebuf_f = FIXED_MIME;
1849 } else if (*cp=='N') {
1850 mime_f = TRUE; cp++;
1851 } else if (*cp=='S') {
1852 mime_f = STRICT_MIME; cp++;
1853 } else if (*cp=='0') {
1854 mime_decode_f = FALSE;
1855 mime_f = FALSE; cp++;
1858 case 'M': /* MIME output */
1861 mimeout_f = FIXED_MIME; cp++;
1862 } else if (*cp=='Q') {
1864 mimeout_f = FIXED_MIME; cp++;
1869 case 'B': /* Broken JIS support */
1871 bit:1 allow any x on ESC-(-x or ESC-$-x
1872 bit:2 reset to ascii on NL
1874 if ('9'>= *cp && *cp>='0')
1875 broken_f |= 1<<(*cp++ -'0');
1880 case 'O':/* for Output file */
1884 case 'c':/* add cr code */
1887 case 'd':/* delete cr code */
1890 case 'I': /* ISO-2022-JP output */
1893 case 'L': /* line mode */
1894 if (*cp=='u') { /* unix */
1895 crmode_f = NL; cp++;
1896 } else if (*cp=='m') { /* mac */
1897 crmode_f = CR; cp++;
1898 } else if (*cp=='w') { /* windows */
1899 crmode_f = CRLF; cp++;
1900 } else if (*cp=='0') { /* no conversion */
1910 /* module muliple options in a string are allowed for Perl moudle */
1911 while(*cp && *cp++!='-');
1914 /* bogus option but ignored */
1920 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1923 struct input_code *p = input_code_list;
1925 if (iconv_func == p->iconv_func){
1934 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1936 #ifdef INPUT_CODE_FIX
1944 #ifdef INPUT_CODE_FIX
1945 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1951 if (estab_f && iconv_for_check != iconv){
1952 struct input_code *p = find_inputcode_byfunc(iconv);
1954 set_input_codename(p->name);
1955 debug(input_codename);
1957 iconv_for_check = iconv;
1962 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1963 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1964 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1965 #ifdef SHIFTJIS_CP932
1966 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1967 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1969 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1971 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1972 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1974 #define SCORE_INIT (SCORE_iMIME)
1976 const nkf_char score_table_A0[] = {
1979 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1980 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1983 const nkf_char score_table_F0[] = {
1984 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1985 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1986 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1987 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1990 void set_code_score(struct input_code *ptr, nkf_char score)
1993 ptr->score |= score;
1997 void clr_code_score(struct input_code *ptr, nkf_char score)
2000 ptr->score &= ~score;
2004 void code_score(struct input_code *ptr)
2006 nkf_char c2 = ptr->buf[0];
2007 #ifdef UTF8_OUTPUT_ENABLE
2008 nkf_char c1 = ptr->buf[1];
2011 set_code_score(ptr, SCORE_ERROR);
2012 }else if (c2 == SSO){
2013 set_code_score(ptr, SCORE_KANA);
2014 #ifdef UTF8_OUTPUT_ENABLE
2015 }else if (!e2w_conv(c2, c1)){
2016 set_code_score(ptr, SCORE_NO_EXIST);
2018 }else if ((c2 & 0x70) == 0x20){
2019 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2020 }else if ((c2 & 0x70) == 0x70){
2021 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2022 }else if ((c2 & 0x70) >= 0x50){
2023 set_code_score(ptr, SCORE_L2);
2027 void status_disable(struct input_code *ptr)
2032 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2035 void status_push_ch(struct input_code *ptr, nkf_char c)
2037 ptr->buf[ptr->index++] = c;
2040 void status_clear(struct input_code *ptr)
2046 void status_reset(struct input_code *ptr)
2049 ptr->score = SCORE_INIT;
2052 void status_reinit(struct input_code *ptr)
2055 ptr->_file_stat = 0;
2058 void status_check(struct input_code *ptr, nkf_char c)
2060 if (c <= DEL && estab_f){
2065 void s_status(struct input_code *ptr, nkf_char c)
2069 status_check(ptr, c);
2074 #ifdef NUMCHAR_OPTION
2075 }else if (is_unicode_capsule(c)){
2078 }else if (0xa1 <= c && c <= 0xdf){
2079 status_push_ch(ptr, SSO);
2080 status_push_ch(ptr, c);
2083 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2085 status_push_ch(ptr, c);
2086 #ifdef SHIFTJIS_CP932
2088 && is_ibmext_in_sjis(c)){
2090 status_push_ch(ptr, c);
2091 #endif /* SHIFTJIS_CP932 */
2093 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2095 status_push_ch(ptr, c);
2096 #endif /* X0212_ENABLE */
2098 status_disable(ptr);
2102 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2103 status_push_ch(ptr, c);
2104 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2108 status_disable(ptr);
2112 #ifdef SHIFTJIS_CP932
2113 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2114 status_push_ch(ptr, c);
2115 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2116 set_code_score(ptr, SCORE_CP932);
2121 #endif /* SHIFTJIS_CP932 */
2122 #ifndef X0212_ENABLE
2123 status_disable(ptr);
2129 void e_status(struct input_code *ptr, nkf_char c)
2133 status_check(ptr, c);
2138 #ifdef NUMCHAR_OPTION
2139 }else if (is_unicode_capsule(c)){
2142 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2144 status_push_ch(ptr, c);
2146 }else if (0x8f == c){
2148 status_push_ch(ptr, c);
2149 #endif /* X0212_ENABLE */
2151 status_disable(ptr);
2155 if (0xa1 <= c && c <= 0xfe){
2156 status_push_ch(ptr, c);
2160 status_disable(ptr);
2165 if (0xa1 <= c && c <= 0xfe){
2167 status_push_ch(ptr, c);
2169 status_disable(ptr);
2171 #endif /* X0212_ENABLE */
2175 #ifdef UTF8_INPUT_ENABLE
2176 void w_status(struct input_code *ptr, nkf_char c)
2180 status_check(ptr, c);
2185 #ifdef NUMCHAR_OPTION
2186 }else if (is_unicode_capsule(c)){
2189 }else if (0xc0 <= c && c <= 0xdf){
2191 status_push_ch(ptr, c);
2192 }else if (0xe0 <= c && c <= 0xef){
2194 status_push_ch(ptr, c);
2195 }else if (0xf0 <= c && c <= 0xf4){
2197 status_push_ch(ptr, c);
2199 status_disable(ptr);
2204 if (0x80 <= c && c <= 0xbf){
2205 status_push_ch(ptr, c);
2206 if (ptr->index > ptr->stat){
2207 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2208 && ptr->buf[2] == 0xbf);
2209 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2210 &ptr->buf[0], &ptr->buf[1]);
2217 status_disable(ptr);
2221 if (0x80 <= c && c <= 0xbf){
2222 if (ptr->index < ptr->stat){
2223 status_push_ch(ptr, c);
2228 status_disable(ptr);
2235 void code_status(nkf_char c)
2237 int action_flag = 1;
2238 struct input_code *result = 0;
2239 struct input_code *p = input_code_list;
2241 if (!p->status_func) {
2245 if (!p->status_func)
2247 (p->status_func)(p, c);
2250 }else if(p->stat == 0){
2261 if (result && !estab_f){
2262 set_iconv(TRUE, result->iconv_func);
2263 }else if (c <= DEL){
2264 struct input_code *ptr = input_code_list;
2274 nkf_char std_getc(FILE *f)
2277 return std_gc_buf[--std_gc_ndx];
2283 nkf_char std_ungetc(nkf_char c, FILE *f)
2285 if (std_gc_ndx == STD_GC_BUFSIZE){
2288 std_gc_buf[std_gc_ndx++] = c;
2293 void std_putc(nkf_char c)
2300 #if !defined(PERL_XS) && !defined(WIN32DLL)
2301 nkf_char noconvert(FILE *f)
2306 module_connection();
2307 while ((c = (*i_getc)(f)) != EOF)
2314 void module_connection(void)
2316 oconv = output_conv;
2319 /* replace continucation module, from output side */
2321 /* output redicrection */
2323 if (noout_f || guess_f){
2330 if (mimeout_f == TRUE) {
2331 o_base64conv = oconv; oconv = base64_conv;
2333 /* base64_count = 0; */
2337 o_crconv = oconv; oconv = cr_conv;
2340 o_rot_conv = oconv; oconv = rot_conv;
2343 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2346 o_hira_conv = oconv; oconv = hira_conv;
2349 o_fconv = oconv; oconv = fold_conv;
2352 if (alpha_f || x0201_f) {
2353 o_zconv = oconv; oconv = z_conv;
2357 i_ungetc = std_ungetc;
2358 /* input redicrection */
2361 i_cgetc = i_getc; i_getc = cap_getc;
2362 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2365 i_ugetc = i_getc; i_getc = url_getc;
2366 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2369 #ifdef NUMCHAR_OPTION
2371 i_ngetc = i_getc; i_getc = numchar_getc;
2372 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2375 #ifdef UNICODE_NORMALIZATION
2376 if (nfc_f && input_f == UTF8_INPUT){
2377 i_nfc_getc = i_getc; i_getc = nfc_getc;
2378 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2381 if (mime_f && mimebuf_f==FIXED_MIME) {
2382 i_mgetc = i_getc; i_getc = mime_getc;
2383 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2386 i_bgetc = i_getc; i_getc = broken_getc;
2387 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2389 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2390 set_iconv(-TRUE, e_iconv);
2391 } else if (input_f == SJIS_INPUT) {
2392 set_iconv(-TRUE, s_iconv);
2393 #ifdef UTF8_INPUT_ENABLE
2394 } else if (input_f == UTF8_INPUT) {
2395 set_iconv(-TRUE, w_iconv);
2396 } else if (input_f == UTF16_INPUT) {
2397 set_iconv(-TRUE, w_iconv16);
2398 } else if (input_f == UTF32_INPUT) {
2399 set_iconv(-TRUE, w_iconv32);
2402 set_iconv(FALSE, e_iconv);
2406 struct input_code *p = input_code_list;
2414 * Check and Ignore BOM
2416 void check_bom(FILE *f)
2419 switch(c2 = (*i_getc)(f)){
2421 if((c2 = (*i_getc)(f)) == 0x00){
2422 if((c2 = (*i_getc)(f)) == 0xFE){
2423 if((c2 = (*i_getc)(f)) == 0xFF){
2425 set_iconv(TRUE, w_iconv32);
2427 if (iconv == w_iconv32) {
2428 input_endian = ENDIAN_BIG;
2431 (*i_ungetc)(0xFF,f);
2432 }else (*i_ungetc)(c2,f);
2433 (*i_ungetc)(0xFE,f);
2434 }else if(c2 == 0xFF){
2435 if((c2 = (*i_getc)(f)) == 0xFE){
2437 set_iconv(TRUE, w_iconv32);
2439 if (iconv == w_iconv32) {
2440 input_endian = ENDIAN_2143;
2443 (*i_ungetc)(0xFF,f);
2444 }else (*i_ungetc)(c2,f);
2445 (*i_ungetc)(0xFF,f);
2446 }else (*i_ungetc)(c2,f);
2447 (*i_ungetc)(0x00,f);
2448 }else (*i_ungetc)(c2,f);
2449 (*i_ungetc)(0x00,f);
2452 if((c2 = (*i_getc)(f)) == 0xBB){
2453 if((c2 = (*i_getc)(f)) == 0xBF){
2455 set_iconv(TRUE, w_iconv);
2457 if (iconv == w_iconv) {
2460 (*i_ungetc)(0xBF,f);
2461 }else (*i_ungetc)(c2,f);
2462 (*i_ungetc)(0xBB,f);
2463 }else (*i_ungetc)(c2,f);
2464 (*i_ungetc)(0xEF,f);
2467 if((c2 = (*i_getc)(f)) == 0xFF){
2468 if((c2 = (*i_getc)(f)) == 0x00){
2469 if((c2 = (*i_getc)(f)) == 0x00){
2471 set_iconv(TRUE, w_iconv32);
2473 if (iconv == w_iconv32) {
2474 input_endian = ENDIAN_3412;
2477 (*i_ungetc)(0x00,f);
2478 }else (*i_ungetc)(c2,f);
2479 (*i_ungetc)(0x00,f);
2480 }else (*i_ungetc)(c2,f);
2482 set_iconv(TRUE, w_iconv16);
2484 if (iconv == w_iconv16) {
2485 input_endian = ENDIAN_BIG;
2488 (*i_ungetc)(0xFF,f);
2489 }else (*i_ungetc)(c2,f);
2490 (*i_ungetc)(0xFE,f);
2493 if((c2 = (*i_getc)(f)) == 0xFE){
2494 if((c2 = (*i_getc)(f)) == 0x00){
2495 if((c2 = (*i_getc)(f)) == 0x00){
2497 set_iconv(TRUE, w_iconv32);
2499 if (iconv == w_iconv32) {
2500 input_endian = ENDIAN_LITTLE;
2503 (*i_ungetc)(0x00,f);
2504 }else (*i_ungetc)(c2,f);
2505 (*i_ungetc)(0x00,f);
2506 }else (*i_ungetc)(c2,f);
2508 set_iconv(TRUE, w_iconv16);
2510 if (iconv == w_iconv16) {
2511 input_endian = ENDIAN_LITTLE;
2514 (*i_ungetc)(0xFE,f);
2515 }else (*i_ungetc)(c2,f);
2516 (*i_ungetc)(0xFF,f);
2525 Conversion main loop. Code detection only.
2528 nkf_char kanji_convert(FILE *f)
2530 nkf_char c3, c2=0, c1, c0=0;
2531 int is_8bit = FALSE;
2533 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2534 #ifdef UTF8_INPUT_ENABLE
2535 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2542 output_mode = ASCII;
2545 #define NEXT continue /* no output, get next */
2546 #define SEND ; /* output c1 and c2, get next */
2547 #define LAST break /* end of loop, go closing */
2549 module_connection();
2552 while ((c1 = (*i_getc)(f)) != EOF) {
2553 #ifdef INPUT_CODE_FIX
2560 /* in case of 8th bit is on */
2561 if (!estab_f&&!mime_decode_mode) {
2562 /* in case of not established yet */
2563 /* It is still ambiguious */
2564 if (h_conv(f, c2, c1)==EOF)
2570 /* in case of already established */
2572 /* ignore bogus code */
2578 /* second byte, 7 bit code */
2579 /* it might be kanji shitfted */
2580 if ((c1 == DEL) || (c1 <= SPACE)) {
2581 /* ignore bogus first code */
2588 #ifdef UTF8_INPUT_ENABLE
2589 if (iconv == w_iconv16) {
2590 if (input_endian == ENDIAN_BIG) {
2592 if ((c1 = (*i_getc)(f)) != EOF) {
2593 if (0xD8 <= c2 && c2 <= 0xDB) {
2594 if ((c0 = (*i_getc)(f)) != EOF) {
2596 if ((c3 = (*i_getc)(f)) != EOF) {
2603 if ((c2 = (*i_getc)(f)) != EOF) {
2604 if (0xD8 <= c2 && c2 <= 0xDB) {
2605 if ((c3 = (*i_getc)(f)) != EOF) {
2606 if ((c0 = (*i_getc)(f)) != EOF) {
2615 } else if(iconv == w_iconv32){
2617 if((c2 = (*i_getc)(f)) != EOF &&
2618 (c1 = (*i_getc)(f)) != EOF &&
2619 (c0 = (*i_getc)(f)) != EOF){
2620 switch(input_endian){
2622 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2625 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2628 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2631 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2641 #ifdef NUMCHAR_OPTION
2642 if (is_unicode_capsule(c1)){
2648 if (!estab_f && !iso8859_f) {
2649 /* not established yet */
2652 } else { /* estab_f==TRUE */
2657 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2658 /* SJIS X0201 Case... */
2659 if(iso2022jp_f && x0201_f==NO_X0201) {
2660 (*oconv)(GETA1, GETA2);
2667 } else if (c1==SSO && iconv != s_iconv) {
2668 /* EUC X0201 Case */
2669 c1 = (*i_getc)(f); /* skip SSO */
2671 if (SSP<=c1 && c1<0xe0) {
2672 if(iso2022jp_f && x0201_f==NO_X0201) {
2673 (*oconv)(GETA1, GETA2);
2680 } else { /* bogus code, skip SSO and one byte */
2684 /* already established */
2689 } else if ((c1 > SPACE) && (c1 != DEL)) {
2690 /* in case of Roman characters */
2692 /* output 1 shifted byte */
2696 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2697 /* output 1 shifted byte */
2698 if(iso2022jp_f && x0201_f==NO_X0201) {
2699 (*oconv)(GETA1, GETA2);
2706 /* look like bogus code */
2709 } else if (input_mode == X0208 || input_mode == X0212 ||
2710 input_mode == X0213_1 || input_mode == X0213_2) {
2711 /* in case of Kanji shifted */
2714 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2715 /* Check MIME code */
2716 if ((c1 = (*i_getc)(f)) == EOF) {
2719 } else if (c1 == '?') {
2720 /* =? is mime conversion start sequence */
2721 if(mime_f == STRICT_MIME) {
2722 /* check in real detail */
2723 if (mime_begin_strict(f) == EOF)
2727 } else if (mime_begin(f) == EOF)
2737 /* normal ASCII code */
2740 } else if (!is_8bit && c1 == SI) {
2743 } else if (!is_8bit && c1 == SO) {
2746 } else if (!is_8bit && c1 == ESC ) {
2747 if ((c1 = (*i_getc)(f)) == EOF) {
2748 /* (*oconv)(0, ESC); don't send bogus code */
2750 } else if (c1 == '$') {
2751 if ((c1 = (*i_getc)(f)) == EOF) {
2753 (*oconv)(0, ESC); don't send bogus code
2754 (*oconv)(0, '$'); */
2756 } else if (c1 == '@'|| c1 == 'B') {
2757 /* This is kanji introduction */
2760 set_input_codename("ISO-2022-JP");
2762 debug(input_codename);
2765 } else if (c1 == '(') {
2766 if ((c1 = (*i_getc)(f)) == EOF) {
2767 /* don't send bogus code
2773 } else if (c1 == '@'|| c1 == 'B') {
2774 /* This is kanji introduction */
2779 } else if (c1 == 'D'){
2783 #endif /* X0212_ENABLE */
2784 } else if (c1 == (X0213_1&0x7F)){
2785 input_mode = X0213_1;
2788 } else if (c1 == (X0213_2&0x7F)){
2789 input_mode = X0213_2;
2793 /* could be some special code */
2800 } else if (broken_f&0x2) {
2801 /* accept any ESC-(-x as broken code ... */
2811 } else if (c1 == '(') {
2812 if ((c1 = (*i_getc)(f)) == EOF) {
2813 /* don't send bogus code
2815 (*oconv)(0, '('); */
2819 /* This is X0201 kana introduction */
2820 input_mode = X0201; shift_mode = X0201;
2822 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2823 /* This is X0208 kanji introduction */
2824 input_mode = ASCII; shift_mode = FALSE;
2826 } else if (broken_f&0x2) {
2827 input_mode = ASCII; shift_mode = FALSE;
2832 /* maintain various input_mode here */
2836 } else if ( c1 == 'N' || c1 == 'n' ){
2838 c3 = (*i_getc)(f); /* skip SS2 */
2839 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2854 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2855 input_mode = ASCII; set_iconv(FALSE, 0);
2857 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2858 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2866 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2867 if ((c1=(*i_getc)(f))!=EOF) {
2871 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2889 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2892 if ((c0 = (*i_getc)(f)) != EOF) {
2895 if ((c3 = (*i_getc)(f)) != EOF) {
2897 (*iconv)(c2, c1, c0|c3);
2902 /* 3 bytes EUC or UTF-8 */
2903 if ((c0 = (*i_getc)(f)) != EOF) {
2905 (*iconv)(c2, c1, c0);
2912 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2916 (*oconv)(PREFIX_EUCG3 | c2, c1);
2918 #endif /* X0212_ENABLE */
2920 (*oconv)(PREFIX_EUCG3 | c2, c1);
2923 (*oconv)(input_mode, c1); /* other special case */
2929 /* goto next_word */
2933 (*iconv)(EOF, 0, 0);
2934 if (!is_inputcode_set)
2937 struct input_code *p = input_code_list;
2938 struct input_code *result = p;
2940 if (p->score < result->score) result = p;
2943 set_input_codename(result->name);
2950 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2952 nkf_char ret, c3, c0;
2956 /** it must NOT be in the kanji shifte sequence */
2957 /** it must NOT be written in JIS7 */
2958 /** and it must be after 2 byte 8bit code */
2964 while ((c1 = (*i_getc)(f)) != EOF) {
2970 if (push_hold_buf(c1) == EOF || estab_f){
2976 struct input_code *p = input_code_list;
2977 struct input_code *result = p;
2982 if (p->score < result->score){
2987 set_iconv(FALSE, result->iconv_func);
2992 ** 1) EOF is detected, or
2993 ** 2) Code is established, or
2994 ** 3) Buffer is FULL (but last word is pushed)
2996 ** in 1) and 3) cases, we continue to use
2997 ** Kanji codes by oconv and leave estab_f unchanged.
3002 while (hold_index < hold_count){
3003 c2 = hold_buf[hold_index++];
3005 #ifdef NUMCHAR_OPTION
3006 || is_unicode_capsule(c2)
3011 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3012 (*iconv)(X0201, c2, 0);
3015 if (hold_index < hold_count){
3016 c1 = hold_buf[hold_index++];
3026 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3029 if (hold_index < hold_count){
3030 c0 = hold_buf[hold_index++];
3031 } else if ((c0 = (*i_getc)(f)) == EOF) {
3037 if (hold_index < hold_count){
3038 c3 = hold_buf[hold_index++];
3039 } else if ((c3 = (*i_getc)(f)) == EOF) {
3044 (*iconv)(c2, c1, c0|c3);
3049 /* 3 bytes EUC or UTF-8 */
3050 if (hold_index < hold_count){
3051 c0 = hold_buf[hold_index++];
3052 } else if ((c0 = (*i_getc)(f)) == EOF) {
3058 (*iconv)(c2, c1, c0);
3061 if (c0 == EOF) break;
3066 nkf_char push_hold_buf(nkf_char c2)
3068 if (hold_count >= HOLD_SIZE*2)
3070 hold_buf[hold_count++] = (unsigned char)c2;
3071 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3074 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3076 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3079 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3080 #ifdef SHIFTJIS_CP932
3081 if (cp51932_f && is_ibmext_in_sjis(c2)){
3083 extern const unsigned short shiftjis_cp932[3][189];
3085 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3091 #endif /* SHIFTJIS_CP932 */
3093 if (!x0213_f && is_ibmext_in_sjis(c2)){
3095 extern const unsigned short shiftjis_x0212[3][189];
3097 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3100 c2 = PREFIX_EUCG3 | (val >> 8);
3113 if(x0213_f && c2 >= 0xF0){
3114 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3115 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3116 }else{ /* 78<=k<=94 */
3117 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3118 if (0x9E < c1) c2++;
3121 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3122 if (0x9E < c1) c2++;
3125 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3132 c2 = x0212_unshift(c2);
3139 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3143 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3146 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3147 if (ret) return ret;
3153 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3158 }else if (c2 == 0x8f){
3162 c2 = (c2 << 8) | (c1 & 0x7f);
3164 #ifdef SHIFTJIS_CP932
3167 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3168 s2e_conv(s2, s1, &c2, &c1);
3175 #endif /* SHIFTJIS_CP932 */
3176 #endif /* X0212_ENABLE */
3177 } else if (c2 == SSO){
3180 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3190 #ifdef UTF8_INPUT_ENABLE
3191 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3198 }else if (0xc0 <= c2 && c2 <= 0xef) {
3199 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3200 #ifdef NUMCHAR_OPTION
3203 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3211 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3214 static const int w_iconv_utf8_1st_byte[] =
3216 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3217 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3218 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3219 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3221 if (c2 < 0 || 0xff < c2) {
3222 }else if (c2 == 0) { /* 0 : 1 byte*/
3224 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3227 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3229 if (c1 < 0x80 || 0xBF < c1) return 0;
3232 if (c0 == 0) return -1;
3233 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3238 if (c0 == 0) return -1;
3239 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3243 if (c0 == 0) return -1;
3244 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3248 if (c0 == 0) return -2;
3249 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3253 if (c0 == 0) return -2;
3254 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3258 if (c0 == 0) return -2;
3259 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3267 if (c2 == 0 || c2 == EOF){
3268 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3269 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3272 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3281 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3282 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3289 }else if (val < 0x800){
3290 *p2 = 0xc0 | (val >> 6);
3291 *p1 = 0x80 | (val & 0x3f);
3293 } else if (val <= NKF_INT32_C(0xFFFF)) {
3294 *p2 = 0xe0 | (val >> 12);
3295 *p1 = 0x80 | ((val >> 6) & 0x3f);
3296 *p0 = 0x80 | (val & 0x3f);
3297 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3298 *p2 = 0xe0 | (val >> 16);
3299 *p1 = 0x80 | ((val >> 12) & 0x3f);
3300 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3309 #ifdef UTF8_INPUT_ENABLE
3310 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3315 } else if (c2 >= 0xf0){
3316 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3317 val = (c2 & 0x0f) << 18;
3318 val |= (c1 & 0x3f) << 12;
3319 val |= (c0 & 0x3f00) >> 2;
3321 }else if (c2 >= 0xe0){
3322 val = (c2 & 0x0f) << 12;
3323 val |= (c1 & 0x3f) << 6;
3325 }else if (c2 >= 0xc0){
3326 val = (c2 & 0x1f) << 6;
3334 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3336 nkf_char c2, c1, c0;
3343 w16w_conv(val, &c2, &c1, &c0);
3344 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3345 #ifdef NUMCHAR_OPTION
3348 *p1 = CLASS_UNICODE | val;
3357 #ifdef UTF8_INPUT_ENABLE
3358 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3361 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3364 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3365 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3367 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3369 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3374 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3375 if (ret) return ret;
3380 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3384 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3385 } else if (is_unicode_bmp(c1)) {
3386 ret = w16e_conv(c1, &c2, &c1);
3389 c1 = CLASS_UNICODE | c1;
3391 if (ret) return ret;
3396 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3399 extern const unsigned short *const utf8_to_euc_2bytes[];
3400 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3401 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3402 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3403 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3404 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3406 const unsigned short *const *pp;
3407 const unsigned short *const *const *ppp;
3408 static const int no_best_fit_chars_table_C2[] =
3409 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3410 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3411 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3412 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3413 static const int no_best_fit_chars_table_C2_ms[] =
3414 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3415 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3416 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3417 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3418 static const int no_best_fit_chars_table_932_C2[] =
3419 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3420 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3421 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3422 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3423 static const int no_best_fit_chars_table_932_C3[] =
3424 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3425 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3426 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3427 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3433 }else if(c2 < 0xe0){
3434 if(no_best_fit_chars_f){
3435 if(ms_ucs_map_f == UCS_MAP_CP932){
3438 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3441 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3444 }else if(cp51932_f){
3447 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3450 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3453 }else if(ms_ucs_map_f == UCS_MAP_MS){
3454 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3458 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3459 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3461 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3462 }else if(c0 < 0xF0){
3463 if(no_best_fit_chars_f){
3464 if(ms_ucs_map_f == UCS_MAP_CP932){
3465 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3466 }else if(ms_ucs_map_f == UCS_MAP_MS){
3471 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3474 if(c0 == 0x92) return 1;
3479 if(c1 == 0x80 || c0 == 0x9C) return 1;
3487 if(c0 == 0x95) return 1;
3490 if(c0 == 0xA5) return 1;
3497 if(c0 == 0x8D) return 1;
3500 if(c0 == 0x9E && cp51932_f) return 1;
3503 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3511 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3512 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3514 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3516 #ifdef SHIFTJIS_CP932
3517 if (!ret && cp51932_f && is_eucg3(*p2)) {
3519 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3520 s2e_conv(s2, s1, p2, p1);
3529 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3532 const unsigned short *p;
3535 if (pp == 0) return 1;
3538 if (c1 < 0 || psize <= c1) return 1;
3540 if (p == 0) return 1;
3543 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3545 if (val == 0) return 1;
3546 if (no_cp932ext_f && (
3547 (val>>8) == 0x2D || /* NEC special characters */
3548 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3556 if (c2 == SO) c2 = X0201;
3563 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3565 const char *hex = "0123456789ABCDEF";
3571 (*f)(0, hex[(c>>shift)&0xF]);
3581 void encode_fallback_html(nkf_char c)
3586 if(c >= NKF_INT32_C(1000000))
3587 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3588 if(c >= NKF_INT32_C(100000))
3589 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3591 (*oconv)(0, 0x30+(c/10000 )%10);
3593 (*oconv)(0, 0x30+(c/1000 )%10);
3595 (*oconv)(0, 0x30+(c/100 )%10);
3597 (*oconv)(0, 0x30+(c/10 )%10);
3599 (*oconv)(0, 0x30+ c %10);
3604 void encode_fallback_xml(nkf_char c)
3609 nkf_each_char_to_hex(oconv, c);
3614 void encode_fallback_java(nkf_char c)
3616 const char *hex = "0123456789ABCDEF";
3619 if(!is_unicode_bmp(c)){
3623 (*oconv)(0, hex[(c>>20)&0xF]);
3624 (*oconv)(0, hex[(c>>16)&0xF]);
3628 (*oconv)(0, hex[(c>>12)&0xF]);
3629 (*oconv)(0, hex[(c>> 8)&0xF]);
3630 (*oconv)(0, hex[(c>> 4)&0xF]);
3631 (*oconv)(0, hex[ c &0xF]);
3635 void encode_fallback_perl(nkf_char c)
3640 nkf_each_char_to_hex(oconv, c);
3645 void encode_fallback_subchar(nkf_char c)
3647 c = unicode_subchar;
3648 (*oconv)((c>>8)&0xFF, c&0xFF);
3653 #ifdef UTF8_OUTPUT_ENABLE
3654 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3657 extern const unsigned short euc_to_utf8_1byte[];
3658 extern const unsigned short *const euc_to_utf8_2bytes[];
3659 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3660 extern const unsigned short *const x0212_to_utf8_2bytes[];
3662 const unsigned short *p;
3665 p = euc_to_utf8_1byte;
3667 } else if (is_eucg3(c2)){
3668 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3671 c2 = (c2&0x7f) - 0x21;
3672 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3673 p = x0212_to_utf8_2bytes[c2];
3679 c2 = (c2&0x7f) - 0x21;
3680 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3681 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3686 c1 = (c1 & 0x7f) - 0x21;
3687 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3692 void w_oconv(nkf_char c2, nkf_char c1)
3698 output_bom_f = FALSE;
3709 #ifdef NUMCHAR_OPTION
3710 if (c2 == 0 && is_unicode_capsule(c1)){
3711 val = c1 & VALUE_MASK;
3714 }else if (val < 0x800){
3715 (*o_putc)(0xC0 | (val >> 6));
3716 (*o_putc)(0x80 | (val & 0x3f));
3717 } else if (val <= NKF_INT32_C(0xFFFF)) {
3718 (*o_putc)(0xE0 | (val >> 12));
3719 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3720 (*o_putc)(0x80 | (val & 0x3f));
3721 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3722 (*o_putc)(0xF0 | ( val>>18));
3723 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3724 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3725 (*o_putc)(0x80 | ( val & 0x3f));
3732 output_mode = ASCII;
3734 } else if (c2 == ISO8859_1) {
3735 output_mode = ISO8859_1;
3736 (*o_putc)(c1 | 0x080);
3739 val = e2w_conv(c2, c1);
3741 w16w_conv(val, &c2, &c1, &c0);
3745 if (c0) (*o_putc)(c0);
3751 void w_oconv16(nkf_char c2, nkf_char c1)
3754 output_bom_f = FALSE;
3755 if (output_endian == ENDIAN_LITTLE){
3756 (*o_putc)((unsigned char)'\377');
3760 (*o_putc)((unsigned char)'\377');
3769 if (c2 == ISO8859_1) {
3772 #ifdef NUMCHAR_OPTION
3773 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3774 if (is_unicode_bmp(c1)) {
3775 c2 = (c1 >> 8) & 0xff;
3779 if (c1 <= UNICODE_MAX) {
3780 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3781 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3782 if (output_endian == ENDIAN_LITTLE){
3783 (*o_putc)(c2 & 0xff);
3784 (*o_putc)((c2 >> 8) & 0xff);
3785 (*o_putc)(c1 & 0xff);
3786 (*o_putc)((c1 >> 8) & 0xff);
3788 (*o_putc)((c2 >> 8) & 0xff);
3789 (*o_putc)(c2 & 0xff);
3790 (*o_putc)((c1 >> 8) & 0xff);
3791 (*o_putc)(c1 & 0xff);
3798 nkf_char val = e2w_conv(c2, c1);
3799 c2 = (val >> 8) & 0xff;
3802 if (output_endian == ENDIAN_LITTLE){
3811 void w_oconv32(nkf_char c2, nkf_char c1)
3814 output_bom_f = FALSE;
3815 if (output_endian == ENDIAN_LITTLE){
3816 (*o_putc)((unsigned char)'\377');
3824 (*o_putc)((unsigned char)'\377');
3833 if (c2 == ISO8859_1) {
3835 #ifdef NUMCHAR_OPTION
3836 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3840 c1 = e2w_conv(c2, c1);
3842 if (output_endian == ENDIAN_LITTLE){
3843 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3844 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3845 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3849 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3850 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3851 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3856 void e_oconv(nkf_char c2, nkf_char c1)
3858 #ifdef NUMCHAR_OPTION
3859 if (c2 == 0 && is_unicode_capsule(c1)){
3860 w16e_conv(c1, &c2, &c1);
3861 if (c2 == 0 && is_unicode_capsule(c1)){
3862 if(encode_fallback)(*encode_fallback)(c1);
3870 } else if (c2 == 0) {
3871 output_mode = ASCII;
3873 } else if (c2 == X0201) {
3874 output_mode = JAPANESE_EUC;
3875 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3876 } else if (c2 == ISO8859_1) {
3877 output_mode = ISO8859_1;
3878 (*o_putc)(c1 | 0x080);
3880 } else if (is_eucg3(c2)){
3881 output_mode = JAPANESE_EUC;
3882 #ifdef SHIFTJIS_CP932
3885 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3886 s2e_conv(s2, s1, &c2, &c1);
3891 output_mode = ASCII;
3893 }else if (is_eucg3(c2)){
3896 (*o_putc)((c2 & 0x7f) | 0x080);
3897 (*o_putc)(c1 | 0x080);
3900 (*o_putc)((c2 & 0x7f) | 0x080);
3901 (*o_putc)(c1 | 0x080);
3905 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3906 set_iconv(FALSE, 0);
3907 return; /* too late to rescue this char */
3909 output_mode = JAPANESE_EUC;
3910 (*o_putc)(c2 | 0x080);
3911 (*o_putc)(c1 | 0x080);
3916 nkf_char x0212_shift(nkf_char c)
3921 if (0x75 <= c && c <= 0x7f){
3922 ret = c + (0x109 - 0x75);
3925 if (0x75 <= c && c <= 0x7f){
3926 ret = c + (0x113 - 0x75);
3933 nkf_char x0212_unshift(nkf_char c)
3936 if (0x7f <= c && c <= 0x88){
3937 ret = c + (0x75 - 0x7f);
3938 }else if (0x89 <= c && c <= 0x92){
3939 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
3943 #endif /* X0212_ENABLE */
3945 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3951 if((0x21 <= ndx && ndx <= 0x2F)){
3952 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3953 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3955 }else if(0x6E <= ndx && ndx <= 0x7E){
3956 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3957 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3963 else if(nkf_isgraph(ndx)){
3965 const unsigned short *ptr;
3967 extern const unsigned short *const x0212_shiftjis[];
3969 ptr = x0212_shiftjis[ndx - 0x21];
3971 val = ptr[(c1 & 0x7f) - 0x21];
3980 c2 = x0212_shift(c2);
3982 #endif /* X0212_ENABLE */
3984 if(0x7F < c2) return 1;
3985 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3986 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3990 void s_oconv(nkf_char c2, nkf_char c1)
3992 #ifdef NUMCHAR_OPTION
3993 if (c2 == 0 && is_unicode_capsule(c1)){
3994 w16e_conv(c1, &c2, &c1);
3995 if (c2 == 0 && is_unicode_capsule(c1)){
3996 if(encode_fallback)(*encode_fallback)(c1);
4004 } else if (c2 == 0) {
4005 output_mode = ASCII;
4007 } else if (c2 == X0201) {
4008 output_mode = SHIFT_JIS;
4010 } else if (c2 == ISO8859_1) {
4011 output_mode = ISO8859_1;
4012 (*o_putc)(c1 | 0x080);
4014 } else if (is_eucg3(c2)){
4015 output_mode = SHIFT_JIS;
4016 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4022 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4023 set_iconv(FALSE, 0);
4024 return; /* too late to rescue this char */
4026 output_mode = SHIFT_JIS;
4027 e2s_conv(c2, c1, &c2, &c1);
4029 #ifdef SHIFTJIS_CP932
4031 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4033 extern const unsigned short cp932inv[2][189];
4035 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4041 #endif /* SHIFTJIS_CP932 */
4044 if (prefix_table[(unsigned char)c1]){
4045 (*o_putc)(prefix_table[(unsigned char)c1]);
4051 void j_oconv(nkf_char c2, nkf_char c1)
4053 #ifdef NUMCHAR_OPTION
4054 if (c2 == 0 && is_unicode_capsule(c1)){
4055 w16e_conv(c1, &c2, &c1);
4056 if (c2 == 0 && is_unicode_capsule(c1)){
4057 if(encode_fallback)(*encode_fallback)(c1);
4063 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4066 (*o_putc)(ascii_intro);
4067 output_mode = ASCII;
4071 } else if (is_eucg3(c2)){
4073 if(output_mode!=X0213_2){
4074 output_mode = X0213_2;
4078 (*o_putc)(X0213_2&0x7F);
4081 if(output_mode!=X0212){
4082 output_mode = X0212;
4086 (*o_putc)(X0212&0x7F);
4089 (*o_putc)(c2 & 0x7f);
4092 } else if (c2==X0201) {
4093 if (output_mode!=X0201) {
4094 output_mode = X0201;
4100 } else if (c2==ISO8859_1) {
4101 /* iso8859 introduction, or 8th bit on */
4102 /* Can we convert in 7bit form using ESC-'-'-A ?
4104 output_mode = ISO8859_1;
4106 } else if (c2 == 0) {
4107 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4110 (*o_putc)(ascii_intro);
4111 output_mode = ASCII;
4115 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4117 if (output_mode!=X0213_1) {
4118 output_mode = X0213_1;
4122 (*o_putc)(X0213_1&0x7F);
4124 }else if (output_mode != X0208) {
4125 output_mode = X0208;
4128 (*o_putc)(kanji_intro);
4135 void base64_conv(nkf_char c2, nkf_char c1)
4137 mime_prechar(c2, c1);
4138 (*o_base64conv)(c2,c1);
4142 static nkf_char broken_buf[3];
4143 static int broken_counter = 0;
4144 static int broken_last = 0;
4145 nkf_char broken_getc(FILE *f)
4149 if (broken_counter>0) {
4150 return broken_buf[--broken_counter];
4153 if (c=='$' && broken_last != ESC
4154 && (input_mode==ASCII || input_mode==X0201)) {
4157 if (c1=='@'|| c1=='B') {
4158 broken_buf[0]=c1; broken_buf[1]=c;
4165 } else if (c=='(' && broken_last != ESC
4166 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4169 if (c1=='J'|| c1=='B') {
4170 broken_buf[0]=c1; broken_buf[1]=c;
4183 nkf_char broken_ungetc(nkf_char c, FILE *f)
4185 if (broken_counter<2)
4186 broken_buf[broken_counter++]=c;
4190 static nkf_char prev_cr = 0;
4192 void cr_conv(nkf_char c2, nkf_char c1)
4196 if (! (c2==0&&c1==NL) ) {
4202 } else if (c1=='\r') {
4204 } else if (c1=='\n') {
4205 if (crmode_f==CRLF) {
4206 (*o_crconv)(0,'\r');
4207 } else if (crmode_f==CR) {
4208 (*o_crconv)(0,'\r');
4212 } else if (c1!='\032' || crmode_f!=NL){
4218 Return value of fold_conv()
4220 \n add newline and output char
4221 \r add newline and output nothing
4224 1 (or else) normal output
4226 fold state in prev (previous character)
4228 >0x80 Japanese (X0208/X0201)
4233 This fold algorthm does not preserve heading space in a line.
4234 This is the main difference from fmt.
4237 #define char_size(c2,c1) (c2?2:1)
4239 void fold_conv(nkf_char c2, nkf_char c1)
4242 nkf_char fold_state;
4244 if (c1== '\r' && !fold_preserve_f) {
4245 fold_state=0; /* ignore cr */
4246 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4248 fold_state=0; /* ignore cr */
4249 } else if (c1== BS) {
4250 if (f_line>0) f_line--;
4252 } else if (c2==EOF && f_line != 0) { /* close open last line */
4254 } else if ((c1=='\n' && !fold_preserve_f)
4255 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4256 && fold_preserve_f)) {
4258 if (fold_preserve_f) {
4262 } else if ((f_prev == c1 && !fold_preserve_f)
4263 || (f_prev == '\n' && fold_preserve_f)
4264 ) { /* duplicate newline */
4267 fold_state = '\n'; /* output two newline */
4273 if (f_prev&0x80) { /* Japanese? */
4275 fold_state = 0; /* ignore given single newline */
4276 } else if (f_prev==' ') {
4280 if (++f_line<=fold_len)
4284 fold_state = '\r'; /* fold and output nothing */
4288 } else if (c1=='\f') {
4291 fold_state = '\n'; /* output newline and clear */
4292 } else if ( (c2==0 && c1==' ')||
4293 (c2==0 && c1=='\t')||
4294 (c2=='!'&& c1=='!')) {
4295 /* X0208 kankaku or ascii space */
4296 if (f_prev == ' ') {
4297 fold_state = 0; /* remove duplicate spaces */
4300 if (++f_line<=fold_len)
4301 fold_state = ' '; /* output ASCII space only */
4303 f_prev = ' '; f_line = 0;
4304 fold_state = '\r'; /* fold and output nothing */
4308 prev0 = f_prev; /* we still need this one... , but almost done */
4310 if (c2 || c2==X0201)
4311 f_prev |= 0x80; /* this is Japanese */
4312 f_line += char_size(c2,c1);
4313 if (f_line<=fold_len) { /* normal case */
4316 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4317 f_line = char_size(c2,c1);
4318 fold_state = '\n'; /* We can't wait, do fold now */
4319 } else if (c2==X0201) {
4320 /* simple kinsoku rules return 1 means no folding */
4321 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4322 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4323 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4324 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4325 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4326 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4327 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4329 fold_state = '\n';/* add one new f_line before this character */
4332 fold_state = '\n';/* add one new f_line before this character */
4335 /* kinsoku point in ASCII */
4336 if ( c1==')'|| /* { [ ( */
4347 /* just after special */
4348 } else if (!is_alnum(prev0)) {
4349 f_line = char_size(c2,c1);
4351 } else if ((prev0==' ') || /* ignored new f_line */
4352 (prev0=='\n')|| /* ignored new f_line */
4353 (prev0&0x80)) { /* X0208 - ASCII */
4354 f_line = char_size(c2,c1);
4355 fold_state = '\n';/* add one new f_line before this character */
4357 fold_state = 1; /* default no fold in ASCII */
4361 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4362 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4363 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4364 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4365 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4366 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4367 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4368 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4369 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4370 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4371 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4372 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4373 /* default no fold in kinsoku */
4376 f_line = char_size(c2,c1);
4377 /* add one new f_line before this character */
4380 f_line = char_size(c2,c1);
4382 /* add one new f_line before this character */
4387 /* terminator process */
4388 switch(fold_state) {
4407 nkf_char z_prev2=0,z_prev1=0;
4409 void z_conv(nkf_char c2, nkf_char c1)
4412 /* if (c2) c1 &= 0x7f; assertion */
4414 if (x0201_f && z_prev2==X0201) { /* X0201 */
4415 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4417 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4419 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4421 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4425 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4434 if (x0201_f && c2==X0201) {
4435 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4436 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4437 z_prev1 = c1; z_prev2 = c2;
4440 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4445 /* JISX0208 Alphabet */
4446 if (alpha_f && c2 == 0x23 ) {
4448 } else if (alpha_f && c2 == 0x21 ) {
4449 /* JISX0208 Kigou */
4454 } else if (alpha_f&0x4) {
4459 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4465 case '>': entity = ">"; break;
4466 case '<': entity = "<"; break;
4467 case '\"': entity = """; break;
4468 case '&': entity = "&"; break;
4471 while (*entity) (*o_zconv)(0, *entity++);
4481 #define rot13(c) ( \
4483 (c <= 'M') ? (c + 13): \
4484 (c <= 'Z') ? (c - 13): \
4486 (c <= 'm') ? (c + 13): \
4487 (c <= 'z') ? (c - 13): \
4491 #define rot47(c) ( \
4493 ( c <= 'O' ) ? (c + 47) : \
4494 ( c <= '~' ) ? (c - 47) : \
4498 void rot_conv(nkf_char c2, nkf_char c1)
4500 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4506 (*o_rot_conv)(c2,c1);
4509 void hira_conv(nkf_char c2, nkf_char c1)
4513 if (0x20 < c1 && c1 < 0x74) {
4515 (*o_hira_conv)(c2,c1);
4517 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4519 c1 = CLASS_UNICODE | 0x3094;
4520 (*o_hira_conv)(c2,c1);
4523 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4525 (*o_hira_conv)(c2,c1);
4530 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4533 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4535 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4539 (*o_hira_conv)(c2,c1);
4543 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4545 static const nkf_char range[RANGE_NUM_MAX][2] = {
4566 nkf_char start, end, c;
4568 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4572 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4577 for (i = 0; i < RANGE_NUM_MAX; i++) {
4578 start = range[i][0];
4581 if (c >= start && c <= end) {
4586 (*o_iso2022jp_check_conv)(c2,c1);
4590 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4592 const unsigned char *mime_pattern[] = {
4593 (const unsigned char *)"\075?EUC-JP?B?",
4594 (const unsigned char *)"\075?SHIFT_JIS?B?",
4595 (const unsigned char *)"\075?ISO-8859-1?Q?",
4596 (const unsigned char *)"\075?ISO-8859-1?B?",
4597 (const unsigned char *)"\075?ISO-2022-JP?B?",
4598 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4599 #if defined(UTF8_INPUT_ENABLE)
4600 (const unsigned char *)"\075?UTF-8?B?",
4601 (const unsigned char *)"\075?UTF-8?Q?",
4603 (const unsigned char *)"\075?US-ASCII?Q?",
4608 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4609 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4610 e_iconv, s_iconv, 0, 0, 0, 0,
4611 #if defined(UTF8_INPUT_ENABLE)
4617 const nkf_char mime_encode[] = {
4618 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4619 #if defined(UTF8_INPUT_ENABLE)
4626 const nkf_char mime_encode_method[] = {
4627 'B', 'B','Q', 'B', 'B', 'Q',
4628 #if defined(UTF8_INPUT_ENABLE)
4636 #define MAXRECOVER 20
4638 void switch_mime_getc(void)
4640 if (i_getc!=mime_getc) {
4641 i_mgetc = i_getc; i_getc = mime_getc;
4642 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4643 if(mime_f==STRICT_MIME) {
4644 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4645 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4650 void unswitch_mime_getc(void)
4652 if(mime_f==STRICT_MIME) {
4653 i_mgetc = i_mgetc_buf;
4654 i_mungetc = i_mungetc_buf;
4657 i_ungetc = i_mungetc;
4658 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4659 mime_iconv_back = NULL;
4662 nkf_char mime_begin_strict(FILE *f)
4666 const unsigned char *p,*q;
4667 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4669 mime_decode_mode = FALSE;
4670 /* =? has been checked */
4672 p = mime_pattern[j];
4675 for(i=2;p[i]>' ';i++) { /* start at =? */
4676 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4677 /* pattern fails, try next one */
4679 while (mime_pattern[++j]) {
4680 p = mime_pattern[j];
4681 for(k=2;k<i;k++) /* assume length(p) > i */
4682 if (p[k]!=q[k]) break;
4683 if (k==i && nkf_toupper(c1)==p[k]) break;
4685 p = mime_pattern[j];
4686 if (p) continue; /* found next one, continue */
4687 /* all fails, output from recovery buffer */
4695 mime_decode_mode = p[i-2];
4697 mime_iconv_back = iconv;
4698 set_iconv(FALSE, mime_priority_func[j]);
4699 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4701 if (mime_decode_mode=='B') {
4702 mimebuf_f = unbuf_f;
4704 /* do MIME integrity check */
4705 return mime_integrity(f,mime_pattern[j]);
4713 nkf_char mime_getc_buf(FILE *f)
4715 /* we don't keep eof of Fifo, becase it contains ?= as
4716 a terminator. It was checked in mime_integrity. */
4717 return ((mimebuf_f)?
4718 (*i_mgetc_buf)(f):Fifo(mime_input++));
4721 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4724 (*i_mungetc_buf)(c,f);
4726 Fifo(--mime_input) = (unsigned char)c;
4730 nkf_char mime_begin(FILE *f)
4735 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4736 /* re-read and convert again from mime_buffer. */
4738 /* =? has been checked */
4740 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4741 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4742 /* We accept any character type even if it is breaked by new lines */
4743 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4744 if (c1=='\n'||c1==' '||c1=='\r'||
4745 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4747 /* Failed. But this could be another MIME preemble */
4755 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4756 if (!(++i<MAXRECOVER) || c1==EOF) break;
4757 if (c1=='b'||c1=='B') {
4758 mime_decode_mode = 'B';
4759 } else if (c1=='q'||c1=='Q') {
4760 mime_decode_mode = 'Q';
4764 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4765 if (!(++i<MAXRECOVER) || c1==EOF) break;
4767 mime_decode_mode = FALSE;
4773 if (!mime_decode_mode) {
4774 /* false MIME premble, restart from mime_buffer */
4775 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4776 /* Since we are in MIME mode until buffer becomes empty, */
4777 /* we never go into mime_begin again for a while. */
4780 /* discard mime preemble, and goto MIME mode */
4782 /* do no MIME integrity check */
4783 return c1; /* used only for checking EOF */
4787 void no_putc(nkf_char c)
4792 void debug(const char *str)
4795 fprintf(stderr, "%s\n", str);
4800 void set_input_codename(char *codename)
4804 strcmp(codename, "") != 0 &&
4805 strcmp(codename, input_codename) != 0)
4807 is_inputcode_mixed = TRUE;
4809 input_codename = codename;
4810 is_inputcode_set = TRUE;
4813 #if !defined(PERL_XS) && !defined(WIN32DLL)
4814 void print_guessed_code(char *filename)
4816 char *codename = "BINARY";
4817 if (!is_inputcode_mixed) {
4818 if (strcmp(input_codename, "") == 0) {
4821 codename = input_codename;
4824 if (filename != NULL) printf("%s:", filename);
4825 printf("%s\n", codename);
4831 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4833 nkf_char c1, c2, c3;
4839 if (!nkf_isxdigit(c2)){
4844 if (!nkf_isxdigit(c3)){
4849 return (hex2bin(c2) << 4) | hex2bin(c3);
4852 nkf_char cap_getc(FILE *f)
4854 return hex_getc(':', f, i_cgetc, i_cungetc);
4857 nkf_char cap_ungetc(nkf_char c, FILE *f)
4859 return (*i_cungetc)(c, f);
4862 nkf_char url_getc(FILE *f)
4864 return hex_getc('%', f, i_ugetc, i_uungetc);
4867 nkf_char url_ungetc(nkf_char c, FILE *f)
4869 return (*i_uungetc)(c, f);
4873 #ifdef NUMCHAR_OPTION
4874 nkf_char numchar_getc(FILE *f)
4876 nkf_char (*g)(FILE *) = i_ngetc;
4877 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4888 if (buf[i] == 'x' || buf[i] == 'X'){
4889 for (j = 0; j < 7; j++){
4891 if (!nkf_isxdigit(buf[i])){
4898 c |= hex2bin(buf[i]);
4901 for (j = 0; j < 8; j++){
4905 if (!nkf_isdigit(buf[i])){
4912 c += hex2bin(buf[i]);
4918 return CLASS_UNICODE | c;
4927 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4929 return (*i_nungetc)(c, f);
4933 #ifdef UNICODE_NORMALIZATION
4935 /* Normalization Form C */
4936 nkf_char nfc_getc(FILE *f)
4938 nkf_char (*g)(FILE *f) = i_nfc_getc;
4939 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4940 int i=0, j, k=1, lower, upper;
4942 const nkf_nfchar *array;
4944 extern const struct normalization_pair normalization_table[];
4948 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4949 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4950 while (upper >= lower) {
4951 j = (lower+upper) / 2;
4952 array = normalization_table[j].nfd;
4953 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4954 if (array[k] != buf[k]){
4955 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4962 array = normalization_table[j].nfc;
4963 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4964 buf[i] = (nkf_char)(array[i]);
4975 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4977 return (*i_nfc_ungetc)(c, f);
4979 #endif /* UNICODE_NORMALIZATION */
4985 nkf_char c1, c2, c3, c4, cc;
4986 nkf_char t1, t2, t3, t4, mode, exit_mode;
4987 nkf_char lwsp_count;
4990 nkf_char lwsp_size = 128;
4992 if (mime_top != mime_last) { /* Something is in FIFO */
4993 return Fifo(mime_top++);
4995 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4996 mime_decode_mode=FALSE;
4997 unswitch_mime_getc();
4998 return (*i_getc)(f);
5001 if (mimebuf_f == FIXED_MIME)
5002 exit_mode = mime_decode_mode;
5005 if (mime_decode_mode == 'Q') {
5006 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5008 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5009 if (c1<=' ' || DEL<=c1) {
5010 mime_decode_mode = exit_mode; /* prepare for quit */
5013 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5017 mime_decode_mode = exit_mode; /* prepare for quit */
5018 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5019 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5020 /* end Q encoding */
5021 input_mode = exit_mode;
5023 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5024 if (lwsp_buf==NULL) {
5025 perror("can't malloc");
5028 while ((c1=(*i_getc)(f))!=EOF) {
5033 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5041 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5042 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5057 lwsp_buf[lwsp_count] = (unsigned char)c1;
5058 if (lwsp_count++>lwsp_size){
5060 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5061 if (lwsp_buf_new==NULL) {
5063 perror("can't realloc");
5066 lwsp_buf = lwsp_buf_new;
5072 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5074 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5075 i_ungetc(lwsp_buf[lwsp_count],f);
5081 if (c1=='='&&c2<' ') { /* this is soft wrap */
5082 while((c1 = (*i_mgetc)(f)) <=' ') {
5083 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5085 mime_decode_mode = 'Q'; /* still in MIME */
5086 goto restart_mime_q;
5089 mime_decode_mode = 'Q'; /* still in MIME */
5093 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5094 if (c2<=' ') return c2;
5095 mime_decode_mode = 'Q'; /* still in MIME */
5096 return ((hex2bin(c2)<<4) + hex2bin(c3));
5099 if (mime_decode_mode != 'B') {
5100 mime_decode_mode = FALSE;
5101 return (*i_mgetc)(f);
5105 /* Base64 encoding */
5107 MIME allows line break in the middle of
5108 Base64, but we are very pessimistic in decoding
5109 in unbuf mode because MIME encoded code may broken by
5110 less or editor's control sequence (such as ESC-[-K in unbuffered
5111 mode. ignore incomplete MIME.
5113 mode = mime_decode_mode;
5114 mime_decode_mode = exit_mode; /* prepare for quit */
5116 while ((c1 = (*i_mgetc)(f))<=' ') {
5121 if ((c2 = (*i_mgetc)(f))<=' ') {
5124 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5125 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5128 if ((c1 == '?') && (c2 == '=')) {
5131 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5132 if (lwsp_buf==NULL) {
5133 perror("can't malloc");
5136 while ((c1=(*i_getc)(f))!=EOF) {
5141 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5149 if ((c1=(*i_getc)(f))!=EOF) {
5153 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5168 lwsp_buf[lwsp_count] = (unsigned char)c1;
5169 if (lwsp_count++>lwsp_size){
5171 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5172 if (lwsp_buf_new==NULL) {
5174 perror("can't realloc");
5177 lwsp_buf = lwsp_buf_new;
5183 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5185 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5186 i_ungetc(lwsp_buf[lwsp_count],f);
5193 if ((c3 = (*i_mgetc)(f))<=' ') {
5196 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5197 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5201 if ((c4 = (*i_mgetc)(f))<=' ') {
5204 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5205 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5209 mime_decode_mode = mode; /* still in MIME sigh... */
5211 /* BASE 64 decoding */
5213 t1 = 0x3f & base64decode(c1);
5214 t2 = 0x3f & base64decode(c2);
5215 t3 = 0x3f & base64decode(c3);
5216 t4 = 0x3f & base64decode(c4);
5217 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5219 Fifo(mime_last++) = (unsigned char)cc;
5220 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5222 Fifo(mime_last++) = (unsigned char)cc;
5223 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5225 Fifo(mime_last++) = (unsigned char)cc;
5230 return Fifo(mime_top++);
5233 nkf_char mime_ungetc(nkf_char c, FILE *f)
5235 Fifo(--mime_top) = (unsigned char)c;
5239 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5243 /* In buffered mode, read until =? or NL or buffer full
5245 mime_input = mime_top;
5246 mime_last = mime_top;
5248 while(*p) Fifo(mime_input++) = *p++;
5251 while((c=(*i_getc)(f))!=EOF) {
5252 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5253 break; /* buffer full */
5255 if (c=='=' && d=='?') {
5256 /* checked. skip header, start decode */
5257 Fifo(mime_input++) = (unsigned char)c;
5258 /* mime_last_input = mime_input; */
5263 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5265 /* Should we check length mod 4? */
5266 Fifo(mime_input++) = (unsigned char)c;
5269 /* In case of Incomplete MIME, no MIME decode */
5270 Fifo(mime_input++) = (unsigned char)c;
5271 mime_last = mime_input; /* point undecoded buffer */
5272 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5273 switch_mime_getc(); /* anyway we need buffered getc */
5277 nkf_char base64decode(nkf_char c)
5282 i = c - 'A'; /* A..Z 0-25 */
5284 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5286 } else if (c > '/') {
5287 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5288 } else if (c == '+') {
5289 i = '>' /* 62 */ ; /* + 62 */
5291 i = '?' /* 63 */ ; /* / 63 */
5296 static const char basis_64[] =
5297 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5299 static nkf_char b64c;
5300 #define MIMEOUT_BUF_LENGTH (60)
5301 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5302 int mimeout_buf_count = 0;
5303 int mimeout_preserve_space = 0;
5304 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5306 void open_mime(nkf_char mode)
5308 const unsigned char *p;
5311 p = mime_pattern[0];
5312 for(i=0;mime_encode[i];i++) {
5313 if (mode == mime_encode[i]) {
5314 p = mime_pattern[i];
5318 mimeout_mode = mime_encode_method[i];
5321 if (base64_count>45) {
5322 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5323 (*o_mputc)(mimeout_buf[i]);
5329 if (!mimeout_preserve_space && mimeout_buf_count>0
5330 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5331 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5335 if (!mimeout_preserve_space) {
5336 for (;i<mimeout_buf_count;i++) {
5337 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5338 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5339 (*o_mputc)(mimeout_buf[i]);
5346 mimeout_preserve_space = FALSE;
5352 j = mimeout_buf_count;
5353 mimeout_buf_count = 0;
5355 mime_putc(mimeout_buf[i]);
5359 void close_mime(void)
5369 switch(mimeout_mode) {
5374 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5380 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5386 if (mimeout_f!=FIXED_MIME) {
5388 } else if (mimeout_mode != 'Q')
5393 void mimeout_addchar(nkf_char c)
5395 switch(mimeout_mode) {
5400 } else if(!nkf_isalnum(c)) {
5402 (*o_mputc)(itoh4(((c>>4)&0xf)));
5403 (*o_mputc)(itoh4((c&0xf)));
5412 (*o_mputc)(basis_64[c>>2]);
5417 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5423 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5424 (*o_mputc)(basis_64[c & 0x3F]);
5435 nkf_char mime_lastchar2, mime_lastchar1;
5437 void mime_prechar(nkf_char c2, nkf_char c1)
5441 if (base64_count + mimeout_buf_count/3*4> 66){
5442 (*o_base64conv)(EOF,0);
5443 (*o_base64conv)(0,NL);
5444 (*o_base64conv)(0,SPACE);
5446 }/*else if (mime_lastchar2){
5447 if (c1 <=DEL && !nkf_isspace(c1)){
5448 (*o_base64conv)(0,SPACE);
5452 if (c2 && mime_lastchar2 == 0
5453 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5454 (*o_base64conv)(0,SPACE);
5457 mime_lastchar2 = c2;
5458 mime_lastchar1 = c1;
5461 void mime_putc(nkf_char c)
5466 if (mimeout_f == FIXED_MIME){
5467 if (mimeout_mode == 'Q'){
5468 if (base64_count > 71){
5469 if (c!=CR && c!=NL) {
5476 if (base64_count > 71){
5481 if (c == EOF) { /* c==EOF */
5485 if (c != EOF) { /* c==EOF */
5491 /* mimeout_f != FIXED_MIME */
5493 if (c == EOF) { /* c==EOF */
5494 j = mimeout_buf_count;
5495 mimeout_buf_count = 0;
5499 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5502 mimeout_addchar(mimeout_buf[i]);
5506 mimeout_addchar(mimeout_buf[i]);
5510 mimeout_addchar(mimeout_buf[i]);
5516 if (mimeout_mode=='Q') {
5517 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5529 if (mimeout_buf_count > 0){
5530 lastchar = mimeout_buf[mimeout_buf_count - 1];
5535 if (!mimeout_mode) {
5536 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5537 if (nkf_isspace(c)) {
5538 if (c==CR || c==NL) {
5541 for (i=0;i<mimeout_buf_count;i++) {
5542 (*o_mputc)(mimeout_buf[i]);
5543 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5549 mimeout_buf[0] = (char)c;
5550 mimeout_buf_count = 1;
5552 if (base64_count > 1
5553 && base64_count + mimeout_buf_count > 76){
5556 if (!nkf_isspace(mimeout_buf[0])){
5561 mimeout_buf[mimeout_buf_count++] = (char)c;
5562 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5563 open_mime(output_mode);
5568 if (lastchar==CR || lastchar == NL){
5569 for (i=0;i<mimeout_buf_count;i++) {
5570 (*o_mputc)(mimeout_buf[i]);
5573 mimeout_buf_count = 0;
5575 if (lastchar==SPACE) {
5576 for (i=0;i<mimeout_buf_count-1;i++) {
5577 (*o_mputc)(mimeout_buf[i]);
5580 mimeout_buf[0] = SPACE;
5581 mimeout_buf_count = 1;
5583 open_mime(output_mode);
5586 /* mimeout_mode == 'B', 1, 2 */
5587 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5588 if (lastchar == CR || lastchar == NL){
5589 if (nkf_isblank(c)) {
5590 for (i=0;i<mimeout_buf_count;i++) {
5591 mimeout_addchar(mimeout_buf[i]);
5593 mimeout_buf_count = 0;
5594 } else if (SPACE<c && c<DEL) {
5596 for (i=0;i<mimeout_buf_count;i++) {
5597 (*o_mputc)(mimeout_buf[i]);
5600 mimeout_buf_count = 0;
5603 if (c==SPACE || c==TAB || c==CR || c==NL) {
5604 for (i=0;i<mimeout_buf_count;i++) {
5605 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5607 for (i=0;i<mimeout_buf_count;i++) {
5608 (*o_mputc)(mimeout_buf[i]);
5611 mimeout_buf_count = 0;
5614 mimeout_buf[mimeout_buf_count++] = (char)c;
5615 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5617 for (i=0;i<mimeout_buf_count;i++) {
5618 (*o_mputc)(mimeout_buf[i]);
5621 mimeout_buf_count = 0;
5625 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5626 mimeout_buf[mimeout_buf_count++] = (char)c;
5627 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5628 j = mimeout_buf_count;
5629 mimeout_buf_count = 0;
5631 mimeout_addchar(mimeout_buf[i]);
5638 if (mimeout_buf_count>0) {
5639 j = mimeout_buf_count;
5640 mimeout_buf_count = 0;
5642 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5644 mimeout_addchar(mimeout_buf[i]);
5650 (*o_mputc)(mimeout_buf[i]);
5652 open_mime(output_mode);
5659 #if defined(PERL_XS) || defined(WIN32DLL)
5663 struct input_code *p = input_code_list;
5676 mime_f = STRICT_MIME;
5677 mime_decode_f = FALSE;
5682 #if defined(MSDOS) || defined(__OS2__)
5687 iso2022jp_f = FALSE;
5688 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5689 ms_ucs_map_f = UCS_MAP_ASCII;
5691 #ifdef UTF8_INPUT_ENABLE
5692 no_cp932ext_f = FALSE;
5693 no_best_fit_chars_f = FALSE;
5694 encode_fallback = NULL;
5695 unicode_subchar = '?';
5696 input_endian = ENDIAN_BIG;
5698 #ifdef UTF8_OUTPUT_ENABLE
5699 output_bom_f = FALSE;
5700 output_endian = ENDIAN_BIG;
5702 #ifdef UNICODE_NORMALIZATION
5715 is_inputcode_mixed = FALSE;
5716 is_inputcode_set = FALSE;
5720 #ifdef SHIFTJIS_CP932
5730 for (i = 0; i < 256; i++){
5731 prefix_table[i] = 0;
5735 mimeout_buf_count = 0;
5740 fold_preserve_f = FALSE;
5743 kanji_intro = DEFAULT_J;
5744 ascii_intro = DEFAULT_R;
5745 fold_margin = FOLD_MARGIN;
5746 output_conv = DEFAULT_CONV;
5747 oconv = DEFAULT_CONV;
5748 o_zconv = no_connection;
5749 o_fconv = no_connection;
5750 o_crconv = no_connection;
5751 o_rot_conv = no_connection;
5752 o_hira_conv = no_connection;
5753 o_base64conv = no_connection;
5754 o_iso2022jp_check_conv = no_connection;
5757 i_ungetc = std_ungetc;
5759 i_bungetc = std_ungetc;
5762 i_mungetc = std_ungetc;
5763 i_mgetc_buf = std_getc;
5764 i_mungetc_buf = std_ungetc;
5765 output_mode = ASCII;
5768 mime_decode_mode = FALSE;
5774 z_prev2=0,z_prev1=0;
5776 iconv_for_check = 0;
5778 input_codename = "";
5785 void no_connection(nkf_char c2, nkf_char c1)
5787 no_connection2(c2,c1,0);
5790 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5792 fprintf(stderr,"nkf internal module connection failure.\n");
5794 return 0; /* LINT */
5799 #define fprintf dllprintf
5803 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5804 fprintf(stderr,"Flags:\n");
5805 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5806 #ifdef DEFAULT_CODE_SJIS
5807 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5809 #ifdef DEFAULT_CODE_JIS
5810 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5812 #ifdef DEFAULT_CODE_EUC
5813 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5815 #ifdef DEFAULT_CODE_UTF8
5816 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5818 #ifdef UTF8_OUTPUT_ENABLE
5819 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5821 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5822 #ifdef UTF8_INPUT_ENABLE
5823 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5825 fprintf(stderr,"t no conversion\n");
5826 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5827 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5828 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5829 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5830 fprintf(stderr,"v Show this usage. V: show version\n");
5831 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5832 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5833 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5834 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5835 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5836 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5837 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5838 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5840 fprintf(stderr,"T Text mode output\n");
5842 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5843 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5844 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5845 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5846 fprintf(stderr,"\n");
5847 fprintf(stderr,"Long name options\n");
5848 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5849 fprintf(stderr," Specify the input or output codeset\n");
5850 fprintf(stderr," --fj --unix --mac --windows\n");
5851 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5852 fprintf(stderr," Convert for the system or code\n");
5853 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5854 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5855 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5857 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5859 #ifdef NUMCHAR_OPTION
5860 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5862 #ifdef UTF8_INPUT_ENABLE
5863 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5864 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5867 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5868 fprintf(stderr," Overwrite original listed files by filtered result\n");
5869 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5871 fprintf(stderr," -g --guess Guess the input code\n");
5872 fprintf(stderr," --help --version Show this help/the version\n");
5873 fprintf(stderr," For more information, see also man nkf\n");
5874 fprintf(stderr,"\n");
5880 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5881 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
5884 #if defined(MSDOS) && defined(__WIN16__)
5887 #if defined(MSDOS) && defined(__WIN32__)
5893 ,NKF_VERSION,NKF_RELEASE_DATE);
5894 fprintf(stderr,"\n%s\n",CopyRight);
5899 **
\e$B%Q%C%A@):n<T
\e(B
5900 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5901 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5902 ** ohta@src.ricoh.co.jp (Junn Ohta)
5903 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5904 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5905 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5906 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5907 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5908 ** GHG00637@nifty-serve.or.jp (COW)