1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.116 2006/11/03 20:14:43 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2006-11-04"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = TRUE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
584 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
585 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
590 static int mimeout_mode = 0;
591 static int base64_count = 0;
593 /* X0208 -> ASCII converter */
596 static int f_line = 0; /* chars in line */
597 static int f_prev = 0;
598 static int fold_preserve_f = FALSE; /* preserve new lines */
599 static int fold_f = FALSE;
600 static int fold_len = 0;
603 static unsigned char kanji_intro = DEFAULT_J;
604 static unsigned char ascii_intro = DEFAULT_R;
608 #define FOLD_MARGIN 10
609 #define DEFAULT_FOLD 60
611 static int fold_margin = FOLD_MARGIN;
615 #ifdef DEFAULT_CODE_JIS
616 # define DEFAULT_CONV j_oconv
618 #ifdef DEFAULT_CODE_SJIS
619 # define DEFAULT_CONV s_oconv
621 #ifdef DEFAULT_CODE_EUC
622 # define DEFAULT_CONV e_oconv
624 #ifdef DEFAULT_CODE_UTF8
625 # define DEFAULT_CONV w_oconv
628 /* process default */
629 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
631 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
632 /* s_iconv or oconv */
633 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
635 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
643 /* static redirections */
645 static void (*o_putc)(nkf_char c) = std_putc;
647 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
648 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
650 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
651 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
653 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
655 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
656 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
658 /* for strict mime */
659 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
660 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
663 static int output_mode = ASCII, /* output kanji mode */
664 input_mode = ASCII, /* input kanji mode */
665 shift_mode = FALSE; /* TRUE shift out, or X0201 */
666 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
668 /* X0201 / X0208 conversion tables */
670 /* X0201 kana conversion table */
673 unsigned char cv[]= {
674 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
675 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
676 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
677 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
678 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
679 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
680 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
681 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
682 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
683 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
684 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
685 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
686 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
687 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
688 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
689 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
693 /* X0201 kana conversion table for daguten */
696 unsigned char dv[]= {
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
702 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
703 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
704 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
705 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
706 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
708 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
715 /* X0201 kana conversion table for han-daguten */
718 unsigned char ev[]= {
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
730 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
738 /* X0208 kigou conversion table */
739 /* 0x8140 - 0x819e */
741 unsigned char fv[] = {
743 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
744 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
745 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
746 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
747 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
748 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
749 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
751 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
753 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
760 static int file_out_f = FALSE;
762 static int overwrite_f = FALSE;
763 static int preserve_time_f = FALSE;
764 static int backup_f = FALSE;
765 static char *backup_suffix = "";
766 static char *get_backup_filename(const char *suffix, const char *filename);
769 static int crmode_f = 0; /* CR, NL, CRLF */
770 #ifdef EASYWIN /*Easy Win */
771 static int end_check;
774 #define STD_GC_BUFSIZE (256)
775 nkf_char std_gc_buf[STD_GC_BUFSIZE];
779 #include "nkf32dll.c"
780 #elif defined(PERL_XS)
782 int main(int argc, char **argv)
787 char *outfname = NULL;
790 #ifdef EASYWIN /*Easy Win */
791 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
794 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
795 cp = (unsigned char *)*argv;
800 if (pipe(fds) < 0 || (pid = fork()) < 0){
811 execvp(argv[1], &argv[1]);
825 if(x0201_f == WISH_TRUE)
826 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
828 if (binmode_f == TRUE)
829 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
830 if (freopen("","wb",stdout) == NULL)
837 setbuf(stdout, (char *) NULL);
839 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
842 if (binmode_f == TRUE)
843 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
844 if (freopen("","rb",stdin) == NULL) return (-1);
848 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
852 kanji_convert(stdin);
853 if (guess_f) print_guessed_code(NULL);
857 int is_argument_error = FALSE;
859 is_inputcode_mixed = FALSE;
860 is_inputcode_set = FALSE;
865 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
868 is_argument_error = TRUE;
876 /* reopen file for stdout */
877 if (file_out_f == TRUE) {
880 outfname = malloc(strlen(origfname)
881 + strlen(".nkftmpXXXXXX")
887 strcpy(outfname, origfname);
891 for (i = strlen(outfname); i; --i){
892 if (outfname[i - 1] == '/'
893 || outfname[i - 1] == '\\'){
899 strcat(outfname, "ntXXXXXX");
901 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
904 strcat(outfname, ".nkftmpXXXXXX");
905 fd = mkstemp(outfname);
908 || (fd_backup = dup(fileno(stdout))) < 0
909 || dup2(fd, fileno(stdout)) < 0
920 outfname = "nkf.out";
923 if(freopen(outfname, "w", stdout) == NULL) {
927 if (binmode_f == TRUE) {
928 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
929 if (freopen("","wb",stdout) == NULL)
936 if (binmode_f == TRUE)
937 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
938 if (freopen("","rb",fin) == NULL)
943 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
947 char *filename = NULL;
949 if (nfiles > 1) filename = origfname;
950 if (guess_f) print_guessed_code(filename);
956 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
964 if (dup2(fd_backup, fileno(stdout)) < 0){
967 if (stat(origfname, &sb)) {
968 fprintf(stderr, "Can't stat %s\n", origfname);
970 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
971 if (chmod(outfname, sb.st_mode)) {
972 fprintf(stderr, "Can't set permission %s\n", outfname);
975 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
977 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
978 tb[0] = tb[1] = sb.st_mtime;
979 if (utime(outfname, tb)) {
980 fprintf(stderr, "Can't set timestamp %s\n", outfname);
983 tb.actime = sb.st_atime;
984 tb.modtime = sb.st_mtime;
985 if (utime(outfname, &tb)) {
986 fprintf(stderr, "Can't set timestamp %s\n", outfname);
991 char *backup_filename = get_backup_filename(backup_suffix, origfname);
993 unlink(backup_filename);
995 if (rename(origfname, backup_filename)) {
996 perror(backup_filename);
997 fprintf(stderr, "Can't rename %s to %s\n",
998 origfname, backup_filename);
1002 if (unlink(origfname)){
1007 if (rename(outfname, origfname)) {
1009 fprintf(stderr, "Can't rename %s to %s\n",
1010 outfname, origfname);
1017 if (is_argument_error)
1020 #ifdef EASYWIN /*Easy Win */
1021 if (file_out_f == FALSE)
1022 scanf("%d",&end_check);
1025 #else /* for Other OS */
1026 if (file_out_f == TRUE)
1028 #endif /*Easy Win */
1031 #endif /* WIN32DLL */
1034 char *get_backup_filename(const char *suffix, const char *filename)
1036 char *backup_filename;
1037 int asterisk_count = 0;
1039 int filename_length = strlen(filename);
1041 for(i = 0; suffix[i]; i++){
1042 if(suffix[i] == '*') asterisk_count++;
1046 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1047 if (!backup_filename){
1048 perror("Can't malloc backup filename.");
1052 for(i = 0, j = 0; suffix[i];){
1053 if(suffix[i] == '*'){
1054 backup_filename[j] = '\0';
1055 strncat(backup_filename, filename, filename_length);
1057 j += filename_length;
1059 backup_filename[j++] = suffix[i++];
1062 backup_filename[j] = '\0';
1064 j = strlen(suffix) + filename_length;
1065 backup_filename = malloc( + 1);
1066 strcpy(backup_filename, filename);
1067 strcat(backup_filename, suffix);
1068 backup_filename[j] = '\0';
1070 return backup_filename;
1099 {"katakana-hiragana","h3"},
1106 #ifdef UTF8_OUTPUT_ENABLE
1116 {"fb-subchar=", ""},
1118 #ifdef UTF8_INPUT_ENABLE
1119 {"utf8-input", "W"},
1120 {"utf16-input", "W16"},
1121 {"no-cp932ext", ""},
1122 {"no-best-fit-chars",""},
1124 #ifdef UNICODE_NORMALIZATION
1125 {"utf8mac-input", ""},
1137 #ifdef NUMCHAR_OPTION
1138 {"numchar-input", ""},
1144 #ifdef SHIFTJIS_CP932
1154 static int option_mode = 0;
1156 void options(unsigned char *cp)
1160 unsigned char *cp_back = NULL;
1165 while(*cp && *cp++!='-');
1166 while (*cp || cp_back) {
1174 case '-': /* literal options */
1175 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1179 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1180 p = (unsigned char *)long_option[i].name;
1181 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1182 if (*p == cp[j] || cp[j] == ' '){
1189 while(*cp && *cp != SPACE && cp++);
1190 if (long_option[i].alias[0]){
1192 cp = (unsigned char *)long_option[i].alias;
1194 if (strcmp(long_option[i].name, "ic=") == 0){
1195 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1196 codeset[i] = nkf_toupper(p[i]);
1199 if(strcmp(codeset, "ISO-2022-JP") == 0){
1200 input_f = JIS_INPUT;
1201 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1202 strcmp(codeset, "CP50220") == 0 ||
1203 strcmp(codeset, "CP50221") == 0 ||
1204 strcmp(codeset, "CP50222") == 0){
1205 input_f = JIS_INPUT;
1206 #ifdef SHIFTJIS_CP932
1209 #ifdef UTF8_OUTPUT_ENABLE
1210 ms_ucs_map_f = UCS_MAP_CP932;
1212 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1213 input_f = JIS_INPUT;
1217 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1218 input_f = JIS_INPUT;
1223 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1224 input_f = SJIS_INPUT;
1225 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1226 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1227 strcmp(codeset, "CP932") == 0 ||
1228 strcmp(codeset, "MS932") == 0){
1229 input_f = SJIS_INPUT;
1230 #ifdef SHIFTJIS_CP932
1233 #ifdef UTF8_OUTPUT_ENABLE
1234 ms_ucs_map_f = UCS_MAP_CP932;
1236 }else if(strcmp(codeset, "EUCJP") == 0 ||
1237 strcmp(codeset, "EUC-JP") == 0){
1238 input_f = EUC_INPUT;
1239 }else if(strcmp(codeset, "CP51932") == 0){
1240 input_f = EUC_INPUT;
1241 #ifdef SHIFTJIS_CP932
1244 #ifdef UTF8_OUTPUT_ENABLE
1245 ms_ucs_map_f = UCS_MAP_CP932;
1247 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1248 strcmp(codeset, "EUCJP-MS") == 0 ||
1249 strcmp(codeset, "EUCJPMS") == 0){
1250 input_f = EUC_INPUT;
1251 #ifdef SHIFTJIS_CP932
1254 #ifdef UTF8_OUTPUT_ENABLE
1255 ms_ucs_map_f = UCS_MAP_MS;
1257 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1258 strcmp(codeset, "EUCJP-ASCII") == 0){
1259 input_f = EUC_INPUT;
1260 #ifdef SHIFTJIS_CP932
1263 #ifdef UTF8_OUTPUT_ENABLE
1264 ms_ucs_map_f = UCS_MAP_ASCII;
1266 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1267 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1268 input_f = SJIS_INPUT;
1270 #ifdef SHIFTJIS_CP932
1274 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1275 strcmp(codeset, "EUC-JIS-2004") == 0){
1276 input_f = EUC_INPUT;
1278 #ifdef SHIFTJIS_CP932
1282 #ifdef UTF8_INPUT_ENABLE
1283 }else if(strcmp(codeset, "UTF-8") == 0 ||
1284 strcmp(codeset, "UTF-8N") == 0 ||
1285 strcmp(codeset, "UTF-8-BOM") == 0){
1286 input_f = UTF8_INPUT;
1287 #ifdef UNICODE_NORMALIZATION
1288 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1289 strcmp(codeset, "UTF-8-MAC") == 0){
1290 input_f = UTF8_INPUT;
1293 }else if(strcmp(codeset, "UTF-16") == 0 ||
1294 strcmp(codeset, "UTF-16BE") == 0 ||
1295 strcmp(codeset, "UTF-16BE-BOM") == 0){
1296 input_f = UTF16_INPUT;
1297 input_endian = ENDIAN_BIG;
1298 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1299 strcmp(codeset, "UTF-16LE-BOM") == 0){
1300 input_f = UTF16_INPUT;
1301 input_endian = ENDIAN_LITTLE;
1302 }else if(strcmp(codeset, "UTF-32") == 0 ||
1303 strcmp(codeset, "UTF-32BE") == 0 ||
1304 strcmp(codeset, "UTF-32BE-BOM") == 0){
1305 input_f = UTF32_INPUT;
1306 input_endian = ENDIAN_BIG;
1307 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1308 strcmp(codeset, "UTF-32LE-BOM") == 0){
1309 input_f = UTF32_INPUT;
1310 input_endian = ENDIAN_LITTLE;
1315 if (strcmp(long_option[i].name, "oc=") == 0){
1316 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1317 codeset[i] = nkf_toupper(p[i]);
1320 if(strcmp(codeset, "ISO-2022-JP") == 0){
1321 output_conv = j_oconv;
1322 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1323 output_conv = j_oconv;
1325 no_cp932ext_f = TRUE;
1326 #ifdef SHIFTJIS_CP932
1329 #ifdef UTF8_OUTPUT_ENABLE
1330 ms_ucs_map_f = UCS_MAP_CP932;
1332 }else if(strcmp(codeset, "CP50220") == 0){
1333 output_conv = j_oconv;
1334 #ifdef SHIFTJIS_CP932
1337 #ifdef UTF8_OUTPUT_ENABLE
1338 ms_ucs_map_f = UCS_MAP_CP932;
1340 }else if(strcmp(codeset, "CP50221") == 0){
1341 output_conv = j_oconv;
1343 #ifdef SHIFTJIS_CP932
1346 #ifdef UTF8_OUTPUT_ENABLE
1347 ms_ucs_map_f = UCS_MAP_CP932;
1349 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1350 output_conv = j_oconv;
1354 #ifdef SHIFTJIS_CP932
1357 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1358 output_conv = j_oconv;
1363 #ifdef SHIFTJIS_CP932
1366 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1367 output_conv = j_oconv;
1372 #ifdef SHIFTJIS_CP932
1375 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1376 output_conv = s_oconv;
1377 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1378 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1379 strcmp(codeset, "CP932") == 0 ||
1380 strcmp(codeset, "MS932") == 0){
1381 output_conv = s_oconv;
1383 #ifdef SHIFTJIS_CP932
1387 #ifdef UTF8_OUTPUT_ENABLE
1388 ms_ucs_map_f = UCS_MAP_CP932;
1390 }else if(strcmp(codeset, "EUCJP") == 0 ||
1391 strcmp(codeset, "EUC-JP") == 0){
1392 output_conv = e_oconv;
1393 }else if(strcmp(codeset, "CP51932") == 0){
1394 output_conv = e_oconv;
1396 #ifdef SHIFTJIS_CP932
1399 #ifdef UTF8_OUTPUT_ENABLE
1400 ms_ucs_map_f = UCS_MAP_CP932;
1402 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1403 strcmp(codeset, "EUCJP-MS") == 0 ||
1404 strcmp(codeset, "EUCJPMS") == 0){
1405 output_conv = e_oconv;
1410 #ifdef SHIFTJIS_CP932
1413 #ifdef UTF8_OUTPUT_ENABLE
1414 ms_ucs_map_f = UCS_MAP_MS;
1416 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1417 strcmp(codeset, "EUCJP-ASCII") == 0){
1418 output_conv = e_oconv;
1423 #ifdef SHIFTJIS_CP932
1426 #ifdef UTF8_OUTPUT_ENABLE
1427 ms_ucs_map_f = UCS_MAP_ASCII;
1429 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1430 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1431 output_conv = s_oconv;
1433 #ifdef SHIFTJIS_CP932
1436 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1437 strcmp(codeset, "EUC-JIS-2004") == 0){
1438 output_conv = e_oconv;
1443 #ifdef SHIFTJIS_CP932
1446 #ifdef UTF8_OUTPUT_ENABLE
1447 }else if(strcmp(codeset, "UTF-8") == 0){
1448 output_conv = w_oconv;
1449 }else if(strcmp(codeset, "UTF-8N") == 0){
1450 output_conv = w_oconv;
1451 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1452 output_conv = w_oconv;
1453 output_bom_f = TRUE;
1454 }else if(strcmp(codeset, "UTF-16BE") == 0){
1455 output_conv = w_oconv16;
1456 }else if(strcmp(codeset, "UTF-16") == 0 ||
1457 strcmp(codeset, "UTF-16BE-BOM") == 0){
1458 output_conv = w_oconv16;
1459 output_bom_f = TRUE;
1460 }else if(strcmp(codeset, "UTF-16LE") == 0){
1461 output_conv = w_oconv16;
1462 output_endian = ENDIAN_LITTLE;
1463 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1464 output_conv = w_oconv16;
1465 output_endian = ENDIAN_LITTLE;
1466 output_bom_f = TRUE;
1467 }else if(strcmp(codeset, "UTF-32") == 0 ||
1468 strcmp(codeset, "UTF-32BE") == 0){
1469 output_conv = w_oconv32;
1470 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1471 output_conv = w_oconv32;
1472 output_bom_f = TRUE;
1473 }else if(strcmp(codeset, "UTF-32LE") == 0){
1474 output_conv = w_oconv32;
1475 output_endian = ENDIAN_LITTLE;
1476 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1477 output_conv = w_oconv32;
1478 output_endian = ENDIAN_LITTLE;
1479 output_bom_f = TRUE;
1485 if (strcmp(long_option[i].name, "overwrite") == 0){
1488 preserve_time_f = TRUE;
1491 if (strcmp(long_option[i].name, "overwrite=") == 0){
1494 preserve_time_f = TRUE;
1496 backup_suffix = malloc(strlen((char *) p) + 1);
1497 strcpy(backup_suffix, (char *) p);
1500 if (strcmp(long_option[i].name, "in-place") == 0){
1503 preserve_time_f = FALSE;
1506 if (strcmp(long_option[i].name, "in-place=") == 0){
1509 preserve_time_f = FALSE;
1511 backup_suffix = malloc(strlen((char *) p) + 1);
1512 strcpy(backup_suffix, (char *) p);
1517 if (strcmp(long_option[i].name, "cap-input") == 0){
1521 if (strcmp(long_option[i].name, "url-input") == 0){
1526 #ifdef NUMCHAR_OPTION
1527 if (strcmp(long_option[i].name, "numchar-input") == 0){
1533 if (strcmp(long_option[i].name, "no-output") == 0){
1537 if (strcmp(long_option[i].name, "debug") == 0){
1542 if (strcmp(long_option[i].name, "cp932") == 0){
1543 #ifdef SHIFTJIS_CP932
1547 #ifdef UTF8_OUTPUT_ENABLE
1548 ms_ucs_map_f = UCS_MAP_CP932;
1552 if (strcmp(long_option[i].name, "no-cp932") == 0){
1553 #ifdef SHIFTJIS_CP932
1557 #ifdef UTF8_OUTPUT_ENABLE
1558 ms_ucs_map_f = UCS_MAP_ASCII;
1562 #ifdef SHIFTJIS_CP932
1563 if (strcmp(long_option[i].name, "cp932inv") == 0){
1570 if (strcmp(long_option[i].name, "x0212") == 0){
1577 if (strcmp(long_option[i].name, "exec-in") == 0){
1581 if (strcmp(long_option[i].name, "exec-out") == 0){
1586 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1587 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1588 no_cp932ext_f = TRUE;
1591 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1592 no_best_fit_chars_f = TRUE;
1595 if (strcmp(long_option[i].name, "fb-skip") == 0){
1596 encode_fallback = NULL;
1599 if (strcmp(long_option[i].name, "fb-html") == 0){
1600 encode_fallback = encode_fallback_html;
1603 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1604 encode_fallback = encode_fallback_xml;
1607 if (strcmp(long_option[i].name, "fb-java") == 0){
1608 encode_fallback = encode_fallback_java;
1611 if (strcmp(long_option[i].name, "fb-perl") == 0){
1612 encode_fallback = encode_fallback_perl;
1615 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1616 encode_fallback = encode_fallback_subchar;
1619 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1620 encode_fallback = encode_fallback_subchar;
1621 unicode_subchar = 0;
1623 /* decimal number */
1624 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1625 unicode_subchar *= 10;
1626 unicode_subchar += hex2bin(p[i]);
1628 }else if(p[1] == 'x' || p[1] == 'X'){
1629 /* hexadecimal number */
1630 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1631 unicode_subchar <<= 4;
1632 unicode_subchar |= hex2bin(p[i]);
1636 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1637 unicode_subchar *= 8;
1638 unicode_subchar += hex2bin(p[i]);
1641 w16e_conv(unicode_subchar, &i, &j);
1642 unicode_subchar = i<<8 | j;
1646 #ifdef UTF8_OUTPUT_ENABLE
1647 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1648 ms_ucs_map_f = UCS_MAP_MS;
1652 #ifdef UNICODE_NORMALIZATION
1653 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1654 input_f = UTF8_INPUT;
1659 if (strcmp(long_option[i].name, "prefix=") == 0){
1660 if (nkf_isgraph(p[0])){
1661 for (i = 1; nkf_isgraph(p[i]); i++){
1662 prefix_table[p[i]] = p[0];
1669 case 'b': /* buffered mode */
1672 case 'u': /* non bufferd mode */
1675 case 't': /* transparent mode */
1680 } else if (*cp=='2') {
1684 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1692 case 'j': /* JIS output */
1694 output_conv = j_oconv;
1696 case 'e': /* AT&T EUC output */
1697 output_conv = e_oconv;
1699 case 's': /* SJIS output */
1700 output_conv = s_oconv;
1702 case 'l': /* ISO8859 Latin-1 support, no conversion */
1703 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1704 input_f = LATIN1_INPUT;
1706 case 'i': /* Kanji IN ESC-$-@/B */
1707 if (*cp=='@'||*cp=='B')
1708 kanji_intro = *cp++;
1710 case 'o': /* ASCII IN ESC-(-J/B */
1711 if (*cp=='J'||*cp=='B'||*cp=='H')
1712 ascii_intro = *cp++;
1716 bit:1 katakana->hiragana
1717 bit:2 hiragana->katakana
1719 if ('9'>= *cp && *cp>='0')
1720 hira_f |= (*cp++ -'0');
1727 #if defined(MSDOS) || defined(__OS2__)
1742 #ifdef UTF8_OUTPUT_ENABLE
1743 case 'w': /* UTF-8 output */
1745 output_conv = w_oconv; cp++;
1749 output_bom_f = TRUE;
1752 if ('1'== cp[0] && '6'==cp[1]) {
1753 output_conv = w_oconv16; cp+=2;
1754 } else if ('3'== cp[0] && '2'==cp[1]) {
1755 output_conv = w_oconv32; cp+=2;
1757 output_conv = w_oconv;
1762 output_endian = ENDIAN_LITTLE;
1763 } else if (cp[0] == 'B') {
1771 output_bom_f = TRUE;
1776 #ifdef UTF8_INPUT_ENABLE
1777 case 'W': /* UTF input */
1780 input_f = UTF8_INPUT;
1782 if ('1'== cp[0] && '6'==cp[1]) {
1784 input_f = UTF16_INPUT;
1785 input_endian = ENDIAN_BIG;
1786 } else if ('3'== cp[0] && '2'==cp[1]) {
1788 input_f = UTF32_INPUT;
1789 input_endian = ENDIAN_BIG;
1791 input_f = UTF8_INPUT;
1796 input_endian = ENDIAN_LITTLE;
1797 } else if (cp[0] == 'B') {
1803 /* Input code assumption */
1804 case 'J': /* JIS input */
1805 input_f = JIS_INPUT;
1807 case 'E': /* AT&T EUC input */
1808 input_f = EUC_INPUT;
1810 case 'S': /* MS Kanji input */
1811 input_f = SJIS_INPUT;
1812 if (x0201_f==NO_X0201) x0201_f=TRUE;
1814 case 'Z': /* Convert X0208 alphabet to asii */
1815 /* bit:0 Convert X0208
1816 bit:1 Convert Kankaku to one space
1817 bit:2 Convert Kankaku to two spaces
1818 bit:3 Convert HTML Entity
1820 if ('9'>= *cp && *cp>='0')
1821 alpha_f |= 1<<(*cp++ -'0');
1825 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1826 x0201_f = FALSE; /* No X0201->X0208 conversion */
1828 ESC-(-I in JIS, EUC, MS Kanji
1829 SI/SO in JIS, EUC, MS Kanji
1830 SSO in EUC, JIS, not in MS Kanji
1831 MS Kanji (0xa0-0xdf)
1833 ESC-(-I in JIS (0x20-0x5f)
1834 SSO in EUC (0xa0-0xdf)
1835 0xa0-0xd in MS Kanji (0xa0-0xdf)
1838 case 'X': /* Assume X0201 kana */
1839 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1842 case 'F': /* prserve new lines */
1843 fold_preserve_f = TRUE;
1844 case 'f': /* folding -f60 or -f */
1847 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1849 fold_len += *cp++ - '0';
1851 if (!(0<fold_len && fold_len<BUFSIZ))
1852 fold_len = DEFAULT_FOLD;
1856 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1858 fold_margin += *cp++ - '0';
1862 case 'm': /* MIME support */
1863 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1864 if (*cp=='B'||*cp=='Q') {
1865 mime_decode_mode = *cp++;
1866 mimebuf_f = FIXED_MIME;
1867 } else if (*cp=='N') {
1868 mime_f = TRUE; cp++;
1869 } else if (*cp=='S') {
1870 mime_f = STRICT_MIME; cp++;
1871 } else if (*cp=='0') {
1872 mime_decode_f = FALSE;
1873 mime_f = FALSE; cp++;
1876 case 'M': /* MIME output */
1879 mimeout_f = FIXED_MIME; cp++;
1880 } else if (*cp=='Q') {
1882 mimeout_f = FIXED_MIME; cp++;
1887 case 'B': /* Broken JIS support */
1889 bit:1 allow any x on ESC-(-x or ESC-$-x
1890 bit:2 reset to ascii on NL
1892 if ('9'>= *cp && *cp>='0')
1893 broken_f |= 1<<(*cp++ -'0');
1898 case 'O':/* for Output file */
1902 case 'c':/* add cr code */
1905 case 'd':/* delete cr code */
1908 case 'I': /* ISO-2022-JP output */
1911 case 'L': /* line mode */
1912 if (*cp=='u') { /* unix */
1913 crmode_f = NL; cp++;
1914 } else if (*cp=='m') { /* mac */
1915 crmode_f = CR; cp++;
1916 } else if (*cp=='w') { /* windows */
1917 crmode_f = CRLF; cp++;
1918 } else if (*cp=='0') { /* no conversion */
1928 /* module muliple options in a string are allowed for Perl moudle */
1929 while(*cp && *cp++!='-');
1932 /* bogus option but ignored */
1938 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1941 struct input_code *p = input_code_list;
1943 if (iconv_func == p->iconv_func){
1952 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1954 #ifdef INPUT_CODE_FIX
1962 #ifdef INPUT_CODE_FIX
1963 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1969 if (estab_f && iconv_for_check != iconv){
1970 struct input_code *p = find_inputcode_byfunc(iconv);
1972 set_input_codename(p->name);
1973 debug(input_codename);
1975 iconv_for_check = iconv;
1980 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1981 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1982 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1983 #ifdef SHIFTJIS_CP932
1984 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1985 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1987 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1989 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1990 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1992 #define SCORE_INIT (SCORE_iMIME)
1994 const nkf_char score_table_A0[] = {
1997 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1998 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2001 const nkf_char score_table_F0[] = {
2002 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2003 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2004 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2005 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2008 void set_code_score(struct input_code *ptr, nkf_char score)
2011 ptr->score |= score;
2015 void clr_code_score(struct input_code *ptr, nkf_char score)
2018 ptr->score &= ~score;
2022 void code_score(struct input_code *ptr)
2024 nkf_char c2 = ptr->buf[0];
2025 #ifdef UTF8_OUTPUT_ENABLE
2026 nkf_char c1 = ptr->buf[1];
2029 set_code_score(ptr, SCORE_ERROR);
2030 }else if (c2 == SSO){
2031 set_code_score(ptr, SCORE_KANA);
2032 #ifdef UTF8_OUTPUT_ENABLE
2033 }else if (!e2w_conv(c2, c1)){
2034 set_code_score(ptr, SCORE_NO_EXIST);
2036 }else if ((c2 & 0x70) == 0x20){
2037 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2038 }else if ((c2 & 0x70) == 0x70){
2039 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2040 }else if ((c2 & 0x70) >= 0x50){
2041 set_code_score(ptr, SCORE_L2);
2045 void status_disable(struct input_code *ptr)
2050 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2053 void status_push_ch(struct input_code *ptr, nkf_char c)
2055 ptr->buf[ptr->index++] = c;
2058 void status_clear(struct input_code *ptr)
2064 void status_reset(struct input_code *ptr)
2067 ptr->score = SCORE_INIT;
2070 void status_reinit(struct input_code *ptr)
2073 ptr->_file_stat = 0;
2076 void status_check(struct input_code *ptr, nkf_char c)
2078 if (c <= DEL && estab_f){
2083 void s_status(struct input_code *ptr, nkf_char c)
2087 status_check(ptr, c);
2092 #ifdef NUMCHAR_OPTION
2093 }else if (is_unicode_capsule(c)){
2096 }else if (0xa1 <= c && c <= 0xdf){
2097 status_push_ch(ptr, SSO);
2098 status_push_ch(ptr, c);
2101 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2103 status_push_ch(ptr, c);
2104 #ifdef SHIFTJIS_CP932
2106 && is_ibmext_in_sjis(c)){
2108 status_push_ch(ptr, c);
2109 #endif /* SHIFTJIS_CP932 */
2111 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2113 status_push_ch(ptr, c);
2114 #endif /* X0212_ENABLE */
2116 status_disable(ptr);
2120 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2121 status_push_ch(ptr, c);
2122 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2126 status_disable(ptr);
2130 #ifdef SHIFTJIS_CP932
2131 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2132 status_push_ch(ptr, c);
2133 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2134 set_code_score(ptr, SCORE_CP932);
2139 #endif /* SHIFTJIS_CP932 */
2140 #ifndef X0212_ENABLE
2141 status_disable(ptr);
2147 void e_status(struct input_code *ptr, nkf_char c)
2151 status_check(ptr, c);
2156 #ifdef NUMCHAR_OPTION
2157 }else if (is_unicode_capsule(c)){
2160 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2162 status_push_ch(ptr, c);
2164 }else if (0x8f == c){
2166 status_push_ch(ptr, c);
2167 #endif /* X0212_ENABLE */
2169 status_disable(ptr);
2173 if (0xa1 <= c && c <= 0xfe){
2174 status_push_ch(ptr, c);
2178 status_disable(ptr);
2183 if (0xa1 <= c && c <= 0xfe){
2185 status_push_ch(ptr, c);
2187 status_disable(ptr);
2189 #endif /* X0212_ENABLE */
2193 #ifdef UTF8_INPUT_ENABLE
2194 void w_status(struct input_code *ptr, nkf_char c)
2198 status_check(ptr, c);
2203 #ifdef NUMCHAR_OPTION
2204 }else if (is_unicode_capsule(c)){
2207 }else if (0xc0 <= c && c <= 0xdf){
2209 status_push_ch(ptr, c);
2210 }else if (0xe0 <= c && c <= 0xef){
2212 status_push_ch(ptr, c);
2213 }else if (0xf0 <= c && c <= 0xf4){
2215 status_push_ch(ptr, c);
2217 status_disable(ptr);
2222 if (0x80 <= c && c <= 0xbf){
2223 status_push_ch(ptr, c);
2224 if (ptr->index > ptr->stat){
2225 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2226 && ptr->buf[2] == 0xbf);
2227 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2228 &ptr->buf[0], &ptr->buf[1]);
2235 status_disable(ptr);
2239 if (0x80 <= c && c <= 0xbf){
2240 if (ptr->index < ptr->stat){
2241 status_push_ch(ptr, c);
2246 status_disable(ptr);
2253 void code_status(nkf_char c)
2255 int action_flag = 1;
2256 struct input_code *result = 0;
2257 struct input_code *p = input_code_list;
2259 if (!p->status_func) {
2263 if (!p->status_func)
2265 (p->status_func)(p, c);
2268 }else if(p->stat == 0){
2279 if (result && !estab_f){
2280 set_iconv(TRUE, result->iconv_func);
2281 }else if (c <= DEL){
2282 struct input_code *ptr = input_code_list;
2292 nkf_char std_getc(FILE *f)
2295 return std_gc_buf[--std_gc_ndx];
2301 nkf_char std_ungetc(nkf_char c, FILE *f)
2303 if (std_gc_ndx == STD_GC_BUFSIZE){
2306 std_gc_buf[std_gc_ndx++] = c;
2311 void std_putc(nkf_char c)
2318 #if !defined(PERL_XS) && !defined(WIN32DLL)
2319 nkf_char noconvert(FILE *f)
2324 module_connection();
2325 while ((c = (*i_getc)(f)) != EOF)
2332 void module_connection(void)
2334 oconv = output_conv;
2337 /* replace continucation module, from output side */
2339 /* output redicrection */
2341 if (noout_f || guess_f){
2348 if (mimeout_f == TRUE) {
2349 o_base64conv = oconv; oconv = base64_conv;
2351 /* base64_count = 0; */
2355 o_crconv = oconv; oconv = cr_conv;
2358 o_rot_conv = oconv; oconv = rot_conv;
2361 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2364 o_hira_conv = oconv; oconv = hira_conv;
2367 o_fconv = oconv; oconv = fold_conv;
2370 if (alpha_f || x0201_f) {
2371 o_zconv = oconv; oconv = z_conv;
2375 i_ungetc = std_ungetc;
2376 /* input redicrection */
2379 i_cgetc = i_getc; i_getc = cap_getc;
2380 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2383 i_ugetc = i_getc; i_getc = url_getc;
2384 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2387 #ifdef NUMCHAR_OPTION
2389 i_ngetc = i_getc; i_getc = numchar_getc;
2390 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2393 #ifdef UNICODE_NORMALIZATION
2394 if (nfc_f && input_f == UTF8_INPUT){
2395 i_nfc_getc = i_getc; i_getc = nfc_getc;
2396 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2399 if (mime_f && mimebuf_f==FIXED_MIME) {
2400 i_mgetc = i_getc; i_getc = mime_getc;
2401 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2404 i_bgetc = i_getc; i_getc = broken_getc;
2405 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2407 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2408 set_iconv(-TRUE, e_iconv);
2409 } else if (input_f == SJIS_INPUT) {
2410 set_iconv(-TRUE, s_iconv);
2411 #ifdef UTF8_INPUT_ENABLE
2412 } else if (input_f == UTF8_INPUT) {
2413 set_iconv(-TRUE, w_iconv);
2414 } else if (input_f == UTF16_INPUT) {
2415 set_iconv(-TRUE, w_iconv16);
2416 } else if (input_f == UTF32_INPUT) {
2417 set_iconv(-TRUE, w_iconv32);
2420 set_iconv(FALSE, e_iconv);
2424 struct input_code *p = input_code_list;
2432 * Check and Ignore BOM
2434 void check_bom(FILE *f)
2437 switch(c2 = (*i_getc)(f)){
2439 if((c2 = (*i_getc)(f)) == 0x00){
2440 if((c2 = (*i_getc)(f)) == 0xFE){
2441 if((c2 = (*i_getc)(f)) == 0xFF){
2443 set_iconv(TRUE, w_iconv32);
2445 if (iconv == w_iconv32) {
2446 input_endian = ENDIAN_BIG;
2449 (*i_ungetc)(0xFF,f);
2450 }else (*i_ungetc)(c2,f);
2451 (*i_ungetc)(0xFE,f);
2452 }else if(c2 == 0xFF){
2453 if((c2 = (*i_getc)(f)) == 0xFE){
2455 set_iconv(TRUE, w_iconv32);
2457 if (iconv == w_iconv32) {
2458 input_endian = ENDIAN_2143;
2461 (*i_ungetc)(0xFF,f);
2462 }else (*i_ungetc)(c2,f);
2463 (*i_ungetc)(0xFF,f);
2464 }else (*i_ungetc)(c2,f);
2465 (*i_ungetc)(0x00,f);
2466 }else (*i_ungetc)(c2,f);
2467 (*i_ungetc)(0x00,f);
2470 if((c2 = (*i_getc)(f)) == 0xBB){
2471 if((c2 = (*i_getc)(f)) == 0xBF){
2473 set_iconv(TRUE, w_iconv);
2475 if (iconv == w_iconv) {
2478 (*i_ungetc)(0xBF,f);
2479 }else (*i_ungetc)(c2,f);
2480 (*i_ungetc)(0xBB,f);
2481 }else (*i_ungetc)(c2,f);
2482 (*i_ungetc)(0xEF,f);
2485 if((c2 = (*i_getc)(f)) == 0xFF){
2486 if((c2 = (*i_getc)(f)) == 0x00){
2487 if((c2 = (*i_getc)(f)) == 0x00){
2489 set_iconv(TRUE, w_iconv32);
2491 if (iconv == w_iconv32) {
2492 input_endian = ENDIAN_3412;
2495 (*i_ungetc)(0x00,f);
2496 }else (*i_ungetc)(c2,f);
2497 (*i_ungetc)(0x00,f);
2498 }else (*i_ungetc)(c2,f);
2500 set_iconv(TRUE, w_iconv16);
2502 if (iconv == w_iconv16) {
2503 input_endian = ENDIAN_BIG;
2506 (*i_ungetc)(0xFF,f);
2507 }else (*i_ungetc)(c2,f);
2508 (*i_ungetc)(0xFE,f);
2511 if((c2 = (*i_getc)(f)) == 0xFE){
2512 if((c2 = (*i_getc)(f)) == 0x00){
2513 if((c2 = (*i_getc)(f)) == 0x00){
2515 set_iconv(TRUE, w_iconv32);
2517 if (iconv == w_iconv32) {
2518 input_endian = ENDIAN_LITTLE;
2521 (*i_ungetc)(0x00,f);
2522 }else (*i_ungetc)(c2,f);
2523 (*i_ungetc)(0x00,f);
2524 }else (*i_ungetc)(c2,f);
2526 set_iconv(TRUE, w_iconv16);
2528 if (iconv == w_iconv16) {
2529 input_endian = ENDIAN_LITTLE;
2532 (*i_ungetc)(0xFE,f);
2533 }else (*i_ungetc)(c2,f);
2534 (*i_ungetc)(0xFF,f);
2543 Conversion main loop. Code detection only.
2546 nkf_char kanji_convert(FILE *f)
2548 nkf_char c3, c2=0, c1, c0=0;
2549 int is_8bit = FALSE;
2551 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2552 #ifdef UTF8_INPUT_ENABLE
2553 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2560 output_mode = ASCII;
2563 #define NEXT continue /* no output, get next */
2564 #define SEND ; /* output c1 and c2, get next */
2565 #define LAST break /* end of loop, go closing */
2567 module_connection();
2570 while ((c1 = (*i_getc)(f)) != EOF) {
2571 #ifdef INPUT_CODE_FIX
2578 /* in case of 8th bit is on */
2579 if (!estab_f&&!mime_decode_mode) {
2580 /* in case of not established yet */
2581 /* It is still ambiguious */
2582 if (h_conv(f, c2, c1)==EOF)
2588 /* in case of already established */
2589 if (c1 < AT && !(X0208 && 0x80 <= c2 && c2 <= 0x92)) {
2590 /* ignore bogus code and not CP5022x UCD */
2598 /* second byte, 7 bit code */
2599 /* it might be kanji shitfted */
2600 if ((c1 == DEL) || (c1 <= SPACE)) {
2601 /* ignore bogus first code */
2608 #ifdef UTF8_INPUT_ENABLE
2609 if (iconv == w_iconv16) {
2610 if (input_endian == ENDIAN_BIG) {
2612 if ((c1 = (*i_getc)(f)) != EOF) {
2613 if (0xD8 <= c2 && c2 <= 0xDB) {
2614 if ((c0 = (*i_getc)(f)) != EOF) {
2616 if ((c3 = (*i_getc)(f)) != EOF) {
2623 if ((c2 = (*i_getc)(f)) != EOF) {
2624 if (0xD8 <= c2 && c2 <= 0xDB) {
2625 if ((c3 = (*i_getc)(f)) != EOF) {
2626 if ((c0 = (*i_getc)(f)) != EOF) {
2635 } else if(iconv == w_iconv32){
2637 if((c2 = (*i_getc)(f)) != EOF &&
2638 (c1 = (*i_getc)(f)) != EOF &&
2639 (c0 = (*i_getc)(f)) != EOF){
2640 switch(input_endian){
2642 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2645 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2648 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2651 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2661 #ifdef NUMCHAR_OPTION
2662 if (is_unicode_capsule(c1)){
2668 if (!estab_f && !iso8859_f) {
2669 /* not established yet */
2672 } else { /* estab_f==TRUE */
2677 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2678 /* SJIS X0201 Case... */
2679 if(iso2022jp_f && x0201_f==NO_X0201) {
2680 (*oconv)(GETA1, GETA2);
2687 } else if (c1==SSO && iconv != s_iconv) {
2688 /* EUC X0201 Case */
2689 c1 = (*i_getc)(f); /* skip SSO */
2691 if (SSP<=c1 && c1<0xe0) {
2692 if(iso2022jp_f && x0201_f==NO_X0201) {
2693 (*oconv)(GETA1, GETA2);
2700 } else { /* bogus code, skip SSO and one byte */
2704 /* already established */
2709 } else if ((c1 > SPACE) && (c1 != DEL)) {
2710 /* in case of Roman characters */
2712 /* output 1 shifted byte */
2716 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2717 /* output 1 shifted byte */
2718 if(iso2022jp_f && x0201_f==NO_X0201) {
2719 (*oconv)(GETA1, GETA2);
2726 /* look like bogus code */
2729 } else if (input_mode == X0208 || input_mode == X0212 ||
2730 input_mode == X0213_1 || input_mode == X0213_2) {
2731 /* in case of Kanji shifted */
2734 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2735 /* Check MIME code */
2736 if ((c1 = (*i_getc)(f)) == EOF) {
2739 } else if (c1 == '?') {
2740 /* =? is mime conversion start sequence */
2741 if(mime_f == STRICT_MIME) {
2742 /* check in real detail */
2743 if (mime_begin_strict(f) == EOF)
2747 } else if (mime_begin(f) == EOF)
2757 /* normal ASCII code */
2760 } else if (!is_8bit && c1 == SI) {
2763 } else if (!is_8bit && c1 == SO) {
2766 } else if (!is_8bit && c1 == ESC ) {
2767 if ((c1 = (*i_getc)(f)) == EOF) {
2768 /* (*oconv)(0, ESC); don't send bogus code */
2770 } else if (c1 == '$') {
2771 if ((c1 = (*i_getc)(f)) == EOF) {
2773 (*oconv)(0, ESC); don't send bogus code
2774 (*oconv)(0, '$'); */
2776 } else if (c1 == '@'|| c1 == 'B') {
2777 /* This is kanji introduction */
2780 set_input_codename("ISO-2022-JP");
2782 debug(input_codename);
2785 } else if (c1 == '(') {
2786 if ((c1 = (*i_getc)(f)) == EOF) {
2787 /* don't send bogus code
2793 } else if (c1 == '@'|| c1 == 'B') {
2794 /* This is kanji introduction */
2799 } else if (c1 == 'D'){
2803 #endif /* X0212_ENABLE */
2804 } else if (c1 == (X0213_1&0x7F)){
2805 input_mode = X0213_1;
2808 } else if (c1 == (X0213_2&0x7F)){
2809 input_mode = X0213_2;
2813 /* could be some special code */
2820 } else if (broken_f&0x2) {
2821 /* accept any ESC-(-x as broken code ... */
2831 } else if (c1 == '(') {
2832 if ((c1 = (*i_getc)(f)) == EOF) {
2833 /* don't send bogus code
2835 (*oconv)(0, '('); */
2839 /* This is X0201 kana introduction */
2840 input_mode = X0201; shift_mode = X0201;
2842 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2843 /* This is X0208 kanji introduction */
2844 input_mode = ASCII; shift_mode = FALSE;
2846 } else if (broken_f&0x2) {
2847 input_mode = ASCII; shift_mode = FALSE;
2852 /* maintain various input_mode here */
2856 } else if ( c1 == 'N' || c1 == 'n' ){
2858 c3 = (*i_getc)(f); /* skip SS2 */
2859 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2874 } else if (c1 == ESC && iconv == s_iconv) {
2875 /* ESC in Shift_JIS */
2876 if ((c1 = (*i_getc)(f)) == EOF) {
2877 /* (*oconv)(0, ESC); don't send bogus code */
2879 } else if (c1 == '$') {
2881 if ((c1 = (*i_getc)(f)) == EOF) {
2883 (*oconv)(0, ESC); don't send bogus code
2884 (*oconv)(0, '$'); */
2887 if (('E' <= c1 && c1 <= 'G') ||
2888 ('O' <= c1 && c1 <= 'Q')) {
2896 static const int jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2897 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
2898 while ((c1 = (*i_getc)(f)) != EOF) {
2899 if (SPACE <= c1 && c1 <= 'z') {
2900 (*oconv)(0, c1 + c0);
2901 } else break; /* c1 == SO */
2905 if (c1 == EOF) LAST;
2912 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2913 input_mode = ASCII; set_iconv(FALSE, 0);
2915 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2916 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2924 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2925 if ((c1=(*i_getc)(f))!=EOF) {
2929 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2941 } else if (c1 == DEL && input_mode == X0208 ) {
2951 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2954 if ((c0 = (*i_getc)(f)) != EOF) {
2957 if ((c3 = (*i_getc)(f)) != EOF) {
2959 (*iconv)(c2, c1, c0|c3);
2964 /* 3 bytes EUC or UTF-8 */
2965 if ((c0 = (*i_getc)(f)) != EOF) {
2967 (*iconv)(c2, c1, c0);
2974 if (ms_ucs_map_f == UCS_MAP_CP932 &&
2975 0x7F <= c2 && c2 <= 0x92 &&
2976 0x21 <= c1 && c1 <= 0x7E) {
2978 if(c1 == 0x7F) return 0;
2979 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2982 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2986 (*oconv)(PREFIX_EUCG3 | c2, c1);
2988 #endif /* X0212_ENABLE */
2990 (*oconv)(PREFIX_EUCG3 | c2, c1);
2993 (*oconv)(input_mode, c1); /* other special case */
2999 /* goto next_word */
3003 (*iconv)(EOF, 0, 0);
3004 if (!is_inputcode_set)
3007 struct input_code *p = input_code_list;
3008 struct input_code *result = p;
3010 if (p->score < result->score) result = p;
3013 set_input_codename(result->name);
3020 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3022 nkf_char ret, c3, c0;
3026 /** it must NOT be in the kanji shifte sequence */
3027 /** it must NOT be written in JIS7 */
3028 /** and it must be after 2 byte 8bit code */
3034 while ((c1 = (*i_getc)(f)) != EOF) {
3040 if (push_hold_buf(c1) == EOF || estab_f){
3046 struct input_code *p = input_code_list;
3047 struct input_code *result = p;
3052 if (p->score < result->score){
3057 set_iconv(FALSE, result->iconv_func);
3062 ** 1) EOF is detected, or
3063 ** 2) Code is established, or
3064 ** 3) Buffer is FULL (but last word is pushed)
3066 ** in 1) and 3) cases, we continue to use
3067 ** Kanji codes by oconv and leave estab_f unchanged.
3072 while (hold_index < hold_count){
3073 c2 = hold_buf[hold_index++];
3075 #ifdef NUMCHAR_OPTION
3076 || is_unicode_capsule(c2)
3081 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3082 (*iconv)(X0201, c2, 0);
3085 if (hold_index < hold_count){
3086 c1 = hold_buf[hold_index++];
3096 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3099 if (hold_index < hold_count){
3100 c0 = hold_buf[hold_index++];
3101 } else if ((c0 = (*i_getc)(f)) == EOF) {
3107 if (hold_index < hold_count){
3108 c3 = hold_buf[hold_index++];
3109 } else if ((c3 = (*i_getc)(f)) == EOF) {
3114 (*iconv)(c2, c1, c0|c3);
3119 /* 3 bytes EUC or UTF-8 */
3120 if (hold_index < hold_count){
3121 c0 = hold_buf[hold_index++];
3122 } else if ((c0 = (*i_getc)(f)) == EOF) {
3128 (*iconv)(c2, c1, c0);
3131 if (c0 == EOF) break;
3136 nkf_char push_hold_buf(nkf_char c2)
3138 if (hold_count >= HOLD_SIZE*2)
3140 hold_buf[hold_count++] = (unsigned char)c2;
3141 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3144 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3146 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3149 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3150 #ifdef SHIFTJIS_CP932
3151 if (cp51932_f && is_ibmext_in_sjis(c2)){
3153 extern const unsigned short shiftjis_cp932[3][189];
3155 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3161 #endif /* SHIFTJIS_CP932 */
3163 if (!x0213_f && is_ibmext_in_sjis(c2)){
3165 extern const unsigned short shiftjis_x0212[3][189];
3167 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3170 c2 = PREFIX_EUCG3 | (val >> 8);
3183 if(x0213_f && c2 >= 0xF0){
3184 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3185 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3186 }else{ /* 78<=k<=94 */
3187 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3188 if (0x9E < c1) c2++;
3191 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3192 if (0x9E < c1) c2++;
3195 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3202 c2 = x0212_unshift(c2);
3209 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3213 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3215 } else if (ms_ucs_map_f == UCS_MAP_CP932 &&
3216 0xF0 <= c2 && c2 <= 0xF9 &&
3217 0x40 <= c1 && c1 <= 0xFC) {
3219 if(c1 == 0x7F) return 0;
3220 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3223 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3224 if (ret) return ret;
3230 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3235 }else if (c2 == 0x8f){
3239 if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3240 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3241 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3244 c2 = (c2 << 8) | (c1 & 0x7f);
3246 #ifdef SHIFTJIS_CP932
3249 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3250 s2e_conv(s2, s1, &c2, &c1);
3257 #endif /* SHIFTJIS_CP932 */
3259 #endif /* X0212_ENABLE */
3260 } else if (c2 == SSO){
3263 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3266 if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3267 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3268 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3279 #ifdef UTF8_INPUT_ENABLE
3280 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3287 }else if (0xc0 <= c2 && c2 <= 0xef) {
3288 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3289 #ifdef NUMCHAR_OPTION
3292 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3300 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3303 static const int w_iconv_utf8_1st_byte[] =
3305 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3306 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3307 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3308 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3310 if (c2 < 0 || 0xff < c2) {
3311 }else if (c2 == 0) { /* 0 : 1 byte*/
3313 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3316 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3318 if (c1 < 0x80 || 0xBF < c1) return 0;
3321 if (c0 == 0) return -1;
3322 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3327 if (c0 == 0) return -1;
3328 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3332 if (c0 == 0) return -1;
3333 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3337 if (c0 == 0) return -2;
3338 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3342 if (c0 == 0) return -2;
3343 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3347 if (c0 == 0) return -2;
3348 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3356 if (c2 == 0 || c2 == EOF){
3357 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3358 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3361 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3370 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3371 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3378 }else if (val < 0x800){
3379 *p2 = 0xc0 | (val >> 6);
3380 *p1 = 0x80 | (val & 0x3f);
3382 } else if (val <= NKF_INT32_C(0xFFFF)) {
3383 *p2 = 0xe0 | (val >> 12);
3384 *p1 = 0x80 | ((val >> 6) & 0x3f);
3385 *p0 = 0x80 | (val & 0x3f);
3386 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3387 *p2 = 0xe0 | (val >> 16);
3388 *p1 = 0x80 | ((val >> 12) & 0x3f);
3389 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3398 #ifdef UTF8_INPUT_ENABLE
3399 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3404 } else if (c2 >= 0xf0){
3405 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3406 val = (c2 & 0x0f) << 18;
3407 val |= (c1 & 0x3f) << 12;
3408 val |= (c0 & 0x3f00) >> 2;
3410 }else if (c2 >= 0xe0){
3411 val = (c2 & 0x0f) << 12;
3412 val |= (c1 & 0x3f) << 6;
3414 }else if (c2 >= 0xc0){
3415 val = (c2 & 0x1f) << 6;
3423 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3425 nkf_char c2, c1, c0;
3432 w16w_conv(val, &c2, &c1, &c0);
3433 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3434 #ifdef NUMCHAR_OPTION
3437 *p1 = CLASS_UNICODE | val;
3446 #ifdef UTF8_INPUT_ENABLE
3447 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3450 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3453 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3454 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3456 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3458 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3463 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3464 if (ret) return ret;
3469 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3473 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3474 } else if (is_unicode_bmp(c1)) {
3475 ret = w16e_conv(c1, &c2, &c1);
3478 c1 = CLASS_UNICODE | c1;
3480 if (ret) return ret;
3485 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3488 extern const unsigned short *const utf8_to_euc_2bytes[];
3489 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3490 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3491 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3492 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3493 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3495 const unsigned short *const *pp;
3496 const unsigned short *const *const *ppp;
3497 static const int no_best_fit_chars_table_C2[] =
3498 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3500 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3501 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3502 static const int no_best_fit_chars_table_C2_ms[] =
3503 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3504 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3505 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3506 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3507 static const int no_best_fit_chars_table_932_C2[] =
3508 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3509 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3510 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3511 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3512 static const int no_best_fit_chars_table_932_C3[] =
3513 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3514 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3515 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3516 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3522 }else if(c2 < 0xe0){
3523 if(no_best_fit_chars_f){
3524 if(ms_ucs_map_f == UCS_MAP_CP932){
3527 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3530 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3533 }else if(cp51932_f){
3536 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3539 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3542 }else if(ms_ucs_map_f == UCS_MAP_MS){
3543 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3547 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3548 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3550 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3551 }else if(c0 < 0xF0){
3552 if(no_best_fit_chars_f){
3553 if(ms_ucs_map_f == UCS_MAP_CP932){
3554 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3555 }else if(ms_ucs_map_f == UCS_MAP_MS){
3560 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3563 if(c0 == 0x92) return 1;
3568 if(c1 == 0x80 || c0 == 0x9C) return 1;
3576 if(c0 == 0x95) return 1;
3579 if(c0 == 0xA5) return 1;
3586 if(c0 == 0x8D) return 1;
3589 if(c0 == 0x9E && cp51932_f) return 1;
3592 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3600 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3601 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3603 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3605 #ifdef SHIFTJIS_CP932
3606 if (!ret && cp51932_f && is_eucg3(*p2)) {
3608 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3609 s2e_conv(s2, s1, p2, p1);
3618 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3621 const unsigned short *p;
3624 if (pp == 0) return 1;
3627 if (c1 < 0 || psize <= c1) return 1;
3629 if (p == 0) return 1;
3632 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3634 if (val == 0) return 1;
3635 if (no_cp932ext_f && (
3636 (val>>8) == 0x2D || /* NEC special characters */
3637 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3645 if (c2 == SO) c2 = X0201;
3652 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3654 const char *hex = "0123456789ABCDEF";
3660 (*f)(0, hex[(c>>shift)&0xF]);
3670 void encode_fallback_html(nkf_char c)
3675 if(c >= NKF_INT32_C(1000000))
3676 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3677 if(c >= NKF_INT32_C(100000))
3678 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3680 (*oconv)(0, 0x30+(c/10000 )%10);
3682 (*oconv)(0, 0x30+(c/1000 )%10);
3684 (*oconv)(0, 0x30+(c/100 )%10);
3686 (*oconv)(0, 0x30+(c/10 )%10);
3688 (*oconv)(0, 0x30+ c %10);
3693 void encode_fallback_xml(nkf_char c)
3698 nkf_each_char_to_hex(oconv, c);
3703 void encode_fallback_java(nkf_char c)
3705 const char *hex = "0123456789ABCDEF";
3708 if(!is_unicode_bmp(c)){
3712 (*oconv)(0, hex[(c>>20)&0xF]);
3713 (*oconv)(0, hex[(c>>16)&0xF]);
3717 (*oconv)(0, hex[(c>>12)&0xF]);
3718 (*oconv)(0, hex[(c>> 8)&0xF]);
3719 (*oconv)(0, hex[(c>> 4)&0xF]);
3720 (*oconv)(0, hex[ c &0xF]);
3724 void encode_fallback_perl(nkf_char c)
3729 nkf_each_char_to_hex(oconv, c);
3734 void encode_fallback_subchar(nkf_char c)
3736 c = unicode_subchar;
3737 (*oconv)((c>>8)&0xFF, c&0xFF);
3742 #ifdef UTF8_OUTPUT_ENABLE
3743 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3746 extern const unsigned short euc_to_utf8_1byte[];
3747 extern const unsigned short *const euc_to_utf8_2bytes[];
3748 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3749 extern const unsigned short *const x0212_to_utf8_2bytes[];
3751 const unsigned short *p;
3754 p = euc_to_utf8_1byte;
3756 } else if (is_eucg3(c2)){
3757 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3760 c2 = (c2&0x7f) - 0x21;
3761 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3762 p = x0212_to_utf8_2bytes[c2];
3768 c2 = (c2&0x7f) - 0x21;
3769 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3770 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3775 c1 = (c1 & 0x7f) - 0x21;
3776 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3781 void w_oconv(nkf_char c2, nkf_char c1)
3787 output_bom_f = FALSE;
3798 #ifdef NUMCHAR_OPTION
3799 if (c2 == 0 && is_unicode_capsule(c1)){
3800 val = c1 & VALUE_MASK;
3803 }else if (val < 0x800){
3804 (*o_putc)(0xC0 | (val >> 6));
3805 (*o_putc)(0x80 | (val & 0x3f));
3806 } else if (val <= NKF_INT32_C(0xFFFF)) {
3807 (*o_putc)(0xE0 | (val >> 12));
3808 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3809 (*o_putc)(0x80 | (val & 0x3f));
3810 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3811 (*o_putc)(0xF0 | ( val>>18));
3812 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3813 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3814 (*o_putc)(0x80 | ( val & 0x3f));
3821 output_mode = ASCII;
3823 } else if (c2 == ISO8859_1) {
3824 output_mode = ISO8859_1;
3825 (*o_putc)(c1 | 0x080);
3828 val = e2w_conv(c2, c1);
3830 w16w_conv(val, &c2, &c1, &c0);
3834 if (c0) (*o_putc)(c0);
3840 void w_oconv16(nkf_char c2, nkf_char c1)
3843 output_bom_f = FALSE;
3844 if (output_endian == ENDIAN_LITTLE){
3845 (*o_putc)((unsigned char)'\377');
3849 (*o_putc)((unsigned char)'\377');
3858 if (c2 == ISO8859_1) {
3861 #ifdef NUMCHAR_OPTION
3862 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3863 if (is_unicode_bmp(c1)) {
3864 c2 = (c1 >> 8) & 0xff;
3868 if (c1 <= UNICODE_MAX) {
3869 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3870 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3871 if (output_endian == ENDIAN_LITTLE){
3872 (*o_putc)(c2 & 0xff);
3873 (*o_putc)((c2 >> 8) & 0xff);
3874 (*o_putc)(c1 & 0xff);
3875 (*o_putc)((c1 >> 8) & 0xff);
3877 (*o_putc)((c2 >> 8) & 0xff);
3878 (*o_putc)(c2 & 0xff);
3879 (*o_putc)((c1 >> 8) & 0xff);
3880 (*o_putc)(c1 & 0xff);
3887 nkf_char val = e2w_conv(c2, c1);
3888 c2 = (val >> 8) & 0xff;
3892 if (output_endian == ENDIAN_LITTLE){
3901 void w_oconv32(nkf_char c2, nkf_char c1)
3904 output_bom_f = FALSE;
3905 if (output_endian == ENDIAN_LITTLE){
3906 (*o_putc)((unsigned char)'\377');
3914 (*o_putc)((unsigned char)'\377');
3923 if (c2 == ISO8859_1) {
3925 #ifdef NUMCHAR_OPTION
3926 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3930 c1 = e2w_conv(c2, c1);
3933 if (output_endian == ENDIAN_LITTLE){
3934 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3935 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3936 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3940 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3941 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3942 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3947 void e_oconv(nkf_char c2, nkf_char c1)
3949 #ifdef NUMCHAR_OPTION
3950 if (c2 == 0 && is_unicode_capsule(c1)){
3951 w16e_conv(c1, &c2, &c1);
3952 if (c2 == 0 && is_unicode_capsule(c1)){
3953 c2 = c1 & VALUE_MASK;
3954 if (ms_ucs_map_f == UCS_MAP_MS &&
3955 0xE000 <= c2 && c2 <= 0xE757) {
3959 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3960 c1 = 0x21 + c1 % 94;
3962 if (encode_fallback) (*encode_fallback)(c1);
3971 } else if (c2 == 0) {
3972 output_mode = ASCII;
3974 } else if (c2 == X0201) {
3975 output_mode = JAPANESE_EUC;
3976 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3977 } else if (c2 == ISO8859_1) {
3978 output_mode = ISO8859_1;
3979 (*o_putc)(c1 | 0x080);
3981 } else if (is_eucg3(c2)){
3982 output_mode = JAPANESE_EUC;
3983 #ifdef SHIFTJIS_CP932
3986 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3987 s2e_conv(s2, s1, &c2, &c1);
3992 output_mode = ASCII;
3994 }else if (is_eucg3(c2)){
3997 (*o_putc)((c2 & 0x7f) | 0x080);
3998 (*o_putc)(c1 | 0x080);
4001 (*o_putc)((c2 & 0x7f) | 0x080);
4002 (*o_putc)(c1 | 0x080);
4006 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4007 set_iconv(FALSE, 0);
4008 return; /* too late to rescue this char */
4010 output_mode = JAPANESE_EUC;
4011 (*o_putc)(c2 | 0x080);
4012 (*o_putc)(c1 | 0x080);
4017 nkf_char x0212_shift(nkf_char c)
4022 if (0x75 <= c && c <= 0x7f){
4023 ret = c + (0x109 - 0x75);
4026 if (0x75 <= c && c <= 0x7f){
4027 ret = c + (0x113 - 0x75);
4034 nkf_char x0212_unshift(nkf_char c)
4037 if (0x7f <= c && c <= 0x88){
4038 ret = c + (0x75 - 0x7f);
4039 }else if (0x89 <= c && c <= 0x92){
4040 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4044 #endif /* X0212_ENABLE */
4046 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4052 if((0x21 <= ndx && ndx <= 0x2F)){
4053 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4054 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4056 }else if(0x6E <= ndx && ndx <= 0x7E){
4057 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4058 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4064 else if(nkf_isgraph(ndx)){
4066 const unsigned short *ptr;
4068 extern const unsigned short *const x0212_shiftjis[];
4070 ptr = x0212_shiftjis[ndx - 0x21];
4072 val = ptr[(c1 & 0x7f) - 0x21];
4081 c2 = x0212_shift(c2);
4083 #endif /* X0212_ENABLE */
4085 if(0x7F < c2) return 1;
4086 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4087 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4091 void s_oconv(nkf_char c2, nkf_char c1)
4093 #ifdef NUMCHAR_OPTION
4094 if (c2 == 0 && is_unicode_capsule(c1)){
4095 w16e_conv(c1, &c2, &c1);
4096 if (c2 == 0 && is_unicode_capsule(c1)){
4097 c2 = c1 & VALUE_MASK;
4098 if (ms_ucs_map_f == UCS_MAP_CP932 &&
4099 0xE000 <= c2 && c2 <= 0xE757) {
4102 c2 = c1 / 188 + 0xF0;
4104 c1 += 0x40 + (c1 > 0x3e);
4106 if(encode_fallback)(*encode_fallback)(c1);
4115 } else if (c2 == 0) {
4116 output_mode = ASCII;
4118 } else if (c2 == X0201) {
4119 output_mode = SHIFT_JIS;
4121 } else if (c2 == ISO8859_1) {
4122 output_mode = ISO8859_1;
4123 (*o_putc)(c1 | 0x080);
4125 } else if (is_eucg3(c2)){
4126 output_mode = SHIFT_JIS;
4127 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4133 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4134 set_iconv(FALSE, 0);
4135 return; /* too late to rescue this char */
4137 output_mode = SHIFT_JIS;
4138 e2s_conv(c2, c1, &c2, &c1);
4140 #ifdef SHIFTJIS_CP932
4142 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4144 extern const unsigned short cp932inv[2][189];
4146 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4152 #endif /* SHIFTJIS_CP932 */
4155 if (prefix_table[(unsigned char)c1]){
4156 (*o_putc)(prefix_table[(unsigned char)c1]);
4162 void j_oconv(nkf_char c2, nkf_char c1)
4164 #ifdef NUMCHAR_OPTION
4165 if (c2 == 0 && is_unicode_capsule(c1)){
4166 w16e_conv(c1, &c2, &c1);
4167 if (c2 == 0 && is_unicode_capsule(c1)){
4168 c2 = c1 & VALUE_MASK;
4169 if (ms_ucs_map_f == UCS_MAP_CP932 &&
4170 0xE000 <= c2 && c2 <= 0xE757) {
4173 c2 = 0x7F + c1 / 94;
4174 c1 = 0x21 + c1 % 94;
4176 if (encode_fallback) (*encode_fallback)(c1);
4183 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4186 (*o_putc)(ascii_intro);
4187 output_mode = ASCII;
4191 } else if (is_eucg3(c2)){
4193 if(output_mode!=X0213_2){
4194 output_mode = X0213_2;
4198 (*o_putc)(X0213_2&0x7F);
4201 if(output_mode!=X0212){
4202 output_mode = X0212;
4206 (*o_putc)(X0212&0x7F);
4209 (*o_putc)(c2 & 0x7f);
4212 } else if (c2==X0201) {
4213 if (output_mode!=X0201) {
4214 output_mode = X0201;
4220 } else if (c2==ISO8859_1) {
4221 /* iso8859 introduction, or 8th bit on */
4222 /* Can we convert in 7bit form using ESC-'-'-A ?
4224 output_mode = ISO8859_1;
4226 } else if (c2 == 0) {
4227 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4230 (*o_putc)(ascii_intro);
4231 output_mode = ASCII;
4235 if(ms_ucs_map_f == UCS_MAP_CP932
4236 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4237 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4239 if (output_mode!=X0213_1) {
4240 output_mode = X0213_1;
4244 (*o_putc)(X0213_1&0x7F);
4246 }else if (output_mode != X0208) {
4247 output_mode = X0208;
4250 (*o_putc)(kanji_intro);
4257 void base64_conv(nkf_char c2, nkf_char c1)
4259 mime_prechar(c2, c1);
4260 (*o_base64conv)(c2,c1);
4264 static nkf_char broken_buf[3];
4265 static int broken_counter = 0;
4266 static int broken_last = 0;
4267 nkf_char broken_getc(FILE *f)
4271 if (broken_counter>0) {
4272 return broken_buf[--broken_counter];
4275 if (c=='$' && broken_last != ESC
4276 && (input_mode==ASCII || input_mode==X0201)) {
4279 if (c1=='@'|| c1=='B') {
4280 broken_buf[0]=c1; broken_buf[1]=c;
4287 } else if (c=='(' && broken_last != ESC
4288 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4291 if (c1=='J'|| c1=='B') {
4292 broken_buf[0]=c1; broken_buf[1]=c;
4305 nkf_char broken_ungetc(nkf_char c, FILE *f)
4307 if (broken_counter<2)
4308 broken_buf[broken_counter++]=c;
4312 static nkf_char prev_cr = 0;
4314 void cr_conv(nkf_char c2, nkf_char c1)
4318 if (! (c2==0&&c1==NL) ) {
4324 } else if (c1=='\r') {
4326 } else if (c1=='\n') {
4327 if (crmode_f==CRLF) {
4328 (*o_crconv)(0,'\r');
4329 } else if (crmode_f==CR) {
4330 (*o_crconv)(0,'\r');
4334 } else if (c1!='\032' || crmode_f!=NL){
4340 Return value of fold_conv()
4342 \n add newline and output char
4343 \r add newline and output nothing
4346 1 (or else) normal output
4348 fold state in prev (previous character)
4350 >0x80 Japanese (X0208/X0201)
4355 This fold algorthm does not preserve heading space in a line.
4356 This is the main difference from fmt.
4359 #define char_size(c2,c1) (c2?2:1)
4361 void fold_conv(nkf_char c2, nkf_char c1)
4364 nkf_char fold_state;
4366 if (c1== '\r' && !fold_preserve_f) {
4367 fold_state=0; /* ignore cr */
4368 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4370 fold_state=0; /* ignore cr */
4371 } else if (c1== BS) {
4372 if (f_line>0) f_line--;
4374 } else if (c2==EOF && f_line != 0) { /* close open last line */
4376 } else if ((c1=='\n' && !fold_preserve_f)
4377 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4378 && fold_preserve_f)) {
4380 if (fold_preserve_f) {
4384 } else if ((f_prev == c1 && !fold_preserve_f)
4385 || (f_prev == '\n' && fold_preserve_f)
4386 ) { /* duplicate newline */
4389 fold_state = '\n'; /* output two newline */
4395 if (f_prev&0x80) { /* Japanese? */
4397 fold_state = 0; /* ignore given single newline */
4398 } else if (f_prev==' ') {
4402 if (++f_line<=fold_len)
4406 fold_state = '\r'; /* fold and output nothing */
4410 } else if (c1=='\f') {
4413 fold_state = '\n'; /* output newline and clear */
4414 } else if ( (c2==0 && c1==' ')||
4415 (c2==0 && c1=='\t')||
4416 (c2=='!'&& c1=='!')) {
4417 /* X0208 kankaku or ascii space */
4418 if (f_prev == ' ') {
4419 fold_state = 0; /* remove duplicate spaces */
4422 if (++f_line<=fold_len)
4423 fold_state = ' '; /* output ASCII space only */
4425 f_prev = ' '; f_line = 0;
4426 fold_state = '\r'; /* fold and output nothing */
4430 prev0 = f_prev; /* we still need this one... , but almost done */
4432 if (c2 || c2==X0201)
4433 f_prev |= 0x80; /* this is Japanese */
4434 f_line += char_size(c2,c1);
4435 if (f_line<=fold_len) { /* normal case */
4438 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4439 f_line = char_size(c2,c1);
4440 fold_state = '\n'; /* We can't wait, do fold now */
4441 } else if (c2==X0201) {
4442 /* simple kinsoku rules return 1 means no folding */
4443 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4444 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4445 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4446 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4447 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4448 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4449 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4451 fold_state = '\n';/* add one new f_line before this character */
4454 fold_state = '\n';/* add one new f_line before this character */
4457 /* kinsoku point in ASCII */
4458 if ( c1==')'|| /* { [ ( */
4469 /* just after special */
4470 } else if (!is_alnum(prev0)) {
4471 f_line = char_size(c2,c1);
4473 } else if ((prev0==' ') || /* ignored new f_line */
4474 (prev0=='\n')|| /* ignored new f_line */
4475 (prev0&0x80)) { /* X0208 - ASCII */
4476 f_line = char_size(c2,c1);
4477 fold_state = '\n';/* add one new f_line before this character */
4479 fold_state = 1; /* default no fold in ASCII */
4483 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4484 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4485 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4486 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4487 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4488 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4489 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4490 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4491 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4492 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4493 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4494 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4495 /* default no fold in kinsoku */
4498 f_line = char_size(c2,c1);
4499 /* add one new f_line before this character */
4502 f_line = char_size(c2,c1);
4504 /* add one new f_line before this character */
4509 /* terminator process */
4510 switch(fold_state) {
4529 nkf_char z_prev2=0,z_prev1=0;
4531 void z_conv(nkf_char c2, nkf_char c1)
4534 /* if (c2) c1 &= 0x7f; assertion */
4536 if (x0201_f && z_prev2==X0201) { /* X0201 */
4537 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4539 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4541 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4543 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4547 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4556 if (x0201_f && c2==X0201) {
4557 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4558 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4559 z_prev1 = c1; z_prev2 = c2;
4562 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4567 /* JISX0208 Alphabet */
4568 if (alpha_f && c2 == 0x23 ) {
4570 } else if (alpha_f && c2 == 0x21 ) {
4571 /* JISX0208 Kigou */
4576 } else if (alpha_f&0x4) {
4581 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4587 case '>': entity = ">"; break;
4588 case '<': entity = "<"; break;
4589 case '\"': entity = """; break;
4590 case '&': entity = "&"; break;
4593 while (*entity) (*o_zconv)(0, *entity++);
4603 #define rot13(c) ( \
4605 (c <= 'M') ? (c + 13): \
4606 (c <= 'Z') ? (c - 13): \
4608 (c <= 'm') ? (c + 13): \
4609 (c <= 'z') ? (c - 13): \
4613 #define rot47(c) ( \
4615 ( c <= 'O' ) ? (c + 47) : \
4616 ( c <= '~' ) ? (c - 47) : \
4620 void rot_conv(nkf_char c2, nkf_char c1)
4622 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4628 (*o_rot_conv)(c2,c1);
4631 void hira_conv(nkf_char c2, nkf_char c1)
4635 if (0x20 < c1 && c1 < 0x74) {
4637 (*o_hira_conv)(c2,c1);
4639 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4641 c1 = CLASS_UNICODE | 0x3094;
4642 (*o_hira_conv)(c2,c1);
4645 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4647 (*o_hira_conv)(c2,c1);
4652 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4655 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4657 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4661 (*o_hira_conv)(c2,c1);
4665 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4667 static const nkf_char range[RANGE_NUM_MAX][2] = {
4688 nkf_char start, end, c;
4690 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4694 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4699 for (i = 0; i < RANGE_NUM_MAX; i++) {
4700 start = range[i][0];
4703 if (c >= start && c <= end) {
4708 (*o_iso2022jp_check_conv)(c2,c1);
4712 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4714 const unsigned char *mime_pattern[] = {
4715 (const unsigned char *)"\075?EUC-JP?B?",
4716 (const unsigned char *)"\075?SHIFT_JIS?B?",
4717 (const unsigned char *)"\075?ISO-8859-1?Q?",
4718 (const unsigned char *)"\075?ISO-8859-1?B?",
4719 (const unsigned char *)"\075?ISO-2022-JP?B?",
4720 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4721 #if defined(UTF8_INPUT_ENABLE)
4722 (const unsigned char *)"\075?UTF-8?B?",
4723 (const unsigned char *)"\075?UTF-8?Q?",
4725 (const unsigned char *)"\075?US-ASCII?Q?",
4730 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4731 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4732 e_iconv, s_iconv, 0, 0, 0, 0,
4733 #if defined(UTF8_INPUT_ENABLE)
4739 const nkf_char mime_encode[] = {
4740 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4741 #if defined(UTF8_INPUT_ENABLE)
4748 const nkf_char mime_encode_method[] = {
4749 'B', 'B','Q', 'B', 'B', 'Q',
4750 #if defined(UTF8_INPUT_ENABLE)
4758 #define MAXRECOVER 20
4760 void switch_mime_getc(void)
4762 if (i_getc!=mime_getc) {
4763 i_mgetc = i_getc; i_getc = mime_getc;
4764 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4765 if(mime_f==STRICT_MIME) {
4766 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4767 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4772 void unswitch_mime_getc(void)
4774 if(mime_f==STRICT_MIME) {
4775 i_mgetc = i_mgetc_buf;
4776 i_mungetc = i_mungetc_buf;
4779 i_ungetc = i_mungetc;
4780 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4781 mime_iconv_back = NULL;
4784 nkf_char mime_begin_strict(FILE *f)
4788 const unsigned char *p,*q;
4789 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4791 mime_decode_mode = FALSE;
4792 /* =? has been checked */
4794 p = mime_pattern[j];
4797 for(i=2;p[i]>' ';i++) { /* start at =? */
4798 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4799 /* pattern fails, try next one */
4801 while (mime_pattern[++j]) {
4802 p = mime_pattern[j];
4803 for(k=2;k<i;k++) /* assume length(p) > i */
4804 if (p[k]!=q[k]) break;
4805 if (k==i && nkf_toupper(c1)==p[k]) break;
4807 p = mime_pattern[j];
4808 if (p) continue; /* found next one, continue */
4809 /* all fails, output from recovery buffer */
4817 mime_decode_mode = p[i-2];
4819 mime_iconv_back = iconv;
4820 set_iconv(FALSE, mime_priority_func[j]);
4821 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4823 if (mime_decode_mode=='B') {
4824 mimebuf_f = unbuf_f;
4826 /* do MIME integrity check */
4827 return mime_integrity(f,mime_pattern[j]);
4835 nkf_char mime_getc_buf(FILE *f)
4837 /* we don't keep eof of Fifo, becase it contains ?= as
4838 a terminator. It was checked in mime_integrity. */
4839 return ((mimebuf_f)?
4840 (*i_mgetc_buf)(f):Fifo(mime_input++));
4843 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4846 (*i_mungetc_buf)(c,f);
4848 Fifo(--mime_input) = (unsigned char)c;
4852 nkf_char mime_begin(FILE *f)
4857 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4858 /* re-read and convert again from mime_buffer. */
4860 /* =? has been checked */
4862 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4863 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4864 /* We accept any character type even if it is breaked by new lines */
4865 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4866 if (c1=='\n'||c1==' '||c1=='\r'||
4867 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4869 /* Failed. But this could be another MIME preemble */
4877 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4878 if (!(++i<MAXRECOVER) || c1==EOF) break;
4879 if (c1=='b'||c1=='B') {
4880 mime_decode_mode = 'B';
4881 } else if (c1=='q'||c1=='Q') {
4882 mime_decode_mode = 'Q';
4886 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4887 if (!(++i<MAXRECOVER) || c1==EOF) break;
4889 mime_decode_mode = FALSE;
4895 if (!mime_decode_mode) {
4896 /* false MIME premble, restart from mime_buffer */
4897 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4898 /* Since we are in MIME mode until buffer becomes empty, */
4899 /* we never go into mime_begin again for a while. */
4902 /* discard mime preemble, and goto MIME mode */
4904 /* do no MIME integrity check */
4905 return c1; /* used only for checking EOF */
4909 void no_putc(nkf_char c)
4914 void debug(const char *str)
4917 fprintf(stderr, "%s\n", str);
4922 void set_input_codename(char *codename)
4926 strcmp(codename, "") != 0 &&
4927 strcmp(codename, input_codename) != 0)
4929 is_inputcode_mixed = TRUE;
4931 input_codename = codename;
4932 is_inputcode_set = TRUE;
4935 #if !defined(PERL_XS) && !defined(WIN32DLL)
4936 void print_guessed_code(char *filename)
4938 char *codename = "BINARY";
4939 if (!is_inputcode_mixed) {
4940 if (strcmp(input_codename, "") == 0) {
4943 codename = input_codename;
4946 if (filename != NULL) printf("%s:", filename);
4947 printf("%s\n", codename);
4953 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4955 nkf_char c1, c2, c3;
4961 if (!nkf_isxdigit(c2)){
4966 if (!nkf_isxdigit(c3)){
4971 return (hex2bin(c2) << 4) | hex2bin(c3);
4974 nkf_char cap_getc(FILE *f)
4976 return hex_getc(':', f, i_cgetc, i_cungetc);
4979 nkf_char cap_ungetc(nkf_char c, FILE *f)
4981 return (*i_cungetc)(c, f);
4984 nkf_char url_getc(FILE *f)
4986 return hex_getc('%', f, i_ugetc, i_uungetc);
4989 nkf_char url_ungetc(nkf_char c, FILE *f)
4991 return (*i_uungetc)(c, f);
4995 #ifdef NUMCHAR_OPTION
4996 nkf_char numchar_getc(FILE *f)
4998 nkf_char (*g)(FILE *) = i_ngetc;
4999 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5010 if (buf[i] == 'x' || buf[i] == 'X'){
5011 for (j = 0; j < 7; j++){
5013 if (!nkf_isxdigit(buf[i])){
5020 c |= hex2bin(buf[i]);
5023 for (j = 0; j < 8; j++){
5027 if (!nkf_isdigit(buf[i])){
5034 c += hex2bin(buf[i]);
5040 return CLASS_UNICODE | c;
5049 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5051 return (*i_nungetc)(c, f);
5055 #ifdef UNICODE_NORMALIZATION
5057 /* Normalization Form C */
5058 nkf_char nfc_getc(FILE *f)
5060 nkf_char (*g)(FILE *f) = i_nfc_getc;
5061 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5062 int i=0, j, k=1, lower, upper;
5064 const nkf_nfchar *array;
5066 extern const struct normalization_pair normalization_table[];
5070 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5071 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5072 while (upper >= lower) {
5073 j = (lower+upper) / 2;
5074 array = normalization_table[j].nfd;
5075 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5076 if (array[k] != buf[k]){
5077 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5084 array = normalization_table[j].nfc;
5085 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5086 buf[i] = (nkf_char)(array[i]);
5097 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5099 return (*i_nfc_ungetc)(c, f);
5101 #endif /* UNICODE_NORMALIZATION */
5107 nkf_char c1, c2, c3, c4, cc;
5108 nkf_char t1, t2, t3, t4, mode, exit_mode;
5109 nkf_char lwsp_count;
5112 nkf_char lwsp_size = 128;
5114 if (mime_top != mime_last) { /* Something is in FIFO */
5115 return Fifo(mime_top++);
5117 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5118 mime_decode_mode=FALSE;
5119 unswitch_mime_getc();
5120 return (*i_getc)(f);
5123 if (mimebuf_f == FIXED_MIME)
5124 exit_mode = mime_decode_mode;
5127 if (mime_decode_mode == 'Q') {
5128 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5130 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5131 if (c1<=' ' || DEL<=c1) {
5132 mime_decode_mode = exit_mode; /* prepare for quit */
5135 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5139 mime_decode_mode = exit_mode; /* prepare for quit */
5140 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5141 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5142 /* end Q encoding */
5143 input_mode = exit_mode;
5145 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5146 if (lwsp_buf==NULL) {
5147 perror("can't malloc");
5150 while ((c1=(*i_getc)(f))!=EOF) {
5155 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5163 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5164 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5179 lwsp_buf[lwsp_count] = (unsigned char)c1;
5180 if (lwsp_count++>lwsp_size){
5182 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5183 if (lwsp_buf_new==NULL) {
5185 perror("can't realloc");
5188 lwsp_buf = lwsp_buf_new;
5194 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5196 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5197 i_ungetc(lwsp_buf[lwsp_count],f);
5203 if (c1=='='&&c2<' ') { /* this is soft wrap */
5204 while((c1 = (*i_mgetc)(f)) <=' ') {
5205 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5207 mime_decode_mode = 'Q'; /* still in MIME */
5208 goto restart_mime_q;
5211 mime_decode_mode = 'Q'; /* still in MIME */
5215 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5216 if (c2<=' ') return c2;
5217 mime_decode_mode = 'Q'; /* still in MIME */
5218 return ((hex2bin(c2)<<4) + hex2bin(c3));
5221 if (mime_decode_mode != 'B') {
5222 mime_decode_mode = FALSE;
5223 return (*i_mgetc)(f);
5227 /* Base64 encoding */
5229 MIME allows line break in the middle of
5230 Base64, but we are very pessimistic in decoding
5231 in unbuf mode because MIME encoded code may broken by
5232 less or editor's control sequence (such as ESC-[-K in unbuffered
5233 mode. ignore incomplete MIME.
5235 mode = mime_decode_mode;
5236 mime_decode_mode = exit_mode; /* prepare for quit */
5238 while ((c1 = (*i_mgetc)(f))<=' ') {
5243 if ((c2 = (*i_mgetc)(f))<=' ') {
5246 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5247 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5250 if ((c1 == '?') && (c2 == '=')) {
5253 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5254 if (lwsp_buf==NULL) {
5255 perror("can't malloc");
5258 while ((c1=(*i_getc)(f))!=EOF) {
5263 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5271 if ((c1=(*i_getc)(f))!=EOF) {
5275 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5290 lwsp_buf[lwsp_count] = (unsigned char)c1;
5291 if (lwsp_count++>lwsp_size){
5293 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5294 if (lwsp_buf_new==NULL) {
5296 perror("can't realloc");
5299 lwsp_buf = lwsp_buf_new;
5305 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5307 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5308 i_ungetc(lwsp_buf[lwsp_count],f);
5315 if ((c3 = (*i_mgetc)(f))<=' ') {
5318 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5319 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5323 if ((c4 = (*i_mgetc)(f))<=' ') {
5326 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5327 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5331 mime_decode_mode = mode; /* still in MIME sigh... */
5333 /* BASE 64 decoding */
5335 t1 = 0x3f & base64decode(c1);
5336 t2 = 0x3f & base64decode(c2);
5337 t3 = 0x3f & base64decode(c3);
5338 t4 = 0x3f & base64decode(c4);
5339 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5341 Fifo(mime_last++) = (unsigned char)cc;
5342 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5344 Fifo(mime_last++) = (unsigned char)cc;
5345 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5347 Fifo(mime_last++) = (unsigned char)cc;
5352 return Fifo(mime_top++);
5355 nkf_char mime_ungetc(nkf_char c, FILE *f)
5357 Fifo(--mime_top) = (unsigned char)c;
5361 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5365 /* In buffered mode, read until =? or NL or buffer full
5367 mime_input = mime_top;
5368 mime_last = mime_top;
5370 while(*p) Fifo(mime_input++) = *p++;
5373 while((c=(*i_getc)(f))!=EOF) {
5374 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5375 break; /* buffer full */
5377 if (c=='=' && d=='?') {
5378 /* checked. skip header, start decode */
5379 Fifo(mime_input++) = (unsigned char)c;
5380 /* mime_last_input = mime_input; */
5385 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5387 /* Should we check length mod 4? */
5388 Fifo(mime_input++) = (unsigned char)c;
5391 /* In case of Incomplete MIME, no MIME decode */
5392 Fifo(mime_input++) = (unsigned char)c;
5393 mime_last = mime_input; /* point undecoded buffer */
5394 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5395 switch_mime_getc(); /* anyway we need buffered getc */
5399 nkf_char base64decode(nkf_char c)
5404 i = c - 'A'; /* A..Z 0-25 */
5406 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5408 } else if (c > '/') {
5409 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5410 } else if (c == '+') {
5411 i = '>' /* 62 */ ; /* + 62 */
5413 i = '?' /* 63 */ ; /* / 63 */
5418 static const char basis_64[] =
5419 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5421 static nkf_char b64c;
5422 #define MIMEOUT_BUF_LENGTH (60)
5423 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5424 int mimeout_buf_count = 0;
5425 int mimeout_preserve_space = 0;
5426 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5428 void open_mime(nkf_char mode)
5430 const unsigned char *p;
5433 p = mime_pattern[0];
5434 for(i=0;mime_encode[i];i++) {
5435 if (mode == mime_encode[i]) {
5436 p = mime_pattern[i];
5440 mimeout_mode = mime_encode_method[i];
5443 if (base64_count>45) {
5444 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5445 (*o_mputc)(mimeout_buf[i]);
5451 if (!mimeout_preserve_space && mimeout_buf_count>0
5452 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5453 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5457 if (!mimeout_preserve_space) {
5458 for (;i<mimeout_buf_count;i++) {
5459 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5460 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5461 (*o_mputc)(mimeout_buf[i]);
5468 mimeout_preserve_space = FALSE;
5474 j = mimeout_buf_count;
5475 mimeout_buf_count = 0;
5477 mime_putc(mimeout_buf[i]);
5481 void close_mime(void)
5491 switch(mimeout_mode) {
5496 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5502 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5508 if (mimeout_f!=FIXED_MIME) {
5510 } else if (mimeout_mode != 'Q')
5515 void mimeout_addchar(nkf_char c)
5517 switch(mimeout_mode) {
5522 } else if(!nkf_isalnum(c)) {
5524 (*o_mputc)(itoh4(((c>>4)&0xf)));
5525 (*o_mputc)(itoh4((c&0xf)));
5534 (*o_mputc)(basis_64[c>>2]);
5539 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5545 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5546 (*o_mputc)(basis_64[c & 0x3F]);
5557 nkf_char mime_lastchar2, mime_lastchar1;
5559 void mime_prechar(nkf_char c2, nkf_char c1)
5563 if (base64_count + mimeout_buf_count/3*4> 66){
5564 (*o_base64conv)(EOF,0);
5565 (*o_base64conv)(0,NL);
5566 (*o_base64conv)(0,SPACE);
5568 }/*else if (mime_lastchar2){
5569 if (c1 <=DEL && !nkf_isspace(c1)){
5570 (*o_base64conv)(0,SPACE);
5574 if (c2 && mime_lastchar2 == 0
5575 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5576 (*o_base64conv)(0,SPACE);
5579 mime_lastchar2 = c2;
5580 mime_lastchar1 = c1;
5583 void mime_putc(nkf_char c)
5588 if (mimeout_f == FIXED_MIME){
5589 if (mimeout_mode == 'Q'){
5590 if (base64_count > 71){
5591 if (c!=CR && c!=NL) {
5598 if (base64_count > 71){
5603 if (c == EOF) { /* c==EOF */
5607 if (c != EOF) { /* c==EOF */
5613 /* mimeout_f != FIXED_MIME */
5615 if (c == EOF) { /* c==EOF */
5616 j = mimeout_buf_count;
5617 mimeout_buf_count = 0;
5621 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5624 mimeout_addchar(mimeout_buf[i]);
5628 mimeout_addchar(mimeout_buf[i]);
5632 mimeout_addchar(mimeout_buf[i]);
5638 if (mimeout_mode=='Q') {
5639 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5651 if (mimeout_buf_count > 0){
5652 lastchar = mimeout_buf[mimeout_buf_count - 1];
5657 if (!mimeout_mode) {
5658 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5659 if (nkf_isspace(c)) {
5660 if (c==CR || c==NL) {
5663 for (i=0;i<mimeout_buf_count;i++) {
5664 (*o_mputc)(mimeout_buf[i]);
5665 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5671 mimeout_buf[0] = (char)c;
5672 mimeout_buf_count = 1;
5674 if (base64_count > 1
5675 && base64_count + mimeout_buf_count > 76){
5678 if (!nkf_isspace(mimeout_buf[0])){
5683 mimeout_buf[mimeout_buf_count++] = (char)c;
5684 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5685 open_mime(output_mode);
5690 if (lastchar==CR || lastchar == NL){
5691 for (i=0;i<mimeout_buf_count;i++) {
5692 (*o_mputc)(mimeout_buf[i]);
5695 mimeout_buf_count = 0;
5697 if (lastchar==SPACE) {
5698 for (i=0;i<mimeout_buf_count-1;i++) {
5699 (*o_mputc)(mimeout_buf[i]);
5702 mimeout_buf[0] = SPACE;
5703 mimeout_buf_count = 1;
5705 open_mime(output_mode);
5708 /* mimeout_mode == 'B', 1, 2 */
5709 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5710 if (lastchar == CR || lastchar == NL){
5711 if (nkf_isblank(c)) {
5712 for (i=0;i<mimeout_buf_count;i++) {
5713 mimeout_addchar(mimeout_buf[i]);
5715 mimeout_buf_count = 0;
5716 } else if (SPACE<c && c<DEL) {
5718 for (i=0;i<mimeout_buf_count;i++) {
5719 (*o_mputc)(mimeout_buf[i]);
5722 mimeout_buf_count = 0;
5725 if (c==SPACE || c==TAB || c==CR || c==NL) {
5726 for (i=0;i<mimeout_buf_count;i++) {
5727 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5729 for (i=0;i<mimeout_buf_count;i++) {
5730 (*o_mputc)(mimeout_buf[i]);
5733 mimeout_buf_count = 0;
5736 mimeout_buf[mimeout_buf_count++] = (char)c;
5737 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5739 for (i=0;i<mimeout_buf_count;i++) {
5740 (*o_mputc)(mimeout_buf[i]);
5743 mimeout_buf_count = 0;
5747 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5748 mimeout_buf[mimeout_buf_count++] = (char)c;
5749 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5750 j = mimeout_buf_count;
5751 mimeout_buf_count = 0;
5753 mimeout_addchar(mimeout_buf[i]);
5760 if (mimeout_buf_count>0) {
5761 j = mimeout_buf_count;
5762 mimeout_buf_count = 0;
5764 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5766 mimeout_addchar(mimeout_buf[i]);
5772 (*o_mputc)(mimeout_buf[i]);
5774 open_mime(output_mode);
5781 #if defined(PERL_XS) || defined(WIN32DLL)
5785 struct input_code *p = input_code_list;
5798 mime_f = STRICT_MIME;
5799 mime_decode_f = FALSE;
5804 #if defined(MSDOS) || defined(__OS2__)
5809 iso2022jp_f = FALSE;
5810 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5811 ms_ucs_map_f = UCS_MAP_ASCII;
5813 #ifdef UTF8_INPUT_ENABLE
5814 no_cp932ext_f = FALSE;
5815 no_best_fit_chars_f = FALSE;
5816 encode_fallback = NULL;
5817 unicode_subchar = '?';
5818 input_endian = ENDIAN_BIG;
5820 #ifdef UTF8_OUTPUT_ENABLE
5821 output_bom_f = FALSE;
5822 output_endian = ENDIAN_BIG;
5824 #ifdef UNICODE_NORMALIZATION
5837 is_inputcode_mixed = FALSE;
5838 is_inputcode_set = FALSE;
5842 #ifdef SHIFTJIS_CP932
5852 for (i = 0; i < 256; i++){
5853 prefix_table[i] = 0;
5857 mimeout_buf_count = 0;
5862 fold_preserve_f = FALSE;
5865 kanji_intro = DEFAULT_J;
5866 ascii_intro = DEFAULT_R;
5867 fold_margin = FOLD_MARGIN;
5868 output_conv = DEFAULT_CONV;
5869 oconv = DEFAULT_CONV;
5870 o_zconv = no_connection;
5871 o_fconv = no_connection;
5872 o_crconv = no_connection;
5873 o_rot_conv = no_connection;
5874 o_hira_conv = no_connection;
5875 o_base64conv = no_connection;
5876 o_iso2022jp_check_conv = no_connection;
5879 i_ungetc = std_ungetc;
5881 i_bungetc = std_ungetc;
5884 i_mungetc = std_ungetc;
5885 i_mgetc_buf = std_getc;
5886 i_mungetc_buf = std_ungetc;
5887 output_mode = ASCII;
5890 mime_decode_mode = FALSE;
5896 z_prev2=0,z_prev1=0;
5898 iconv_for_check = 0;
5900 input_codename = "";
5907 void no_connection(nkf_char c2, nkf_char c1)
5909 no_connection2(c2,c1,0);
5912 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5914 fprintf(stderr,"nkf internal module connection failure.\n");
5916 return 0; /* LINT */
5921 #define fprintf dllprintf
5925 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5926 fprintf(stderr,"Flags:\n");
5927 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5928 #ifdef DEFAULT_CODE_SJIS
5929 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5931 #ifdef DEFAULT_CODE_JIS
5932 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5934 #ifdef DEFAULT_CODE_EUC
5935 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5937 #ifdef DEFAULT_CODE_UTF8
5938 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5940 #ifdef UTF8_OUTPUT_ENABLE
5941 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5943 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5944 #ifdef UTF8_INPUT_ENABLE
5945 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5947 fprintf(stderr,"t no conversion\n");
5948 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5949 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5950 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5951 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5952 fprintf(stderr,"v Show this usage. V: show version\n");
5953 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5954 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5955 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5956 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5957 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5958 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5959 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5960 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5962 fprintf(stderr,"T Text mode output\n");
5964 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5965 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5966 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5967 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5968 fprintf(stderr,"\n");
5969 fprintf(stderr,"Long name options\n");
5970 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5971 fprintf(stderr," Specify the input or output codeset\n");
5972 fprintf(stderr," --fj --unix --mac --windows\n");
5973 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5974 fprintf(stderr," Convert for the system or code\n");
5975 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5976 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5977 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5979 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5981 #ifdef NUMCHAR_OPTION
5982 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5984 #ifdef UTF8_INPUT_ENABLE
5985 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5986 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5989 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5990 fprintf(stderr," Overwrite original listed files by filtered result\n");
5991 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5993 fprintf(stderr," -g --guess Guess the input code\n");
5994 fprintf(stderr," --help --version Show this help/the version\n");
5995 fprintf(stderr," For more information, see also man nkf\n");
5996 fprintf(stderr,"\n");
6002 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6003 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6006 #if defined(MSDOS) && defined(__WIN16__)
6009 #if defined(MSDOS) && defined(__WIN32__)
6015 ,NKF_VERSION,NKF_RELEASE_DATE);
6016 fprintf(stderr,"\n%s\n",CopyRight);
6021 **
\e$B%Q%C%A@):n<T
\e(B
6022 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
6023 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
6024 ** ohta@src.ricoh.co.jp (Junn Ohta)
6025 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
6026 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
6027 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
6028 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
6029 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
6030 ** GHG00637@nifty-serve.or.jp (COW)