1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.141 2007/10/02 08:41:03 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-10-02"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
42 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
44 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
60 #if defined(MSDOS) || defined(__OS2__)
63 #if defined(_MSC_VER) || defined(__WATCOMC__)
64 #define mktemp _mktemp
70 #define setbinmode(fp) fsetbin(fp)
71 #elif defined(__DJGPP__)
72 #include <libc/dosio.h>
73 #define setbinmode(fp) djgpp_setbinmode(fp)
74 #else /* Microsoft C, Turbo C */
75 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
78 #define setbinmode(fp)
81 #if defined(__DJGPP__)
82 void djgpp_setbinmode(FILE *fp)
84 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
87 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
88 __file_handle_set(fd, m);
92 #ifdef _IOFBF /* SysV and MSDOS, Windows */
93 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
95 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
98 /*Borland C++ 4.5 EasyWin*/
99 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
108 /* added by satoru@isoternet.org */
110 #include <sys/types.h>
112 #include <sys/stat.h>
113 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
115 #if defined(__WATCOMC__)
116 #include <sys/utime.h>
120 #else /* defined(MSDOS) */
122 #ifdef __BORLANDC__ /* BCC32 */
124 #else /* !defined(__BORLANDC__) */
125 #include <sys/utime.h>
126 #endif /* (__BORLANDC__) */
127 #else /* !defined(__WIN32__) */
128 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
129 #include <sys/utime.h>
130 #elif defined(__TURBOC__) /* BCC */
132 #elif defined(LSI_C) /* LSI C */
133 #endif /* (__WIN32__) */
141 /* state of output_mode and input_mode
158 #define X0213_1 0x284F
159 #define X0213_2 0x2850
161 /* Input Assumption */
166 #define LATIN1_INPUT 6
168 #define STRICT_MIME 8
173 #define JAPANESE_EUC 10
177 #define UTF8_INPUT 13
178 #define UTF16_INPUT 1015
179 #define UTF32_INPUT 1017
183 #define ENDIAN_BIG 1234
184 #define ENDIAN_LITTLE 4321
185 #define ENDIAN_2143 2143
186 #define ENDIAN_3412 3412
207 #define is_alnum(c) \
208 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
210 /* I don't trust portablity of toupper */
211 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
212 #define nkf_isoctal(c) ('0'<=c && c<='7')
213 #define nkf_isdigit(c) ('0'<=c && c<='9')
214 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
215 #define nkf_isblank(c) (c == SP || c == TAB)
216 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
217 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
218 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
219 #define nkf_isprint(c) (SP<=c && c<='~')
220 #define nkf_isgraph(c) ('!'<=c && c<='~')
221 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
222 ('A'<=c&&c<='F') ? (c-'A'+10) : \
223 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
224 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
225 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
227 #define CP932_TABLE_BEGIN 0xFA
228 #define CP932_TABLE_END 0xFC
229 #define CP932INV_TABLE_BEGIN 0xED
230 #define CP932INV_TABLE_END 0xEE
231 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
233 #define HOLD_SIZE 1024
234 #if defined(INT_IS_SHORT)
235 #define IOBUF_SIZE 2048
237 #define IOBUF_SIZE 16384
240 #define DEFAULT_J 'B'
241 #define DEFAULT_R 'B'
243 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
244 #define SJ6394 0x0161 /* 63 - 94 ku offset */
246 #define RANGE_NUM_MAX 18
251 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
252 #define sizeof_euc_to_utf8_1byte 94
253 #define sizeof_euc_to_utf8_2bytes 94
254 #define sizeof_utf8_to_euc_C2 64
255 #define sizeof_utf8_to_euc_E5B8 64
256 #define sizeof_utf8_to_euc_2bytes 112
257 #define sizeof_utf8_to_euc_3bytes 16
260 /* MIME preprocessor */
262 #ifdef EASYWIN /*Easy Win */
263 extern POINT _BufferSize;
272 void (*status_func)(struct input_code *, nkf_char);
273 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
277 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
280 static const char *CopyRight = COPY_RIGHT;
282 #if !defined(PERL_XS) && !defined(WIN32DLL)
283 static nkf_char noconvert(FILE *f);
285 static void module_connection(void);
286 static nkf_char kanji_convert(FILE *f);
287 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
288 static nkf_char push_hold_buf(nkf_char c2);
289 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
290 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
291 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
292 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
293 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
295 * 0: Shift_JIS, eucJP-ascii
300 #define UCS_MAP_ASCII 0
302 #define UCS_MAP_CP932 2
303 #define UCS_MAP_CP10001 3
304 static int ms_ucs_map_f = UCS_MAP_ASCII;
306 #ifdef UTF8_INPUT_ENABLE
307 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
308 static int no_cp932ext_f = FALSE;
309 /* ignore ZERO WIDTH NO-BREAK SPACE */
310 static int no_best_fit_chars_f = FALSE;
311 static int input_endian = ENDIAN_BIG;
312 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
313 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
314 static void encode_fallback_html(nkf_char c);
315 static void encode_fallback_xml(nkf_char c);
316 static void encode_fallback_java(nkf_char c);
317 static void encode_fallback_perl(nkf_char c);
318 static void encode_fallback_subchar(nkf_char c);
319 static void (*encode_fallback)(nkf_char c) = NULL;
320 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
321 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
322 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
323 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
324 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
325 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
326 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
327 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
328 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
329 static void w_status(struct input_code *, nkf_char);
331 #ifdef UTF8_OUTPUT_ENABLE
332 static int output_bom_f = FALSE;
333 static int output_endian = ENDIAN_BIG;
334 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
335 static void w_oconv(nkf_char c2,nkf_char c1);
336 static void w_oconv16(nkf_char c2,nkf_char c1);
337 static void w_oconv32(nkf_char c2,nkf_char c1);
339 static void e_oconv(nkf_char c2,nkf_char c1);
340 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
341 static void s_oconv(nkf_char c2,nkf_char c1);
342 static void j_oconv(nkf_char c2,nkf_char c1);
343 static void fold_conv(nkf_char c2,nkf_char c1);
344 static void nl_conv(nkf_char c2,nkf_char c1);
345 static void z_conv(nkf_char c2,nkf_char c1);
346 static void rot_conv(nkf_char c2,nkf_char c1);
347 static void hira_conv(nkf_char c2,nkf_char c1);
348 static void base64_conv(nkf_char c2,nkf_char c1);
349 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
350 static void no_connection(nkf_char c2,nkf_char c1);
351 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
353 static void code_score(struct input_code *ptr);
354 static void code_status(nkf_char c);
356 static void std_putc(nkf_char c);
357 static nkf_char std_getc(FILE *f);
358 static nkf_char std_ungetc(nkf_char c,FILE *f);
360 static nkf_char broken_getc(FILE *f);
361 static nkf_char broken_ungetc(nkf_char c,FILE *f);
363 static nkf_char mime_begin(FILE *f);
364 static nkf_char mime_getc(FILE *f);
365 static nkf_char mime_ungetc(nkf_char c,FILE *f);
367 static void switch_mime_getc(void);
368 static void unswitch_mime_getc(void);
369 static nkf_char mime_begin_strict(FILE *f);
370 static nkf_char mime_getc_buf(FILE *f);
371 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
372 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
374 static nkf_char base64decode(nkf_char c);
375 static void mime_prechar(nkf_char c2, nkf_char c1);
376 static void mime_putc(nkf_char c);
377 static void open_mime(nkf_char c);
378 static void close_mime(void);
379 static void eof_mime(void);
380 static void mimeout_addchar(nkf_char c);
382 static void usage(void);
383 static void version(void);
385 static void options(unsigned char *c);
386 #if defined(PERL_XS) || defined(WIN32DLL)
387 static void reinit(void);
392 #if !defined(PERL_XS) && !defined(WIN32DLL)
393 static unsigned char stdibuf[IOBUF_SIZE];
394 static unsigned char stdobuf[IOBUF_SIZE];
396 static unsigned char hold_buf[HOLD_SIZE*2];
397 static int hold_count = 0;
399 /* MIME preprocessor fifo */
401 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
402 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
403 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
404 static unsigned char mime_buf[MIME_BUF_SIZE];
405 static unsigned int mime_top = 0;
406 static unsigned int mime_last = 0; /* decoded */
407 static unsigned int mime_input = 0; /* undecoded */
408 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
411 static int unbuf_f = FALSE;
412 static int estab_f = FALSE;
413 static int nop_f = FALSE;
414 static int binmode_f = TRUE; /* binary mode */
415 static int rot_f = FALSE; /* rot14/43 mode */
416 static int hira_f = FALSE; /* hira/kata henkan */
417 static int input_f = FALSE; /* non fixed input code */
418 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
419 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
420 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
421 static int mimebuf_f = FALSE; /* MIME buffered input */
422 static int broken_f = FALSE; /* convert ESC-less broken JIS */
423 static int iso8859_f = FALSE; /* ISO8859 through */
424 static int mimeout_f = FALSE; /* base64 mode */
425 #if defined(MSDOS) || defined(__OS2__)
426 static int x0201_f = TRUE; /* Assume JISX0201 kana */
428 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
430 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
432 #ifdef UNICODE_NORMALIZATION
433 static int nfc_f = FALSE;
434 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
435 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
436 static nkf_char nfc_getc(FILE *f);
437 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
441 static int cap_f = FALSE;
442 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
443 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
444 static nkf_char cap_getc(FILE *f);
445 static nkf_char cap_ungetc(nkf_char c,FILE *f);
447 static int url_f = FALSE;
448 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
449 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
450 static nkf_char url_getc(FILE *f);
451 static nkf_char url_ungetc(nkf_char c,FILE *f);
454 #if defined(INT_IS_SHORT)
455 #define NKF_INT32_C(n) (n##L)
457 #define NKF_INT32_C(n) (n)
459 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
460 #define CLASS_MASK NKF_INT32_C(0xFF000000)
461 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
462 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
463 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
464 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
465 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
467 #ifdef NUMCHAR_OPTION
468 static int numchar_f = FALSE;
469 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
470 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
471 static nkf_char numchar_getc(FILE *f);
472 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
476 static int noout_f = FALSE;
477 static void no_putc(nkf_char c);
478 static nkf_char debug_f = FALSE;
479 static void debug(const char *str);
480 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
483 static int guess_f = FALSE;
485 static void print_guessed_code(char *filename);
487 static void set_input_codename(char *codename);
490 static int exec_f = 0;
493 #ifdef SHIFTJIS_CP932
494 /* invert IBM extended characters to others */
495 static int cp51932_f = FALSE;
497 /* invert NEC-selected IBM extended characters to IBM extended characters */
498 static int cp932inv_f = TRUE;
500 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
501 #endif /* SHIFTJIS_CP932 */
504 static int x0212_f = FALSE;
505 static nkf_char x0212_shift(nkf_char c);
506 static nkf_char x0212_unshift(nkf_char c);
508 static int x0213_f = FALSE;
510 static unsigned char prefix_table[256];
512 static void set_code_score(struct input_code *ptr, nkf_char score);
513 static void clr_code_score(struct input_code *ptr, nkf_char score);
514 static void status_disable(struct input_code *ptr);
515 static void status_push_ch(struct input_code *ptr, nkf_char c);
516 static void status_clear(struct input_code *ptr);
517 static void status_reset(struct input_code *ptr);
518 static void status_reinit(struct input_code *ptr);
519 static void status_check(struct input_code *ptr, nkf_char c);
520 static void e_status(struct input_code *, nkf_char);
521 static void s_status(struct input_code *, nkf_char);
523 struct input_code input_code_list[] = {
524 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
525 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
526 #ifdef UTF8_INPUT_ENABLE
527 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
528 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
529 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
534 static int mimeout_mode = 0;
535 static int base64_count = 0;
537 /* X0208 -> ASCII converter */
540 static int f_line = 0; /* chars in line */
541 static int f_prev = 0;
542 static int fold_preserve_f = FALSE; /* preserve new lines */
543 static int fold_f = FALSE;
544 static int fold_len = 0;
547 static unsigned char kanji_intro = DEFAULT_J;
548 static unsigned char ascii_intro = DEFAULT_R;
552 #define FOLD_MARGIN 10
553 #define DEFAULT_FOLD 60
555 static int fold_margin = FOLD_MARGIN;
559 #ifdef DEFAULT_CODE_JIS
560 # define DEFAULT_CONV j_oconv
562 #ifdef DEFAULT_CODE_SJIS
563 # define DEFAULT_CONV s_oconv
565 #ifdef DEFAULT_CODE_EUC
566 # define DEFAULT_CONV e_oconv
568 #ifdef DEFAULT_CODE_UTF8
569 # define DEFAULT_CONV w_oconv
572 /* process default */
573 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
575 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
576 /* s_iconv or oconv */
577 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
579 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
580 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
581 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
587 /* static redirections */
589 static void (*o_putc)(nkf_char c) = std_putc;
591 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
592 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
594 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
595 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
597 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
599 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
600 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
602 /* for strict mime */
603 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
604 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
607 static int output_mode = ASCII, /* output kanji mode */
608 input_mode = ASCII, /* input kanji mode */
609 shift_mode = FALSE; /* TRUE shift out, or X0201 */
610 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
612 /* X0201 / X0208 conversion tables */
614 /* X0201 kana conversion table */
616 static const unsigned char cv[]= {
617 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
618 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
619 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
620 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
621 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
622 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
623 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
624 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
625 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
626 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
627 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
628 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
629 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
630 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
631 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
632 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
636 /* X0201 kana conversion table for daguten */
638 static const unsigned char dv[]= {
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
644 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
645 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
646 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
647 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
648 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
650 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
657 /* X0201 kana conversion table for han-daguten */
659 static const unsigned char ev[]= {
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
671 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 /* X0208 kigou conversion table */
680 /* 0x8140 - 0x819e */
681 static const unsigned char fv[] = {
683 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
684 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
685 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
686 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
687 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
688 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
689 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
690 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
691 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
692 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
693 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
694 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
699 static int file_out_f = FALSE;
701 static int overwrite_f = FALSE;
702 static int preserve_time_f = FALSE;
703 static int backup_f = FALSE;
704 static char *backup_suffix = "";
705 static char *get_backup_filename(const char *suffix, const char *filename);
708 static int nlmode_f = 0; /* CR, LF, CRLF */
709 static int input_nextline = 0; /* 0: unestablished, EOF: MIXED */
710 static nkf_char prev_cr = 0; /* CR or 0 */
711 #ifdef EASYWIN /*Easy Win */
712 static int end_check;
715 #define STD_GC_BUFSIZE (256)
716 nkf_char std_gc_buf[STD_GC_BUFSIZE];
720 #include "nkf32dll.c"
721 #elif defined(PERL_XS)
723 int main(int argc, char **argv)
728 char *outfname = NULL;
731 #ifdef EASYWIN /*Easy Win */
732 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
735 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
736 cp = (unsigned char *)*argv;
741 if (pipe(fds) < 0 || (pid = fork()) < 0){
752 execvp(argv[1], &argv[1]);
766 if(x0201_f == WISH_TRUE)
767 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
769 if (binmode_f == TRUE)
770 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
771 if (freopen("","wb",stdout) == NULL)
778 setbuf(stdout, (char *) NULL);
780 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
783 if (binmode_f == TRUE)
784 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
785 if (freopen("","rb",stdin) == NULL) return (-1);
789 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
793 kanji_convert(stdin);
794 if (guess_f) print_guessed_code(NULL);
798 int is_argument_error = FALSE;
800 input_codename = NULL;
805 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
808 is_argument_error = TRUE;
816 /* reopen file for stdout */
817 if (file_out_f == TRUE) {
820 outfname = malloc(strlen(origfname)
821 + strlen(".nkftmpXXXXXX")
827 strcpy(outfname, origfname);
831 for (i = strlen(outfname); i; --i){
832 if (outfname[i - 1] == '/'
833 || outfname[i - 1] == '\\'){
839 strcat(outfname, "ntXXXXXX");
841 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
844 strcat(outfname, ".nkftmpXXXXXX");
845 fd = mkstemp(outfname);
848 || (fd_backup = dup(fileno(stdout))) < 0
849 || dup2(fd, fileno(stdout)) < 0
860 outfname = "nkf.out";
863 if(freopen(outfname, "w", stdout) == NULL) {
867 if (binmode_f == TRUE) {
868 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
869 if (freopen("","wb",stdout) == NULL)
876 if (binmode_f == TRUE)
877 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
878 if (freopen("","rb",fin) == NULL)
883 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
887 char *filename = NULL;
889 if (nfiles > 1) filename = origfname;
890 if (guess_f) print_guessed_code(filename);
896 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
904 if (dup2(fd_backup, fileno(stdout)) < 0){
907 if (stat(origfname, &sb)) {
908 fprintf(stderr, "Can't stat %s\n", origfname);
910 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
911 if (chmod(outfname, sb.st_mode)) {
912 fprintf(stderr, "Can't set permission %s\n", outfname);
915 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
917 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
918 tb[0] = tb[1] = sb.st_mtime;
919 if (utime(outfname, tb)) {
920 fprintf(stderr, "Can't set timestamp %s\n", outfname);
923 tb.actime = sb.st_atime;
924 tb.modtime = sb.st_mtime;
925 if (utime(outfname, &tb)) {
926 fprintf(stderr, "Can't set timestamp %s\n", outfname);
931 char *backup_filename = get_backup_filename(backup_suffix, origfname);
933 unlink(backup_filename);
935 if (rename(origfname, backup_filename)) {
936 perror(backup_filename);
937 fprintf(stderr, "Can't rename %s to %s\n",
938 origfname, backup_filename);
942 if (unlink(origfname)){
947 if (rename(outfname, origfname)) {
949 fprintf(stderr, "Can't rename %s to %s\n",
950 outfname, origfname);
957 if (is_argument_error)
960 #ifdef EASYWIN /*Easy Win */
961 if (file_out_f == FALSE)
962 scanf("%d",&end_check);
965 #else /* for Other OS */
966 if (file_out_f == TRUE)
971 #endif /* WIN32DLL */
974 char *get_backup_filename(const char *suffix, const char *filename)
976 char *backup_filename;
977 int asterisk_count = 0;
979 int filename_length = strlen(filename);
981 for(i = 0; suffix[i]; i++){
982 if(suffix[i] == '*') asterisk_count++;
986 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
987 if (!backup_filename){
988 perror("Can't malloc backup filename.");
992 for(i = 0, j = 0; suffix[i];){
993 if(suffix[i] == '*'){
994 backup_filename[j] = '\0';
995 strncat(backup_filename, filename, filename_length);
997 j += filename_length;
999 backup_filename[j++] = suffix[i++];
1002 backup_filename[j] = '\0';
1004 j = strlen(suffix) + filename_length;
1005 backup_filename = malloc( + 1);
1006 strcpy(backup_filename, filename);
1007 strcat(backup_filename, suffix);
1008 backup_filename[j] = '\0';
1010 return backup_filename;
1014 static const struct {
1038 {"katakana-hiragana","h3"},
1045 #ifdef UTF8_OUTPUT_ENABLE
1055 {"fb-subchar=", ""},
1057 #ifdef UTF8_INPUT_ENABLE
1058 {"utf8-input", "W"},
1059 {"utf16-input", "W16"},
1060 {"no-cp932ext", ""},
1061 {"no-best-fit-chars",""},
1063 #ifdef UNICODE_NORMALIZATION
1064 {"utf8mac-input", ""},
1076 #ifdef NUMCHAR_OPTION
1077 {"numchar-input", ""},
1083 #ifdef SHIFTJIS_CP932
1093 static int option_mode = 0;
1095 void options(unsigned char *cp)
1099 unsigned char *cp_back = NULL;
1104 while(*cp && *cp++!='-');
1105 while (*cp || cp_back) {
1113 case '-': /* literal options */
1114 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1118 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1119 p = (unsigned char *)long_option[i].name;
1120 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1121 if (*p == cp[j] || cp[j] == SP){
1128 while(*cp && *cp != SP && cp++);
1129 if (long_option[i].alias[0]){
1131 cp = (unsigned char *)long_option[i].alias;
1133 if (strcmp(long_option[i].name, "ic=") == 0){
1134 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1135 codeset[i] = nkf_toupper(p[i]);
1138 if(strcmp(codeset, "ISO-2022-JP") == 0){
1139 input_f = JIS_INPUT;
1140 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1141 strcmp(codeset, "CP50220") == 0 ||
1142 strcmp(codeset, "CP50221") == 0 ||
1143 strcmp(codeset, "CP50222") == 0){
1144 input_f = JIS_INPUT;
1145 #ifdef SHIFTJIS_CP932
1148 #ifdef UTF8_OUTPUT_ENABLE
1149 ms_ucs_map_f = UCS_MAP_CP932;
1151 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1152 input_f = JIS_INPUT;
1156 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1157 input_f = JIS_INPUT;
1162 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1163 input_f = SJIS_INPUT;
1164 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1165 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1166 strcmp(codeset, "CP932") == 0 ||
1167 strcmp(codeset, "MS932") == 0){
1168 input_f = SJIS_INPUT;
1169 #ifdef SHIFTJIS_CP932
1172 #ifdef UTF8_OUTPUT_ENABLE
1173 ms_ucs_map_f = UCS_MAP_CP932;
1175 }else if(strcmp(codeset, "CP10001") == 0){
1176 input_f = SJIS_INPUT;
1177 #ifdef SHIFTJIS_CP932
1180 #ifdef UTF8_OUTPUT_ENABLE
1181 ms_ucs_map_f = UCS_MAP_CP10001;
1183 }else if(strcmp(codeset, "EUCJP") == 0 ||
1184 strcmp(codeset, "EUC-JP") == 0){
1185 input_f = EUC_INPUT;
1186 }else if(strcmp(codeset, "CP51932") == 0){
1187 input_f = EUC_INPUT;
1188 #ifdef SHIFTJIS_CP932
1191 #ifdef UTF8_OUTPUT_ENABLE
1192 ms_ucs_map_f = UCS_MAP_CP932;
1194 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1195 strcmp(codeset, "EUCJP-MS") == 0 ||
1196 strcmp(codeset, "EUCJPMS") == 0){
1197 input_f = EUC_INPUT;
1198 #ifdef SHIFTJIS_CP932
1201 #ifdef UTF8_OUTPUT_ENABLE
1202 ms_ucs_map_f = UCS_MAP_MS;
1204 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1205 strcmp(codeset, "EUCJP-ASCII") == 0){
1206 input_f = EUC_INPUT;
1207 #ifdef SHIFTJIS_CP932
1210 #ifdef UTF8_OUTPUT_ENABLE
1211 ms_ucs_map_f = UCS_MAP_ASCII;
1213 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1214 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1215 input_f = SJIS_INPUT;
1217 #ifdef SHIFTJIS_CP932
1220 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1221 strcmp(codeset, "EUC-JIS-2004") == 0){
1222 input_f = EUC_INPUT;
1224 #ifdef SHIFTJIS_CP932
1227 #ifdef UTF8_INPUT_ENABLE
1228 }else if(strcmp(codeset, "UTF-8") == 0 ||
1229 strcmp(codeset, "UTF-8N") == 0 ||
1230 strcmp(codeset, "UTF-8-BOM") == 0){
1231 input_f = UTF8_INPUT;
1232 #ifdef UNICODE_NORMALIZATION
1233 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1234 strcmp(codeset, "UTF-8-MAC") == 0){
1235 input_f = UTF8_INPUT;
1238 }else if(strcmp(codeset, "UTF-16") == 0 ||
1239 strcmp(codeset, "UTF-16BE") == 0 ||
1240 strcmp(codeset, "UTF-16BE-BOM") == 0){
1241 input_f = UTF16_INPUT;
1242 input_endian = ENDIAN_BIG;
1243 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1244 strcmp(codeset, "UTF-16LE-BOM") == 0){
1245 input_f = UTF16_INPUT;
1246 input_endian = ENDIAN_LITTLE;
1247 }else if(strcmp(codeset, "UTF-32") == 0 ||
1248 strcmp(codeset, "UTF-32BE") == 0 ||
1249 strcmp(codeset, "UTF-32BE-BOM") == 0){
1250 input_f = UTF32_INPUT;
1251 input_endian = ENDIAN_BIG;
1252 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1253 strcmp(codeset, "UTF-32LE-BOM") == 0){
1254 input_f = UTF32_INPUT;
1255 input_endian = ENDIAN_LITTLE;
1260 if (strcmp(long_option[i].name, "oc=") == 0){
1262 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1263 codeset[i] = nkf_toupper(p[i]);
1266 if(strcmp(codeset, "ISO-2022-JP") == 0){
1267 output_conv = j_oconv;
1268 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1269 output_conv = j_oconv;
1270 no_cp932ext_f = TRUE;
1271 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_CP932;
1277 }else if(strcmp(codeset, "CP50220") == 0){
1278 output_conv = j_oconv;
1280 #ifdef SHIFTJIS_CP932
1283 #ifdef UTF8_OUTPUT_ENABLE
1284 ms_ucs_map_f = UCS_MAP_CP932;
1286 }else if(strcmp(codeset, "CP50221") == 0){
1287 output_conv = j_oconv;
1288 #ifdef SHIFTJIS_CP932
1291 #ifdef UTF8_OUTPUT_ENABLE
1292 ms_ucs_map_f = UCS_MAP_CP932;
1294 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1295 output_conv = j_oconv;
1299 #ifdef SHIFTJIS_CP932
1302 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1303 output_conv = j_oconv;
1308 #ifdef SHIFTJIS_CP932
1311 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1312 output_conv = s_oconv;
1313 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1314 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1315 strcmp(codeset, "CP932") == 0 ||
1316 strcmp(codeset, "MS932") == 0){
1317 output_conv = s_oconv;
1318 #ifdef UTF8_OUTPUT_ENABLE
1319 ms_ucs_map_f = UCS_MAP_CP932;
1321 }else if(strcmp(codeset, "CP10001") == 0){
1322 output_conv = s_oconv;
1323 #ifdef UTF8_OUTPUT_ENABLE
1324 ms_ucs_map_f = UCS_MAP_CP10001;
1326 }else if(strcmp(codeset, "EUCJP") == 0 ||
1327 strcmp(codeset, "EUC-JP") == 0){
1328 output_conv = e_oconv;
1329 }else if(strcmp(codeset, "CP51932") == 0){
1330 output_conv = e_oconv;
1331 #ifdef SHIFTJIS_CP932
1334 #ifdef UTF8_OUTPUT_ENABLE
1335 ms_ucs_map_f = UCS_MAP_CP932;
1337 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1338 strcmp(codeset, "EUCJP-MS") == 0 ||
1339 strcmp(codeset, "EUCJPMS") == 0){
1340 output_conv = e_oconv;
1344 #ifdef UTF8_OUTPUT_ENABLE
1345 ms_ucs_map_f = UCS_MAP_MS;
1347 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1348 strcmp(codeset, "EUCJP-ASCII") == 0){
1349 output_conv = e_oconv;
1353 #ifdef UTF8_OUTPUT_ENABLE
1354 ms_ucs_map_f = UCS_MAP_ASCII;
1356 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1357 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1358 output_conv = s_oconv;
1360 #ifdef SHIFTJIS_CP932
1363 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1364 strcmp(codeset, "EUC-JIS-2004") == 0){
1365 output_conv = e_oconv;
1370 #ifdef SHIFTJIS_CP932
1373 #ifdef UTF8_OUTPUT_ENABLE
1374 }else if(strcmp(codeset, "UTF-8") == 0){
1375 output_conv = w_oconv;
1376 }else if(strcmp(codeset, "UTF-8N") == 0){
1377 output_conv = w_oconv;
1378 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1379 output_conv = w_oconv;
1380 output_bom_f = TRUE;
1381 }else if(strcmp(codeset, "UTF-16BE") == 0){
1382 output_conv = w_oconv16;
1383 }else if(strcmp(codeset, "UTF-16") == 0 ||
1384 strcmp(codeset, "UTF-16BE-BOM") == 0){
1385 output_conv = w_oconv16;
1386 output_bom_f = TRUE;
1387 }else if(strcmp(codeset, "UTF-16LE") == 0){
1388 output_conv = w_oconv16;
1389 output_endian = ENDIAN_LITTLE;
1390 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1391 output_conv = w_oconv16;
1392 output_endian = ENDIAN_LITTLE;
1393 output_bom_f = TRUE;
1394 }else if(strcmp(codeset, "UTF-32") == 0 ||
1395 strcmp(codeset, "UTF-32BE") == 0){
1396 output_conv = w_oconv32;
1397 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1398 output_conv = w_oconv32;
1399 output_bom_f = TRUE;
1400 }else if(strcmp(codeset, "UTF-32LE") == 0){
1401 output_conv = w_oconv32;
1402 output_endian = ENDIAN_LITTLE;
1403 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1404 output_conv = w_oconv32;
1405 output_endian = ENDIAN_LITTLE;
1406 output_bom_f = TRUE;
1412 if (strcmp(long_option[i].name, "overwrite") == 0){
1415 preserve_time_f = TRUE;
1418 if (strcmp(long_option[i].name, "overwrite=") == 0){
1421 preserve_time_f = TRUE;
1423 backup_suffix = malloc(strlen((char *) p) + 1);
1424 strcpy(backup_suffix, (char *) p);
1427 if (strcmp(long_option[i].name, "in-place") == 0){
1430 preserve_time_f = FALSE;
1433 if (strcmp(long_option[i].name, "in-place=") == 0){
1436 preserve_time_f = FALSE;
1438 backup_suffix = malloc(strlen((char *) p) + 1);
1439 strcpy(backup_suffix, (char *) p);
1444 if (strcmp(long_option[i].name, "cap-input") == 0){
1448 if (strcmp(long_option[i].name, "url-input") == 0){
1453 #ifdef NUMCHAR_OPTION
1454 if (strcmp(long_option[i].name, "numchar-input") == 0){
1460 if (strcmp(long_option[i].name, "no-output") == 0){
1464 if (strcmp(long_option[i].name, "debug") == 0){
1469 if (strcmp(long_option[i].name, "cp932") == 0){
1470 #ifdef SHIFTJIS_CP932
1474 #ifdef UTF8_OUTPUT_ENABLE
1475 ms_ucs_map_f = UCS_MAP_CP932;
1479 if (strcmp(long_option[i].name, "no-cp932") == 0){
1480 #ifdef SHIFTJIS_CP932
1484 #ifdef UTF8_OUTPUT_ENABLE
1485 ms_ucs_map_f = UCS_MAP_ASCII;
1489 #ifdef SHIFTJIS_CP932
1490 if (strcmp(long_option[i].name, "cp932inv") == 0){
1497 if (strcmp(long_option[i].name, "x0212") == 0){
1504 if (strcmp(long_option[i].name, "exec-in") == 0){
1508 if (strcmp(long_option[i].name, "exec-out") == 0){
1513 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1514 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1515 no_cp932ext_f = TRUE;
1518 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1519 no_best_fit_chars_f = TRUE;
1522 if (strcmp(long_option[i].name, "fb-skip") == 0){
1523 encode_fallback = NULL;
1526 if (strcmp(long_option[i].name, "fb-html") == 0){
1527 encode_fallback = encode_fallback_html;
1530 if (strcmp(long_option[i].name, "fb-xml") == 0){
1531 encode_fallback = encode_fallback_xml;
1534 if (strcmp(long_option[i].name, "fb-java") == 0){
1535 encode_fallback = encode_fallback_java;
1538 if (strcmp(long_option[i].name, "fb-perl") == 0){
1539 encode_fallback = encode_fallback_perl;
1542 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1543 encode_fallback = encode_fallback_subchar;
1546 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1547 encode_fallback = encode_fallback_subchar;
1548 unicode_subchar = 0;
1550 /* decimal number */
1551 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1552 unicode_subchar *= 10;
1553 unicode_subchar += hex2bin(p[i]);
1555 }else if(p[1] == 'x' || p[1] == 'X'){
1556 /* hexadecimal number */
1557 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1558 unicode_subchar <<= 4;
1559 unicode_subchar |= hex2bin(p[i]);
1563 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1564 unicode_subchar *= 8;
1565 unicode_subchar += hex2bin(p[i]);
1568 w16e_conv(unicode_subchar, &i, &j);
1569 unicode_subchar = i<<8 | j;
1573 #ifdef UTF8_OUTPUT_ENABLE
1574 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1575 ms_ucs_map_f = UCS_MAP_MS;
1579 #ifdef UNICODE_NORMALIZATION
1580 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1581 input_f = UTF8_INPUT;
1586 if (strcmp(long_option[i].name, "prefix=") == 0){
1587 if (nkf_isgraph(p[0])){
1588 for (i = 1; nkf_isgraph(p[i]); i++){
1589 prefix_table[p[i]] = p[0];
1596 case 'b': /* buffered mode */
1599 case 'u': /* non bufferd mode */
1602 case 't': /* transparent mode */
1607 } else if (*cp=='2') {
1611 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1619 case 'j': /* JIS output */
1621 output_conv = j_oconv;
1623 case 'e': /* AT&T EUC output */
1624 output_conv = e_oconv;
1627 case 's': /* SJIS output */
1628 output_conv = s_oconv;
1630 case 'l': /* ISO8859 Latin-1 support, no conversion */
1631 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1632 input_f = LATIN1_INPUT;
1634 case 'i': /* Kanji IN ESC-$-@/B */
1635 if (*cp=='@'||*cp=='B')
1636 kanji_intro = *cp++;
1638 case 'o': /* ASCII IN ESC-(-J/B */
1639 if (*cp=='J'||*cp=='B'||*cp=='H')
1640 ascii_intro = *cp++;
1644 bit:1 katakana->hiragana
1645 bit:2 hiragana->katakana
1647 if ('9'>= *cp && *cp>='0')
1648 hira_f |= (*cp++ -'0');
1655 #if defined(MSDOS) || defined(__OS2__)
1670 #ifdef UTF8_OUTPUT_ENABLE
1671 case 'w': /* UTF-8 output */
1673 output_conv = w_oconv; cp++;
1677 output_bom_f = TRUE;
1680 if ('1'== cp[0] && '6'==cp[1]) {
1681 output_conv = w_oconv16; cp+=2;
1682 } else if ('3'== cp[0] && '2'==cp[1]) {
1683 output_conv = w_oconv32; cp+=2;
1685 output_conv = w_oconv;
1690 output_endian = ENDIAN_LITTLE;
1691 } else if (cp[0] == 'B') {
1699 output_bom_f = TRUE;
1704 #ifdef UTF8_INPUT_ENABLE
1705 case 'W': /* UTF input */
1708 input_f = UTF8_INPUT;
1710 if ('1'== cp[0] && '6'==cp[1]) {
1712 input_f = UTF16_INPUT;
1713 input_endian = ENDIAN_BIG;
1714 } else if ('3'== cp[0] && '2'==cp[1]) {
1716 input_f = UTF32_INPUT;
1717 input_endian = ENDIAN_BIG;
1719 input_f = UTF8_INPUT;
1724 input_endian = ENDIAN_LITTLE;
1725 } else if (cp[0] == 'B') {
1731 /* Input code assumption */
1732 case 'J': /* JIS input */
1733 input_f = JIS_INPUT;
1735 case 'E': /* AT&T EUC input */
1736 input_f = EUC_INPUT;
1738 case 'S': /* MS Kanji input */
1739 input_f = SJIS_INPUT;
1740 if (x0201_f==NO_X0201) x0201_f=TRUE;
1742 case 'Z': /* Convert X0208 alphabet to asii */
1744 bit:0 Convert JIS X 0208 Alphabet to ASCII
1745 bit:1 Convert Kankaku to one space
1746 bit:2 Convert Kankaku to two spaces
1747 bit:3 Convert HTML Entity
1748 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1750 while ('0'<= *cp && *cp <='9') {
1751 alpha_f |= 1 << (*cp++ - '0');
1753 if (!alpha_f) alpha_f = 1;
1755 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1756 x0201_f = FALSE; /* No X0201->X0208 conversion */
1758 ESC-(-I in JIS, EUC, MS Kanji
1759 SI/SO in JIS, EUC, MS Kanji
1760 SSO in EUC, JIS, not in MS Kanji
1761 MS Kanji (0xa0-0xdf)
1763 ESC-(-I in JIS (0x20-0x5f)
1764 SSO in EUC (0xa0-0xdf)
1765 0xa0-0xd in MS Kanji (0xa0-0xdf)
1768 case 'X': /* Assume X0201 kana */
1769 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1772 case 'F': /* prserve new lines */
1773 fold_preserve_f = TRUE;
1774 case 'f': /* folding -f60 or -f */
1777 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1779 fold_len += *cp++ - '0';
1781 if (!(0<fold_len && fold_len<BUFSIZ))
1782 fold_len = DEFAULT_FOLD;
1786 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1788 fold_margin += *cp++ - '0';
1792 case 'm': /* MIME support */
1793 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1794 if (*cp=='B'||*cp=='Q') {
1795 mime_decode_mode = *cp++;
1796 mimebuf_f = FIXED_MIME;
1797 } else if (*cp=='N') {
1798 mime_f = TRUE; cp++;
1799 } else if (*cp=='S') {
1800 mime_f = STRICT_MIME; cp++;
1801 } else if (*cp=='0') {
1802 mime_decode_f = FALSE;
1803 mime_f = FALSE; cp++;
1806 case 'M': /* MIME output */
1809 mimeout_f = FIXED_MIME; cp++;
1810 } else if (*cp=='Q') {
1812 mimeout_f = FIXED_MIME; cp++;
1817 case 'B': /* Broken JIS support */
1819 bit:1 allow any x on ESC-(-x or ESC-$-x
1820 bit:2 reset to ascii on NL
1822 if ('9'>= *cp && *cp>='0')
1823 broken_f |= 1<<(*cp++ -'0');
1828 case 'O':/* for Output file */
1832 case 'c':/* add cr code */
1835 case 'd':/* delete cr code */
1838 case 'I': /* ISO-2022-JP output */
1841 case 'L': /* line mode */
1842 if (*cp=='u') { /* unix */
1843 nlmode_f = LF; cp++;
1844 } else if (*cp=='m') { /* mac */
1845 nlmode_f = CR; cp++;
1846 } else if (*cp=='w') { /* windows */
1847 nlmode_f = CRLF; cp++;
1848 } else if (*cp=='0') { /* no conversion */
1858 /* module muliple options in a string are allowed for Perl moudle */
1859 while(*cp && *cp++!='-');
1862 /* bogus option but ignored */
1868 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1871 struct input_code *p = input_code_list;
1873 if (iconv_func == p->iconv_func){
1882 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1884 #ifdef INPUT_CODE_FIX
1892 #ifdef INPUT_CODE_FIX
1893 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1899 if (estab_f && iconv_for_check != iconv){
1900 struct input_code *p = find_inputcode_byfunc(iconv);
1902 set_input_codename(p->name);
1905 iconv_for_check = iconv;
1910 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1911 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1912 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1913 #ifdef SHIFTJIS_CP932
1914 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1915 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1917 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1919 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1920 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1922 #define SCORE_INIT (SCORE_iMIME)
1924 static const char score_table_A0[] = {
1927 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1928 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1931 static const char score_table_F0[] = {
1932 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1933 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1934 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1935 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1938 void set_code_score(struct input_code *ptr, nkf_char score)
1941 ptr->score |= score;
1945 void clr_code_score(struct input_code *ptr, nkf_char score)
1948 ptr->score &= ~score;
1952 void code_score(struct input_code *ptr)
1954 nkf_char c2 = ptr->buf[0];
1955 #ifdef UTF8_OUTPUT_ENABLE
1956 nkf_char c1 = ptr->buf[1];
1959 set_code_score(ptr, SCORE_ERROR);
1960 }else if (c2 == SSO){
1961 set_code_score(ptr, SCORE_KANA);
1962 #ifdef UTF8_OUTPUT_ENABLE
1963 }else if (!e2w_conv(c2, c1)){
1964 set_code_score(ptr, SCORE_NO_EXIST);
1966 }else if ((c2 & 0x70) == 0x20){
1967 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1968 }else if ((c2 & 0x70) == 0x70){
1969 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1970 }else if ((c2 & 0x70) >= 0x50){
1971 set_code_score(ptr, SCORE_L2);
1975 void status_disable(struct input_code *ptr)
1980 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1983 void status_push_ch(struct input_code *ptr, nkf_char c)
1985 ptr->buf[ptr->index++] = c;
1988 void status_clear(struct input_code *ptr)
1994 void status_reset(struct input_code *ptr)
1997 ptr->score = SCORE_INIT;
2000 void status_reinit(struct input_code *ptr)
2003 ptr->_file_stat = 0;
2006 void status_check(struct input_code *ptr, nkf_char c)
2008 if (c <= DEL && estab_f){
2013 void s_status(struct input_code *ptr, nkf_char c)
2017 status_check(ptr, c);
2022 #ifdef NUMCHAR_OPTION
2023 }else if (is_unicode_capsule(c)){
2026 }else if (0xa1 <= c && c <= 0xdf){
2027 status_push_ch(ptr, SSO);
2028 status_push_ch(ptr, c);
2031 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2033 status_push_ch(ptr, c);
2034 #ifdef SHIFTJIS_CP932
2036 && is_ibmext_in_sjis(c)){
2038 status_push_ch(ptr, c);
2039 #endif /* SHIFTJIS_CP932 */
2041 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2043 status_push_ch(ptr, c);
2044 #endif /* X0212_ENABLE */
2046 status_disable(ptr);
2050 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2051 status_push_ch(ptr, c);
2052 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2056 status_disable(ptr);
2060 #ifdef SHIFTJIS_CP932
2061 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2062 status_push_ch(ptr, c);
2063 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2064 set_code_score(ptr, SCORE_CP932);
2069 #endif /* SHIFTJIS_CP932 */
2070 #ifndef X0212_ENABLE
2071 status_disable(ptr);
2077 void e_status(struct input_code *ptr, nkf_char c)
2081 status_check(ptr, c);
2086 #ifdef NUMCHAR_OPTION
2087 }else if (is_unicode_capsule(c)){
2090 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2092 status_push_ch(ptr, c);
2094 }else if (0x8f == c){
2096 status_push_ch(ptr, c);
2097 #endif /* X0212_ENABLE */
2099 status_disable(ptr);
2103 if (0xa1 <= c && c <= 0xfe){
2104 status_push_ch(ptr, c);
2108 status_disable(ptr);
2113 if (0xa1 <= c && c <= 0xfe){
2115 status_push_ch(ptr, c);
2117 status_disable(ptr);
2119 #endif /* X0212_ENABLE */
2123 #ifdef UTF8_INPUT_ENABLE
2124 void w_status(struct input_code *ptr, nkf_char c)
2128 status_check(ptr, c);
2133 #ifdef NUMCHAR_OPTION
2134 }else if (is_unicode_capsule(c)){
2137 }else if (0xc0 <= c && c <= 0xdf){
2139 status_push_ch(ptr, c);
2140 }else if (0xe0 <= c && c <= 0xef){
2142 status_push_ch(ptr, c);
2143 }else if (0xf0 <= c && c <= 0xf4){
2145 status_push_ch(ptr, c);
2147 status_disable(ptr);
2152 if (0x80 <= c && c <= 0xbf){
2153 status_push_ch(ptr, c);
2154 if (ptr->index > ptr->stat){
2155 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2156 && ptr->buf[2] == 0xbf);
2157 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2158 &ptr->buf[0], &ptr->buf[1]);
2165 status_disable(ptr);
2169 if (0x80 <= c && c <= 0xbf){
2170 if (ptr->index < ptr->stat){
2171 status_push_ch(ptr, c);
2176 status_disable(ptr);
2183 void code_status(nkf_char c)
2185 int action_flag = 1;
2186 struct input_code *result = 0;
2187 struct input_code *p = input_code_list;
2189 if (!p->status_func) {
2193 if (!p->status_func)
2195 (p->status_func)(p, c);
2198 }else if(p->stat == 0){
2209 if (result && !estab_f){
2210 set_iconv(TRUE, result->iconv_func);
2211 }else if (c <= DEL){
2212 struct input_code *ptr = input_code_list;
2222 nkf_char std_getc(FILE *f)
2225 return std_gc_buf[--std_gc_ndx];
2231 nkf_char std_ungetc(nkf_char c, FILE *f)
2233 if (std_gc_ndx == STD_GC_BUFSIZE){
2236 std_gc_buf[std_gc_ndx++] = c;
2241 void std_putc(nkf_char c)
2248 #if !defined(PERL_XS) && !defined(WIN32DLL)
2249 nkf_char noconvert(FILE *f)
2254 module_connection();
2255 while ((c = (*i_getc)(f)) != EOF)
2262 void module_connection(void)
2264 oconv = output_conv;
2267 /* replace continucation module, from output side */
2269 /* output redicrection */
2271 if (noout_f || guess_f){
2278 if (mimeout_f == TRUE) {
2279 o_base64conv = oconv; oconv = base64_conv;
2281 /* base64_count = 0; */
2284 if (nlmode_f || guess_f) {
2285 o_nlconv = oconv; oconv = nl_conv;
2288 o_rot_conv = oconv; oconv = rot_conv;
2291 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2294 o_hira_conv = oconv; oconv = hira_conv;
2297 o_fconv = oconv; oconv = fold_conv;
2300 if (alpha_f || x0201_f) {
2301 o_zconv = oconv; oconv = z_conv;
2305 i_ungetc = std_ungetc;
2306 /* input redicrection */
2309 i_cgetc = i_getc; i_getc = cap_getc;
2310 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2313 i_ugetc = i_getc; i_getc = url_getc;
2314 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2317 #ifdef NUMCHAR_OPTION
2319 i_ngetc = i_getc; i_getc = numchar_getc;
2320 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2323 #ifdef UNICODE_NORMALIZATION
2324 if (nfc_f && input_f == UTF8_INPUT){
2325 i_nfc_getc = i_getc; i_getc = nfc_getc;
2326 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2329 if (mime_f && mimebuf_f==FIXED_MIME) {
2330 i_mgetc = i_getc; i_getc = mime_getc;
2331 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2334 i_bgetc = i_getc; i_getc = broken_getc;
2335 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2337 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2338 set_iconv(-TRUE, e_iconv);
2339 } else if (input_f == SJIS_INPUT) {
2340 set_iconv(-TRUE, s_iconv);
2341 #ifdef UTF8_INPUT_ENABLE
2342 } else if (input_f == UTF8_INPUT) {
2343 set_iconv(-TRUE, w_iconv);
2344 } else if (input_f == UTF16_INPUT) {
2345 set_iconv(-TRUE, w_iconv16);
2346 } else if (input_f == UTF32_INPUT) {
2347 set_iconv(-TRUE, w_iconv32);
2350 set_iconv(FALSE, e_iconv);
2354 struct input_code *p = input_code_list;
2362 * Check and Ignore BOM
2364 void check_bom(FILE *f)
2367 switch(c2 = (*i_getc)(f)){
2369 if((c2 = (*i_getc)(f)) == 0x00){
2370 if((c2 = (*i_getc)(f)) == 0xFE){
2371 if((c2 = (*i_getc)(f)) == 0xFF){
2373 set_iconv(TRUE, w_iconv32);
2375 if (iconv == w_iconv32) {
2376 input_endian = ENDIAN_BIG;
2379 (*i_ungetc)(0xFF,f);
2380 }else (*i_ungetc)(c2,f);
2381 (*i_ungetc)(0xFE,f);
2382 }else if(c2 == 0xFF){
2383 if((c2 = (*i_getc)(f)) == 0xFE){
2385 set_iconv(TRUE, w_iconv32);
2387 if (iconv == w_iconv32) {
2388 input_endian = ENDIAN_2143;
2391 (*i_ungetc)(0xFF,f);
2392 }else (*i_ungetc)(c2,f);
2393 (*i_ungetc)(0xFF,f);
2394 }else (*i_ungetc)(c2,f);
2395 (*i_ungetc)(0x00,f);
2396 }else (*i_ungetc)(c2,f);
2397 (*i_ungetc)(0x00,f);
2400 if((c2 = (*i_getc)(f)) == 0xBB){
2401 if((c2 = (*i_getc)(f)) == 0xBF){
2403 set_iconv(TRUE, w_iconv);
2405 if (iconv == w_iconv) {
2408 (*i_ungetc)(0xBF,f);
2409 }else (*i_ungetc)(c2,f);
2410 (*i_ungetc)(0xBB,f);
2411 }else (*i_ungetc)(c2,f);
2412 (*i_ungetc)(0xEF,f);
2415 if((c2 = (*i_getc)(f)) == 0xFF){
2416 if((c2 = (*i_getc)(f)) == 0x00){
2417 if((c2 = (*i_getc)(f)) == 0x00){
2419 set_iconv(TRUE, w_iconv32);
2421 if (iconv == w_iconv32) {
2422 input_endian = ENDIAN_3412;
2425 (*i_ungetc)(0x00,f);
2426 }else (*i_ungetc)(c2,f);
2427 (*i_ungetc)(0x00,f);
2428 }else (*i_ungetc)(c2,f);
2430 set_iconv(TRUE, w_iconv16);
2432 if (iconv == w_iconv16) {
2433 input_endian = ENDIAN_BIG;
2436 (*i_ungetc)(0xFF,f);
2437 }else (*i_ungetc)(c2,f);
2438 (*i_ungetc)(0xFE,f);
2441 if((c2 = (*i_getc)(f)) == 0xFE){
2442 if((c2 = (*i_getc)(f)) == 0x00){
2443 if((c2 = (*i_getc)(f)) == 0x00){
2445 set_iconv(TRUE, w_iconv32);
2447 if (iconv == w_iconv32) {
2448 input_endian = ENDIAN_LITTLE;
2451 (*i_ungetc)(0x00,f);
2452 }else (*i_ungetc)(c2,f);
2453 (*i_ungetc)(0x00,f);
2454 }else (*i_ungetc)(c2,f);
2456 set_iconv(TRUE, w_iconv16);
2458 if (iconv == w_iconv16) {
2459 input_endian = ENDIAN_LITTLE;
2462 (*i_ungetc)(0xFE,f);
2463 }else (*i_ungetc)(c2,f);
2464 (*i_ungetc)(0xFF,f);
2473 Conversion main loop. Code detection only.
2476 nkf_char kanji_convert(FILE *f)
2478 nkf_char c3, c2=0, c1, c0=0;
2479 int is_8bit = FALSE;
2481 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2482 #ifdef UTF8_INPUT_ENABLE
2483 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2490 output_mode = ASCII;
2493 #define NEXT continue /* no output, get next */
2494 #define SEND ; /* output c1 and c2, get next */
2495 #define LAST break /* end of loop, go closing */
2497 module_connection();
2500 while ((c1 = (*i_getc)(f)) != EOF) {
2501 #ifdef INPUT_CODE_FIX
2507 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2508 /* in case of 8th bit is on */
2509 if (!estab_f&&!mime_decode_mode) {
2510 /* in case of not established yet */
2511 /* It is still ambiguious */
2512 if (h_conv(f, c2, c1)==EOF)
2518 /* in case of already established */
2520 /* ignore bogus code and not CP5022x UCD */
2528 /* second byte, 7 bit code */
2529 /* it might be kanji shitfted */
2530 if ((c1 == DEL) || (c1 <= SP)) {
2531 /* ignore bogus first code */
2538 #ifdef UTF8_INPUT_ENABLE
2539 if (iconv == w_iconv16) {
2540 if (input_endian == ENDIAN_BIG) {
2542 if ((c1 = (*i_getc)(f)) != EOF) {
2543 if (0xD8 <= c2 && c2 <= 0xDB) {
2544 if ((c0 = (*i_getc)(f)) != EOF) {
2546 if ((c3 = (*i_getc)(f)) != EOF) {
2553 if ((c2 = (*i_getc)(f)) != EOF) {
2554 if (0xD8 <= c2 && c2 <= 0xDB) {
2555 if ((c3 = (*i_getc)(f)) != EOF) {
2556 if ((c0 = (*i_getc)(f)) != EOF) {
2565 } else if(iconv == w_iconv32){
2567 if((c2 = (*i_getc)(f)) != EOF &&
2568 (c1 = (*i_getc)(f)) != EOF &&
2569 (c0 = (*i_getc)(f)) != EOF){
2570 switch(input_endian){
2572 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2575 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2578 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2581 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2591 #ifdef NUMCHAR_OPTION
2592 if (is_unicode_capsule(c1)){
2596 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2598 if (!estab_f && !iso8859_f) {
2599 /* not established yet */
2602 } else { /* estab_f==TRUE */
2607 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2608 /* SJIS X0201 Case... */
2609 if(iso2022jp_f && x0201_f==NO_X0201) {
2610 (*oconv)(GETA1, GETA2);
2617 } else if (c1==SSO && iconv != s_iconv) {
2618 /* EUC X0201 Case */
2619 c1 = (*i_getc)(f); /* skip SSO */
2621 if (SSP<=c1 && c1<0xe0) {
2622 if(iso2022jp_f && x0201_f==NO_X0201) {
2623 (*oconv)(GETA1, GETA2);
2630 } else { /* bogus code, skip SSO and one byte */
2633 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2634 (c1 == 0xFD || c1 == 0xFE)) {
2640 /* already established */
2645 } else if ((c1 > SP) && (c1 != DEL)) {
2646 /* in case of Roman characters */
2648 /* output 1 shifted byte */
2652 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2653 /* output 1 shifted byte */
2654 if(iso2022jp_f && x0201_f==NO_X0201) {
2655 (*oconv)(GETA1, GETA2);
2662 /* look like bogus code */
2665 } else if (input_mode == X0208 || input_mode == X0212 ||
2666 input_mode == X0213_1 || input_mode == X0213_2) {
2667 /* in case of Kanji shifted */
2670 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2671 /* Check MIME code */
2672 if ((c1 = (*i_getc)(f)) == EOF) {
2675 } else if (c1 == '?') {
2676 /* =? is mime conversion start sequence */
2677 if(mime_f == STRICT_MIME) {
2678 /* check in real detail */
2679 if (mime_begin_strict(f) == EOF)
2683 } else if (mime_begin(f) == EOF)
2693 /* normal ASCII code */
2696 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2699 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2702 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2703 if ((c1 = (*i_getc)(f)) == EOF) {
2704 /* (*oconv)(0, ESC); don't send bogus code */
2706 } else if (c1 == '$') {
2707 if ((c1 = (*i_getc)(f)) == EOF) {
2709 (*oconv)(0, ESC); don't send bogus code
2710 (*oconv)(0, '$'); */
2712 } else if (c1 == '@'|| c1 == 'B') {
2713 /* This is kanji introduction */
2716 set_input_codename("ISO-2022-JP");
2718 debug("ISO-2022-JP");
2721 } else if (c1 == '(') {
2722 if ((c1 = (*i_getc)(f)) == EOF) {
2723 /* don't send bogus code
2729 } else if (c1 == '@'|| c1 == 'B') {
2730 /* This is kanji introduction */
2735 } else if (c1 == 'D'){
2739 #endif /* X0212_ENABLE */
2740 } else if (c1 == (X0213_1&0x7F)){
2741 input_mode = X0213_1;
2744 } else if (c1 == (X0213_2&0x7F)){
2745 input_mode = X0213_2;
2749 /* could be some special code */
2756 } else if (broken_f&0x2) {
2757 /* accept any ESC-(-x as broken code ... */
2767 } else if (c1 == '(') {
2768 if ((c1 = (*i_getc)(f)) == EOF) {
2769 /* don't send bogus code
2771 (*oconv)(0, '('); */
2775 /* This is X0201 kana introduction */
2776 input_mode = X0201; shift_mode = X0201;
2778 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2779 /* This is X0208 kanji introduction */
2780 input_mode = ASCII; shift_mode = FALSE;
2782 } else if (broken_f&0x2) {
2783 input_mode = ASCII; shift_mode = FALSE;
2788 /* maintain various input_mode here */
2792 } else if ( c1 == 'N' || c1 == 'n'){
2794 c3 = (*i_getc)(f); /* skip SS2 */
2795 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2810 } else if (c1 == ESC && iconv == s_iconv) {
2811 /* ESC in Shift_JIS */
2812 if ((c1 = (*i_getc)(f)) == EOF) {
2813 /* (*oconv)(0, ESC); don't send bogus code */
2815 } else if (c1 == '$') {
2817 if ((c1 = (*i_getc)(f)) == EOF) {
2819 (*oconv)(0, ESC); don't send bogus code
2820 (*oconv)(0, '$'); */
2823 if (('E' <= c1 && c1 <= 'G') ||
2824 ('O' <= c1 && c1 <= 'Q')) {
2832 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2833 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2834 while ((c1 = (*i_getc)(f)) != EOF) {
2835 if (SP <= c1 && c1 <= 'z') {
2836 (*oconv)(0, c1 + c0);
2837 } else break; /* c1 == SO */
2841 if (c1 == EOF) LAST;
2848 } else if (c1 == LF || c1 == CR) {
2850 input_mode = ASCII; set_iconv(FALSE, 0);
2852 } else if (mime_decode_f && !mime_decode_mode){
2854 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2862 } else { /* if (c1 == CR)*/
2863 if ((c1=(*i_getc)(f))!=EOF) {
2867 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2881 } else if (c1 == DEL && input_mode == X0208) {
2891 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2894 if ((c0 = (*i_getc)(f)) != EOF) {
2897 if ((c3 = (*i_getc)(f)) != EOF) {
2899 (*iconv)(c2, c1, c0|c3);
2904 /* 3 bytes EUC or UTF-8 */
2905 if ((c0 = (*i_getc)(f)) != EOF) {
2907 (*iconv)(c2, c1, c0);
2915 0x7F <= c2 && c2 <= 0x92 &&
2916 0x21 <= c1 && c1 <= 0x7E) {
2918 if(c1 == 0x7F) return 0;
2919 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2922 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2926 (*oconv)(PREFIX_EUCG3 | c2, c1);
2928 #endif /* X0212_ENABLE */
2930 (*oconv)(PREFIX_EUCG3 | c2, c1);
2933 (*oconv)(input_mode, c1); /* other special case */
2939 /* goto next_word */
2943 (*iconv)(EOF, 0, 0);
2944 if (!input_codename)
2947 struct input_code *p = input_code_list;
2948 struct input_code *result = p;
2950 if (p->score < result->score) result = p;
2953 set_input_codename(result->name);
2955 debug(result->name);
2963 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2965 nkf_char ret, c3, c0;
2969 /** it must NOT be in the kanji shifte sequence */
2970 /** it must NOT be written in JIS7 */
2971 /** and it must be after 2 byte 8bit code */
2977 while ((c1 = (*i_getc)(f)) != EOF) {
2983 if (push_hold_buf(c1) == EOF || estab_f){
2989 struct input_code *p = input_code_list;
2990 struct input_code *result = p;
2995 if (p->status_func && p->score < result->score){
3000 set_iconv(TRUE, result->iconv_func);
3005 ** 1) EOF is detected, or
3006 ** 2) Code is established, or
3007 ** 3) Buffer is FULL (but last word is pushed)
3009 ** in 1) and 3) cases, we continue to use
3010 ** Kanji codes by oconv and leave estab_f unchanged.
3015 while (hold_index < hold_count){
3016 c2 = hold_buf[hold_index++];
3018 #ifdef NUMCHAR_OPTION
3019 || is_unicode_capsule(c2)
3024 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3025 (*iconv)(X0201, c2, 0);
3028 if (hold_index < hold_count){
3029 c1 = hold_buf[hold_index++];
3039 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3042 if (hold_index < hold_count){
3043 c0 = hold_buf[hold_index++];
3044 } else if ((c0 = (*i_getc)(f)) == EOF) {
3050 if (hold_index < hold_count){
3051 c3 = hold_buf[hold_index++];
3052 } else if ((c3 = (*i_getc)(f)) == EOF) {
3057 (*iconv)(c2, c1, c0|c3);
3062 /* 3 bytes EUC or UTF-8 */
3063 if (hold_index < hold_count){
3064 c0 = hold_buf[hold_index++];
3065 } else if ((c0 = (*i_getc)(f)) == EOF) {
3071 (*iconv)(c2, c1, c0);
3074 if (c0 == EOF) break;
3079 nkf_char push_hold_buf(nkf_char c2)
3081 if (hold_count >= HOLD_SIZE*2)
3083 hold_buf[hold_count++] = (unsigned char)c2;
3084 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3087 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3089 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3092 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3093 #ifdef SHIFTJIS_CP932
3094 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3095 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3102 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3103 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3109 #endif /* SHIFTJIS_CP932 */
3111 if (!x0213_f && is_ibmext_in_sjis(c2)){
3112 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3115 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3128 if(x0213_f && c2 >= 0xF0){
3129 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3130 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3131 }else{ /* 78<=k<=94 */
3132 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3133 if (0x9E < c1) c2++;
3136 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3137 if (0x9E < c1) c2++;
3140 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3147 c2 = x0212_unshift(c2);
3154 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3158 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3160 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3162 if(c1 == 0x7F) return 0;
3163 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3166 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3167 if (ret) return ret;
3173 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3178 }else if (c2 == 0x8f){
3182 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3183 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3184 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3187 c2 = (c2 << 8) | (c1 & 0x7f);
3189 #ifdef SHIFTJIS_CP932
3192 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3193 s2e_conv(s2, s1, &c2, &c1);
3200 #endif /* SHIFTJIS_CP932 */
3202 #endif /* X0212_ENABLE */
3203 } else if (c2 == SSO){
3206 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3209 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3210 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3211 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3216 #ifdef SHIFTJIS_CP932
3217 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3219 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3220 s2e_conv(s2, s1, &c2, &c1);
3227 #endif /* SHIFTJIS_CP932 */
3234 #ifdef UTF8_INPUT_ENABLE
3235 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3242 }else if (0xc0 <= c2 && c2 <= 0xef) {
3243 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3244 #ifdef NUMCHAR_OPTION
3247 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3255 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3258 static const char w_iconv_utf8_1st_byte[] =
3260 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3261 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3262 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3263 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3265 if (c2 < 0 || 0xff < c2) {
3266 }else if (c2 == 0) { /* 0 : 1 byte*/
3268 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3271 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3273 if (c1 < 0x80 || 0xBF < c1) return 0;
3276 if (c0 == 0) return -1;
3277 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3282 if (c0 == 0) return -1;
3283 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3287 if (c0 == 0) return -1;
3288 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3292 if (c0 == 0) return -2;
3293 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3297 if (c0 == 0) return -2;
3298 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3302 if (c0 == 0) return -2;
3303 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3311 if (c2 == 0 || c2 == EOF){
3312 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3313 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3316 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3325 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3326 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3333 }else if (val < 0x800){
3334 *p2 = 0xc0 | (val >> 6);
3335 *p1 = 0x80 | (val & 0x3f);
3337 } else if (val <= NKF_INT32_C(0xFFFF)) {
3338 *p2 = 0xe0 | (val >> 12);
3339 *p1 = 0x80 | ((val >> 6) & 0x3f);
3340 *p0 = 0x80 | (val & 0x3f);
3341 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3342 *p2 = 0xe0 | (val >> 16);
3343 *p1 = 0x80 | ((val >> 12) & 0x3f);
3344 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3353 #ifdef UTF8_INPUT_ENABLE
3354 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3359 } else if (c2 >= 0xf0){
3360 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3361 val = (c2 & 0x0f) << 18;
3362 val |= (c1 & 0x3f) << 12;
3363 val |= (c0 & 0x3f00) >> 2;
3365 }else if (c2 >= 0xe0){
3366 val = (c2 & 0x0f) << 12;
3367 val |= (c1 & 0x3f) << 6;
3369 }else if (c2 >= 0xc0){
3370 val = (c2 & 0x1f) << 6;
3378 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3380 nkf_char c2, c1, c0;
3387 w16w_conv(val, &c2, &c1, &c0);
3388 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3389 #ifdef NUMCHAR_OPTION
3392 *p1 = CLASS_UNICODE | val;
3401 #ifdef UTF8_INPUT_ENABLE
3402 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3405 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3408 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3409 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3411 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3413 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3418 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3419 if (ret) return ret;
3424 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3428 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3429 } else if (is_unicode_bmp(c1)) {
3430 ret = w16e_conv(c1, &c2, &c1);
3433 c1 = CLASS_UNICODE | c1;
3435 if (ret) return ret;
3440 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3442 const unsigned short *const *pp;
3443 const unsigned short *const *const *ppp;
3444 static const char no_best_fit_chars_table_C2[] =
3445 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3446 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3447 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3448 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3449 static const char no_best_fit_chars_table_C2_ms[] =
3450 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3451 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3452 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3453 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3454 static const char no_best_fit_chars_table_932_C2[] =
3455 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3457 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3458 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3459 static const char no_best_fit_chars_table_932_C3[] =
3460 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3461 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3463 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3469 }else if(c2 < 0xe0){
3470 if(no_best_fit_chars_f){
3471 if(ms_ucs_map_f == UCS_MAP_CP932){
3474 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3477 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3480 }else if(!cp932inv_f){
3483 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3486 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3489 }else if(ms_ucs_map_f == UCS_MAP_MS){
3490 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3491 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3509 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3510 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3511 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3513 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3514 }else if(c0 < 0xF0){
3515 if(no_best_fit_chars_f){
3516 if(ms_ucs_map_f == UCS_MAP_CP932){
3517 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3518 }else if(ms_ucs_map_f == UCS_MAP_MS){
3523 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3526 if(c0 == 0x92) return 1;
3531 if(c1 == 0x80 || c0 == 0x9C) return 1;
3534 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3539 if(c0 == 0x94) return 1;
3542 if(c0 == 0xBB) return 1;
3552 if(c0 == 0x95) return 1;
3555 if(c0 == 0xA5) return 1;
3562 if(c0 == 0x8D) return 1;
3565 if(c0 == 0x9E && !cp932inv_f) return 1;
3568 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3576 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3577 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3578 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3580 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3582 #ifdef SHIFTJIS_CP932
3583 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3585 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3586 s2e_conv(s2, s1, p2, p1);
3595 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3598 const unsigned short *p;
3601 if (pp == 0) return 1;
3604 if (c1 < 0 || psize <= c1) return 1;
3606 if (p == 0) return 1;
3609 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3611 if (val == 0) return 1;
3612 if (no_cp932ext_f && (
3613 (val>>8) == 0x2D || /* NEC special characters */
3614 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3622 if (c2 == SO) c2 = X0201;
3629 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3636 (*f)(0, bin2hex(c>>shift));
3646 void encode_fallback_html(nkf_char c)
3651 if(c >= NKF_INT32_C(1000000))
3652 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3653 if(c >= NKF_INT32_C(100000))
3654 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3656 (*oconv)(0, 0x30+(c/10000 )%10);
3658 (*oconv)(0, 0x30+(c/1000 )%10);
3660 (*oconv)(0, 0x30+(c/100 )%10);
3662 (*oconv)(0, 0x30+(c/10 )%10);
3664 (*oconv)(0, 0x30+ c %10);
3669 void encode_fallback_xml(nkf_char c)
3674 nkf_each_char_to_hex(oconv, c);
3679 void encode_fallback_java(nkf_char c)
3683 if(!is_unicode_bmp(c)){
3687 (*oconv)(0, bin2hex(c>>20));
3688 (*oconv)(0, bin2hex(c>>16));
3692 (*oconv)(0, bin2hex(c>>12));
3693 (*oconv)(0, bin2hex(c>> 8));
3694 (*oconv)(0, bin2hex(c>> 4));
3695 (*oconv)(0, bin2hex(c ));
3699 void encode_fallback_perl(nkf_char c)
3704 nkf_each_char_to_hex(oconv, c);
3709 void encode_fallback_subchar(nkf_char c)
3711 c = unicode_subchar;
3712 (*oconv)((c>>8)&0xFF, c&0xFF);
3717 #ifdef UTF8_OUTPUT_ENABLE
3718 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3720 const unsigned short *p;
3723 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3731 p = euc_to_utf8_1byte;
3733 } else if (is_eucg3(c2)){
3734 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3737 c2 = (c2&0x7f) - 0x21;
3738 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3739 p = x0212_to_utf8_2bytes[c2];
3745 c2 = (c2&0x7f) - 0x21;
3746 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3748 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3749 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3750 euc_to_utf8_2bytes_ms[c2];
3755 c1 = (c1 & 0x7f) - 0x21;
3756 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3761 void w_oconv(nkf_char c2, nkf_char c1)
3767 output_bom_f = FALSE;
3778 #ifdef NUMCHAR_OPTION
3779 if (c2 == 0 && is_unicode_capsule(c1)){
3780 val = c1 & VALUE_MASK;
3783 }else if (val < 0x800){
3784 (*o_putc)(0xC0 | (val >> 6));
3785 (*o_putc)(0x80 | (val & 0x3f));
3786 } else if (val <= NKF_INT32_C(0xFFFF)) {
3787 (*o_putc)(0xE0 | (val >> 12));
3788 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3789 (*o_putc)(0x80 | (val & 0x3f));
3790 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3791 (*o_putc)(0xF0 | ( val>>18));
3792 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3793 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3794 (*o_putc)(0x80 | ( val & 0x3f));
3801 output_mode = ASCII;
3803 } else if (c2 == ISO8859_1) {
3804 output_mode = ISO8859_1;
3805 (*o_putc)(c1 | 0x080);
3808 val = e2w_conv(c2, c1);
3810 w16w_conv(val, &c2, &c1, &c0);
3814 if (c0) (*o_putc)(c0);
3820 void w_oconv16(nkf_char c2, nkf_char c1)
3823 output_bom_f = FALSE;
3824 if (output_endian == ENDIAN_LITTLE){
3825 (*o_putc)((unsigned char)'\377');
3829 (*o_putc)((unsigned char)'\377');
3838 if (c2 == ISO8859_1) {
3841 #ifdef NUMCHAR_OPTION
3842 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3843 if (is_unicode_bmp(c1)) {
3844 c2 = (c1 >> 8) & 0xff;
3848 if (c1 <= UNICODE_MAX) {
3849 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3850 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3851 if (output_endian == ENDIAN_LITTLE){
3852 (*o_putc)(c2 & 0xff);
3853 (*o_putc)((c2 >> 8) & 0xff);
3854 (*o_putc)(c1 & 0xff);
3855 (*o_putc)((c1 >> 8) & 0xff);
3857 (*o_putc)((c2 >> 8) & 0xff);
3858 (*o_putc)(c2 & 0xff);
3859 (*o_putc)((c1 >> 8) & 0xff);
3860 (*o_putc)(c1 & 0xff);
3867 nkf_char val = e2w_conv(c2, c1);
3868 c2 = (val >> 8) & 0xff;
3872 if (output_endian == ENDIAN_LITTLE){
3881 void w_oconv32(nkf_char c2, nkf_char c1)
3884 output_bom_f = FALSE;
3885 if (output_endian == ENDIAN_LITTLE){
3886 (*o_putc)((unsigned char)'\377');
3894 (*o_putc)((unsigned char)'\377');
3903 if (c2 == ISO8859_1) {
3905 #ifdef NUMCHAR_OPTION
3906 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3910 c1 = e2w_conv(c2, c1);
3913 if (output_endian == ENDIAN_LITTLE){
3914 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3915 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3916 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3920 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3921 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3922 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3927 void e_oconv(nkf_char c2, nkf_char c1)
3929 #ifdef NUMCHAR_OPTION
3930 if (c2 == 0 && is_unicode_capsule(c1)){
3931 w16e_conv(c1, &c2, &c1);
3932 if (c2 == 0 && is_unicode_capsule(c1)){
3933 c2 = c1 & VALUE_MASK;
3934 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3938 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3939 c1 = 0x21 + c1 % 94;
3942 (*o_putc)((c2 & 0x7f) | 0x080);
3943 (*o_putc)(c1 | 0x080);
3945 (*o_putc)((c2 & 0x7f) | 0x080);
3946 (*o_putc)(c1 | 0x080);
3950 if (encode_fallback) (*encode_fallback)(c1);
3959 } else if (c2 == 0) {
3960 output_mode = ASCII;
3962 } else if (c2 == X0201) {
3963 output_mode = JAPANESE_EUC;
3964 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3965 } else if (c2 == ISO8859_1) {
3966 output_mode = ISO8859_1;
3967 (*o_putc)(c1 | 0x080);
3969 } else if (is_eucg3(c2)){
3970 output_mode = JAPANESE_EUC;
3971 #ifdef SHIFTJIS_CP932
3974 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3975 s2e_conv(s2, s1, &c2, &c1);
3980 output_mode = ASCII;
3982 }else if (is_eucg3(c2)){
3985 (*o_putc)((c2 & 0x7f) | 0x080);
3986 (*o_putc)(c1 | 0x080);
3989 (*o_putc)((c2 & 0x7f) | 0x080);
3990 (*o_putc)(c1 | 0x080);
3994 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3995 set_iconv(FALSE, 0);
3996 return; /* too late to rescue this char */
3998 output_mode = JAPANESE_EUC;
3999 (*o_putc)(c2 | 0x080);
4000 (*o_putc)(c1 | 0x080);
4005 nkf_char x0212_shift(nkf_char c)
4010 if (0x75 <= c && c <= 0x7f){
4011 ret = c + (0x109 - 0x75);
4014 if (0x75 <= c && c <= 0x7f){
4015 ret = c + (0x113 - 0x75);
4022 nkf_char x0212_unshift(nkf_char c)
4025 if (0x7f <= c && c <= 0x88){
4026 ret = c + (0x75 - 0x7f);
4027 }else if (0x89 <= c && c <= 0x92){
4028 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4032 #endif /* X0212_ENABLE */
4034 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4040 if((0x21 <= ndx && ndx <= 0x2F)){
4041 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4042 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4044 }else if(0x6E <= ndx && ndx <= 0x7E){
4045 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4046 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4052 else if(nkf_isgraph(ndx)){
4054 const unsigned short *ptr;
4055 ptr = x0212_shiftjis[ndx - 0x21];
4057 val = ptr[(c1 & 0x7f) - 0x21];
4066 c2 = x0212_shift(c2);
4068 #endif /* X0212_ENABLE */
4070 if(0x7F < c2) return 1;
4071 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4072 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4076 void s_oconv(nkf_char c2, nkf_char c1)
4078 #ifdef NUMCHAR_OPTION
4079 if (c2 == 0 && is_unicode_capsule(c1)){
4080 w16e_conv(c1, &c2, &c1);
4081 if (c2 == 0 && is_unicode_capsule(c1)){
4082 c2 = c1 & VALUE_MASK;
4083 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4086 c2 = c1 / 188 + 0xF0;
4088 c1 += 0x40 + (c1 > 0x3e);
4093 if(encode_fallback)(*encode_fallback)(c1);
4102 } else if (c2 == 0) {
4103 output_mode = ASCII;
4105 } else if (c2 == X0201) {
4106 output_mode = SHIFT_JIS;
4108 } else if (c2 == ISO8859_1) {
4109 output_mode = ISO8859_1;
4110 (*o_putc)(c1 | 0x080);
4112 } else if (is_eucg3(c2)){
4113 output_mode = SHIFT_JIS;
4114 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4120 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4121 set_iconv(FALSE, 0);
4122 return; /* too late to rescue this char */
4124 output_mode = SHIFT_JIS;
4125 e2s_conv(c2, c1, &c2, &c1);
4127 #ifdef SHIFTJIS_CP932
4129 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4130 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4136 #endif /* SHIFTJIS_CP932 */
4139 if (prefix_table[(unsigned char)c1]){
4140 (*o_putc)(prefix_table[(unsigned char)c1]);
4146 void j_oconv(nkf_char c2, nkf_char c1)
4148 #ifdef NUMCHAR_OPTION
4149 if (c2 == 0 && is_unicode_capsule(c1)){
4150 w16e_conv(c1, &c2, &c1);
4151 if (c2 == 0 && is_unicode_capsule(c1)){
4152 c2 = c1 & VALUE_MASK;
4153 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4156 c2 = 0x7F + c1 / 94;
4157 c1 = 0x21 + c1 % 94;
4159 if (encode_fallback) (*encode_fallback)(c1);
4166 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4169 (*o_putc)(ascii_intro);
4170 output_mode = ASCII;
4174 } else if (is_eucg3(c2)){
4176 if(output_mode!=X0213_2){
4177 output_mode = X0213_2;
4181 (*o_putc)(X0213_2&0x7F);
4184 if(output_mode!=X0212){
4185 output_mode = X0212;
4189 (*o_putc)(X0212&0x7F);
4192 (*o_putc)(c2 & 0x7f);
4195 } else if (c2==X0201) {
4196 if (output_mode!=X0201) {
4197 output_mode = X0201;
4203 } else if (c2==ISO8859_1) {
4204 /* iso8859 introduction, or 8th bit on */
4205 /* Can we convert in 7bit form using ESC-'-'-A ?
4207 output_mode = ISO8859_1;
4209 } else if (c2 == 0) {
4210 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4213 (*o_putc)(ascii_intro);
4214 output_mode = ASCII;
4219 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4220 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4222 if (output_mode!=X0213_1) {
4223 output_mode = X0213_1;
4227 (*o_putc)(X0213_1&0x7F);
4229 }else if (output_mode != X0208) {
4230 output_mode = X0208;
4233 (*o_putc)(kanji_intro);
4240 void base64_conv(nkf_char c2, nkf_char c1)
4242 mime_prechar(c2, c1);
4243 (*o_base64conv)(c2,c1);
4247 static nkf_char broken_buf[3];
4248 static int broken_counter = 0;
4249 static int broken_last = 0;
4250 nkf_char broken_getc(FILE *f)
4254 if (broken_counter>0) {
4255 return broken_buf[--broken_counter];
4258 if (c=='$' && broken_last != ESC
4259 && (input_mode==ASCII || input_mode==X0201)) {
4262 if (c1=='@'|| c1=='B') {
4263 broken_buf[0]=c1; broken_buf[1]=c;
4270 } else if (c=='(' && broken_last != ESC
4271 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4274 if (c1=='J'|| c1=='B') {
4275 broken_buf[0]=c1; broken_buf[1]=c;
4288 nkf_char broken_ungetc(nkf_char c, FILE *f)
4290 if (broken_counter<2)
4291 broken_buf[broken_counter++]=c;
4295 void nl_conv(nkf_char c2, nkf_char c1)
4297 if (guess_f && input_nextline != EOF) {
4298 if (c2 == 0 && c1 == LF) {
4299 if (!input_nextline) input_nextline = prev_cr ? CRLF : LF;
4300 else if (input_nextline != (prev_cr ? CRLF : LF)) input_nextline = EOF;
4301 } else if (c2 == 0 && c1 == CR && input_nextline == LF) input_nextline = EOF;
4303 else if (!input_nextline) input_nextline = CR;
4304 else if (input_nextline != CR) input_nextline = EOF;
4306 if (prev_cr || c2 == 0 && c1 == LF) {
4308 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4309 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4311 if (c2 == 0 && c1 == CR) prev_cr = CR;
4312 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4316 Return value of fold_conv()
4318 LF add newline and output char
4319 CR add newline and output nothing
4322 1 (or else) normal output
4324 fold state in prev (previous character)
4326 >0x80 Japanese (X0208/X0201)
4331 This fold algorthm does not preserve heading space in a line.
4332 This is the main difference from fmt.
4335 #define char_size(c2,c1) (c2?2:1)
4337 void fold_conv(nkf_char c2, nkf_char c1)
4340 nkf_char fold_state;
4342 if (c1== CR && !fold_preserve_f) {
4343 fold_state=0; /* ignore cr */
4344 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4346 fold_state=0; /* ignore cr */
4347 } else if (c1== BS) {
4348 if (f_line>0) f_line--;
4350 } else if (c2==EOF && f_line != 0) { /* close open last line */
4352 } else if ((c1==LF && !fold_preserve_f)
4353 || ((c1==CR||(c1==LF&&f_prev!=CR))
4354 && fold_preserve_f)) {
4356 if (fold_preserve_f) {
4360 } else if ((f_prev == c1 && !fold_preserve_f)
4361 || (f_prev == LF && fold_preserve_f)
4362 ) { /* duplicate newline */
4365 fold_state = LF; /* output two newline */
4371 if (f_prev&0x80) { /* Japanese? */
4373 fold_state = 0; /* ignore given single newline */
4374 } else if (f_prev==SP) {
4378 if (++f_line<=fold_len)
4382 fold_state = CR; /* fold and output nothing */
4386 } else if (c1=='\f') {
4389 fold_state = LF; /* output newline and clear */
4390 } else if ( (c2==0 && c1==SP)||
4391 (c2==0 && c1==TAB)||
4392 (c2=='!'&& c1=='!')) {
4393 /* X0208 kankaku or ascii space */
4395 fold_state = 0; /* remove duplicate spaces */
4398 if (++f_line<=fold_len)
4399 fold_state = SP; /* output ASCII space only */
4401 f_prev = SP; f_line = 0;
4402 fold_state = CR; /* fold and output nothing */
4406 prev0 = f_prev; /* we still need this one... , but almost done */
4408 if (c2 || c2==X0201)
4409 f_prev |= 0x80; /* this is Japanese */
4410 f_line += char_size(c2,c1);
4411 if (f_line<=fold_len) { /* normal case */
4414 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4415 f_line = char_size(c2,c1);
4416 fold_state = LF; /* We can't wait, do fold now */
4417 } else if (c2==X0201) {
4418 /* simple kinsoku rules return 1 means no folding */
4419 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4420 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4421 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4422 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4423 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4424 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4425 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4427 fold_state = LF;/* add one new f_line before this character */
4430 fold_state = LF;/* add one new f_line before this character */
4433 /* kinsoku point in ASCII */
4434 if ( c1==')'|| /* { [ ( */
4445 /* just after special */
4446 } else if (!is_alnum(prev0)) {
4447 f_line = char_size(c2,c1);
4449 } else if ((prev0==SP) || /* ignored new f_line */
4450 (prev0==LF)|| /* ignored new f_line */
4451 (prev0&0x80)) { /* X0208 - ASCII */
4452 f_line = char_size(c2,c1);
4453 fold_state = LF;/* add one new f_line before this character */
4455 fold_state = 1; /* default no fold in ASCII */
4459 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4460 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4461 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4462 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4463 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4464 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4465 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4466 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4467 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4468 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4469 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4470 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4471 /* default no fold in kinsoku */
4474 f_line = char_size(c2,c1);
4475 /* add one new f_line before this character */
4478 f_line = char_size(c2,c1);
4480 /* add one new f_line before this character */
4485 /* terminator process */
4486 switch(fold_state) {
4505 nkf_char z_prev2=0,z_prev1=0;
4507 void z_conv(nkf_char c2, nkf_char c1)
4510 /* if (c2) c1 &= 0x7f; assertion */
4512 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4518 if (z_prev2 == X0201) {
4520 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4522 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4524 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4526 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4531 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4534 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4535 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4540 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4551 if (alpha_f&1 && c2 == 0x23) {
4552 /* JISX0208 Alphabet */
4554 } else if (c2 == 0x21) {
4555 /* JISX0208 Kigou */
4560 } else if (alpha_f&4) {
4565 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4571 if (alpha_f&8 && c2 == 0) {
4575 case '>': entity = ">"; break;
4576 case '<': entity = "<"; break;
4577 case '\"': entity = """; break;
4578 case '&': entity = "&"; break;
4581 while (*entity) (*o_zconv)(0, *entity++);
4587 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4592 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4596 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4600 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4604 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4608 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4612 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4616 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4620 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4625 (*o_zconv)(X0201, c);
4628 } else if (c2 == 0x25) {
4629 /* JISX0208 Katakana */
4630 static const int fullwidth_to_halfwidth[] =
4632 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4633 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4634 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4635 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4636 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4637 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4638 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4639 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4640 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4641 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4642 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4643 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4645 if (fullwidth_to_halfwidth[c1-0x20]){
4646 c2 = fullwidth_to_halfwidth[c1-0x20];
4647 (*o_zconv)(X0201, c2>>8);
4649 (*o_zconv)(X0201, c2&0xFF);
4659 #define rot13(c) ( \
4661 (c <= 'M') ? (c + 13): \
4662 (c <= 'Z') ? (c - 13): \
4664 (c <= 'm') ? (c + 13): \
4665 (c <= 'z') ? (c - 13): \
4669 #define rot47(c) ( \
4671 ( c <= 'O') ? (c + 47) : \
4672 ( c <= '~') ? (c - 47) : \
4676 void rot_conv(nkf_char c2, nkf_char c1)
4678 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4684 (*o_rot_conv)(c2,c1);
4687 void hira_conv(nkf_char c2, nkf_char c1)
4691 if (0x20 < c1 && c1 < 0x74) {
4693 (*o_hira_conv)(c2,c1);
4695 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4697 c1 = CLASS_UNICODE | 0x3094;
4698 (*o_hira_conv)(c2,c1);
4701 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4703 (*o_hira_conv)(c2,c1);
4708 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4711 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4713 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4717 (*o_hira_conv)(c2,c1);
4721 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4723 static const nkf_char range[RANGE_NUM_MAX][2] = {
4744 nkf_char start, end, c;
4746 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4750 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4755 for (i = 0; i < RANGE_NUM_MAX; i++) {
4756 start = range[i][0];
4759 if (c >= start && c <= end) {
4764 (*o_iso2022jp_check_conv)(c2,c1);
4768 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4770 static const unsigned char *mime_pattern[] = {
4771 (const unsigned char *)"\075?EUC-JP?B?",
4772 (const unsigned char *)"\075?SHIFT_JIS?B?",
4773 (const unsigned char *)"\075?ISO-8859-1?Q?",
4774 (const unsigned char *)"\075?ISO-8859-1?B?",
4775 (const unsigned char *)"\075?ISO-2022-JP?B?",
4776 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4777 #if defined(UTF8_INPUT_ENABLE)
4778 (const unsigned char *)"\075?UTF-8?B?",
4779 (const unsigned char *)"\075?UTF-8?Q?",
4781 (const unsigned char *)"\075?US-ASCII?Q?",
4786 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4787 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4788 e_iconv, s_iconv, 0, 0, 0, 0,
4789 #if defined(UTF8_INPUT_ENABLE)
4795 static const nkf_char mime_encode[] = {
4796 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4797 #if defined(UTF8_INPUT_ENABLE)
4804 static const nkf_char mime_encode_method[] = {
4805 'B', 'B','Q', 'B', 'B', 'Q',
4806 #if defined(UTF8_INPUT_ENABLE)
4814 #define MAXRECOVER 20
4816 void switch_mime_getc(void)
4818 if (i_getc!=mime_getc) {
4819 i_mgetc = i_getc; i_getc = mime_getc;
4820 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4821 if(mime_f==STRICT_MIME) {
4822 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4823 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4828 void unswitch_mime_getc(void)
4830 if(mime_f==STRICT_MIME) {
4831 i_mgetc = i_mgetc_buf;
4832 i_mungetc = i_mungetc_buf;
4835 i_ungetc = i_mungetc;
4836 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4837 mime_iconv_back = NULL;
4840 nkf_char mime_begin_strict(FILE *f)
4844 const unsigned char *p,*q;
4845 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4847 mime_decode_mode = FALSE;
4848 /* =? has been checked */
4850 p = mime_pattern[j];
4853 for(i=2;p[i]>SP;i++) { /* start at =? */
4854 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4855 /* pattern fails, try next one */
4857 while (mime_pattern[++j]) {
4858 p = mime_pattern[j];
4859 for(k=2;k<i;k++) /* assume length(p) > i */
4860 if (p[k]!=q[k]) break;
4861 if (k==i && nkf_toupper(c1)==p[k]) break;
4863 p = mime_pattern[j];
4864 if (p) continue; /* found next one, continue */
4865 /* all fails, output from recovery buffer */
4873 mime_decode_mode = p[i-2];
4875 mime_iconv_back = iconv;
4876 set_iconv(FALSE, mime_priority_func[j]);
4877 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4879 if (mime_decode_mode=='B') {
4880 mimebuf_f = unbuf_f;
4882 /* do MIME integrity check */
4883 return mime_integrity(f,mime_pattern[j]);
4891 nkf_char mime_getc_buf(FILE *f)
4893 /* we don't keep eof of Fifo, becase it contains ?= as
4894 a terminator. It was checked in mime_integrity. */
4895 return ((mimebuf_f)?
4896 (*i_mgetc_buf)(f):Fifo(mime_input++));
4899 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4902 (*i_mungetc_buf)(c,f);
4904 Fifo(--mime_input) = (unsigned char)c;
4908 nkf_char mime_begin(FILE *f)
4913 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4914 /* re-read and convert again from mime_buffer. */
4916 /* =? has been checked */
4918 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4919 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4920 /* We accept any character type even if it is breaked by new lines */
4921 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4922 if (c1==LF||c1==SP||c1==CR||
4923 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4925 /* Failed. But this could be another MIME preemble */
4933 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4934 if (!(++i<MAXRECOVER) || c1==EOF) break;
4935 if (c1=='b'||c1=='B') {
4936 mime_decode_mode = 'B';
4937 } else if (c1=='q'||c1=='Q') {
4938 mime_decode_mode = 'Q';
4942 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4943 if (!(++i<MAXRECOVER) || c1==EOF) break;
4945 mime_decode_mode = FALSE;
4951 if (!mime_decode_mode) {
4952 /* false MIME premble, restart from mime_buffer */
4953 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4954 /* Since we are in MIME mode until buffer becomes empty, */
4955 /* we never go into mime_begin again for a while. */
4958 /* discard mime preemble, and goto MIME mode */
4960 /* do no MIME integrity check */
4961 return c1; /* used only for checking EOF */
4965 void no_putc(nkf_char c)
4970 void debug(const char *str)
4973 fprintf(stderr, "%s\n", str ? str : "NULL");
4978 void set_input_codename(char *codename)
4980 if (!input_codename) {
4981 input_codename = codename;
4982 } else if (strcmp(codename, input_codename) != 0) {
4983 input_codename = "";
4987 #if !defined(PERL_XS) && !defined(WIN32DLL)
4988 void print_guessed_code(char *filename)
4990 char *codename = "BINARY";
4991 char *str_nlmode = NULL;
4992 if (filename != NULL) printf("%s: ", filename);
4993 if (input_codename && !*input_codename) {
4997 (input_codename ? input_codename : "ASCII"),
4998 input_nextline == CR ? " (CR)" :
4999 input_nextline == LF ? " (LF)" :
5000 input_nextline == CRLF ? " (CRLF)" :
5001 input_nextline == EOF ? " (MIXED NL)" :
5009 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5011 nkf_char c1, c2, c3;
5017 if (!nkf_isxdigit(c2)){
5022 if (!nkf_isxdigit(c3)){
5027 return (hex2bin(c2) << 4) | hex2bin(c3);
5030 nkf_char cap_getc(FILE *f)
5032 return hex_getc(':', f, i_cgetc, i_cungetc);
5035 nkf_char cap_ungetc(nkf_char c, FILE *f)
5037 return (*i_cungetc)(c, f);
5040 nkf_char url_getc(FILE *f)
5042 return hex_getc('%', f, i_ugetc, i_uungetc);
5045 nkf_char url_ungetc(nkf_char c, FILE *f)
5047 return (*i_uungetc)(c, f);
5051 #ifdef NUMCHAR_OPTION
5052 nkf_char numchar_getc(FILE *f)
5054 nkf_char (*g)(FILE *) = i_ngetc;
5055 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5066 if (buf[i] == 'x' || buf[i] == 'X'){
5067 for (j = 0; j < 7; j++){
5069 if (!nkf_isxdigit(buf[i])){
5076 c |= hex2bin(buf[i]);
5079 for (j = 0; j < 8; j++){
5083 if (!nkf_isdigit(buf[i])){
5090 c += hex2bin(buf[i]);
5096 return CLASS_UNICODE | c;
5105 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5107 return (*i_nungetc)(c, f);
5111 #ifdef UNICODE_NORMALIZATION
5113 /* Normalization Form C */
5114 nkf_char nfc_getc(FILE *f)
5116 nkf_char (*g)(FILE *f) = i_nfc_getc;
5117 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5118 int i=0, j, k=1, lower, upper;
5120 const nkf_nfchar *array;
5123 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5124 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5125 while (upper >= lower) {
5126 j = (lower+upper) / 2;
5127 array = normalization_table[j].nfd;
5128 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5129 if (array[k] != buf[k]){
5130 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5137 array = normalization_table[j].nfc;
5138 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5139 buf[i] = (nkf_char)(array[i]);
5150 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5152 return (*i_nfc_ungetc)(c, f);
5154 #endif /* UNICODE_NORMALIZATION */
5160 nkf_char c1, c2, c3, c4, cc;
5161 nkf_char t1, t2, t3, t4, mode, exit_mode;
5162 nkf_char lwsp_count;
5165 nkf_char lwsp_size = 128;
5167 if (mime_top != mime_last) { /* Something is in FIFO */
5168 return Fifo(mime_top++);
5170 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5171 mime_decode_mode=FALSE;
5172 unswitch_mime_getc();
5173 return (*i_getc)(f);
5176 if (mimebuf_f == FIXED_MIME)
5177 exit_mode = mime_decode_mode;
5180 if (mime_decode_mode == 'Q') {
5181 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5183 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5184 if (c1<=SP || DEL<=c1) {
5185 mime_decode_mode = exit_mode; /* prepare for quit */
5188 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5192 mime_decode_mode = exit_mode; /* prepare for quit */
5193 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5194 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5195 /* end Q encoding */
5196 input_mode = exit_mode;
5198 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5199 if (lwsp_buf==NULL) {
5200 perror("can't malloc");
5203 while ((c1=(*i_getc)(f))!=EOF) {
5208 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5216 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5217 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5232 lwsp_buf[lwsp_count] = (unsigned char)c1;
5233 if (lwsp_count++>lwsp_size){
5235 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5236 if (lwsp_buf_new==NULL) {
5238 perror("can't realloc");
5241 lwsp_buf = lwsp_buf_new;
5247 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5249 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5250 i_ungetc(lwsp_buf[lwsp_count],f);
5256 if (c1=='='&&c2<SP) { /* this is soft wrap */
5257 while((c1 = (*i_mgetc)(f)) <=SP) {
5258 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5260 mime_decode_mode = 'Q'; /* still in MIME */
5261 goto restart_mime_q;
5264 mime_decode_mode = 'Q'; /* still in MIME */
5268 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5269 if (c2<=SP) return c2;
5270 mime_decode_mode = 'Q'; /* still in MIME */
5271 return ((hex2bin(c2)<<4) + hex2bin(c3));
5274 if (mime_decode_mode != 'B') {
5275 mime_decode_mode = FALSE;
5276 return (*i_mgetc)(f);
5280 /* Base64 encoding */
5282 MIME allows line break in the middle of
5283 Base64, but we are very pessimistic in decoding
5284 in unbuf mode because MIME encoded code may broken by
5285 less or editor's control sequence (such as ESC-[-K in unbuffered
5286 mode. ignore incomplete MIME.
5288 mode = mime_decode_mode;
5289 mime_decode_mode = exit_mode; /* prepare for quit */
5291 while ((c1 = (*i_mgetc)(f))<=SP) {
5296 if ((c2 = (*i_mgetc)(f))<=SP) {
5299 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5300 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5303 if ((c1 == '?') && (c2 == '=')) {
5306 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5307 if (lwsp_buf==NULL) {
5308 perror("can't malloc");
5311 while ((c1=(*i_getc)(f))!=EOF) {
5316 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5324 if ((c1=(*i_getc)(f))!=EOF) {
5328 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5343 lwsp_buf[lwsp_count] = (unsigned char)c1;
5344 if (lwsp_count++>lwsp_size){
5346 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5347 if (lwsp_buf_new==NULL) {
5349 perror("can't realloc");
5352 lwsp_buf = lwsp_buf_new;
5358 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5360 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5361 i_ungetc(lwsp_buf[lwsp_count],f);
5368 if ((c3 = (*i_mgetc)(f))<=SP) {
5371 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5372 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5376 if ((c4 = (*i_mgetc)(f))<=SP) {
5379 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5380 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5384 mime_decode_mode = mode; /* still in MIME sigh... */
5386 /* BASE 64 decoding */
5388 t1 = 0x3f & base64decode(c1);
5389 t2 = 0x3f & base64decode(c2);
5390 t3 = 0x3f & base64decode(c3);
5391 t4 = 0x3f & base64decode(c4);
5392 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5394 Fifo(mime_last++) = (unsigned char)cc;
5395 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5397 Fifo(mime_last++) = (unsigned char)cc;
5398 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5400 Fifo(mime_last++) = (unsigned char)cc;
5405 return Fifo(mime_top++);
5408 nkf_char mime_ungetc(nkf_char c, FILE *f)
5410 Fifo(--mime_top) = (unsigned char)c;
5414 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5418 /* In buffered mode, read until =? or NL or buffer full
5420 mime_input = mime_top;
5421 mime_last = mime_top;
5423 while(*p) Fifo(mime_input++) = *p++;
5426 while((c=(*i_getc)(f))!=EOF) {
5427 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5428 break; /* buffer full */
5430 if (c=='=' && d=='?') {
5431 /* checked. skip header, start decode */
5432 Fifo(mime_input++) = (unsigned char)c;
5433 /* mime_last_input = mime_input; */
5438 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5440 /* Should we check length mod 4? */
5441 Fifo(mime_input++) = (unsigned char)c;
5444 /* In case of Incomplete MIME, no MIME decode */
5445 Fifo(mime_input++) = (unsigned char)c;
5446 mime_last = mime_input; /* point undecoded buffer */
5447 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5448 switch_mime_getc(); /* anyway we need buffered getc */
5452 nkf_char base64decode(nkf_char c)
5457 i = c - 'A'; /* A..Z 0-25 */
5459 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5461 } else if (c > '/') {
5462 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5463 } else if (c == '+') {
5464 i = '>' /* 62 */ ; /* + 62 */
5466 i = '?' /* 63 */ ; /* / 63 */
5471 static const char basis_64[] =
5472 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5474 static nkf_char b64c;
5475 #define MIMEOUT_BUF_LENGTH (60)
5476 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5477 int mimeout_buf_count = 0;
5478 int mimeout_preserve_space = 0;
5479 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5481 void open_mime(nkf_char mode)
5483 const unsigned char *p;
5486 p = mime_pattern[0];
5487 for(i=0;mime_pattern[i];i++) {
5488 if (mode == mime_encode[i]) {
5489 p = mime_pattern[i];
5493 mimeout_mode = mime_encode_method[i];
5496 if (base64_count>45) {
5497 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5498 (*o_mputc)(mimeout_buf[i]);
5504 if (!mimeout_preserve_space && mimeout_buf_count>0
5505 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5506 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5510 if (!mimeout_preserve_space) {
5511 for (;i<mimeout_buf_count;i++) {
5512 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5513 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5514 (*o_mputc)(mimeout_buf[i]);
5521 mimeout_preserve_space = FALSE;
5527 j = mimeout_buf_count;
5528 mimeout_buf_count = 0;
5530 mime_putc(mimeout_buf[i]);
5534 void close_mime(void)
5544 switch(mimeout_mode) {
5549 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5555 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5561 if (mimeout_f!=FIXED_MIME) {
5563 } else if (mimeout_mode != 'Q')
5568 void mimeout_addchar(nkf_char c)
5570 switch(mimeout_mode) {
5575 } else if(!nkf_isalnum(c)) {
5577 (*o_mputc)(itoh4(((c>>4)&0xf)));
5578 (*o_mputc)(itoh4((c&0xf)));
5587 (*o_mputc)(basis_64[c>>2]);
5592 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5598 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5599 (*o_mputc)(basis_64[c & 0x3F]);
5610 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5612 void mime_prechar(nkf_char c2, nkf_char c1)
5616 if (base64_count + mimeout_buf_count/3*4> 73){
5617 (*o_base64conv)(EOF,0);
5618 (*o_base64conv)(0,LF);
5619 (*o_base64conv)(0,SP);
5622 if (base64_count + mimeout_buf_count/3*4> 66){
5623 (*o_base64conv)(EOF,0);
5624 (*o_base64conv)(0,LF);
5625 (*o_base64conv)(0,SP);
5627 }/*else if (mime_lastchar2){
5628 if (c1 <=DEL && !nkf_isspace(c1)){
5629 (*o_base64conv)(0,SP);
5633 if (c2 && mime_lastchar2 == 0
5634 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5635 (*o_base64conv)(0,SP);
5638 /*mime_lastchar2 = c2;
5639 mime_lastchar1 = c1;*/
5642 void mime_putc(nkf_char c)
5647 if (mimeout_f == FIXED_MIME){
5648 if (mimeout_mode == 'Q'){
5649 if (base64_count > 71){
5650 if (c!=CR && c!=LF) {
5657 if (base64_count > 71){
5662 if (c == EOF) { /* c==EOF */
5666 if (c != EOF) { /* c==EOF */
5672 /* mimeout_f != FIXED_MIME */
5674 if (c == EOF) { /* c==EOF */
5675 j = mimeout_buf_count;
5676 mimeout_buf_count = 0;
5679 if (!nkf_isblank(mimeout_buf[j-1])) {
5681 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5684 mimeout_addchar(mimeout_buf[i]);
5688 mimeout_addchar(mimeout_buf[i]);
5692 mimeout_addchar(mimeout_buf[i]);
5698 mimeout_addchar(mimeout_buf[i]);
5704 if (mimeout_mode=='Q') {
5705 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5706 if (c == CR || c == LF) {
5711 } else if (c <= SP) {
5713 if (base64_count > 70) {
5717 if (!nkf_isblank(c)) {
5728 if (mimeout_buf_count > 0){
5729 lastchar = mimeout_buf[mimeout_buf_count - 1];
5734 if (!mimeout_mode) {
5735 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5736 if (nkf_isspace(c)) {
5737 if (c==CR || c==LF) {
5740 for (i=0;i<mimeout_buf_count;i++) {
5741 (*o_mputc)(mimeout_buf[i]);
5742 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5748 mimeout_buf[0] = (char)c;
5749 mimeout_buf_count = 1;
5751 if (base64_count > 1
5752 && base64_count + mimeout_buf_count > 76
5753 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5756 if (!nkf_isspace(mimeout_buf[0])){
5761 mimeout_buf[mimeout_buf_count++] = (char)c;
5762 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5763 open_mime(output_mode);
5768 if (lastchar==CR || lastchar == LF){
5769 for (i=0;i<mimeout_buf_count;i++) {
5770 (*o_mputc)(mimeout_buf[i]);
5773 mimeout_buf_count = 0;
5776 for (i=0;i<mimeout_buf_count-1;i++) {
5777 (*o_mputc)(mimeout_buf[i]);
5780 mimeout_buf[0] = SP;
5781 mimeout_buf_count = 1;
5783 open_mime(output_mode);
5786 /* mimeout_mode == 'B', 1, 2 */
5787 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5788 if (lastchar == CR || lastchar == LF){
5789 if (nkf_isblank(c)) {
5790 for (i=0;i<mimeout_buf_count;i++) {
5791 mimeout_addchar(mimeout_buf[i]);
5793 mimeout_buf_count = 0;
5794 } else if (SP<c && c<DEL) {
5796 for (i=0;i<mimeout_buf_count;i++) {
5797 (*o_mputc)(mimeout_buf[i]);
5800 mimeout_buf_count = 0;
5803 if (c==SP || c==TAB || c==CR || c==LF) {
5804 for (i=0;i<mimeout_buf_count;i++) {
5805 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5807 for (i=0;i<mimeout_buf_count;i++) {
5808 (*o_mputc)(mimeout_buf[i]);
5811 mimeout_buf_count = 0;
5814 mimeout_buf[mimeout_buf_count++] = (char)c;
5815 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5817 for (i=0;i<mimeout_buf_count;i++) {
5818 (*o_mputc)(mimeout_buf[i]);
5821 mimeout_buf_count = 0;
5825 if (mimeout_buf_count>0 && SP<c && c!='=') {
5826 mimeout_buf[mimeout_buf_count++] = (char)c;
5827 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5828 j = mimeout_buf_count;
5829 mimeout_buf_count = 0;
5831 mimeout_addchar(mimeout_buf[i]);
5838 if (mimeout_buf_count>0) {
5839 j = mimeout_buf_count;
5840 mimeout_buf_count = 0;
5842 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5844 mimeout_addchar(mimeout_buf[i]);
5850 (*o_mputc)(mimeout_buf[i]);
5852 open_mime(output_mode);
5859 #if defined(PERL_XS) || defined(WIN32DLL)
5863 struct input_code *p = input_code_list;
5876 mime_f = STRICT_MIME;
5877 mime_decode_f = FALSE;
5882 #if defined(MSDOS) || defined(__OS2__)
5887 iso2022jp_f = FALSE;
5888 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5889 ms_ucs_map_f = UCS_MAP_ASCII;
5891 #ifdef UTF8_INPUT_ENABLE
5892 no_cp932ext_f = FALSE;
5893 no_best_fit_chars_f = FALSE;
5894 encode_fallback = NULL;
5895 unicode_subchar = '?';
5896 input_endian = ENDIAN_BIG;
5898 #ifdef UTF8_OUTPUT_ENABLE
5899 output_bom_f = FALSE;
5900 output_endian = ENDIAN_BIG;
5902 #ifdef UNICODE_NORMALIZATION
5918 #ifdef SHIFTJIS_CP932
5928 for (i = 0; i < 256; i++){
5929 prefix_table[i] = 0;
5933 mimeout_buf_count = 0;
5938 fold_preserve_f = FALSE;
5941 kanji_intro = DEFAULT_J;
5942 ascii_intro = DEFAULT_R;
5943 fold_margin = FOLD_MARGIN;
5944 output_conv = DEFAULT_CONV;
5945 oconv = DEFAULT_CONV;
5946 o_zconv = no_connection;
5947 o_fconv = no_connection;
5948 o_nlconv = no_connection;
5949 o_rot_conv = no_connection;
5950 o_hira_conv = no_connection;
5951 o_base64conv = no_connection;
5952 o_iso2022jp_check_conv = no_connection;
5955 i_ungetc = std_ungetc;
5957 i_bungetc = std_ungetc;
5960 i_mungetc = std_ungetc;
5961 i_mgetc_buf = std_getc;
5962 i_mungetc_buf = std_ungetc;
5963 output_mode = ASCII;
5966 mime_decode_mode = FALSE;
5974 z_prev2=0,z_prev1=0;
5976 iconv_for_check = 0;
5978 input_codename = NULL;
5985 void no_connection(nkf_char c2, nkf_char c1)
5987 no_connection2(c2,c1,0);
5990 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5992 fprintf(stderr,"nkf internal module connection failure.\n");
5994 return 0; /* LINT */
5999 #define fprintf dllprintf
6003 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6004 fprintf(stderr,"Flags:\n");
6005 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6006 #ifdef DEFAULT_CODE_SJIS
6007 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6009 #ifdef DEFAULT_CODE_JIS
6010 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6012 #ifdef DEFAULT_CODE_EUC
6013 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6015 #ifdef DEFAULT_CODE_UTF8
6016 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6018 #ifdef UTF8_OUTPUT_ENABLE
6019 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6021 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6022 #ifdef UTF8_INPUT_ENABLE
6023 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6025 fprintf(stderr,"t no conversion\n");
6026 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6027 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6028 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6029 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6030 fprintf(stderr,"v Show this usage. V: show version\n");
6031 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6032 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6033 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6034 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6035 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6036 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6037 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6038 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6039 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6041 fprintf(stderr,"T Text mode output\n");
6043 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6044 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6045 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6046 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6047 fprintf(stderr,"\n");
6048 fprintf(stderr,"Long name options\n");
6049 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6050 fprintf(stderr," Specify the input or output codeset\n");
6051 fprintf(stderr," --fj --unix --mac --windows\n");
6052 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6053 fprintf(stderr," Convert for the system or code\n");
6054 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6055 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6056 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6058 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6060 #ifdef NUMCHAR_OPTION
6061 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6063 #ifdef UTF8_INPUT_ENABLE
6064 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6065 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6068 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6069 fprintf(stderr," Overwrite original listed files by filtered result\n");
6070 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6072 fprintf(stderr," -g --guess Guess the input code\n");
6073 fprintf(stderr," --help --version Show this help/the version\n");
6074 fprintf(stderr," For more information, see also man nkf\n");
6075 fprintf(stderr,"\n");
6081 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6082 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6085 #if defined(MSDOS) && defined(__WIN16__)
6088 #if defined(MSDOS) && defined(__WIN32__)
6094 ,NKF_VERSION,NKF_RELEASE_DATE);
6095 fprintf(stderr,"\n%s\n",CopyRight);