1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.137 2007/10/01 14:29:21 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-10-01"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
42 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
44 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
60 #if defined(MSDOS) || defined(__OS2__)
63 #if defined(_MSC_VER) || defined(__WATCOMC__)
64 #define mktemp _mktemp
70 #define setbinmode(fp) fsetbin(fp)
71 #elif defined(__DJGPP__)
72 #include <libc/dosio.h>
73 #define setbinmode(fp) djgpp_setbinmode(fp)
74 #else /* Microsoft C, Turbo C */
75 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
78 #define setbinmode(fp)
81 #if defined(__DJGPP__)
82 void djgpp_setbinmode(FILE *fp)
84 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
87 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
88 __file_handle_set(fd, m);
92 #ifdef _IOFBF /* SysV and MSDOS, Windows */
93 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
95 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
98 /*Borland C++ 4.5 EasyWin*/
99 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
108 /* added by satoru@isoternet.org */
110 #include <sys/types.h>
112 #include <sys/stat.h>
113 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
115 #if defined(__WATCOMC__)
116 #include <sys/utime.h>
120 #else /* defined(MSDOS) */
122 #ifdef __BORLANDC__ /* BCC32 */
124 #else /* !defined(__BORLANDC__) */
125 #include <sys/utime.h>
126 #endif /* (__BORLANDC__) */
127 #else /* !defined(__WIN32__) */
128 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
129 #include <sys/utime.h>
130 #elif defined(__TURBOC__) /* BCC */
132 #elif defined(LSI_C) /* LSI C */
133 #endif /* (__WIN32__) */
141 /* state of output_mode and input_mode
158 #define X0213_1 0x284F
159 #define X0213_2 0x2850
161 /* Input Assumption */
166 #define LATIN1_INPUT 6
168 #define STRICT_MIME 8
173 #define JAPANESE_EUC 10
177 #define UTF8_INPUT 13
178 #define UTF16_INPUT 1015
179 #define UTF32_INPUT 1017
183 #define ENDIAN_BIG 1234
184 #define ENDIAN_LITTLE 4321
185 #define ENDIAN_2143 2143
186 #define ENDIAN_3412 3412
207 #define is_alnum(c) \
208 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
210 /* I don't trust portablity of toupper */
211 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
212 #define nkf_isoctal(c) ('0'<=c && c<='7')
213 #define nkf_isdigit(c) ('0'<=c && c<='9')
214 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
215 #define nkf_isblank(c) (c == SP || c == TAB)
216 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
217 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
218 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
219 #define nkf_isprint(c) (SP<=c && c<='~')
220 #define nkf_isgraph(c) ('!'<=c && c<='~')
221 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
222 ('A'<=c&&c<='F') ? (c-'A'+10) : \
223 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
224 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
225 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
227 #define CP932_TABLE_BEGIN 0xFA
228 #define CP932_TABLE_END 0xFC
229 #define CP932INV_TABLE_BEGIN 0xED
230 #define CP932INV_TABLE_END 0xEE
231 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
233 #define HOLD_SIZE 1024
234 #if defined(INT_IS_SHORT)
235 #define IOBUF_SIZE 2048
237 #define IOBUF_SIZE 16384
240 #define DEFAULT_J 'B'
241 #define DEFAULT_R 'B'
243 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
244 #define SJ6394 0x0161 /* 63 - 94 ku offset */
246 #define RANGE_NUM_MAX 18
251 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
252 #define sizeof_euc_to_utf8_1byte 94
253 #define sizeof_euc_to_utf8_2bytes 94
254 #define sizeof_utf8_to_euc_C2 64
255 #define sizeof_utf8_to_euc_E5B8 64
256 #define sizeof_utf8_to_euc_2bytes 112
257 #define sizeof_utf8_to_euc_3bytes 16
260 /* MIME preprocessor */
262 #ifdef EASYWIN /*Easy Win */
263 extern POINT _BufferSize;
272 void (*status_func)(struct input_code *, nkf_char);
273 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
277 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
280 static const char *CopyRight = COPY_RIGHT;
282 #if !defined(PERL_XS) && !defined(WIN32DLL)
283 static nkf_char noconvert(FILE *f);
285 static void module_connection(void);
286 static nkf_char kanji_convert(FILE *f);
287 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
288 static nkf_char push_hold_buf(nkf_char c2);
289 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
290 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
291 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
292 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
293 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
295 * 0: Shift_JIS, eucJP-ascii
300 #define UCS_MAP_ASCII 0
302 #define UCS_MAP_CP932 2
303 #define UCS_MAP_CP10001 3
304 static int ms_ucs_map_f = UCS_MAP_ASCII;
306 #ifdef UTF8_INPUT_ENABLE
307 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
308 static int no_cp932ext_f = FALSE;
309 /* ignore ZERO WIDTH NO-BREAK SPACE */
310 static int no_best_fit_chars_f = FALSE;
311 static int input_endian = ENDIAN_BIG;
312 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
313 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
314 static void encode_fallback_html(nkf_char c);
315 static void encode_fallback_xml(nkf_char c);
316 static void encode_fallback_java(nkf_char c);
317 static void encode_fallback_perl(nkf_char c);
318 static void encode_fallback_subchar(nkf_char c);
319 static void (*encode_fallback)(nkf_char c) = NULL;
320 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
321 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
322 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
323 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
324 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
325 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
326 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
327 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
328 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
329 static void w_status(struct input_code *, nkf_char);
331 #ifdef UTF8_OUTPUT_ENABLE
332 static int output_bom_f = FALSE;
333 static int output_endian = ENDIAN_BIG;
334 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
335 static void w_oconv(nkf_char c2,nkf_char c1);
336 static void w_oconv16(nkf_char c2,nkf_char c1);
337 static void w_oconv32(nkf_char c2,nkf_char c1);
339 static void e_oconv(nkf_char c2,nkf_char c1);
340 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
341 static void s_oconv(nkf_char c2,nkf_char c1);
342 static void j_oconv(nkf_char c2,nkf_char c1);
343 static void fold_conv(nkf_char c2,nkf_char c1);
344 static void nl_conv(nkf_char c2,nkf_char c1);
345 static void z_conv(nkf_char c2,nkf_char c1);
346 static void rot_conv(nkf_char c2,nkf_char c1);
347 static void hira_conv(nkf_char c2,nkf_char c1);
348 static void base64_conv(nkf_char c2,nkf_char c1);
349 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
350 static void no_connection(nkf_char c2,nkf_char c1);
351 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
353 static void code_score(struct input_code *ptr);
354 static void code_status(nkf_char c);
356 static void std_putc(nkf_char c);
357 static nkf_char std_getc(FILE *f);
358 static nkf_char std_ungetc(nkf_char c,FILE *f);
360 static nkf_char broken_getc(FILE *f);
361 static nkf_char broken_ungetc(nkf_char c,FILE *f);
363 static nkf_char mime_begin(FILE *f);
364 static nkf_char mime_getc(FILE *f);
365 static nkf_char mime_ungetc(nkf_char c,FILE *f);
367 static void switch_mime_getc(void);
368 static void unswitch_mime_getc(void);
369 static nkf_char mime_begin_strict(FILE *f);
370 static nkf_char mime_getc_buf(FILE *f);
371 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
372 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
374 static nkf_char base64decode(nkf_char c);
375 static void mime_prechar(nkf_char c2, nkf_char c1);
376 static void mime_putc(nkf_char c);
377 static void open_mime(nkf_char c);
378 static void close_mime(void);
379 static void eof_mime(void);
380 static void mimeout_addchar(nkf_char c);
382 static void usage(void);
383 static void version(void);
385 static void options(unsigned char *c);
386 #if defined(PERL_XS) || defined(WIN32DLL)
387 static void reinit(void);
392 #if !defined(PERL_XS) && !defined(WIN32DLL)
393 static unsigned char stdibuf[IOBUF_SIZE];
394 static unsigned char stdobuf[IOBUF_SIZE];
396 static unsigned char hold_buf[HOLD_SIZE*2];
397 static int hold_count = 0;
399 /* MIME preprocessor fifo */
401 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
402 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
403 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
404 static unsigned char mime_buf[MIME_BUF_SIZE];
405 static unsigned int mime_top = 0;
406 static unsigned int mime_last = 0; /* decoded */
407 static unsigned int mime_input = 0; /* undecoded */
408 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
411 static int unbuf_f = FALSE;
412 static int estab_f = FALSE;
413 static int nop_f = FALSE;
414 static int binmode_f = TRUE; /* binary mode */
415 static int rot_f = FALSE; /* rot14/43 mode */
416 static int hira_f = FALSE; /* hira/kata henkan */
417 static int input_f = FALSE; /* non fixed input code */
418 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
419 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
420 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
421 static int mimebuf_f = FALSE; /* MIME buffered input */
422 static int broken_f = FALSE; /* convert ESC-less broken JIS */
423 static int iso8859_f = FALSE; /* ISO8859 through */
424 static int mimeout_f = FALSE; /* base64 mode */
425 #if defined(MSDOS) || defined(__OS2__)
426 static int x0201_f = TRUE; /* Assume JISX0201 kana */
428 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
430 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
432 #ifdef UNICODE_NORMALIZATION
433 static int nfc_f = FALSE;
434 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
435 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
436 static nkf_char nfc_getc(FILE *f);
437 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
441 static int cap_f = FALSE;
442 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
443 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
444 static nkf_char cap_getc(FILE *f);
445 static nkf_char cap_ungetc(nkf_char c,FILE *f);
447 static int url_f = FALSE;
448 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
449 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
450 static nkf_char url_getc(FILE *f);
451 static nkf_char url_ungetc(nkf_char c,FILE *f);
454 #if defined(INT_IS_SHORT)
455 #define NKF_INT32_C(n) (n##L)
457 #define NKF_INT32_C(n) (n)
459 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
460 #define CLASS_MASK NKF_INT32_C(0xFF000000)
461 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
462 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
463 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
464 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
465 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
467 #ifdef NUMCHAR_OPTION
468 static int numchar_f = FALSE;
469 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
470 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
471 static nkf_char numchar_getc(FILE *f);
472 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
476 static int noout_f = FALSE;
477 static void no_putc(nkf_char c);
478 static nkf_char debug_f = FALSE;
479 static void debug(const char *str);
480 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
483 static int guess_f = FALSE;
485 static void print_guessed_code(char *filename);
487 static void set_input_codename(char *codename);
488 static int is_inputcode_mixed = FALSE;
491 static int exec_f = 0;
494 #ifdef SHIFTJIS_CP932
495 /* invert IBM extended characters to others */
496 static int cp51932_f = FALSE;
498 /* invert NEC-selected IBM extended characters to IBM extended characters */
499 static int cp932inv_f = TRUE;
501 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
502 #endif /* SHIFTJIS_CP932 */
505 static int x0212_f = FALSE;
506 static nkf_char x0212_shift(nkf_char c);
507 static nkf_char x0212_unshift(nkf_char c);
509 static int x0213_f = FALSE;
511 static unsigned char prefix_table[256];
513 static void set_code_score(struct input_code *ptr, nkf_char score);
514 static void clr_code_score(struct input_code *ptr, nkf_char score);
515 static void status_disable(struct input_code *ptr);
516 static void status_push_ch(struct input_code *ptr, nkf_char c);
517 static void status_clear(struct input_code *ptr);
518 static void status_reset(struct input_code *ptr);
519 static void status_reinit(struct input_code *ptr);
520 static void status_check(struct input_code *ptr, nkf_char c);
521 static void e_status(struct input_code *, nkf_char);
522 static void s_status(struct input_code *, nkf_char);
524 struct input_code input_code_list[] = {
525 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
526 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
527 #ifdef UTF8_INPUT_ENABLE
528 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
529 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
530 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
535 static int mimeout_mode = 0;
536 static int base64_count = 0;
538 /* X0208 -> ASCII converter */
541 static int f_line = 0; /* chars in line */
542 static int f_prev = 0;
543 static int fold_preserve_f = FALSE; /* preserve new lines */
544 static int fold_f = FALSE;
545 static int fold_len = 0;
548 static unsigned char kanji_intro = DEFAULT_J;
549 static unsigned char ascii_intro = DEFAULT_R;
553 #define FOLD_MARGIN 10
554 #define DEFAULT_FOLD 60
556 static int fold_margin = FOLD_MARGIN;
560 #ifdef DEFAULT_CODE_JIS
561 # define DEFAULT_CONV j_oconv
563 #ifdef DEFAULT_CODE_SJIS
564 # define DEFAULT_CONV s_oconv
566 #ifdef DEFAULT_CODE_EUC
567 # define DEFAULT_CONV e_oconv
569 #ifdef DEFAULT_CODE_UTF8
570 # define DEFAULT_CONV w_oconv
573 /* process default */
574 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
576 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
577 /* s_iconv or oconv */
578 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
580 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
581 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
588 /* static redirections */
590 static void (*o_putc)(nkf_char c) = std_putc;
592 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
593 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
595 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
596 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
598 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
600 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
601 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
603 /* for strict mime */
604 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
605 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
608 static int output_mode = ASCII, /* output kanji mode */
609 input_mode = ASCII, /* input kanji mode */
610 shift_mode = FALSE; /* TRUE shift out, or X0201 */
611 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
613 /* X0201 / X0208 conversion tables */
615 /* X0201 kana conversion table */
617 static const unsigned char cv[]= {
618 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
619 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
620 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
621 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
622 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
623 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
624 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
625 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
626 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
627 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
628 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
629 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
630 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
631 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
632 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
633 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
637 /* X0201 kana conversion table for daguten */
639 static const unsigned char dv[]= {
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
645 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
646 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
647 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
648 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
649 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
651 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 /* X0201 kana conversion table for han-daguten */
660 static const unsigned char ev[]= {
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
672 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 /* X0208 kigou conversion table */
681 /* 0x8140 - 0x819e */
682 static const unsigned char fv[] = {
684 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
685 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
686 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
687 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
688 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
689 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
690 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
691 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
692 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
694 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
700 static int file_out_f = FALSE;
702 static int overwrite_f = FALSE;
703 static int preserve_time_f = FALSE;
704 static int backup_f = FALSE;
705 static char *backup_suffix = "";
706 static char *get_backup_filename(const char *suffix, const char *filename);
709 static int nlmode_f = 0; /* CR, LF, CRLF */
710 static nkf_char prev_cr = 0;
711 #ifdef EASYWIN /*Easy Win */
712 static int end_check;
715 #define STD_GC_BUFSIZE (256)
716 nkf_char std_gc_buf[STD_GC_BUFSIZE];
720 #include "nkf32dll.c"
721 #elif defined(PERL_XS)
723 int main(int argc, char **argv)
728 char *outfname = NULL;
731 #ifdef EASYWIN /*Easy Win */
732 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
735 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
736 cp = (unsigned char *)*argv;
741 if (pipe(fds) < 0 || (pid = fork()) < 0){
752 execvp(argv[1], &argv[1]);
766 if(x0201_f == WISH_TRUE)
767 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
769 if (binmode_f == TRUE)
770 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
771 if (freopen("","wb",stdout) == NULL)
778 setbuf(stdout, (char *) NULL);
780 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
783 if (binmode_f == TRUE)
784 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
785 if (freopen("","rb",stdin) == NULL) return (-1);
789 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
793 kanji_convert(stdin);
794 if (guess_f) print_guessed_code(NULL);
798 int is_argument_error = FALSE;
800 is_inputcode_mixed = FALSE;
801 input_codename = NULL;
805 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
808 is_argument_error = TRUE;
816 /* reopen file for stdout */
817 if (file_out_f == TRUE) {
820 outfname = malloc(strlen(origfname)
821 + strlen(".nkftmpXXXXXX")
827 strcpy(outfname, origfname);
831 for (i = strlen(outfname); i; --i){
832 if (outfname[i - 1] == '/'
833 || outfname[i - 1] == '\\'){
839 strcat(outfname, "ntXXXXXX");
841 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
844 strcat(outfname, ".nkftmpXXXXXX");
845 fd = mkstemp(outfname);
848 || (fd_backup = dup(fileno(stdout))) < 0
849 || dup2(fd, fileno(stdout)) < 0
860 outfname = "nkf.out";
863 if(freopen(outfname, "w", stdout) == NULL) {
867 if (binmode_f == TRUE) {
868 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
869 if (freopen("","wb",stdout) == NULL)
876 if (binmode_f == TRUE)
877 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
878 if (freopen("","rb",fin) == NULL)
883 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
887 char *filename = NULL;
889 if (nfiles > 1) filename = origfname;
890 if (guess_f) print_guessed_code(filename);
896 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
904 if (dup2(fd_backup, fileno(stdout)) < 0){
907 if (stat(origfname, &sb)) {
908 fprintf(stderr, "Can't stat %s\n", origfname);
910 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
911 if (chmod(outfname, sb.st_mode)) {
912 fprintf(stderr, "Can't set permission %s\n", outfname);
915 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
917 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
918 tb[0] = tb[1] = sb.st_mtime;
919 if (utime(outfname, tb)) {
920 fprintf(stderr, "Can't set timestamp %s\n", outfname);
923 tb.actime = sb.st_atime;
924 tb.modtime = sb.st_mtime;
925 if (utime(outfname, &tb)) {
926 fprintf(stderr, "Can't set timestamp %s\n", outfname);
931 char *backup_filename = get_backup_filename(backup_suffix, origfname);
933 unlink(backup_filename);
935 if (rename(origfname, backup_filename)) {
936 perror(backup_filename);
937 fprintf(stderr, "Can't rename %s to %s\n",
938 origfname, backup_filename);
942 if (unlink(origfname)){
947 if (rename(outfname, origfname)) {
949 fprintf(stderr, "Can't rename %s to %s\n",
950 outfname, origfname);
957 if (is_argument_error)
960 #ifdef EASYWIN /*Easy Win */
961 if (file_out_f == FALSE)
962 scanf("%d",&end_check);
965 #else /* for Other OS */
966 if (file_out_f == TRUE)
971 #endif /* WIN32DLL */
974 char *get_backup_filename(const char *suffix, const char *filename)
976 char *backup_filename;
977 int asterisk_count = 0;
979 int filename_length = strlen(filename);
981 for(i = 0; suffix[i]; i++){
982 if(suffix[i] == '*') asterisk_count++;
986 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
987 if (!backup_filename){
988 perror("Can't malloc backup filename.");
992 for(i = 0, j = 0; suffix[i];){
993 if(suffix[i] == '*'){
994 backup_filename[j] = '\0';
995 strncat(backup_filename, filename, filename_length);
997 j += filename_length;
999 backup_filename[j++] = suffix[i++];
1002 backup_filename[j] = '\0';
1004 j = strlen(suffix) + filename_length;
1005 backup_filename = malloc( + 1);
1006 strcpy(backup_filename, filename);
1007 strcat(backup_filename, suffix);
1008 backup_filename[j] = '\0';
1010 return backup_filename;
1014 static const struct {
1038 {"katakana-hiragana","h3"},
1045 #ifdef UTF8_OUTPUT_ENABLE
1055 {"fb-subchar=", ""},
1057 #ifdef UTF8_INPUT_ENABLE
1058 {"utf8-input", "W"},
1059 {"utf16-input", "W16"},
1060 {"no-cp932ext", ""},
1061 {"no-best-fit-chars",""},
1063 #ifdef UNICODE_NORMALIZATION
1064 {"utf8mac-input", ""},
1076 #ifdef NUMCHAR_OPTION
1077 {"numchar-input", ""},
1083 #ifdef SHIFTJIS_CP932
1093 static int option_mode = 0;
1095 void options(unsigned char *cp)
1099 unsigned char *cp_back = NULL;
1104 while(*cp && *cp++!='-');
1105 while (*cp || cp_back) {
1113 case '-': /* literal options */
1114 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1118 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1119 p = (unsigned char *)long_option[i].name;
1120 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1121 if (*p == cp[j] || cp[j] == SP){
1128 while(*cp && *cp != SP && cp++);
1129 if (long_option[i].alias[0]){
1131 cp = (unsigned char *)long_option[i].alias;
1133 if (strcmp(long_option[i].name, "ic=") == 0){
1134 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1135 codeset[i] = nkf_toupper(p[i]);
1138 if(strcmp(codeset, "ISO-2022-JP") == 0){
1139 input_f = JIS_INPUT;
1140 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1141 strcmp(codeset, "CP50220") == 0 ||
1142 strcmp(codeset, "CP50221") == 0 ||
1143 strcmp(codeset, "CP50222") == 0){
1144 input_f = JIS_INPUT;
1145 #ifdef SHIFTJIS_CP932
1148 #ifdef UTF8_OUTPUT_ENABLE
1149 ms_ucs_map_f = UCS_MAP_CP932;
1151 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1152 input_f = JIS_INPUT;
1156 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1157 input_f = JIS_INPUT;
1162 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1163 input_f = SJIS_INPUT;
1164 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1165 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1166 strcmp(codeset, "CP932") == 0 ||
1167 strcmp(codeset, "MS932") == 0){
1168 input_f = SJIS_INPUT;
1169 #ifdef SHIFTJIS_CP932
1172 #ifdef UTF8_OUTPUT_ENABLE
1173 ms_ucs_map_f = UCS_MAP_CP932;
1175 }else if(strcmp(codeset, "CP10001") == 0){
1176 input_f = SJIS_INPUT;
1177 #ifdef SHIFTJIS_CP932
1180 #ifdef UTF8_OUTPUT_ENABLE
1181 ms_ucs_map_f = UCS_MAP_CP10001;
1183 }else if(strcmp(codeset, "EUCJP") == 0 ||
1184 strcmp(codeset, "EUC-JP") == 0){
1185 input_f = EUC_INPUT;
1186 }else if(strcmp(codeset, "CP51932") == 0){
1187 input_f = EUC_INPUT;
1188 #ifdef SHIFTJIS_CP932
1191 #ifdef UTF8_OUTPUT_ENABLE
1192 ms_ucs_map_f = UCS_MAP_CP932;
1194 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1195 strcmp(codeset, "EUCJP-MS") == 0 ||
1196 strcmp(codeset, "EUCJPMS") == 0){
1197 input_f = EUC_INPUT;
1198 #ifdef SHIFTJIS_CP932
1201 #ifdef UTF8_OUTPUT_ENABLE
1202 ms_ucs_map_f = UCS_MAP_MS;
1204 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1205 strcmp(codeset, "EUCJP-ASCII") == 0){
1206 input_f = EUC_INPUT;
1207 #ifdef SHIFTJIS_CP932
1210 #ifdef UTF8_OUTPUT_ENABLE
1211 ms_ucs_map_f = UCS_MAP_ASCII;
1213 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1214 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1215 input_f = SJIS_INPUT;
1217 #ifdef SHIFTJIS_CP932
1220 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1221 strcmp(codeset, "EUC-JIS-2004") == 0){
1222 input_f = EUC_INPUT;
1224 #ifdef SHIFTJIS_CP932
1227 #ifdef UTF8_INPUT_ENABLE
1228 }else if(strcmp(codeset, "UTF-8") == 0 ||
1229 strcmp(codeset, "UTF-8N") == 0 ||
1230 strcmp(codeset, "UTF-8-BOM") == 0){
1231 input_f = UTF8_INPUT;
1232 #ifdef UNICODE_NORMALIZATION
1233 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1234 strcmp(codeset, "UTF-8-MAC") == 0){
1235 input_f = UTF8_INPUT;
1238 }else if(strcmp(codeset, "UTF-16") == 0 ||
1239 strcmp(codeset, "UTF-16BE") == 0 ||
1240 strcmp(codeset, "UTF-16BE-BOM") == 0){
1241 input_f = UTF16_INPUT;
1242 input_endian = ENDIAN_BIG;
1243 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1244 strcmp(codeset, "UTF-16LE-BOM") == 0){
1245 input_f = UTF16_INPUT;
1246 input_endian = ENDIAN_LITTLE;
1247 }else if(strcmp(codeset, "UTF-32") == 0 ||
1248 strcmp(codeset, "UTF-32BE") == 0 ||
1249 strcmp(codeset, "UTF-32BE-BOM") == 0){
1250 input_f = UTF32_INPUT;
1251 input_endian = ENDIAN_BIG;
1252 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1253 strcmp(codeset, "UTF-32LE-BOM") == 0){
1254 input_f = UTF32_INPUT;
1255 input_endian = ENDIAN_LITTLE;
1260 if (strcmp(long_option[i].name, "oc=") == 0){
1262 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1263 codeset[i] = nkf_toupper(p[i]);
1266 if(strcmp(codeset, "ISO-2022-JP") == 0){
1267 output_conv = j_oconv;
1268 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1269 output_conv = j_oconv;
1270 no_cp932ext_f = TRUE;
1271 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_CP932;
1277 }else if(strcmp(codeset, "CP50220") == 0){
1278 output_conv = j_oconv;
1280 #ifdef SHIFTJIS_CP932
1283 #ifdef UTF8_OUTPUT_ENABLE
1284 ms_ucs_map_f = UCS_MAP_CP932;
1286 }else if(strcmp(codeset, "CP50221") == 0){
1287 output_conv = j_oconv;
1288 #ifdef SHIFTJIS_CP932
1291 #ifdef UTF8_OUTPUT_ENABLE
1292 ms_ucs_map_f = UCS_MAP_CP932;
1294 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1295 output_conv = j_oconv;
1299 #ifdef SHIFTJIS_CP932
1302 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1303 output_conv = j_oconv;
1308 #ifdef SHIFTJIS_CP932
1311 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1312 output_conv = s_oconv;
1313 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1314 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1315 strcmp(codeset, "CP932") == 0 ||
1316 strcmp(codeset, "MS932") == 0){
1317 output_conv = s_oconv;
1318 #ifdef UTF8_OUTPUT_ENABLE
1319 ms_ucs_map_f = UCS_MAP_CP932;
1321 }else if(strcmp(codeset, "CP10001") == 0){
1322 output_conv = s_oconv;
1323 #ifdef UTF8_OUTPUT_ENABLE
1324 ms_ucs_map_f = UCS_MAP_CP10001;
1326 }else if(strcmp(codeset, "EUCJP") == 0 ||
1327 strcmp(codeset, "EUC-JP") == 0){
1328 output_conv = e_oconv;
1329 }else if(strcmp(codeset, "CP51932") == 0){
1330 output_conv = e_oconv;
1331 #ifdef SHIFTJIS_CP932
1334 #ifdef UTF8_OUTPUT_ENABLE
1335 ms_ucs_map_f = UCS_MAP_CP932;
1337 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1338 strcmp(codeset, "EUCJP-MS") == 0 ||
1339 strcmp(codeset, "EUCJPMS") == 0){
1340 output_conv = e_oconv;
1344 #ifdef UTF8_OUTPUT_ENABLE
1345 ms_ucs_map_f = UCS_MAP_MS;
1347 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1348 strcmp(codeset, "EUCJP-ASCII") == 0){
1349 output_conv = e_oconv;
1353 #ifdef UTF8_OUTPUT_ENABLE
1354 ms_ucs_map_f = UCS_MAP_ASCII;
1356 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1357 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1358 output_conv = s_oconv;
1360 #ifdef SHIFTJIS_CP932
1363 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1364 strcmp(codeset, "EUC-JIS-2004") == 0){
1365 output_conv = e_oconv;
1370 #ifdef SHIFTJIS_CP932
1373 #ifdef UTF8_OUTPUT_ENABLE
1374 }else if(strcmp(codeset, "UTF-8") == 0){
1375 output_conv = w_oconv;
1376 }else if(strcmp(codeset, "UTF-8N") == 0){
1377 output_conv = w_oconv;
1378 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1379 output_conv = w_oconv;
1380 output_bom_f = TRUE;
1381 }else if(strcmp(codeset, "UTF-16BE") == 0){
1382 output_conv = w_oconv16;
1383 }else if(strcmp(codeset, "UTF-16") == 0 ||
1384 strcmp(codeset, "UTF-16BE-BOM") == 0){
1385 output_conv = w_oconv16;
1386 output_bom_f = TRUE;
1387 }else if(strcmp(codeset, "UTF-16LE") == 0){
1388 output_conv = w_oconv16;
1389 output_endian = ENDIAN_LITTLE;
1390 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1391 output_conv = w_oconv16;
1392 output_endian = ENDIAN_LITTLE;
1393 output_bom_f = TRUE;
1394 }else if(strcmp(codeset, "UTF-32") == 0 ||
1395 strcmp(codeset, "UTF-32BE") == 0){
1396 output_conv = w_oconv32;
1397 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1398 output_conv = w_oconv32;
1399 output_bom_f = TRUE;
1400 }else if(strcmp(codeset, "UTF-32LE") == 0){
1401 output_conv = w_oconv32;
1402 output_endian = ENDIAN_LITTLE;
1403 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1404 output_conv = w_oconv32;
1405 output_endian = ENDIAN_LITTLE;
1406 output_bom_f = TRUE;
1412 if (strcmp(long_option[i].name, "overwrite") == 0){
1415 preserve_time_f = TRUE;
1418 if (strcmp(long_option[i].name, "overwrite=") == 0){
1421 preserve_time_f = TRUE;
1423 backup_suffix = malloc(strlen((char *) p) + 1);
1424 strcpy(backup_suffix, (char *) p);
1427 if (strcmp(long_option[i].name, "in-place") == 0){
1430 preserve_time_f = FALSE;
1433 if (strcmp(long_option[i].name, "in-place=") == 0){
1436 preserve_time_f = FALSE;
1438 backup_suffix = malloc(strlen((char *) p) + 1);
1439 strcpy(backup_suffix, (char *) p);
1444 if (strcmp(long_option[i].name, "cap-input") == 0){
1448 if (strcmp(long_option[i].name, "url-input") == 0){
1453 #ifdef NUMCHAR_OPTION
1454 if (strcmp(long_option[i].name, "numchar-input") == 0){
1460 if (strcmp(long_option[i].name, "no-output") == 0){
1464 if (strcmp(long_option[i].name, "debug") == 0){
1469 if (strcmp(long_option[i].name, "cp932") == 0){
1470 #ifdef SHIFTJIS_CP932
1474 #ifdef UTF8_OUTPUT_ENABLE
1475 ms_ucs_map_f = UCS_MAP_CP932;
1479 if (strcmp(long_option[i].name, "no-cp932") == 0){
1480 #ifdef SHIFTJIS_CP932
1484 #ifdef UTF8_OUTPUT_ENABLE
1485 ms_ucs_map_f = UCS_MAP_ASCII;
1489 #ifdef SHIFTJIS_CP932
1490 if (strcmp(long_option[i].name, "cp932inv") == 0){
1497 if (strcmp(long_option[i].name, "x0212") == 0){
1504 if (strcmp(long_option[i].name, "exec-in") == 0){
1508 if (strcmp(long_option[i].name, "exec-out") == 0){
1513 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1514 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1515 no_cp932ext_f = TRUE;
1518 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1519 no_best_fit_chars_f = TRUE;
1522 if (strcmp(long_option[i].name, "fb-skip") == 0){
1523 encode_fallback = NULL;
1526 if (strcmp(long_option[i].name, "fb-html") == 0){
1527 encode_fallback = encode_fallback_html;
1530 if (strcmp(long_option[i].name, "fb-xml") == 0){
1531 encode_fallback = encode_fallback_xml;
1534 if (strcmp(long_option[i].name, "fb-java") == 0){
1535 encode_fallback = encode_fallback_java;
1538 if (strcmp(long_option[i].name, "fb-perl") == 0){
1539 encode_fallback = encode_fallback_perl;
1542 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1543 encode_fallback = encode_fallback_subchar;
1546 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1547 encode_fallback = encode_fallback_subchar;
1548 unicode_subchar = 0;
1550 /* decimal number */
1551 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1552 unicode_subchar *= 10;
1553 unicode_subchar += hex2bin(p[i]);
1555 }else if(p[1] == 'x' || p[1] == 'X'){
1556 /* hexadecimal number */
1557 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1558 unicode_subchar <<= 4;
1559 unicode_subchar |= hex2bin(p[i]);
1563 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1564 unicode_subchar *= 8;
1565 unicode_subchar += hex2bin(p[i]);
1568 w16e_conv(unicode_subchar, &i, &j);
1569 unicode_subchar = i<<8 | j;
1573 #ifdef UTF8_OUTPUT_ENABLE
1574 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1575 ms_ucs_map_f = UCS_MAP_MS;
1579 #ifdef UNICODE_NORMALIZATION
1580 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1581 input_f = UTF8_INPUT;
1586 if (strcmp(long_option[i].name, "prefix=") == 0){
1587 if (nkf_isgraph(p[0])){
1588 for (i = 1; nkf_isgraph(p[i]); i++){
1589 prefix_table[p[i]] = p[0];
1596 case 'b': /* buffered mode */
1599 case 'u': /* non bufferd mode */
1602 case 't': /* transparent mode */
1607 } else if (*cp=='2') {
1611 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1619 case 'j': /* JIS output */
1621 output_conv = j_oconv;
1623 case 'e': /* AT&T EUC output */
1624 output_conv = e_oconv;
1627 case 's': /* SJIS output */
1628 output_conv = s_oconv;
1630 case 'l': /* ISO8859 Latin-1 support, no conversion */
1631 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1632 input_f = LATIN1_INPUT;
1634 case 'i': /* Kanji IN ESC-$-@/B */
1635 if (*cp=='@'||*cp=='B')
1636 kanji_intro = *cp++;
1638 case 'o': /* ASCII IN ESC-(-J/B */
1639 if (*cp=='J'||*cp=='B'||*cp=='H')
1640 ascii_intro = *cp++;
1644 bit:1 katakana->hiragana
1645 bit:2 hiragana->katakana
1647 if ('9'>= *cp && *cp>='0')
1648 hira_f |= (*cp++ -'0');
1655 #if defined(MSDOS) || defined(__OS2__)
1670 #ifdef UTF8_OUTPUT_ENABLE
1671 case 'w': /* UTF-8 output */
1673 output_conv = w_oconv; cp++;
1677 output_bom_f = TRUE;
1680 if ('1'== cp[0] && '6'==cp[1]) {
1681 output_conv = w_oconv16; cp+=2;
1682 } else if ('3'== cp[0] && '2'==cp[1]) {
1683 output_conv = w_oconv32; cp+=2;
1685 output_conv = w_oconv;
1690 output_endian = ENDIAN_LITTLE;
1691 } else if (cp[0] == 'B') {
1699 output_bom_f = TRUE;
1704 #ifdef UTF8_INPUT_ENABLE
1705 case 'W': /* UTF input */
1708 input_f = UTF8_INPUT;
1710 if ('1'== cp[0] && '6'==cp[1]) {
1712 input_f = UTF16_INPUT;
1713 input_endian = ENDIAN_BIG;
1714 } else if ('3'== cp[0] && '2'==cp[1]) {
1716 input_f = UTF32_INPUT;
1717 input_endian = ENDIAN_BIG;
1719 input_f = UTF8_INPUT;
1724 input_endian = ENDIAN_LITTLE;
1725 } else if (cp[0] == 'B') {
1731 /* Input code assumption */
1732 case 'J': /* JIS input */
1733 input_f = JIS_INPUT;
1735 case 'E': /* AT&T EUC input */
1736 input_f = EUC_INPUT;
1738 case 'S': /* MS Kanji input */
1739 input_f = SJIS_INPUT;
1740 if (x0201_f==NO_X0201) x0201_f=TRUE;
1742 case 'Z': /* Convert X0208 alphabet to asii */
1744 bit:0 Convert JIS X 0208 Alphabet to ASCII
1745 bit:1 Convert Kankaku to one space
1746 bit:2 Convert Kankaku to two spaces
1747 bit:3 Convert HTML Entity
1748 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1750 while ('0'<= *cp && *cp <='9') {
1751 alpha_f |= 1 << (*cp++ - '0');
1753 if (!alpha_f) alpha_f = 1;
1755 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1756 x0201_f = FALSE; /* No X0201->X0208 conversion */
1758 ESC-(-I in JIS, EUC, MS Kanji
1759 SI/SO in JIS, EUC, MS Kanji
1760 SSO in EUC, JIS, not in MS Kanji
1761 MS Kanji (0xa0-0xdf)
1763 ESC-(-I in JIS (0x20-0x5f)
1764 SSO in EUC (0xa0-0xdf)
1765 0xa0-0xd in MS Kanji (0xa0-0xdf)
1768 case 'X': /* Assume X0201 kana */
1769 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1772 case 'F': /* prserve new lines */
1773 fold_preserve_f = TRUE;
1774 case 'f': /* folding -f60 or -f */
1777 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1779 fold_len += *cp++ - '0';
1781 if (!(0<fold_len && fold_len<BUFSIZ))
1782 fold_len = DEFAULT_FOLD;
1786 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1788 fold_margin += *cp++ - '0';
1792 case 'm': /* MIME support */
1793 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1794 if (*cp=='B'||*cp=='Q') {
1795 mime_decode_mode = *cp++;
1796 mimebuf_f = FIXED_MIME;
1797 } else if (*cp=='N') {
1798 mime_f = TRUE; cp++;
1799 } else if (*cp=='S') {
1800 mime_f = STRICT_MIME; cp++;
1801 } else if (*cp=='0') {
1802 mime_decode_f = FALSE;
1803 mime_f = FALSE; cp++;
1806 case 'M': /* MIME output */
1809 mimeout_f = FIXED_MIME; cp++;
1810 } else if (*cp=='Q') {
1812 mimeout_f = FIXED_MIME; cp++;
1817 case 'B': /* Broken JIS support */
1819 bit:1 allow any x on ESC-(-x or ESC-$-x
1820 bit:2 reset to ascii on NL
1822 if ('9'>= *cp && *cp>='0')
1823 broken_f |= 1<<(*cp++ -'0');
1828 case 'O':/* for Output file */
1832 case 'c':/* add cr code */
1835 case 'd':/* delete cr code */
1838 case 'I': /* ISO-2022-JP output */
1841 case 'L': /* line mode */
1842 if (*cp=='u') { /* unix */
1843 nlmode_f = LF; cp++;
1844 } else if (*cp=='m') { /* mac */
1845 nlmode_f = CR; cp++;
1846 } else if (*cp=='w') { /* windows */
1847 nlmode_f = CRLF; cp++;
1848 } else if (*cp=='0') { /* no conversion */
1858 /* module muliple options in a string are allowed for Perl moudle */
1859 while(*cp && *cp++!='-');
1862 /* bogus option but ignored */
1868 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1871 struct input_code *p = input_code_list;
1873 if (iconv_func == p->iconv_func){
1882 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1884 #ifdef INPUT_CODE_FIX
1892 #ifdef INPUT_CODE_FIX
1893 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1899 if (estab_f && iconv_for_check != iconv){
1900 struct input_code *p = find_inputcode_byfunc(iconv);
1902 set_input_codename(p->name);
1905 iconv_for_check = iconv;
1910 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1911 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1912 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1913 #ifdef SHIFTJIS_CP932
1914 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1915 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1917 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1919 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1920 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1922 #define SCORE_INIT (SCORE_iMIME)
1924 static const char score_table_A0[] = {
1927 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1928 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1931 static const char score_table_F0[] = {
1932 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1933 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1934 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1935 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1938 void set_code_score(struct input_code *ptr, nkf_char score)
1941 ptr->score |= score;
1945 void clr_code_score(struct input_code *ptr, nkf_char score)
1948 ptr->score &= ~score;
1952 void code_score(struct input_code *ptr)
1954 nkf_char c2 = ptr->buf[0];
1955 #ifdef UTF8_OUTPUT_ENABLE
1956 nkf_char c1 = ptr->buf[1];
1959 set_code_score(ptr, SCORE_ERROR);
1960 }else if (c2 == SSO){
1961 set_code_score(ptr, SCORE_KANA);
1962 #ifdef UTF8_OUTPUT_ENABLE
1963 }else if (!e2w_conv(c2, c1)){
1964 set_code_score(ptr, SCORE_NO_EXIST);
1966 }else if ((c2 & 0x70) == 0x20){
1967 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1968 }else if ((c2 & 0x70) == 0x70){
1969 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1970 }else if ((c2 & 0x70) >= 0x50){
1971 set_code_score(ptr, SCORE_L2);
1975 void status_disable(struct input_code *ptr)
1980 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1983 void status_push_ch(struct input_code *ptr, nkf_char c)
1985 ptr->buf[ptr->index++] = c;
1988 void status_clear(struct input_code *ptr)
1994 void status_reset(struct input_code *ptr)
1997 ptr->score = SCORE_INIT;
2000 void status_reinit(struct input_code *ptr)
2003 ptr->_file_stat = 0;
2006 void status_check(struct input_code *ptr, nkf_char c)
2008 if (c <= DEL && estab_f){
2013 void s_status(struct input_code *ptr, nkf_char c)
2017 status_check(ptr, c);
2022 #ifdef NUMCHAR_OPTION
2023 }else if (is_unicode_capsule(c)){
2026 }else if (0xa1 <= c && c <= 0xdf){
2027 status_push_ch(ptr, SSO);
2028 status_push_ch(ptr, c);
2031 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2033 status_push_ch(ptr, c);
2034 #ifdef SHIFTJIS_CP932
2036 && is_ibmext_in_sjis(c)){
2038 status_push_ch(ptr, c);
2039 #endif /* SHIFTJIS_CP932 */
2041 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2043 status_push_ch(ptr, c);
2044 #endif /* X0212_ENABLE */
2046 status_disable(ptr);
2050 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2051 status_push_ch(ptr, c);
2052 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2056 status_disable(ptr);
2060 #ifdef SHIFTJIS_CP932
2061 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2062 status_push_ch(ptr, c);
2063 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2064 set_code_score(ptr, SCORE_CP932);
2069 #endif /* SHIFTJIS_CP932 */
2070 #ifndef X0212_ENABLE
2071 status_disable(ptr);
2077 void e_status(struct input_code *ptr, nkf_char c)
2081 status_check(ptr, c);
2086 #ifdef NUMCHAR_OPTION
2087 }else if (is_unicode_capsule(c)){
2090 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2092 status_push_ch(ptr, c);
2094 }else if (0x8f == c){
2096 status_push_ch(ptr, c);
2097 #endif /* X0212_ENABLE */
2099 status_disable(ptr);
2103 if (0xa1 <= c && c <= 0xfe){
2104 status_push_ch(ptr, c);
2108 status_disable(ptr);
2113 if (0xa1 <= c && c <= 0xfe){
2115 status_push_ch(ptr, c);
2117 status_disable(ptr);
2119 #endif /* X0212_ENABLE */
2123 #ifdef UTF8_INPUT_ENABLE
2124 void w_status(struct input_code *ptr, nkf_char c)
2128 status_check(ptr, c);
2133 #ifdef NUMCHAR_OPTION
2134 }else if (is_unicode_capsule(c)){
2137 }else if (0xc0 <= c && c <= 0xdf){
2139 status_push_ch(ptr, c);
2140 }else if (0xe0 <= c && c <= 0xef){
2142 status_push_ch(ptr, c);
2143 }else if (0xf0 <= c && c <= 0xf4){
2145 status_push_ch(ptr, c);
2147 status_disable(ptr);
2152 if (0x80 <= c && c <= 0xbf){
2153 status_push_ch(ptr, c);
2154 if (ptr->index > ptr->stat){
2155 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2156 && ptr->buf[2] == 0xbf);
2157 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2158 &ptr->buf[0], &ptr->buf[1]);
2165 status_disable(ptr);
2169 if (0x80 <= c && c <= 0xbf){
2170 if (ptr->index < ptr->stat){
2171 status_push_ch(ptr, c);
2176 status_disable(ptr);
2183 void code_status(nkf_char c)
2185 int action_flag = 1;
2186 struct input_code *result = 0;
2187 struct input_code *p = input_code_list;
2189 if (!p->status_func) {
2193 if (!p->status_func)
2195 (p->status_func)(p, c);
2198 }else if(p->stat == 0){
2209 if (result && !estab_f){
2210 set_iconv(TRUE, result->iconv_func);
2211 }else if (c <= DEL){
2212 struct input_code *ptr = input_code_list;
2222 nkf_char std_getc(FILE *f)
2225 return std_gc_buf[--std_gc_ndx];
2231 nkf_char std_ungetc(nkf_char c, FILE *f)
2233 if (std_gc_ndx == STD_GC_BUFSIZE){
2236 std_gc_buf[std_gc_ndx++] = c;
2241 void std_putc(nkf_char c)
2248 #if !defined(PERL_XS) && !defined(WIN32DLL)
2249 nkf_char noconvert(FILE *f)
2254 module_connection();
2255 while ((c = (*i_getc)(f)) != EOF)
2262 void module_connection(void)
2264 oconv = output_conv;
2267 /* replace continucation module, from output side */
2269 /* output redicrection */
2271 if (noout_f || guess_f){
2278 if (mimeout_f == TRUE) {
2279 o_base64conv = oconv; oconv = base64_conv;
2281 /* base64_count = 0; */
2285 o_nlconv = oconv; oconv = nl_conv;
2288 o_rot_conv = oconv; oconv = rot_conv;
2291 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2294 o_hira_conv = oconv; oconv = hira_conv;
2297 o_fconv = oconv; oconv = fold_conv;
2300 if (alpha_f || x0201_f) {
2301 o_zconv = oconv; oconv = z_conv;
2305 i_ungetc = std_ungetc;
2306 /* input redicrection */
2309 i_cgetc = i_getc; i_getc = cap_getc;
2310 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2313 i_ugetc = i_getc; i_getc = url_getc;
2314 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2317 #ifdef NUMCHAR_OPTION
2319 i_ngetc = i_getc; i_getc = numchar_getc;
2320 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2323 #ifdef UNICODE_NORMALIZATION
2324 if (nfc_f && input_f == UTF8_INPUT){
2325 i_nfc_getc = i_getc; i_getc = nfc_getc;
2326 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2329 if (mime_f && mimebuf_f==FIXED_MIME) {
2330 i_mgetc = i_getc; i_getc = mime_getc;
2331 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2334 i_bgetc = i_getc; i_getc = broken_getc;
2335 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2337 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2338 set_iconv(-TRUE, e_iconv);
2339 } else if (input_f == SJIS_INPUT) {
2340 set_iconv(-TRUE, s_iconv);
2341 #ifdef UTF8_INPUT_ENABLE
2342 } else if (input_f == UTF8_INPUT) {
2343 set_iconv(-TRUE, w_iconv);
2344 } else if (input_f == UTF16_INPUT) {
2345 set_iconv(-TRUE, w_iconv16);
2346 } else if (input_f == UTF32_INPUT) {
2347 set_iconv(-TRUE, w_iconv32);
2350 set_iconv(FALSE, e_iconv);
2354 struct input_code *p = input_code_list;
2362 * Check and Ignore BOM
2364 void check_bom(FILE *f)
2367 switch(c2 = (*i_getc)(f)){
2369 if((c2 = (*i_getc)(f)) == 0x00){
2370 if((c2 = (*i_getc)(f)) == 0xFE){
2371 if((c2 = (*i_getc)(f)) == 0xFF){
2373 set_iconv(TRUE, w_iconv32);
2375 if (iconv == w_iconv32) {
2376 input_endian = ENDIAN_BIG;
2379 (*i_ungetc)(0xFF,f);
2380 }else (*i_ungetc)(c2,f);
2381 (*i_ungetc)(0xFE,f);
2382 }else if(c2 == 0xFF){
2383 if((c2 = (*i_getc)(f)) == 0xFE){
2385 set_iconv(TRUE, w_iconv32);
2387 if (iconv == w_iconv32) {
2388 input_endian = ENDIAN_2143;
2391 (*i_ungetc)(0xFF,f);
2392 }else (*i_ungetc)(c2,f);
2393 (*i_ungetc)(0xFF,f);
2394 }else (*i_ungetc)(c2,f);
2395 (*i_ungetc)(0x00,f);
2396 }else (*i_ungetc)(c2,f);
2397 (*i_ungetc)(0x00,f);
2400 if((c2 = (*i_getc)(f)) == 0xBB){
2401 if((c2 = (*i_getc)(f)) == 0xBF){
2403 set_iconv(TRUE, w_iconv);
2405 if (iconv == w_iconv) {
2408 (*i_ungetc)(0xBF,f);
2409 }else (*i_ungetc)(c2,f);
2410 (*i_ungetc)(0xBB,f);
2411 }else (*i_ungetc)(c2,f);
2412 (*i_ungetc)(0xEF,f);
2415 if((c2 = (*i_getc)(f)) == 0xFF){
2416 if((c2 = (*i_getc)(f)) == 0x00){
2417 if((c2 = (*i_getc)(f)) == 0x00){
2419 set_iconv(TRUE, w_iconv32);
2421 if (iconv == w_iconv32) {
2422 input_endian = ENDIAN_3412;
2425 (*i_ungetc)(0x00,f);
2426 }else (*i_ungetc)(c2,f);
2427 (*i_ungetc)(0x00,f);
2428 }else (*i_ungetc)(c2,f);
2430 set_iconv(TRUE, w_iconv16);
2432 if (iconv == w_iconv16) {
2433 input_endian = ENDIAN_BIG;
2436 (*i_ungetc)(0xFF,f);
2437 }else (*i_ungetc)(c2,f);
2438 (*i_ungetc)(0xFE,f);
2441 if((c2 = (*i_getc)(f)) == 0xFE){
2442 if((c2 = (*i_getc)(f)) == 0x00){
2443 if((c2 = (*i_getc)(f)) == 0x00){
2445 set_iconv(TRUE, w_iconv32);
2447 if (iconv == w_iconv32) {
2448 input_endian = ENDIAN_LITTLE;
2451 (*i_ungetc)(0x00,f);
2452 }else (*i_ungetc)(c2,f);
2453 (*i_ungetc)(0x00,f);
2454 }else (*i_ungetc)(c2,f);
2456 set_iconv(TRUE, w_iconv16);
2458 if (iconv == w_iconv16) {
2459 input_endian = ENDIAN_LITTLE;
2462 (*i_ungetc)(0xFE,f);
2463 }else (*i_ungetc)(c2,f);
2464 (*i_ungetc)(0xFF,f);
2473 Conversion main loop. Code detection only.
2476 nkf_char kanji_convert(FILE *f)
2478 nkf_char c3, c2=0, c1, c0=0;
2479 int is_8bit = FALSE;
2481 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2482 #ifdef UTF8_INPUT_ENABLE
2483 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2490 output_mode = ASCII;
2493 #define NEXT continue /* no output, get next */
2494 #define SEND ; /* output c1 and c2, get next */
2495 #define LAST break /* end of loop, go closing */
2497 module_connection();
2500 while ((c1 = (*i_getc)(f)) != EOF) {
2501 #ifdef INPUT_CODE_FIX
2507 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2508 /* in case of 8th bit is on */
2509 if (!estab_f&&!mime_decode_mode) {
2510 /* in case of not established yet */
2511 /* It is still ambiguious */
2512 if (h_conv(f, c2, c1)==EOF)
2518 /* in case of already established */
2520 /* ignore bogus code and not CP5022x UCD */
2528 /* second byte, 7 bit code */
2529 /* it might be kanji shitfted */
2530 if ((c1 == DEL) || (c1 <= SP)) {
2531 /* ignore bogus first code */
2538 #ifdef UTF8_INPUT_ENABLE
2539 if (iconv == w_iconv16) {
2540 if (input_endian == ENDIAN_BIG) {
2542 if ((c1 = (*i_getc)(f)) != EOF) {
2543 if (0xD8 <= c2 && c2 <= 0xDB) {
2544 if ((c0 = (*i_getc)(f)) != EOF) {
2546 if ((c3 = (*i_getc)(f)) != EOF) {
2553 if ((c2 = (*i_getc)(f)) != EOF) {
2554 if (0xD8 <= c2 && c2 <= 0xDB) {
2555 if ((c3 = (*i_getc)(f)) != EOF) {
2556 if ((c0 = (*i_getc)(f)) != EOF) {
2565 } else if(iconv == w_iconv32){
2567 if((c2 = (*i_getc)(f)) != EOF &&
2568 (c1 = (*i_getc)(f)) != EOF &&
2569 (c0 = (*i_getc)(f)) != EOF){
2570 switch(input_endian){
2572 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2575 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2578 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2581 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2591 #ifdef NUMCHAR_OPTION
2592 if (is_unicode_capsule(c1)){
2596 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2598 if (!estab_f && !iso8859_f) {
2599 /* not established yet */
2602 } else { /* estab_f==TRUE */
2607 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2608 /* SJIS X0201 Case... */
2609 if(iso2022jp_f && x0201_f==NO_X0201) {
2610 (*oconv)(GETA1, GETA2);
2617 } else if (c1==SSO && iconv != s_iconv) {
2618 /* EUC X0201 Case */
2619 c1 = (*i_getc)(f); /* skip SSO */
2621 if (SSP<=c1 && c1<0xe0) {
2622 if(iso2022jp_f && x0201_f==NO_X0201) {
2623 (*oconv)(GETA1, GETA2);
2630 } else { /* bogus code, skip SSO and one byte */
2633 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2634 (c1 == 0xFD || c1 == 0xFE)) {
2640 /* already established */
2645 } else if ((c1 > SP) && (c1 != DEL)) {
2646 /* in case of Roman characters */
2648 /* output 1 shifted byte */
2652 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2653 /* output 1 shifted byte */
2654 if(iso2022jp_f && x0201_f==NO_X0201) {
2655 (*oconv)(GETA1, GETA2);
2662 /* look like bogus code */
2665 } else if (input_mode == X0208 || input_mode == X0212 ||
2666 input_mode == X0213_1 || input_mode == X0213_2) {
2667 /* in case of Kanji shifted */
2670 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2671 /* Check MIME code */
2672 if ((c1 = (*i_getc)(f)) == EOF) {
2675 } else if (c1 == '?') {
2676 /* =? is mime conversion start sequence */
2677 if(mime_f == STRICT_MIME) {
2678 /* check in real detail */
2679 if (mime_begin_strict(f) == EOF)
2683 } else if (mime_begin(f) == EOF)
2693 /* normal ASCII code */
2696 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2699 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2702 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2703 if ((c1 = (*i_getc)(f)) == EOF) {
2704 /* (*oconv)(0, ESC); don't send bogus code */
2706 } else if (c1 == '$') {
2707 if ((c1 = (*i_getc)(f)) == EOF) {
2709 (*oconv)(0, ESC); don't send bogus code
2710 (*oconv)(0, '$'); */
2712 } else if (c1 == '@'|| c1 == 'B') {
2713 /* This is kanji introduction */
2716 set_input_codename("ISO-2022-JP");
2718 debug("ISO-2022-JP");
2721 } else if (c1 == '(') {
2722 if ((c1 = (*i_getc)(f)) == EOF) {
2723 /* don't send bogus code
2729 } else if (c1 == '@'|| c1 == 'B') {
2730 /* This is kanji introduction */
2735 } else if (c1 == 'D'){
2739 #endif /* X0212_ENABLE */
2740 } else if (c1 == (X0213_1&0x7F)){
2741 input_mode = X0213_1;
2744 } else if (c1 == (X0213_2&0x7F)){
2745 input_mode = X0213_2;
2749 /* could be some special code */
2756 } else if (broken_f&0x2) {
2757 /* accept any ESC-(-x as broken code ... */
2767 } else if (c1 == '(') {
2768 if ((c1 = (*i_getc)(f)) == EOF) {
2769 /* don't send bogus code
2771 (*oconv)(0, '('); */
2775 /* This is X0201 kana introduction */
2776 input_mode = X0201; shift_mode = X0201;
2778 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2779 /* This is X0208 kanji introduction */
2780 input_mode = ASCII; shift_mode = FALSE;
2782 } else if (broken_f&0x2) {
2783 input_mode = ASCII; shift_mode = FALSE;
2788 /* maintain various input_mode here */
2792 } else if ( c1 == 'N' || c1 == 'n'){
2794 c3 = (*i_getc)(f); /* skip SS2 */
2795 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2810 } else if (c1 == ESC && iconv == s_iconv) {
2811 /* ESC in Shift_JIS */
2812 if ((c1 = (*i_getc)(f)) == EOF) {
2813 /* (*oconv)(0, ESC); don't send bogus code */
2815 } else if (c1 == '$') {
2817 if ((c1 = (*i_getc)(f)) == EOF) {
2819 (*oconv)(0, ESC); don't send bogus code
2820 (*oconv)(0, '$'); */
2823 if (('E' <= c1 && c1 <= 'G') ||
2824 ('O' <= c1 && c1 <= 'Q')) {
2832 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2833 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2834 while ((c1 = (*i_getc)(f)) != EOF) {
2835 if (SP <= c1 && c1 <= 'z') {
2836 (*oconv)(0, c1 + c0);
2837 } else break; /* c1 == SO */
2841 if (c1 == EOF) LAST;
2848 } else if (c1 == LF || c1 == CR) {
2850 input_mode = ASCII; set_iconv(FALSE, 0);
2852 } else if (mime_decode_f && !mime_decode_mode){
2854 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2862 } else { /* if (c1 == CR)*/
2863 if ((c1=(*i_getc)(f))!=EOF) {
2867 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2882 if (prev_cr && c1 == LF) nlmode_f = CRLF;
2885 } else if (c1 == DEL && input_mode == X0208) {
2895 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2898 if ((c0 = (*i_getc)(f)) != EOF) {
2901 if ((c3 = (*i_getc)(f)) != EOF) {
2903 (*iconv)(c2, c1, c0|c3);
2908 /* 3 bytes EUC or UTF-8 */
2909 if ((c0 = (*i_getc)(f)) != EOF) {
2911 (*iconv)(c2, c1, c0);
2919 0x7F <= c2 && c2 <= 0x92 &&
2920 0x21 <= c1 && c1 <= 0x7E) {
2922 if(c1 == 0x7F) return 0;
2923 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2926 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2930 (*oconv)(PREFIX_EUCG3 | c2, c1);
2932 #endif /* X0212_ENABLE */
2934 (*oconv)(PREFIX_EUCG3 | c2, c1);
2937 (*oconv)(input_mode, c1); /* other special case */
2943 /* goto next_word */
2947 (*iconv)(EOF, 0, 0);
2948 if (!input_codename)
2951 struct input_code *p = input_code_list;
2952 struct input_code *result = p;
2954 if (p->score < result->score) result = p;
2957 set_input_codename(result->name);
2959 debug(result->name);
2967 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2969 nkf_char ret, c3, c0;
2973 /** it must NOT be in the kanji shifte sequence */
2974 /** it must NOT be written in JIS7 */
2975 /** and it must be after 2 byte 8bit code */
2981 while ((c1 = (*i_getc)(f)) != EOF) {
2987 if (push_hold_buf(c1) == EOF || estab_f){
2993 struct input_code *p = input_code_list;
2994 struct input_code *result = p;
2999 if (p->status_func && p->score < result->score){
3004 set_iconv(TRUE, result->iconv_func);
3009 ** 1) EOF is detected, or
3010 ** 2) Code is established, or
3011 ** 3) Buffer is FULL (but last word is pushed)
3013 ** in 1) and 3) cases, we continue to use
3014 ** Kanji codes by oconv and leave estab_f unchanged.
3019 while (hold_index < hold_count){
3020 c2 = hold_buf[hold_index++];
3022 #ifdef NUMCHAR_OPTION
3023 || is_unicode_capsule(c2)
3028 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3029 (*iconv)(X0201, c2, 0);
3032 if (hold_index < hold_count){
3033 c1 = hold_buf[hold_index++];
3043 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3046 if (hold_index < hold_count){
3047 c0 = hold_buf[hold_index++];
3048 } else if ((c0 = (*i_getc)(f)) == EOF) {
3054 if (hold_index < hold_count){
3055 c3 = hold_buf[hold_index++];
3056 } else if ((c3 = (*i_getc)(f)) == EOF) {
3061 (*iconv)(c2, c1, c0|c3);
3066 /* 3 bytes EUC or UTF-8 */
3067 if (hold_index < hold_count){
3068 c0 = hold_buf[hold_index++];
3069 } else if ((c0 = (*i_getc)(f)) == EOF) {
3075 (*iconv)(c2, c1, c0);
3078 if (c0 == EOF) break;
3083 nkf_char push_hold_buf(nkf_char c2)
3085 if (hold_count >= HOLD_SIZE*2)
3087 hold_buf[hold_count++] = (unsigned char)c2;
3088 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3091 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3093 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3096 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3097 #ifdef SHIFTJIS_CP932
3098 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3099 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3106 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3107 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3113 #endif /* SHIFTJIS_CP932 */
3115 if (!x0213_f && is_ibmext_in_sjis(c2)){
3116 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3119 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3132 if(x0213_f && c2 >= 0xF0){
3133 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3134 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3135 }else{ /* 78<=k<=94 */
3136 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3137 if (0x9E < c1) c2++;
3140 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3141 if (0x9E < c1) c2++;
3144 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3151 c2 = x0212_unshift(c2);
3158 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3162 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3164 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3166 if(c1 == 0x7F) return 0;
3167 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3170 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3171 if (ret) return ret;
3177 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3182 }else if (c2 == 0x8f){
3186 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3187 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3188 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3191 c2 = (c2 << 8) | (c1 & 0x7f);
3193 #ifdef SHIFTJIS_CP932
3196 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3197 s2e_conv(s2, s1, &c2, &c1);
3204 #endif /* SHIFTJIS_CP932 */
3206 #endif /* X0212_ENABLE */
3207 } else if (c2 == SSO){
3210 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3213 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3214 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3215 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3220 #ifdef SHIFTJIS_CP932
3221 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3223 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3224 s2e_conv(s2, s1, &c2, &c1);
3231 #endif /* SHIFTJIS_CP932 */
3238 #ifdef UTF8_INPUT_ENABLE
3239 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3246 }else if (0xc0 <= c2 && c2 <= 0xef) {
3247 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3248 #ifdef NUMCHAR_OPTION
3251 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3259 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3262 static const char w_iconv_utf8_1st_byte[] =
3264 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3265 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3266 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3267 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3269 if (c2 < 0 || 0xff < c2) {
3270 }else if (c2 == 0) { /* 0 : 1 byte*/
3272 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3275 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3277 if (c1 < 0x80 || 0xBF < c1) return 0;
3280 if (c0 == 0) return -1;
3281 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3286 if (c0 == 0) return -1;
3287 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3291 if (c0 == 0) return -1;
3292 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3296 if (c0 == 0) return -2;
3297 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3301 if (c0 == 0) return -2;
3302 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3306 if (c0 == 0) return -2;
3307 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3315 if (c2 == 0 || c2 == EOF){
3316 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3317 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3320 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3329 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3330 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3337 }else if (val < 0x800){
3338 *p2 = 0xc0 | (val >> 6);
3339 *p1 = 0x80 | (val & 0x3f);
3341 } else if (val <= NKF_INT32_C(0xFFFF)) {
3342 *p2 = 0xe0 | (val >> 12);
3343 *p1 = 0x80 | ((val >> 6) & 0x3f);
3344 *p0 = 0x80 | (val & 0x3f);
3345 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3346 *p2 = 0xe0 | (val >> 16);
3347 *p1 = 0x80 | ((val >> 12) & 0x3f);
3348 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3357 #ifdef UTF8_INPUT_ENABLE
3358 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3363 } else if (c2 >= 0xf0){
3364 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3365 val = (c2 & 0x0f) << 18;
3366 val |= (c1 & 0x3f) << 12;
3367 val |= (c0 & 0x3f00) >> 2;
3369 }else if (c2 >= 0xe0){
3370 val = (c2 & 0x0f) << 12;
3371 val |= (c1 & 0x3f) << 6;
3373 }else if (c2 >= 0xc0){
3374 val = (c2 & 0x1f) << 6;
3382 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3384 nkf_char c2, c1, c0;
3391 w16w_conv(val, &c2, &c1, &c0);
3392 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3393 #ifdef NUMCHAR_OPTION
3396 *p1 = CLASS_UNICODE | val;
3405 #ifdef UTF8_INPUT_ENABLE
3406 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3409 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3412 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3413 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3415 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3417 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3422 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3423 if (ret) return ret;
3428 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3432 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3433 } else if (is_unicode_bmp(c1)) {
3434 ret = w16e_conv(c1, &c2, &c1);
3437 c1 = CLASS_UNICODE | c1;
3439 if (ret) return ret;
3444 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3446 const unsigned short *const *pp;
3447 const unsigned short *const *const *ppp;
3448 static const char no_best_fit_chars_table_C2[] =
3449 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3450 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3451 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3452 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3453 static const char no_best_fit_chars_table_C2_ms[] =
3454 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3456 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3457 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3458 static const char no_best_fit_chars_table_932_C2[] =
3459 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3461 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3462 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3463 static const char no_best_fit_chars_table_932_C3[] =
3464 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3465 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3466 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3467 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3473 }else if(c2 < 0xe0){
3474 if(no_best_fit_chars_f){
3475 if(ms_ucs_map_f == UCS_MAP_CP932){
3478 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3481 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3484 }else if(!cp932inv_f){
3487 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3490 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3493 }else if(ms_ucs_map_f == UCS_MAP_MS){
3494 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3495 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3513 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3514 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3515 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3517 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3518 }else if(c0 < 0xF0){
3519 if(no_best_fit_chars_f){
3520 if(ms_ucs_map_f == UCS_MAP_CP932){
3521 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3522 }else if(ms_ucs_map_f == UCS_MAP_MS){
3527 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3530 if(c0 == 0x92) return 1;
3535 if(c1 == 0x80 || c0 == 0x9C) return 1;
3538 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3543 if(c0 == 0x94) return 1;
3546 if(c0 == 0xBB) return 1;
3556 if(c0 == 0x95) return 1;
3559 if(c0 == 0xA5) return 1;
3566 if(c0 == 0x8D) return 1;
3569 if(c0 == 0x9E && !cp932inv_f) return 1;
3572 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3580 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3581 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3582 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3584 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3586 #ifdef SHIFTJIS_CP932
3587 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3589 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3590 s2e_conv(s2, s1, p2, p1);
3599 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3602 const unsigned short *p;
3605 if (pp == 0) return 1;
3608 if (c1 < 0 || psize <= c1) return 1;
3610 if (p == 0) return 1;
3613 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3615 if (val == 0) return 1;
3616 if (no_cp932ext_f && (
3617 (val>>8) == 0x2D || /* NEC special characters */
3618 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3626 if (c2 == SO) c2 = X0201;
3633 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3640 (*f)(0, bin2hex(c>>shift));
3650 void encode_fallback_html(nkf_char c)
3655 if(c >= NKF_INT32_C(1000000))
3656 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3657 if(c >= NKF_INT32_C(100000))
3658 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3660 (*oconv)(0, 0x30+(c/10000 )%10);
3662 (*oconv)(0, 0x30+(c/1000 )%10);
3664 (*oconv)(0, 0x30+(c/100 )%10);
3666 (*oconv)(0, 0x30+(c/10 )%10);
3668 (*oconv)(0, 0x30+ c %10);
3673 void encode_fallback_xml(nkf_char c)
3678 nkf_each_char_to_hex(oconv, c);
3683 void encode_fallback_java(nkf_char c)
3687 if(!is_unicode_bmp(c)){
3691 (*oconv)(0, bin2hex(c>>20));
3692 (*oconv)(0, bin2hex(c>>16));
3696 (*oconv)(0, bin2hex(c>>12));
3697 (*oconv)(0, bin2hex(c>> 8));
3698 (*oconv)(0, bin2hex(c>> 4));
3699 (*oconv)(0, bin2hex(c ));
3703 void encode_fallback_perl(nkf_char c)
3708 nkf_each_char_to_hex(oconv, c);
3713 void encode_fallback_subchar(nkf_char c)
3715 c = unicode_subchar;
3716 (*oconv)((c>>8)&0xFF, c&0xFF);
3721 #ifdef UTF8_OUTPUT_ENABLE
3722 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3724 const unsigned short *p;
3727 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3735 p = euc_to_utf8_1byte;
3737 } else if (is_eucg3(c2)){
3738 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3741 c2 = (c2&0x7f) - 0x21;
3742 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3743 p = x0212_to_utf8_2bytes[c2];
3749 c2 = (c2&0x7f) - 0x21;
3750 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3752 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3753 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3754 euc_to_utf8_2bytes_ms[c2];
3759 c1 = (c1 & 0x7f) - 0x21;
3760 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3765 void w_oconv(nkf_char c2, nkf_char c1)
3771 output_bom_f = FALSE;
3782 #ifdef NUMCHAR_OPTION
3783 if (c2 == 0 && is_unicode_capsule(c1)){
3784 val = c1 & VALUE_MASK;
3787 }else if (val < 0x800){
3788 (*o_putc)(0xC0 | (val >> 6));
3789 (*o_putc)(0x80 | (val & 0x3f));
3790 } else if (val <= NKF_INT32_C(0xFFFF)) {
3791 (*o_putc)(0xE0 | (val >> 12));
3792 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3793 (*o_putc)(0x80 | (val & 0x3f));
3794 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3795 (*o_putc)(0xF0 | ( val>>18));
3796 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3797 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3798 (*o_putc)(0x80 | ( val & 0x3f));
3805 output_mode = ASCII;
3807 } else if (c2 == ISO8859_1) {
3808 output_mode = ISO8859_1;
3809 (*o_putc)(c1 | 0x080);
3812 val = e2w_conv(c2, c1);
3814 w16w_conv(val, &c2, &c1, &c0);
3818 if (c0) (*o_putc)(c0);
3824 void w_oconv16(nkf_char c2, nkf_char c1)
3827 output_bom_f = FALSE;
3828 if (output_endian == ENDIAN_LITTLE){
3829 (*o_putc)((unsigned char)'\377');
3833 (*o_putc)((unsigned char)'\377');
3842 if (c2 == ISO8859_1) {
3845 #ifdef NUMCHAR_OPTION
3846 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3847 if (is_unicode_bmp(c1)) {
3848 c2 = (c1 >> 8) & 0xff;
3852 if (c1 <= UNICODE_MAX) {
3853 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3854 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3855 if (output_endian == ENDIAN_LITTLE){
3856 (*o_putc)(c2 & 0xff);
3857 (*o_putc)((c2 >> 8) & 0xff);
3858 (*o_putc)(c1 & 0xff);
3859 (*o_putc)((c1 >> 8) & 0xff);
3861 (*o_putc)((c2 >> 8) & 0xff);
3862 (*o_putc)(c2 & 0xff);
3863 (*o_putc)((c1 >> 8) & 0xff);
3864 (*o_putc)(c1 & 0xff);
3871 nkf_char val = e2w_conv(c2, c1);
3872 c2 = (val >> 8) & 0xff;
3876 if (output_endian == ENDIAN_LITTLE){
3885 void w_oconv32(nkf_char c2, nkf_char c1)
3888 output_bom_f = FALSE;
3889 if (output_endian == ENDIAN_LITTLE){
3890 (*o_putc)((unsigned char)'\377');
3898 (*o_putc)((unsigned char)'\377');
3907 if (c2 == ISO8859_1) {
3909 #ifdef NUMCHAR_OPTION
3910 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3914 c1 = e2w_conv(c2, c1);
3917 if (output_endian == ENDIAN_LITTLE){
3918 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3919 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3920 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3924 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3925 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3926 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3931 void e_oconv(nkf_char c2, nkf_char c1)
3933 #ifdef NUMCHAR_OPTION
3934 if (c2 == 0 && is_unicode_capsule(c1)){
3935 w16e_conv(c1, &c2, &c1);
3936 if (c2 == 0 && is_unicode_capsule(c1)){
3937 c2 = c1 & VALUE_MASK;
3938 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3942 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3943 c1 = 0x21 + c1 % 94;
3946 (*o_putc)((c2 & 0x7f) | 0x080);
3947 (*o_putc)(c1 | 0x080);
3949 (*o_putc)((c2 & 0x7f) | 0x080);
3950 (*o_putc)(c1 | 0x080);
3954 if (encode_fallback) (*encode_fallback)(c1);
3963 } else if (c2 == 0) {
3964 output_mode = ASCII;
3966 } else if (c2 == X0201) {
3967 output_mode = JAPANESE_EUC;
3968 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3969 } else if (c2 == ISO8859_1) {
3970 output_mode = ISO8859_1;
3971 (*o_putc)(c1 | 0x080);
3973 } else if (is_eucg3(c2)){
3974 output_mode = JAPANESE_EUC;
3975 #ifdef SHIFTJIS_CP932
3978 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3979 s2e_conv(s2, s1, &c2, &c1);
3984 output_mode = ASCII;
3986 }else if (is_eucg3(c2)){
3989 (*o_putc)((c2 & 0x7f) | 0x080);
3990 (*o_putc)(c1 | 0x080);
3993 (*o_putc)((c2 & 0x7f) | 0x080);
3994 (*o_putc)(c1 | 0x080);
3998 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3999 set_iconv(FALSE, 0);
4000 return; /* too late to rescue this char */
4002 output_mode = JAPANESE_EUC;
4003 (*o_putc)(c2 | 0x080);
4004 (*o_putc)(c1 | 0x080);
4009 nkf_char x0212_shift(nkf_char c)
4014 if (0x75 <= c && c <= 0x7f){
4015 ret = c + (0x109 - 0x75);
4018 if (0x75 <= c && c <= 0x7f){
4019 ret = c + (0x113 - 0x75);
4026 nkf_char x0212_unshift(nkf_char c)
4029 if (0x7f <= c && c <= 0x88){
4030 ret = c + (0x75 - 0x7f);
4031 }else if (0x89 <= c && c <= 0x92){
4032 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4036 #endif /* X0212_ENABLE */
4038 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4044 if((0x21 <= ndx && ndx <= 0x2F)){
4045 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4046 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4048 }else if(0x6E <= ndx && ndx <= 0x7E){
4049 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4050 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4056 else if(nkf_isgraph(ndx)){
4058 const unsigned short *ptr;
4059 ptr = x0212_shiftjis[ndx - 0x21];
4061 val = ptr[(c1 & 0x7f) - 0x21];
4070 c2 = x0212_shift(c2);
4072 #endif /* X0212_ENABLE */
4074 if(0x7F < c2) return 1;
4075 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4076 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4080 void s_oconv(nkf_char c2, nkf_char c1)
4082 #ifdef NUMCHAR_OPTION
4083 if (c2 == 0 && is_unicode_capsule(c1)){
4084 w16e_conv(c1, &c2, &c1);
4085 if (c2 == 0 && is_unicode_capsule(c1)){
4086 c2 = c1 & VALUE_MASK;
4087 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4090 c2 = c1 / 188 + 0xF0;
4092 c1 += 0x40 + (c1 > 0x3e);
4097 if(encode_fallback)(*encode_fallback)(c1);
4106 } else if (c2 == 0) {
4107 output_mode = ASCII;
4109 } else if (c2 == X0201) {
4110 output_mode = SHIFT_JIS;
4112 } else if (c2 == ISO8859_1) {
4113 output_mode = ISO8859_1;
4114 (*o_putc)(c1 | 0x080);
4116 } else if (is_eucg3(c2)){
4117 output_mode = SHIFT_JIS;
4118 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4124 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4125 set_iconv(FALSE, 0);
4126 return; /* too late to rescue this char */
4128 output_mode = SHIFT_JIS;
4129 e2s_conv(c2, c1, &c2, &c1);
4131 #ifdef SHIFTJIS_CP932
4133 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4134 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4140 #endif /* SHIFTJIS_CP932 */
4143 if (prefix_table[(unsigned char)c1]){
4144 (*o_putc)(prefix_table[(unsigned char)c1]);
4150 void j_oconv(nkf_char c2, nkf_char c1)
4152 #ifdef NUMCHAR_OPTION
4153 if (c2 == 0 && is_unicode_capsule(c1)){
4154 w16e_conv(c1, &c2, &c1);
4155 if (c2 == 0 && is_unicode_capsule(c1)){
4156 c2 = c1 & VALUE_MASK;
4157 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4160 c2 = 0x7F + c1 / 94;
4161 c1 = 0x21 + c1 % 94;
4163 if (encode_fallback) (*encode_fallback)(c1);
4170 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4173 (*o_putc)(ascii_intro);
4174 output_mode = ASCII;
4178 } else if (is_eucg3(c2)){
4180 if(output_mode!=X0213_2){
4181 output_mode = X0213_2;
4185 (*o_putc)(X0213_2&0x7F);
4188 if(output_mode!=X0212){
4189 output_mode = X0212;
4193 (*o_putc)(X0212&0x7F);
4196 (*o_putc)(c2 & 0x7f);
4199 } else if (c2==X0201) {
4200 if (output_mode!=X0201) {
4201 output_mode = X0201;
4207 } else if (c2==ISO8859_1) {
4208 /* iso8859 introduction, or 8th bit on */
4209 /* Can we convert in 7bit form using ESC-'-'-A ?
4211 output_mode = ISO8859_1;
4213 } else if (c2 == 0) {
4214 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4217 (*o_putc)(ascii_intro);
4218 output_mode = ASCII;
4223 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4224 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4226 if (output_mode!=X0213_1) {
4227 output_mode = X0213_1;
4231 (*o_putc)(X0213_1&0x7F);
4233 }else if (output_mode != X0208) {
4234 output_mode = X0208;
4237 (*o_putc)(kanji_intro);
4244 void base64_conv(nkf_char c2, nkf_char c1)
4246 mime_prechar(c2, c1);
4247 (*o_base64conv)(c2,c1);
4251 static nkf_char broken_buf[3];
4252 static int broken_counter = 0;
4253 static int broken_last = 0;
4254 nkf_char broken_getc(FILE *f)
4258 if (broken_counter>0) {
4259 return broken_buf[--broken_counter];
4262 if (c=='$' && broken_last != ESC
4263 && (input_mode==ASCII || input_mode==X0201)) {
4266 if (c1=='@'|| c1=='B') {
4267 broken_buf[0]=c1; broken_buf[1]=c;
4274 } else if (c=='(' && broken_last != ESC
4275 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4278 if (c1=='J'|| c1=='B') {
4279 broken_buf[0]=c1; broken_buf[1]=c;
4292 nkf_char broken_ungetc(nkf_char c, FILE *f)
4294 if (broken_counter<2)
4295 broken_buf[broken_counter++]=c;
4299 void nl_conv(nkf_char c2, nkf_char c1)
4303 if (! (c2==0&&c1==LF)) {
4309 } else if (c1==CR) {
4311 } else if (c1==LF) {
4312 if (nlmode_f==CRLF) {
4314 } else if (nlmode_f==CR) {
4319 } else if (c1!='\032' || nlmode_f!=LF){
4325 Return value of fold_conv()
4327 LF add newline and output char
4328 CR add newline and output nothing
4331 1 (or else) normal output
4333 fold state in prev (previous character)
4335 >0x80 Japanese (X0208/X0201)
4340 This fold algorthm does not preserve heading space in a line.
4341 This is the main difference from fmt.
4344 #define char_size(c2,c1) (c2?2:1)
4346 void fold_conv(nkf_char c2, nkf_char c1)
4349 nkf_char fold_state;
4351 if (c1== CR && !fold_preserve_f) {
4352 fold_state=0; /* ignore cr */
4353 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4355 fold_state=0; /* ignore cr */
4356 } else if (c1== BS) {
4357 if (f_line>0) f_line--;
4359 } else if (c2==EOF && f_line != 0) { /* close open last line */
4361 } else if ((c1==LF && !fold_preserve_f)
4362 || ((c1==CR||(c1==LF&&f_prev!=CR))
4363 && fold_preserve_f)) {
4365 if (fold_preserve_f) {
4369 } else if ((f_prev == c1 && !fold_preserve_f)
4370 || (f_prev == LF && fold_preserve_f)
4371 ) { /* duplicate newline */
4374 fold_state = LF; /* output two newline */
4380 if (f_prev&0x80) { /* Japanese? */
4382 fold_state = 0; /* ignore given single newline */
4383 } else if (f_prev==SP) {
4387 if (++f_line<=fold_len)
4391 fold_state = CR; /* fold and output nothing */
4395 } else if (c1=='\f') {
4398 fold_state = LF; /* output newline and clear */
4399 } else if ( (c2==0 && c1==SP)||
4400 (c2==0 && c1==TAB)||
4401 (c2=='!'&& c1=='!')) {
4402 /* X0208 kankaku or ascii space */
4404 fold_state = 0; /* remove duplicate spaces */
4407 if (++f_line<=fold_len)
4408 fold_state = SP; /* output ASCII space only */
4410 f_prev = SP; f_line = 0;
4411 fold_state = CR; /* fold and output nothing */
4415 prev0 = f_prev; /* we still need this one... , but almost done */
4417 if (c2 || c2==X0201)
4418 f_prev |= 0x80; /* this is Japanese */
4419 f_line += char_size(c2,c1);
4420 if (f_line<=fold_len) { /* normal case */
4423 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4424 f_line = char_size(c2,c1);
4425 fold_state = LF; /* We can't wait, do fold now */
4426 } else if (c2==X0201) {
4427 /* simple kinsoku rules return 1 means no folding */
4428 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4429 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4430 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4431 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4432 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4433 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4434 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4436 fold_state = LF;/* add one new f_line before this character */
4439 fold_state = LF;/* add one new f_line before this character */
4442 /* kinsoku point in ASCII */
4443 if ( c1==')'|| /* { [ ( */
4454 /* just after special */
4455 } else if (!is_alnum(prev0)) {
4456 f_line = char_size(c2,c1);
4458 } else if ((prev0==SP) || /* ignored new f_line */
4459 (prev0==LF)|| /* ignored new f_line */
4460 (prev0&0x80)) { /* X0208 - ASCII */
4461 f_line = char_size(c2,c1);
4462 fold_state = LF;/* add one new f_line before this character */
4464 fold_state = 1; /* default no fold in ASCII */
4468 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4469 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4470 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4471 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4472 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4473 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4474 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4475 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4476 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4477 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4478 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4479 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4480 /* default no fold in kinsoku */
4483 f_line = char_size(c2,c1);
4484 /* add one new f_line before this character */
4487 f_line = char_size(c2,c1);
4489 /* add one new f_line before this character */
4494 /* terminator process */
4495 switch(fold_state) {
4514 nkf_char z_prev2=0,z_prev1=0;
4516 void z_conv(nkf_char c2, nkf_char c1)
4519 /* if (c2) c1 &= 0x7f; assertion */
4521 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4527 if (z_prev2 == X0201) {
4529 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4531 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4533 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4535 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4540 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4543 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4544 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4549 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4560 if (alpha_f&1 && c2 == 0x23) {
4561 /* JISX0208 Alphabet */
4563 } else if (c2 == 0x21) {
4564 /* JISX0208 Kigou */
4569 } else if (alpha_f&4) {
4574 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4580 if (alpha_f&8 && c2 == 0) {
4584 case '>': entity = ">"; break;
4585 case '<': entity = "<"; break;
4586 case '\"': entity = """; break;
4587 case '&': entity = "&"; break;
4590 while (*entity) (*o_zconv)(0, *entity++);
4596 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4601 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4605 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4609 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4613 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4617 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4621 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4625 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4629 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4634 (*o_zconv)(X0201, c);
4637 } else if (c2 == 0x25) {
4638 /* JISX0208 Katakana */
4639 static const int fullwidth_to_halfwidth[] =
4641 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4642 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4643 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4644 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4645 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4646 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4647 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4648 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4649 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4650 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4651 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4652 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4654 if (fullwidth_to_halfwidth[c1-0x20]){
4655 c2 = fullwidth_to_halfwidth[c1-0x20];
4656 (*o_zconv)(X0201, c2>>8);
4658 (*o_zconv)(X0201, c2&0xFF);
4668 #define rot13(c) ( \
4670 (c <= 'M') ? (c + 13): \
4671 (c <= 'Z') ? (c - 13): \
4673 (c <= 'm') ? (c + 13): \
4674 (c <= 'z') ? (c - 13): \
4678 #define rot47(c) ( \
4680 ( c <= 'O') ? (c + 47) : \
4681 ( c <= '~') ? (c - 47) : \
4685 void rot_conv(nkf_char c2, nkf_char c1)
4687 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4693 (*o_rot_conv)(c2,c1);
4696 void hira_conv(nkf_char c2, nkf_char c1)
4700 if (0x20 < c1 && c1 < 0x74) {
4702 (*o_hira_conv)(c2,c1);
4704 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4706 c1 = CLASS_UNICODE | 0x3094;
4707 (*o_hira_conv)(c2,c1);
4710 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4712 (*o_hira_conv)(c2,c1);
4717 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4720 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4722 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4726 (*o_hira_conv)(c2,c1);
4730 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4732 static const nkf_char range[RANGE_NUM_MAX][2] = {
4753 nkf_char start, end, c;
4755 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4759 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4764 for (i = 0; i < RANGE_NUM_MAX; i++) {
4765 start = range[i][0];
4768 if (c >= start && c <= end) {
4773 (*o_iso2022jp_check_conv)(c2,c1);
4777 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4779 static const unsigned char *mime_pattern[] = {
4780 (const unsigned char *)"\075?EUC-JP?B?",
4781 (const unsigned char *)"\075?SHIFT_JIS?B?",
4782 (const unsigned char *)"\075?ISO-8859-1?Q?",
4783 (const unsigned char *)"\075?ISO-8859-1?B?",
4784 (const unsigned char *)"\075?ISO-2022-JP?B?",
4785 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4786 #if defined(UTF8_INPUT_ENABLE)
4787 (const unsigned char *)"\075?UTF-8?B?",
4788 (const unsigned char *)"\075?UTF-8?Q?",
4790 (const unsigned char *)"\075?US-ASCII?Q?",
4795 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4796 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4797 e_iconv, s_iconv, 0, 0, 0, 0,
4798 #if defined(UTF8_INPUT_ENABLE)
4804 static const nkf_char mime_encode[] = {
4805 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4806 #if defined(UTF8_INPUT_ENABLE)
4813 static const nkf_char mime_encode_method[] = {
4814 'B', 'B','Q', 'B', 'B', 'Q',
4815 #if defined(UTF8_INPUT_ENABLE)
4823 #define MAXRECOVER 20
4825 void switch_mime_getc(void)
4827 if (i_getc!=mime_getc) {
4828 i_mgetc = i_getc; i_getc = mime_getc;
4829 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4830 if(mime_f==STRICT_MIME) {
4831 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4832 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4837 void unswitch_mime_getc(void)
4839 if(mime_f==STRICT_MIME) {
4840 i_mgetc = i_mgetc_buf;
4841 i_mungetc = i_mungetc_buf;
4844 i_ungetc = i_mungetc;
4845 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4846 mime_iconv_back = NULL;
4849 nkf_char mime_begin_strict(FILE *f)
4853 const unsigned char *p,*q;
4854 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4856 mime_decode_mode = FALSE;
4857 /* =? has been checked */
4859 p = mime_pattern[j];
4862 for(i=2;p[i]>SP;i++) { /* start at =? */
4863 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4864 /* pattern fails, try next one */
4866 while (mime_pattern[++j]) {
4867 p = mime_pattern[j];
4868 for(k=2;k<i;k++) /* assume length(p) > i */
4869 if (p[k]!=q[k]) break;
4870 if (k==i && nkf_toupper(c1)==p[k]) break;
4872 p = mime_pattern[j];
4873 if (p) continue; /* found next one, continue */
4874 /* all fails, output from recovery buffer */
4882 mime_decode_mode = p[i-2];
4884 mime_iconv_back = iconv;
4885 set_iconv(FALSE, mime_priority_func[j]);
4886 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4888 if (mime_decode_mode=='B') {
4889 mimebuf_f = unbuf_f;
4891 /* do MIME integrity check */
4892 return mime_integrity(f,mime_pattern[j]);
4900 nkf_char mime_getc_buf(FILE *f)
4902 /* we don't keep eof of Fifo, becase it contains ?= as
4903 a terminator. It was checked in mime_integrity. */
4904 return ((mimebuf_f)?
4905 (*i_mgetc_buf)(f):Fifo(mime_input++));
4908 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4911 (*i_mungetc_buf)(c,f);
4913 Fifo(--mime_input) = (unsigned char)c;
4917 nkf_char mime_begin(FILE *f)
4922 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4923 /* re-read and convert again from mime_buffer. */
4925 /* =? has been checked */
4927 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4928 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4929 /* We accept any character type even if it is breaked by new lines */
4930 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4931 if (c1==LF||c1==SP||c1==CR||
4932 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4934 /* Failed. But this could be another MIME preemble */
4942 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4943 if (!(++i<MAXRECOVER) || c1==EOF) break;
4944 if (c1=='b'||c1=='B') {
4945 mime_decode_mode = 'B';
4946 } else if (c1=='q'||c1=='Q') {
4947 mime_decode_mode = 'Q';
4951 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4952 if (!(++i<MAXRECOVER) || c1==EOF) break;
4954 mime_decode_mode = FALSE;
4960 if (!mime_decode_mode) {
4961 /* false MIME premble, restart from mime_buffer */
4962 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4963 /* Since we are in MIME mode until buffer becomes empty, */
4964 /* we never go into mime_begin again for a while. */
4967 /* discard mime preemble, and goto MIME mode */
4969 /* do no MIME integrity check */
4970 return c1; /* used only for checking EOF */
4974 void no_putc(nkf_char c)
4979 void debug(const char *str)
4982 fprintf(stderr, "%s\n", str ? str : "NULL");
4987 void set_input_codename(char *codename)
4989 if (!input_codename) {
4990 input_codename = codename;
4991 } else if (strcmp(codename, input_codename) != 0) {
4992 is_inputcode_mixed = TRUE;
4993 input_codename = "";
4997 #if !defined(PERL_XS) && !defined(WIN32DLL)
4998 void print_guessed_code(char *filename)
5000 char *codename = "BINARY";
5001 char *str_nlmode = NULL;
5002 if (!input_codename || *input_codename) {
5003 if (!input_codename) {
5006 codename = input_codename;
5008 if (nlmode_f == CR) str_nlmode = "CR";
5009 else if (nlmode_f == LF) str_nlmode = "LF";
5010 else if (nlmode_f == CRLF) str_nlmode = "CRLF";
5011 else if (nlmode_f == EOF) str_nlmode = "MIXED NL";
5013 if (filename != NULL) printf("%s: ", filename);
5014 if (str_nlmode != NULL) printf("%s (%s)\n", codename, str_nlmode);
5015 else printf("%s\n", codename);
5021 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5023 nkf_char c1, c2, c3;
5029 if (!nkf_isxdigit(c2)){
5034 if (!nkf_isxdigit(c3)){
5039 return (hex2bin(c2) << 4) | hex2bin(c3);
5042 nkf_char cap_getc(FILE *f)
5044 return hex_getc(':', f, i_cgetc, i_cungetc);
5047 nkf_char cap_ungetc(nkf_char c, FILE *f)
5049 return (*i_cungetc)(c, f);
5052 nkf_char url_getc(FILE *f)
5054 return hex_getc('%', f, i_ugetc, i_uungetc);
5057 nkf_char url_ungetc(nkf_char c, FILE *f)
5059 return (*i_uungetc)(c, f);
5063 #ifdef NUMCHAR_OPTION
5064 nkf_char numchar_getc(FILE *f)
5066 nkf_char (*g)(FILE *) = i_ngetc;
5067 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5078 if (buf[i] == 'x' || buf[i] == 'X'){
5079 for (j = 0; j < 7; j++){
5081 if (!nkf_isxdigit(buf[i])){
5088 c |= hex2bin(buf[i]);
5091 for (j = 0; j < 8; j++){
5095 if (!nkf_isdigit(buf[i])){
5102 c += hex2bin(buf[i]);
5108 return CLASS_UNICODE | c;
5117 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5119 return (*i_nungetc)(c, f);
5123 #ifdef UNICODE_NORMALIZATION
5125 /* Normalization Form C */
5126 nkf_char nfc_getc(FILE *f)
5128 nkf_char (*g)(FILE *f) = i_nfc_getc;
5129 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5130 int i=0, j, k=1, lower, upper;
5132 const nkf_nfchar *array;
5135 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5136 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5137 while (upper >= lower) {
5138 j = (lower+upper) / 2;
5139 array = normalization_table[j].nfd;
5140 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5141 if (array[k] != buf[k]){
5142 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5149 array = normalization_table[j].nfc;
5150 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5151 buf[i] = (nkf_char)(array[i]);
5162 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5164 return (*i_nfc_ungetc)(c, f);
5166 #endif /* UNICODE_NORMALIZATION */
5172 nkf_char c1, c2, c3, c4, cc;
5173 nkf_char t1, t2, t3, t4, mode, exit_mode;
5174 nkf_char lwsp_count;
5177 nkf_char lwsp_size = 128;
5179 if (mime_top != mime_last) { /* Something is in FIFO */
5180 return Fifo(mime_top++);
5182 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5183 mime_decode_mode=FALSE;
5184 unswitch_mime_getc();
5185 return (*i_getc)(f);
5188 if (mimebuf_f == FIXED_MIME)
5189 exit_mode = mime_decode_mode;
5192 if (mime_decode_mode == 'Q') {
5193 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5195 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5196 if (c1<=SP || DEL<=c1) {
5197 mime_decode_mode = exit_mode; /* prepare for quit */
5200 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5204 mime_decode_mode = exit_mode; /* prepare for quit */
5205 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5206 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5207 /* end Q encoding */
5208 input_mode = exit_mode;
5210 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5211 if (lwsp_buf==NULL) {
5212 perror("can't malloc");
5215 while ((c1=(*i_getc)(f))!=EOF) {
5220 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5228 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5229 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5244 lwsp_buf[lwsp_count] = (unsigned char)c1;
5245 if (lwsp_count++>lwsp_size){
5247 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5248 if (lwsp_buf_new==NULL) {
5250 perror("can't realloc");
5253 lwsp_buf = lwsp_buf_new;
5259 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5261 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5262 i_ungetc(lwsp_buf[lwsp_count],f);
5268 if (c1=='='&&c2<SP) { /* this is soft wrap */
5269 while((c1 = (*i_mgetc)(f)) <=SP) {
5270 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5272 mime_decode_mode = 'Q'; /* still in MIME */
5273 goto restart_mime_q;
5276 mime_decode_mode = 'Q'; /* still in MIME */
5280 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5281 if (c2<=SP) return c2;
5282 mime_decode_mode = 'Q'; /* still in MIME */
5283 return ((hex2bin(c2)<<4) + hex2bin(c3));
5286 if (mime_decode_mode != 'B') {
5287 mime_decode_mode = FALSE;
5288 return (*i_mgetc)(f);
5292 /* Base64 encoding */
5294 MIME allows line break in the middle of
5295 Base64, but we are very pessimistic in decoding
5296 in unbuf mode because MIME encoded code may broken by
5297 less or editor's control sequence (such as ESC-[-K in unbuffered
5298 mode. ignore incomplete MIME.
5300 mode = mime_decode_mode;
5301 mime_decode_mode = exit_mode; /* prepare for quit */
5303 while ((c1 = (*i_mgetc)(f))<=SP) {
5308 if ((c2 = (*i_mgetc)(f))<=SP) {
5311 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5312 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5315 if ((c1 == '?') && (c2 == '=')) {
5318 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5319 if (lwsp_buf==NULL) {
5320 perror("can't malloc");
5323 while ((c1=(*i_getc)(f))!=EOF) {
5328 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5336 if ((c1=(*i_getc)(f))!=EOF) {
5340 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5355 lwsp_buf[lwsp_count] = (unsigned char)c1;
5356 if (lwsp_count++>lwsp_size){
5358 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5359 if (lwsp_buf_new==NULL) {
5361 perror("can't realloc");
5364 lwsp_buf = lwsp_buf_new;
5370 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5372 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5373 i_ungetc(lwsp_buf[lwsp_count],f);
5380 if ((c3 = (*i_mgetc)(f))<=SP) {
5383 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5384 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5388 if ((c4 = (*i_mgetc)(f))<=SP) {
5391 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5392 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5396 mime_decode_mode = mode; /* still in MIME sigh... */
5398 /* BASE 64 decoding */
5400 t1 = 0x3f & base64decode(c1);
5401 t2 = 0x3f & base64decode(c2);
5402 t3 = 0x3f & base64decode(c3);
5403 t4 = 0x3f & base64decode(c4);
5404 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5406 Fifo(mime_last++) = (unsigned char)cc;
5407 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5409 Fifo(mime_last++) = (unsigned char)cc;
5410 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5412 Fifo(mime_last++) = (unsigned char)cc;
5417 return Fifo(mime_top++);
5420 nkf_char mime_ungetc(nkf_char c, FILE *f)
5422 Fifo(--mime_top) = (unsigned char)c;
5426 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5430 /* In buffered mode, read until =? or NL or buffer full
5432 mime_input = mime_top;
5433 mime_last = mime_top;
5435 while(*p) Fifo(mime_input++) = *p++;
5438 while((c=(*i_getc)(f))!=EOF) {
5439 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5440 break; /* buffer full */
5442 if (c=='=' && d=='?') {
5443 /* checked. skip header, start decode */
5444 Fifo(mime_input++) = (unsigned char)c;
5445 /* mime_last_input = mime_input; */
5450 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5452 /* Should we check length mod 4? */
5453 Fifo(mime_input++) = (unsigned char)c;
5456 /* In case of Incomplete MIME, no MIME decode */
5457 Fifo(mime_input++) = (unsigned char)c;
5458 mime_last = mime_input; /* point undecoded buffer */
5459 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5460 switch_mime_getc(); /* anyway we need buffered getc */
5464 nkf_char base64decode(nkf_char c)
5469 i = c - 'A'; /* A..Z 0-25 */
5471 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5473 } else if (c > '/') {
5474 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5475 } else if (c == '+') {
5476 i = '>' /* 62 */ ; /* + 62 */
5478 i = '?' /* 63 */ ; /* / 63 */
5483 static const char basis_64[] =
5484 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5486 static nkf_char b64c;
5487 #define MIMEOUT_BUF_LENGTH (60)
5488 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5489 int mimeout_buf_count = 0;
5490 int mimeout_preserve_space = 0;
5491 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5493 void open_mime(nkf_char mode)
5495 const unsigned char *p;
5498 p = mime_pattern[0];
5499 for(i=0;mime_pattern[i];i++) {
5500 if (mode == mime_encode[i]) {
5501 p = mime_pattern[i];
5505 mimeout_mode = mime_encode_method[i];
5508 if (base64_count>45) {
5509 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5510 (*o_mputc)(mimeout_buf[i]);
5516 if (!mimeout_preserve_space && mimeout_buf_count>0
5517 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5518 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5522 if (!mimeout_preserve_space) {
5523 for (;i<mimeout_buf_count;i++) {
5524 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5525 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5526 (*o_mputc)(mimeout_buf[i]);
5533 mimeout_preserve_space = FALSE;
5539 j = mimeout_buf_count;
5540 mimeout_buf_count = 0;
5542 mime_putc(mimeout_buf[i]);
5546 void close_mime(void)
5556 switch(mimeout_mode) {
5561 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5567 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5573 if (mimeout_f!=FIXED_MIME) {
5575 } else if (mimeout_mode != 'Q')
5580 void mimeout_addchar(nkf_char c)
5582 switch(mimeout_mode) {
5587 } else if(!nkf_isalnum(c)) {
5589 (*o_mputc)(itoh4(((c>>4)&0xf)));
5590 (*o_mputc)(itoh4((c&0xf)));
5599 (*o_mputc)(basis_64[c>>2]);
5604 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5610 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5611 (*o_mputc)(basis_64[c & 0x3F]);
5622 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5624 void mime_prechar(nkf_char c2, nkf_char c1)
5628 if (base64_count + mimeout_buf_count/3*4> 73){
5629 (*o_base64conv)(EOF,0);
5630 (*o_base64conv)(0,LF);
5631 (*o_base64conv)(0,SP);
5634 if (base64_count + mimeout_buf_count/3*4> 66){
5635 (*o_base64conv)(EOF,0);
5636 (*o_base64conv)(0,LF);
5637 (*o_base64conv)(0,SP);
5639 }/*else if (mime_lastchar2){
5640 if (c1 <=DEL && !nkf_isspace(c1)){
5641 (*o_base64conv)(0,SP);
5645 if (c2 && mime_lastchar2 == 0
5646 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5647 (*o_base64conv)(0,SP);
5650 /*mime_lastchar2 = c2;
5651 mime_lastchar1 = c1;*/
5654 void mime_putc(nkf_char c)
5659 if (mimeout_f == FIXED_MIME){
5660 if (mimeout_mode == 'Q'){
5661 if (base64_count > 71){
5662 if (c!=CR && c!=LF) {
5669 if (base64_count > 71){
5674 if (c == EOF) { /* c==EOF */
5678 if (c != EOF) { /* c==EOF */
5684 /* mimeout_f != FIXED_MIME */
5686 if (c == EOF) { /* c==EOF */
5687 j = mimeout_buf_count;
5688 mimeout_buf_count = 0;
5691 if (!nkf_isblank(mimeout_buf[j-1])) {
5693 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5696 mimeout_addchar(mimeout_buf[i]);
5700 mimeout_addchar(mimeout_buf[i]);
5704 mimeout_addchar(mimeout_buf[i]);
5710 mimeout_addchar(mimeout_buf[i]);
5716 if (mimeout_mode=='Q') {
5717 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5718 if (c == CR || c == LF) {
5723 } else if (c <= SP) {
5725 if (base64_count > 70) {
5729 if (!nkf_isblank(c)) {
5740 if (mimeout_buf_count > 0){
5741 lastchar = mimeout_buf[mimeout_buf_count - 1];
5746 if (!mimeout_mode) {
5747 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5748 if (nkf_isspace(c)) {
5749 if (c==CR || c==LF) {
5752 for (i=0;i<mimeout_buf_count;i++) {
5753 (*o_mputc)(mimeout_buf[i]);
5754 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5760 mimeout_buf[0] = (char)c;
5761 mimeout_buf_count = 1;
5763 if (base64_count > 1
5764 && base64_count + mimeout_buf_count > 76
5765 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5768 if (!nkf_isspace(mimeout_buf[0])){
5773 mimeout_buf[mimeout_buf_count++] = (char)c;
5774 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5775 open_mime(output_mode);
5780 if (lastchar==CR || lastchar == LF){
5781 for (i=0;i<mimeout_buf_count;i++) {
5782 (*o_mputc)(mimeout_buf[i]);
5785 mimeout_buf_count = 0;
5788 for (i=0;i<mimeout_buf_count-1;i++) {
5789 (*o_mputc)(mimeout_buf[i]);
5792 mimeout_buf[0] = SP;
5793 mimeout_buf_count = 1;
5795 open_mime(output_mode);
5798 /* mimeout_mode == 'B', 1, 2 */
5799 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5800 if (lastchar == CR || lastchar == LF){
5801 if (nkf_isblank(c)) {
5802 for (i=0;i<mimeout_buf_count;i++) {
5803 mimeout_addchar(mimeout_buf[i]);
5805 mimeout_buf_count = 0;
5806 } else if (SP<c && c<DEL) {
5808 for (i=0;i<mimeout_buf_count;i++) {
5809 (*o_mputc)(mimeout_buf[i]);
5812 mimeout_buf_count = 0;
5815 if (c==SP || c==TAB || c==CR || c==LF) {
5816 for (i=0;i<mimeout_buf_count;i++) {
5817 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5819 for (i=0;i<mimeout_buf_count;i++) {
5820 (*o_mputc)(mimeout_buf[i]);
5823 mimeout_buf_count = 0;
5826 mimeout_buf[mimeout_buf_count++] = (char)c;
5827 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5829 for (i=0;i<mimeout_buf_count;i++) {
5830 (*o_mputc)(mimeout_buf[i]);
5833 mimeout_buf_count = 0;
5837 if (mimeout_buf_count>0 && SP<c && c!='=') {
5838 mimeout_buf[mimeout_buf_count++] = (char)c;
5839 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5840 j = mimeout_buf_count;
5841 mimeout_buf_count = 0;
5843 mimeout_addchar(mimeout_buf[i]);
5850 if (mimeout_buf_count>0) {
5851 j = mimeout_buf_count;
5852 mimeout_buf_count = 0;
5854 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5856 mimeout_addchar(mimeout_buf[i]);
5862 (*o_mputc)(mimeout_buf[i]);
5864 open_mime(output_mode);
5871 #if defined(PERL_XS) || defined(WIN32DLL)
5875 struct input_code *p = input_code_list;
5888 mime_f = STRICT_MIME;
5889 mime_decode_f = FALSE;
5894 #if defined(MSDOS) || defined(__OS2__)
5899 iso2022jp_f = FALSE;
5900 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5901 ms_ucs_map_f = UCS_MAP_ASCII;
5903 #ifdef UTF8_INPUT_ENABLE
5904 no_cp932ext_f = FALSE;
5905 no_best_fit_chars_f = FALSE;
5906 encode_fallback = NULL;
5907 unicode_subchar = '?';
5908 input_endian = ENDIAN_BIG;
5910 #ifdef UTF8_OUTPUT_ENABLE
5911 output_bom_f = FALSE;
5912 output_endian = ENDIAN_BIG;
5914 #ifdef UNICODE_NORMALIZATION
5927 is_inputcode_mixed = FALSE;
5931 #ifdef SHIFTJIS_CP932
5941 for (i = 0; i < 256; i++){
5942 prefix_table[i] = 0;
5946 mimeout_buf_count = 0;
5951 fold_preserve_f = FALSE;
5954 kanji_intro = DEFAULT_J;
5955 ascii_intro = DEFAULT_R;
5956 fold_margin = FOLD_MARGIN;
5957 output_conv = DEFAULT_CONV;
5958 oconv = DEFAULT_CONV;
5959 o_zconv = no_connection;
5960 o_fconv = no_connection;
5961 o_nlconv = no_connection;
5962 o_rot_conv = no_connection;
5963 o_hira_conv = no_connection;
5964 o_base64conv = no_connection;
5965 o_iso2022jp_check_conv = no_connection;
5968 i_ungetc = std_ungetc;
5970 i_bungetc = std_ungetc;
5973 i_mungetc = std_ungetc;
5974 i_mgetc_buf = std_getc;
5975 i_mungetc_buf = std_ungetc;
5976 output_mode = ASCII;
5979 mime_decode_mode = FALSE;
5985 z_prev2=0,z_prev1=0;
5987 iconv_for_check = 0;
5989 input_codename = NULL;
5996 void no_connection(nkf_char c2, nkf_char c1)
5998 no_connection2(c2,c1,0);
6001 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6003 fprintf(stderr,"nkf internal module connection failure.\n");
6005 return 0; /* LINT */
6010 #define fprintf dllprintf
6014 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6015 fprintf(stderr,"Flags:\n");
6016 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6017 #ifdef DEFAULT_CODE_SJIS
6018 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6020 #ifdef DEFAULT_CODE_JIS
6021 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6023 #ifdef DEFAULT_CODE_EUC
6024 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6026 #ifdef DEFAULT_CODE_UTF8
6027 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6029 #ifdef UTF8_OUTPUT_ENABLE
6030 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6032 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6033 #ifdef UTF8_INPUT_ENABLE
6034 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6036 fprintf(stderr,"t no conversion\n");
6037 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6038 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6039 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6040 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6041 fprintf(stderr,"v Show this usage. V: show version\n");
6042 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6043 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6044 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6045 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6046 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6047 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6048 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6049 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6050 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6052 fprintf(stderr,"T Text mode output\n");
6054 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6055 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6056 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6057 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6058 fprintf(stderr,"\n");
6059 fprintf(stderr,"Long name options\n");
6060 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6061 fprintf(stderr," Specify the input or output codeset\n");
6062 fprintf(stderr," --fj --unix --mac --windows\n");
6063 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6064 fprintf(stderr," Convert for the system or code\n");
6065 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6066 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6067 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6069 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6071 #ifdef NUMCHAR_OPTION
6072 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6074 #ifdef UTF8_INPUT_ENABLE
6075 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6076 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6079 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6080 fprintf(stderr," Overwrite original listed files by filtered result\n");
6081 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6083 fprintf(stderr," -g --guess Guess the input code\n");
6084 fprintf(stderr," --help --version Show this help/the version\n");
6085 fprintf(stderr," For more information, see also man nkf\n");
6086 fprintf(stderr,"\n");
6092 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6093 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6096 #if defined(MSDOS) && defined(__WIN16__)
6099 #if defined(MSDOS) && defined(__WIN32__)
6105 ,NKF_VERSION,NKF_RELEASE_DATE);
6106 fprintf(stderr,"\n%s\n",CopyRight);