1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.150 2007/11/30 15:59:05 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-11-30"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #ifndef MIME_DECODE_DEFAULT
44 #define MIME_DECODE_DEFAULT STRICT_MIME
47 #define X0201_DEFAULT TRUE
50 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
52 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
68 #if defined(MSDOS) || defined(__OS2__)
71 #if defined(_MSC_VER) || defined(__WATCOMC__)
72 #define mktemp _mktemp
78 #define setbinmode(fp) fsetbin(fp)
79 #elif defined(__DJGPP__)
80 #include <libc/dosio.h>
81 #define setbinmode(fp) djgpp_setbinmode(fp)
82 #else /* Microsoft C, Turbo C */
83 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
86 #define setbinmode(fp)
89 #if defined(__DJGPP__)
90 void djgpp_setbinmode(FILE *fp)
92 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
95 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
96 __file_handle_set(fd, m);
100 #ifdef _IOFBF /* SysV and MSDOS, Windows */
101 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
103 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
106 /*Borland C++ 4.5 EasyWin*/
107 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
116 /* added by satoru@isoternet.org */
118 #include <sys/types.h>
120 #include <sys/stat.h>
121 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
123 #if defined(__WATCOMC__)
124 #include <sys/utime.h>
128 #else /* defined(MSDOS) */
130 #ifdef __BORLANDC__ /* BCC32 */
132 #else /* !defined(__BORLANDC__) */
133 #include <sys/utime.h>
134 #endif /* (__BORLANDC__) */
135 #else /* !defined(__WIN32__) */
136 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
137 #include <sys/utime.h>
138 #elif defined(__TURBOC__) /* BCC */
140 #elif defined(LSI_C) /* LSI C */
141 #endif /* (__WIN32__) */
149 /* state of output_mode and input_mode
165 #define X0213_1 0x284F
166 #define X0213_2 0x2850
168 /* Input Assumption */
173 #define LATIN1_INPUT 6
175 #define STRICT_MIME 8
180 #define JAPANESE_EUC 10
184 #define UTF8_INPUT 13
185 #define UTF16_INPUT 1015
186 #define UTF32_INPUT 1017
190 #define ENDIAN_BIG 1234
191 #define ENDIAN_LITTLE 4321
192 #define ENDIAN_2143 2143
193 #define ENDIAN_3412 3412
212 #define is_alnum(c) \
213 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
215 /* I don't trust portablity of toupper */
216 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
217 #define nkf_isoctal(c) ('0'<=c && c<='7')
218 #define nkf_isdigit(c) ('0'<=c && c<='9')
219 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
220 #define nkf_isblank(c) (c == SP || c == TAB)
221 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
222 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
223 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
224 #define nkf_isprint(c) (SP<=c && c<='~')
225 #define nkf_isgraph(c) ('!'<=c && c<='~')
226 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
227 ('A'<=c&&c<='F') ? (c-'A'+10) : \
228 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
229 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
230 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
231 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
232 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
233 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
235 #define CP932_TABLE_BEGIN 0xFA
236 #define CP932_TABLE_END 0xFC
237 #define CP932INV_TABLE_BEGIN 0xED
238 #define CP932INV_TABLE_END 0xEE
239 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
241 #define HOLD_SIZE 1024
242 #if defined(INT_IS_SHORT)
243 #define IOBUF_SIZE 2048
245 #define IOBUF_SIZE 16384
248 #define DEFAULT_J 'B'
249 #define DEFAULT_R 'B'
251 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
252 #define SJ6394 0x0161 /* 63 - 94 ku offset */
254 #define RANGE_NUM_MAX 18
259 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
260 #define sizeof_euc_to_utf8_1byte 94
261 #define sizeof_euc_to_utf8_2bytes 94
262 #define sizeof_utf8_to_euc_C2 64
263 #define sizeof_utf8_to_euc_E5B8 64
264 #define sizeof_utf8_to_euc_2bytes 112
265 #define sizeof_utf8_to_euc_3bytes 16
268 /* MIME preprocessor */
270 #ifdef EASYWIN /*Easy Win */
271 extern POINT _BufferSize;
280 void (*status_func)(struct input_code *, nkf_char);
281 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
285 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
288 static const char *CopyRight = COPY_RIGHT;
290 #if !defined(PERL_XS) && !defined(WIN32DLL)
291 static nkf_char noconvert(FILE *f);
293 static void module_connection(void);
294 static nkf_char kanji_convert(FILE *f);
295 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
296 static nkf_char push_hold_buf(nkf_char c2);
297 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
298 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
299 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
300 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
301 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
303 * 0: Shift_JIS, eucJP-ascii
308 #define UCS_MAP_ASCII 0
310 #define UCS_MAP_CP932 2
311 #define UCS_MAP_CP10001 3
312 static int ms_ucs_map_f = UCS_MAP_ASCII;
314 #ifdef UTF8_INPUT_ENABLE
315 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
316 static int no_cp932ext_f = FALSE;
317 /* ignore ZERO WIDTH NO-BREAK SPACE */
318 static int no_best_fit_chars_f = FALSE;
319 static int input_endian = ENDIAN_BIG;
320 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
321 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
322 static void encode_fallback_html(nkf_char c);
323 static void encode_fallback_xml(nkf_char c);
324 static void encode_fallback_java(nkf_char c);
325 static void encode_fallback_perl(nkf_char c);
326 static void encode_fallback_subchar(nkf_char c);
327 static void (*encode_fallback)(nkf_char c) = NULL;
328 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
329 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
330 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
331 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
332 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
333 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
334 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
335 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
336 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
337 static void w_status(struct input_code *, nkf_char);
339 #ifdef UTF8_OUTPUT_ENABLE
340 static int output_bom_f = FALSE;
341 static int output_endian = ENDIAN_BIG;
342 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
343 static void w_oconv(nkf_char c2,nkf_char c1);
344 static void w_oconv16(nkf_char c2,nkf_char c1);
345 static void w_oconv32(nkf_char c2,nkf_char c1);
347 static void e_oconv(nkf_char c2,nkf_char c1);
348 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
349 static void s_oconv(nkf_char c2,nkf_char c1);
350 static void j_oconv(nkf_char c2,nkf_char c1);
351 static void fold_conv(nkf_char c2,nkf_char c1);
352 static void nl_conv(nkf_char c2,nkf_char c1);
353 static void z_conv(nkf_char c2,nkf_char c1);
354 static void rot_conv(nkf_char c2,nkf_char c1);
355 static void hira_conv(nkf_char c2,nkf_char c1);
356 static void base64_conv(nkf_char c2,nkf_char c1);
357 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
358 static void no_connection(nkf_char c2,nkf_char c1);
359 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
361 static void code_score(struct input_code *ptr);
362 static void code_status(nkf_char c);
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_begin(FILE *f);
372 static nkf_char mime_getc(FILE *f);
373 static nkf_char mime_ungetc(nkf_char c,FILE *f);
375 static void switch_mime_getc(void);
376 static void unswitch_mime_getc(void);
377 static nkf_char mime_begin_strict(FILE *f);
378 static nkf_char mime_getc_buf(FILE *f);
379 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
380 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
382 static nkf_char base64decode(nkf_char c);
383 static void mime_prechar(nkf_char c2, nkf_char c1);
384 static void mime_putc(nkf_char c);
385 static void open_mime(nkf_char c);
386 static void close_mime(void);
387 static void eof_mime(void);
388 static void mimeout_addchar(nkf_char c);
390 static void usage(void);
391 static void version(void);
393 static void options(unsigned char *c);
394 static void reinit(void);
398 #if !defined(PERL_XS) && !defined(WIN32DLL)
399 static unsigned char stdibuf[IOBUF_SIZE];
400 static unsigned char stdobuf[IOBUF_SIZE];
402 static unsigned char hold_buf[HOLD_SIZE*2];
403 static int hold_count = 0;
405 /* MIME preprocessor fifo */
407 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
408 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
409 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
410 static unsigned char mime_buf[MIME_BUF_SIZE];
411 static unsigned int mime_top = 0;
412 static unsigned int mime_last = 0; /* decoded */
413 static unsigned int mime_input = 0; /* undecoded */
414 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
417 static int unbuf_f = FALSE;
418 static int estab_f = FALSE;
419 static int nop_f = FALSE;
420 static int binmode_f = TRUE; /* binary mode */
421 static int rot_f = FALSE; /* rot14/43 mode */
422 static int hira_f = FALSE; /* hira/kata henkan */
423 static int input_f = FALSE; /* non fixed input code */
424 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
425 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
426 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
427 static int mimebuf_f = FALSE; /* MIME buffered input */
428 static int broken_f = FALSE; /* convert ESC-less broken JIS */
429 static int iso8859_f = FALSE; /* ISO8859 through */
430 static int mimeout_f = FALSE; /* base64 mode */
431 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
432 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
434 #ifdef UNICODE_NORMALIZATION
435 static int nfc_f = FALSE;
436 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
437 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
438 static nkf_char nfc_getc(FILE *f);
439 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
443 static int cap_f = FALSE;
444 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
445 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
446 static nkf_char cap_getc(FILE *f);
447 static nkf_char cap_ungetc(nkf_char c,FILE *f);
449 static int url_f = FALSE;
450 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
451 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
452 static nkf_char url_getc(FILE *f);
453 static nkf_char url_ungetc(nkf_char c,FILE *f);
456 #if defined(INT_IS_SHORT)
457 #define NKF_INT32_C(n) (n##L)
459 #define NKF_INT32_C(n) (n)
461 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
462 #define CLASS_MASK NKF_INT32_C(0xFF000000)
463 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
464 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
465 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
466 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
467 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
469 #ifdef NUMCHAR_OPTION
470 static int numchar_f = FALSE;
471 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
472 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
473 static nkf_char numchar_getc(FILE *f);
474 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
478 static int noout_f = FALSE;
479 static void no_putc(nkf_char c);
480 static int debug_f = FALSE;
481 static void debug(const char *str);
482 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
485 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
487 static void print_guessed_code(char *filename);
489 static void set_input_codename(char *codename);
492 static int exec_f = 0;
495 #ifdef SHIFTJIS_CP932
496 /* invert IBM extended characters to others */
497 static int cp51932_f = FALSE;
499 /* invert NEC-selected IBM extended characters to IBM extended characters */
500 static int cp932inv_f = TRUE;
502 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
503 #endif /* SHIFTJIS_CP932 */
506 static int x0212_f = FALSE;
507 static nkf_char x0212_shift(nkf_char c);
508 static nkf_char x0212_unshift(nkf_char c);
510 static int x0213_f = FALSE;
512 static unsigned char prefix_table[256];
514 static void set_code_score(struct input_code *ptr, nkf_char score);
515 static void clr_code_score(struct input_code *ptr, nkf_char score);
516 static void status_disable(struct input_code *ptr);
517 static void status_push_ch(struct input_code *ptr, nkf_char c);
518 static void status_clear(struct input_code *ptr);
519 static void status_reset(struct input_code *ptr);
520 static void status_reinit(struct input_code *ptr);
521 static void status_check(struct input_code *ptr, nkf_char c);
522 static void e_status(struct input_code *, nkf_char);
523 static void s_status(struct input_code *, nkf_char);
525 struct input_code input_code_list[] = {
526 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
527 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
528 #ifdef UTF8_INPUT_ENABLE
529 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
530 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
531 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
536 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
537 static int base64_count = 0;
539 /* X0208 -> ASCII converter */
542 static int f_line = 0; /* chars in line */
543 static int f_prev = 0;
544 static int fold_preserve_f = FALSE; /* preserve new lines */
545 static int fold_f = FALSE;
546 static int fold_len = 0;
549 static unsigned char kanji_intro = DEFAULT_J;
550 static unsigned char ascii_intro = DEFAULT_R;
554 #define FOLD_MARGIN 10
555 #define DEFAULT_FOLD 60
557 static int fold_margin = FOLD_MARGIN;
561 #ifdef DEFAULT_CODE_JIS
562 # define DEFAULT_CONV j_oconv
564 #ifdef DEFAULT_CODE_SJIS
565 # define DEFAULT_CONV s_oconv
567 #ifdef DEFAULT_CODE_EUC
568 # define DEFAULT_CONV e_oconv
570 #ifdef DEFAULT_CODE_UTF8
571 # define DEFAULT_CONV w_oconv
574 /* process default */
575 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
577 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
578 /* s_iconv or oconv */
579 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
581 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
587 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
589 /* static redirections */
591 static void (*o_putc)(nkf_char c) = std_putc;
593 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
594 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
596 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
597 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
599 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
601 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
602 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
604 /* for strict mime */
605 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
606 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
609 static int output_mode = ASCII, /* output kanji mode */
610 input_mode = ASCII, /* input kanji mode */
611 shift_mode = FALSE; /* TRUE shift out, or X0201 */
612 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
614 /* X0201 / X0208 conversion tables */
616 /* X0201 kana conversion table */
618 static const unsigned char cv[]= {
619 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
620 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
621 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
622 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
623 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
624 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
625 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
626 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
627 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
628 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
629 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
630 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
631 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
632 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
633 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
634 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
638 /* X0201 kana conversion table for daguten */
640 static const unsigned char dv[]= {
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
646 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
647 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
648 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
649 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
650 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
651 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
652 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
659 /* X0201 kana conversion table for han-daguten */
661 static const unsigned char ev[]= {
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
673 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
681 /* X0208 kigou conversion table */
682 /* 0x8140 - 0x819e */
683 static const unsigned char fv[] = {
685 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
686 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
687 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
688 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
689 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
690 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
691 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
692 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
693 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
694 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
695 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
696 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
701 static int file_out_f = FALSE;
703 static int overwrite_f = FALSE;
704 static int preserve_time_f = FALSE;
705 static int backup_f = FALSE;
706 static char *backup_suffix = "";
707 static char *get_backup_filename(const char *suffix, const char *filename);
710 static int nlmode_f = 0; /* CR, LF, CRLF */
711 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
712 static nkf_char prev_cr = 0; /* CR or 0 */
713 #ifdef EASYWIN /*Easy Win */
714 static int end_check;
717 #define STD_GC_BUFSIZE (256)
718 nkf_char std_gc_buf[STD_GC_BUFSIZE];
722 #include "nkf32dll.c"
723 #elif defined(PERL_XS)
725 int main(int argc, char **argv)
730 char *outfname = NULL;
733 #ifdef EASYWIN /*Easy Win */
734 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
737 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
738 cp = (unsigned char *)*argv;
742 int debug_f_back = debug_f;
745 int exec_f_back = exec_f;
748 int x0212_f_back = x0212_f;
750 int x0213_f_back = x0213_f;
751 int guess_f_back = guess_f;
753 guess_f = guess_f_back;
756 debug_f = debug_f_back;
759 exec_f = exec_f_back;
762 x0212_f = x0212_f_back;
764 x0213_f = x0213_f_back;
769 if (pipe(fds) < 0 || (pid = fork()) < 0){
780 execvp(argv[1], &argv[1]);
795 if (binmode_f == TRUE)
796 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
797 if (freopen("","wb",stdout) == NULL)
804 setbuf(stdout, (char *) NULL);
806 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
809 if (binmode_f == TRUE)
810 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
811 if (freopen("","rb",stdin) == NULL) return (-1);
815 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
819 kanji_convert(stdin);
820 if (guess_f) print_guessed_code(NULL);
824 int is_argument_error = FALSE;
826 input_codename = NULL;
831 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
834 is_argument_error = TRUE;
842 /* reopen file for stdout */
843 if (file_out_f == TRUE) {
846 outfname = malloc(strlen(origfname)
847 + strlen(".nkftmpXXXXXX")
853 strcpy(outfname, origfname);
857 for (i = strlen(outfname); i; --i){
858 if (outfname[i - 1] == '/'
859 || outfname[i - 1] == '\\'){
865 strcat(outfname, "ntXXXXXX");
867 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
870 strcat(outfname, ".nkftmpXXXXXX");
871 fd = mkstemp(outfname);
874 || (fd_backup = dup(fileno(stdout))) < 0
875 || dup2(fd, fileno(stdout)) < 0
886 outfname = "nkf.out";
889 if(freopen(outfname, "w", stdout) == NULL) {
893 if (binmode_f == TRUE) {
894 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
895 if (freopen("","wb",stdout) == NULL)
902 if (binmode_f == TRUE)
903 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
904 if (freopen("","rb",fin) == NULL)
909 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
913 char *filename = NULL;
915 if (nfiles > 1) filename = origfname;
916 if (guess_f) print_guessed_code(filename);
922 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
930 if (dup2(fd_backup, fileno(stdout)) < 0){
933 if (stat(origfname, &sb)) {
934 fprintf(stderr, "Can't stat %s\n", origfname);
936 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
937 if (chmod(outfname, sb.st_mode)) {
938 fprintf(stderr, "Can't set permission %s\n", outfname);
941 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
943 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
944 tb[0] = tb[1] = sb.st_mtime;
945 if (utime(outfname, tb)) {
946 fprintf(stderr, "Can't set timestamp %s\n", outfname);
949 tb.actime = sb.st_atime;
950 tb.modtime = sb.st_mtime;
951 if (utime(outfname, &tb)) {
952 fprintf(stderr, "Can't set timestamp %s\n", outfname);
957 char *backup_filename = get_backup_filename(backup_suffix, origfname);
959 unlink(backup_filename);
961 if (rename(origfname, backup_filename)) {
962 perror(backup_filename);
963 fprintf(stderr, "Can't rename %s to %s\n",
964 origfname, backup_filename);
968 if (unlink(origfname)){
973 if (rename(outfname, origfname)) {
975 fprintf(stderr, "Can't rename %s to %s\n",
976 outfname, origfname);
983 if (is_argument_error)
986 #ifdef EASYWIN /*Easy Win */
987 if (file_out_f == FALSE)
988 scanf("%d",&end_check);
991 #else /* for Other OS */
992 if (file_out_f == TRUE)
997 #endif /* WIN32DLL */
1000 char *get_backup_filename(const char *suffix, const char *filename)
1002 char *backup_filename;
1003 int asterisk_count = 0;
1005 int filename_length = strlen(filename);
1007 for(i = 0; suffix[i]; i++){
1008 if(suffix[i] == '*') asterisk_count++;
1012 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1013 if (!backup_filename){
1014 perror("Can't malloc backup filename.");
1018 for(i = 0, j = 0; suffix[i];){
1019 if(suffix[i] == '*'){
1020 backup_filename[j] = '\0';
1021 strncat(backup_filename, filename, filename_length);
1023 j += filename_length;
1025 backup_filename[j++] = suffix[i++];
1028 backup_filename[j] = '\0';
1030 j = strlen(suffix) + filename_length;
1031 backup_filename = malloc( + 1);
1032 strcpy(backup_filename, filename);
1033 strcat(backup_filename, suffix);
1034 backup_filename[j] = '\0';
1036 return backup_filename;
1040 static const struct {
1064 {"katakana-hiragana","h3"},
1072 #ifdef UTF8_OUTPUT_ENABLE
1082 {"fb-subchar=", ""},
1084 #ifdef UTF8_INPUT_ENABLE
1085 {"utf8-input", "W"},
1086 {"utf16-input", "W16"},
1087 {"no-cp932ext", ""},
1088 {"no-best-fit-chars",""},
1090 #ifdef UNICODE_NORMALIZATION
1091 {"utf8mac-input", ""},
1103 #ifdef NUMCHAR_OPTION
1104 {"numchar-input", ""},
1110 #ifdef SHIFTJIS_CP932
1120 static int option_mode = 0;
1122 void options(unsigned char *cp)
1126 unsigned char *cp_back = NULL;
1131 while(*cp && *cp++!='-');
1132 while (*cp || cp_back) {
1140 case '-': /* literal options */
1141 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1145 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1146 p = (unsigned char *)long_option[i].name;
1147 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1148 if (*p == cp[j] || cp[j] == SP){
1155 fprintf(stderr, "unknown long option: --%s\n", cp);
1158 while(*cp && *cp != SP && cp++);
1159 if (long_option[i].alias[0]){
1161 cp = (unsigned char *)long_option[i].alias;
1163 if (strcmp(long_option[i].name, "ic=") == 0){
1164 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1165 codeset[i] = nkf_toupper(p[i]);
1168 if(strcmp(codeset, "ISO-2022-JP") == 0){
1169 input_f = JIS_INPUT;
1170 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1171 strcmp(codeset, "CP50220") == 0 ||
1172 strcmp(codeset, "CP50221") == 0 ||
1173 strcmp(codeset, "CP50222") == 0){
1174 input_f = JIS_INPUT;
1175 #ifdef SHIFTJIS_CP932
1178 #ifdef UTF8_OUTPUT_ENABLE
1179 ms_ucs_map_f = UCS_MAP_CP932;
1181 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1182 input_f = JIS_INPUT;
1186 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1187 input_f = JIS_INPUT;
1192 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1193 input_f = SJIS_INPUT;
1194 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1195 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1196 strcmp(codeset, "CP932") == 0 ||
1197 strcmp(codeset, "MS932") == 0){
1198 input_f = SJIS_INPUT;
1199 #ifdef SHIFTJIS_CP932
1202 #ifdef UTF8_OUTPUT_ENABLE
1203 ms_ucs_map_f = UCS_MAP_CP932;
1205 }else if(strcmp(codeset, "CP10001") == 0){
1206 input_f = SJIS_INPUT;
1207 #ifdef SHIFTJIS_CP932
1210 #ifdef UTF8_OUTPUT_ENABLE
1211 ms_ucs_map_f = UCS_MAP_CP10001;
1213 }else if(strcmp(codeset, "EUCJP") == 0 ||
1214 strcmp(codeset, "EUC-JP") == 0){
1215 input_f = EUC_INPUT;
1216 }else if(strcmp(codeset, "CP51932") == 0){
1217 input_f = EUC_INPUT;
1218 #ifdef SHIFTJIS_CP932
1221 #ifdef UTF8_OUTPUT_ENABLE
1222 ms_ucs_map_f = UCS_MAP_CP932;
1224 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1225 strcmp(codeset, "EUCJP-MS") == 0 ||
1226 strcmp(codeset, "EUCJPMS") == 0){
1227 input_f = EUC_INPUT;
1228 #ifdef SHIFTJIS_CP932
1231 #ifdef UTF8_OUTPUT_ENABLE
1232 ms_ucs_map_f = UCS_MAP_MS;
1234 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1235 strcmp(codeset, "EUCJP-ASCII") == 0){
1236 input_f = EUC_INPUT;
1237 #ifdef SHIFTJIS_CP932
1240 #ifdef UTF8_OUTPUT_ENABLE
1241 ms_ucs_map_f = UCS_MAP_ASCII;
1243 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1244 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1245 input_f = SJIS_INPUT;
1247 #ifdef SHIFTJIS_CP932
1250 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1251 strcmp(codeset, "EUC-JIS-2004") == 0){
1252 input_f = EUC_INPUT;
1254 #ifdef SHIFTJIS_CP932
1257 #ifdef UTF8_INPUT_ENABLE
1258 }else if(strcmp(codeset, "UTF-8") == 0 ||
1259 strcmp(codeset, "UTF-8N") == 0 ||
1260 strcmp(codeset, "UTF-8-BOM") == 0){
1261 input_f = UTF8_INPUT;
1262 #ifdef UNICODE_NORMALIZATION
1263 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1264 strcmp(codeset, "UTF-8-MAC") == 0){
1265 input_f = UTF8_INPUT;
1268 }else if(strcmp(codeset, "UTF-16") == 0 ||
1269 strcmp(codeset, "UTF-16BE") == 0 ||
1270 strcmp(codeset, "UTF-16BE-BOM") == 0){
1271 input_f = UTF16_INPUT;
1272 input_endian = ENDIAN_BIG;
1273 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1274 strcmp(codeset, "UTF-16LE-BOM") == 0){
1275 input_f = UTF16_INPUT;
1276 input_endian = ENDIAN_LITTLE;
1277 }else if(strcmp(codeset, "UTF-32") == 0 ||
1278 strcmp(codeset, "UTF-32BE") == 0 ||
1279 strcmp(codeset, "UTF-32BE-BOM") == 0){
1280 input_f = UTF32_INPUT;
1281 input_endian = ENDIAN_BIG;
1282 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1283 strcmp(codeset, "UTF-32LE-BOM") == 0){
1284 input_f = UTF32_INPUT;
1285 input_endian = ENDIAN_LITTLE;
1288 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1292 if (strcmp(long_option[i].name, "oc=") == 0){
1294 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1295 codeset[i] = nkf_toupper(p[i]);
1298 if(strcmp(codeset, "ISO-2022-JP") == 0){
1299 output_conv = j_oconv;
1300 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1301 output_conv = j_oconv;
1302 no_cp932ext_f = TRUE;
1303 #ifdef SHIFTJIS_CP932
1306 #ifdef UTF8_OUTPUT_ENABLE
1307 ms_ucs_map_f = UCS_MAP_CP932;
1309 }else if(strcmp(codeset, "CP50220") == 0){
1310 output_conv = j_oconv;
1312 #ifdef SHIFTJIS_CP932
1315 #ifdef UTF8_OUTPUT_ENABLE
1316 ms_ucs_map_f = UCS_MAP_CP932;
1318 }else if(strcmp(codeset, "CP50221") == 0){
1319 output_conv = j_oconv;
1320 #ifdef SHIFTJIS_CP932
1323 #ifdef UTF8_OUTPUT_ENABLE
1324 ms_ucs_map_f = UCS_MAP_CP932;
1326 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1327 output_conv = j_oconv;
1331 #ifdef SHIFTJIS_CP932
1334 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1335 output_conv = j_oconv;
1340 #ifdef SHIFTJIS_CP932
1343 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1344 output_conv = s_oconv;
1345 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1346 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1347 strcmp(codeset, "CP932") == 0 ||
1348 strcmp(codeset, "MS932") == 0){
1349 output_conv = s_oconv;
1350 #ifdef UTF8_OUTPUT_ENABLE
1351 ms_ucs_map_f = UCS_MAP_CP932;
1353 }else if(strcmp(codeset, "CP10001") == 0){
1354 output_conv = s_oconv;
1355 #ifdef UTF8_OUTPUT_ENABLE
1356 ms_ucs_map_f = UCS_MAP_CP10001;
1358 }else if(strcmp(codeset, "EUCJP") == 0 ||
1359 strcmp(codeset, "EUC-JP") == 0){
1360 output_conv = e_oconv;
1361 }else if(strcmp(codeset, "CP51932") == 0){
1362 output_conv = e_oconv;
1363 #ifdef SHIFTJIS_CP932
1366 #ifdef UTF8_OUTPUT_ENABLE
1367 ms_ucs_map_f = UCS_MAP_CP932;
1369 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1370 strcmp(codeset, "EUCJP-MS") == 0 ||
1371 strcmp(codeset, "EUCJPMS") == 0){
1372 output_conv = e_oconv;
1376 #ifdef UTF8_OUTPUT_ENABLE
1377 ms_ucs_map_f = UCS_MAP_MS;
1379 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1380 strcmp(codeset, "EUCJP-ASCII") == 0){
1381 output_conv = e_oconv;
1385 #ifdef UTF8_OUTPUT_ENABLE
1386 ms_ucs_map_f = UCS_MAP_ASCII;
1388 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1389 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1390 output_conv = s_oconv;
1392 #ifdef SHIFTJIS_CP932
1395 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1396 strcmp(codeset, "EUC-JIS-2004") == 0){
1397 output_conv = e_oconv;
1402 #ifdef SHIFTJIS_CP932
1405 #ifdef UTF8_OUTPUT_ENABLE
1406 }else if(strcmp(codeset, "UTF-8") == 0){
1407 output_conv = w_oconv;
1408 }else if(strcmp(codeset, "UTF-8N") == 0){
1409 output_conv = w_oconv;
1410 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1411 output_conv = w_oconv;
1412 output_bom_f = TRUE;
1413 }else if(strcmp(codeset, "UTF-16BE") == 0){
1414 output_conv = w_oconv16;
1415 }else if(strcmp(codeset, "UTF-16") == 0 ||
1416 strcmp(codeset, "UTF-16BE-BOM") == 0){
1417 output_conv = w_oconv16;
1418 output_bom_f = TRUE;
1419 }else if(strcmp(codeset, "UTF-16LE") == 0){
1420 output_conv = w_oconv16;
1421 output_endian = ENDIAN_LITTLE;
1422 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1423 output_conv = w_oconv16;
1424 output_endian = ENDIAN_LITTLE;
1425 output_bom_f = TRUE;
1426 }else if(strcmp(codeset, "UTF-32") == 0 ||
1427 strcmp(codeset, "UTF-32BE") == 0){
1428 output_conv = w_oconv32;
1429 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1430 output_conv = w_oconv32;
1431 output_bom_f = TRUE;
1432 }else if(strcmp(codeset, "UTF-32LE") == 0){
1433 output_conv = w_oconv32;
1434 output_endian = ENDIAN_LITTLE;
1435 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1436 output_conv = w_oconv32;
1437 output_endian = ENDIAN_LITTLE;
1438 output_bom_f = TRUE;
1441 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1445 if (strcmp(long_option[i].name, "guess=") == 0){
1454 if (strcmp(long_option[i].name, "overwrite") == 0){
1457 preserve_time_f = TRUE;
1460 if (strcmp(long_option[i].name, "overwrite=") == 0){
1463 preserve_time_f = TRUE;
1465 backup_suffix = malloc(strlen((char *) p) + 1);
1466 strcpy(backup_suffix, (char *) p);
1469 if (strcmp(long_option[i].name, "in-place") == 0){
1472 preserve_time_f = FALSE;
1475 if (strcmp(long_option[i].name, "in-place=") == 0){
1478 preserve_time_f = FALSE;
1480 backup_suffix = malloc(strlen((char *) p) + 1);
1481 strcpy(backup_suffix, (char *) p);
1486 if (strcmp(long_option[i].name, "cap-input") == 0){
1490 if (strcmp(long_option[i].name, "url-input") == 0){
1495 #ifdef NUMCHAR_OPTION
1496 if (strcmp(long_option[i].name, "numchar-input") == 0){
1502 if (strcmp(long_option[i].name, "no-output") == 0){
1506 if (strcmp(long_option[i].name, "debug") == 0){
1511 if (strcmp(long_option[i].name, "cp932") == 0){
1512 #ifdef SHIFTJIS_CP932
1516 #ifdef UTF8_OUTPUT_ENABLE
1517 ms_ucs_map_f = UCS_MAP_CP932;
1521 if (strcmp(long_option[i].name, "no-cp932") == 0){
1522 #ifdef SHIFTJIS_CP932
1526 #ifdef UTF8_OUTPUT_ENABLE
1527 ms_ucs_map_f = UCS_MAP_ASCII;
1531 #ifdef SHIFTJIS_CP932
1532 if (strcmp(long_option[i].name, "cp932inv") == 0){
1539 if (strcmp(long_option[i].name, "x0212") == 0){
1546 if (strcmp(long_option[i].name, "exec-in") == 0){
1550 if (strcmp(long_option[i].name, "exec-out") == 0){
1555 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1556 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1557 no_cp932ext_f = TRUE;
1560 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1561 no_best_fit_chars_f = TRUE;
1564 if (strcmp(long_option[i].name, "fb-skip") == 0){
1565 encode_fallback = NULL;
1568 if (strcmp(long_option[i].name, "fb-html") == 0){
1569 encode_fallback = encode_fallback_html;
1572 if (strcmp(long_option[i].name, "fb-xml") == 0){
1573 encode_fallback = encode_fallback_xml;
1576 if (strcmp(long_option[i].name, "fb-java") == 0){
1577 encode_fallback = encode_fallback_java;
1580 if (strcmp(long_option[i].name, "fb-perl") == 0){
1581 encode_fallback = encode_fallback_perl;
1584 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1585 encode_fallback = encode_fallback_subchar;
1588 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1589 encode_fallback = encode_fallback_subchar;
1590 unicode_subchar = 0;
1592 /* decimal number */
1593 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1594 unicode_subchar *= 10;
1595 unicode_subchar += hex2bin(p[i]);
1597 }else if(p[1] == 'x' || p[1] == 'X'){
1598 /* hexadecimal number */
1599 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1600 unicode_subchar <<= 4;
1601 unicode_subchar |= hex2bin(p[i]);
1605 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1606 unicode_subchar *= 8;
1607 unicode_subchar += hex2bin(p[i]);
1610 w16e_conv(unicode_subchar, &i, &j);
1611 unicode_subchar = i<<8 | j;
1615 #ifdef UTF8_OUTPUT_ENABLE
1616 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1617 ms_ucs_map_f = UCS_MAP_MS;
1621 #ifdef UNICODE_NORMALIZATION
1622 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1623 input_f = UTF8_INPUT;
1628 if (strcmp(long_option[i].name, "prefix=") == 0){
1629 if (nkf_isgraph(p[0])){
1630 for (i = 1; nkf_isgraph(p[i]); i++){
1631 prefix_table[p[i]] = p[0];
1638 case 'b': /* buffered mode */
1641 case 'u': /* non bufferd mode */
1644 case 't': /* transparent mode */
1649 } else if (*cp=='2') {
1653 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1661 case 'j': /* JIS output */
1663 output_conv = j_oconv;
1665 case 'e': /* AT&T EUC output */
1666 output_conv = e_oconv;
1669 case 's': /* SJIS output */
1670 output_conv = s_oconv;
1672 case 'l': /* ISO8859 Latin-1 support, no conversion */
1673 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1674 input_f = LATIN1_INPUT;
1676 case 'i': /* Kanji IN ESC-$-@/B */
1677 if (*cp=='@'||*cp=='B')
1678 kanji_intro = *cp++;
1680 case 'o': /* ASCII IN ESC-(-J/B */
1681 if (*cp=='J'||*cp=='B'||*cp=='H')
1682 ascii_intro = *cp++;
1686 bit:1 katakana->hiragana
1687 bit:2 hiragana->katakana
1689 if ('9'>= *cp && *cp>='0')
1690 hira_f |= (*cp++ -'0');
1697 #if defined(MSDOS) || defined(__OS2__)
1704 show_configuration();
1712 #ifdef UTF8_OUTPUT_ENABLE
1713 case 'w': /* UTF-8 output */
1715 output_conv = w_oconv; cp++;
1719 output_bom_f = TRUE;
1722 if ('1'== cp[0] && '6'==cp[1]) {
1723 output_conv = w_oconv16; cp+=2;
1724 } else if ('3'== cp[0] && '2'==cp[1]) {
1725 output_conv = w_oconv32; cp+=2;
1727 output_conv = w_oconv;
1732 output_endian = ENDIAN_LITTLE;
1733 } else if (cp[0] == 'B') {
1741 output_bom_f = TRUE;
1746 #ifdef UTF8_INPUT_ENABLE
1747 case 'W': /* UTF input */
1750 input_f = UTF8_INPUT;
1752 if ('1'== cp[0] && '6'==cp[1]) {
1754 input_f = UTF16_INPUT;
1755 input_endian = ENDIAN_BIG;
1756 } else if ('3'== cp[0] && '2'==cp[1]) {
1758 input_f = UTF32_INPUT;
1759 input_endian = ENDIAN_BIG;
1761 input_f = UTF8_INPUT;
1766 input_endian = ENDIAN_LITTLE;
1767 } else if (cp[0] == 'B') {
1773 /* Input code assumption */
1774 case 'J': /* JIS input */
1775 input_f = JIS_INPUT;
1777 case 'E': /* AT&T EUC input */
1778 input_f = EUC_INPUT;
1780 case 'S': /* MS Kanji input */
1781 input_f = SJIS_INPUT;
1783 case 'Z': /* Convert X0208 alphabet to asii */
1785 bit:0 Convert JIS X 0208 Alphabet to ASCII
1786 bit:1 Convert Kankaku to one space
1787 bit:2 Convert Kankaku to two spaces
1788 bit:3 Convert HTML Entity
1789 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1791 while ('0'<= *cp && *cp <='9') {
1792 alpha_f |= 1 << (*cp++ - '0');
1794 if (!alpha_f) alpha_f = 1;
1796 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1797 x0201_f = FALSE; /* No X0201->X0208 conversion */
1799 ESC-(-I in JIS, EUC, MS Kanji
1800 SI/SO in JIS, EUC, MS Kanji
1801 SSO in EUC, JIS, not in MS Kanji
1802 MS Kanji (0xa0-0xdf)
1804 ESC-(-I in JIS (0x20-0x5f)
1805 SSO in EUC (0xa0-0xdf)
1806 0xa0-0xd in MS Kanji (0xa0-0xdf)
1809 case 'X': /* Convert X0201 kana to X0208 */
1812 case 'F': /* prserve new lines */
1813 fold_preserve_f = TRUE;
1814 case 'f': /* folding -f60 or -f */
1817 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1819 fold_len += *cp++ - '0';
1821 if (!(0<fold_len && fold_len<BUFSIZ))
1822 fold_len = DEFAULT_FOLD;
1826 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1828 fold_margin += *cp++ - '0';
1832 case 'm': /* MIME support */
1833 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1834 if (*cp=='B'||*cp=='Q') {
1835 mime_decode_mode = *cp++;
1836 mimebuf_f = FIXED_MIME;
1837 } else if (*cp=='N') {
1838 mime_f = TRUE; cp++;
1839 } else if (*cp=='S') {
1840 mime_f = STRICT_MIME; cp++;
1841 } else if (*cp=='0') {
1842 mime_decode_f = FALSE;
1843 mime_f = FALSE; cp++;
1846 case 'M': /* MIME output */
1849 mimeout_f = FIXED_MIME; cp++;
1850 } else if (*cp=='Q') {
1852 mimeout_f = FIXED_MIME; cp++;
1857 case 'B': /* Broken JIS support */
1859 bit:1 allow any x on ESC-(-x or ESC-$-x
1860 bit:2 reset to ascii on NL
1862 if ('9'>= *cp && *cp>='0')
1863 broken_f |= 1<<(*cp++ -'0');
1868 case 'O':/* for Output file */
1872 case 'c':/* add cr code */
1875 case 'd':/* delete cr code */
1878 case 'I': /* ISO-2022-JP output */
1881 case 'L': /* line mode */
1882 if (*cp=='u') { /* unix */
1883 nlmode_f = LF; cp++;
1884 } else if (*cp=='m') { /* mac */
1885 nlmode_f = CR; cp++;
1886 } else if (*cp=='w') { /* windows */
1887 nlmode_f = CRLF; cp++;
1888 } else if (*cp=='0') { /* no conversion */
1897 } else if (*cp == '0') {
1906 /* module muliple options in a string are allowed for Perl moudle */
1907 while(*cp && *cp++!='-');
1910 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
1911 /* bogus option but ignored */
1917 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1920 struct input_code *p = input_code_list;
1922 if (iconv_func == p->iconv_func){
1931 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1933 #ifdef INPUT_CODE_FIX
1941 #ifdef INPUT_CODE_FIX
1942 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1948 if (estab_f && iconv_for_check != iconv){
1949 struct input_code *p = find_inputcode_byfunc(iconv);
1951 set_input_codename(p->name);
1954 iconv_for_check = iconv;
1959 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1960 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1961 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1962 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
1963 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
1964 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1965 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1966 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1968 #define SCORE_INIT (SCORE_iMIME)
1970 static const char score_table_A0[] = {
1973 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1974 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1977 static const char score_table_F0[] = {
1978 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1979 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1980 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
1981 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1984 void set_code_score(struct input_code *ptr, nkf_char score)
1987 ptr->score |= score;
1991 void clr_code_score(struct input_code *ptr, nkf_char score)
1994 ptr->score &= ~score;
1998 void code_score(struct input_code *ptr)
2000 nkf_char c2 = ptr->buf[0];
2001 #ifdef UTF8_OUTPUT_ENABLE
2002 nkf_char c1 = ptr->buf[1];
2005 set_code_score(ptr, SCORE_ERROR);
2006 }else if (c2 == SSO){
2007 set_code_score(ptr, SCORE_KANA);
2008 }else if (c2 == 0x8f){
2009 set_code_score(ptr, SCORE_X0212);
2010 #ifdef UTF8_OUTPUT_ENABLE
2011 }else if (!e2w_conv(c2, c1)){
2012 set_code_score(ptr, SCORE_NO_EXIST);
2014 }else if ((c2 & 0x70) == 0x20){
2015 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2016 }else if ((c2 & 0x70) == 0x70){
2017 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2018 }else if ((c2 & 0x70) >= 0x50){
2019 set_code_score(ptr, SCORE_L2);
2023 void status_disable(struct input_code *ptr)
2028 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2031 void status_push_ch(struct input_code *ptr, nkf_char c)
2033 ptr->buf[ptr->index++] = c;
2036 void status_clear(struct input_code *ptr)
2042 void status_reset(struct input_code *ptr)
2045 ptr->score = SCORE_INIT;
2048 void status_reinit(struct input_code *ptr)
2051 ptr->_file_stat = 0;
2054 void status_check(struct input_code *ptr, nkf_char c)
2056 if (c <= DEL && estab_f){
2061 void s_status(struct input_code *ptr, nkf_char c)
2065 status_check(ptr, c);
2070 #ifdef NUMCHAR_OPTION
2071 }else if (is_unicode_capsule(c)){
2074 }else if (0xa1 <= c && c <= 0xdf){
2075 status_push_ch(ptr, SSO);
2076 status_push_ch(ptr, c);
2079 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2081 status_push_ch(ptr, c);
2082 }else if (0xed <= c && c <= 0xee){
2084 status_push_ch(ptr, c);
2085 #ifdef SHIFTJIS_CP932
2086 }else if (is_ibmext_in_sjis(c)){
2088 status_push_ch(ptr, c);
2089 #endif /* SHIFTJIS_CP932 */
2091 }else if (0xf0 <= c && c <= 0xfc){
2093 status_push_ch(ptr, c);
2094 #endif /* X0212_ENABLE */
2096 status_disable(ptr);
2100 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2101 status_push_ch(ptr, c);
2102 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2106 status_disable(ptr);
2110 #ifdef SHIFTJIS_CP932
2111 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2112 status_push_ch(ptr, c);
2113 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2114 set_code_score(ptr, SCORE_CP932);
2119 #endif /* SHIFTJIS_CP932 */
2120 status_disable(ptr);
2123 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2124 status_push_ch(ptr, c);
2125 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2126 set_code_score(ptr, SCORE_CP932);
2129 status_disable(ptr);
2135 void e_status(struct input_code *ptr, nkf_char c)
2139 status_check(ptr, c);
2144 #ifdef NUMCHAR_OPTION
2145 }else if (is_unicode_capsule(c)){
2148 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2150 status_push_ch(ptr, c);
2152 }else if (0x8f == c){
2154 status_push_ch(ptr, c);
2155 #endif /* X0212_ENABLE */
2157 status_disable(ptr);
2161 if (0xa1 <= c && c <= 0xfe){
2162 status_push_ch(ptr, c);
2166 status_disable(ptr);
2171 if (0xa1 <= c && c <= 0xfe){
2173 status_push_ch(ptr, c);
2175 status_disable(ptr);
2177 #endif /* X0212_ENABLE */
2181 #ifdef UTF8_INPUT_ENABLE
2182 void w_status(struct input_code *ptr, nkf_char c)
2186 status_check(ptr, c);
2191 #ifdef NUMCHAR_OPTION
2192 }else if (is_unicode_capsule(c)){
2195 }else if (0xc0 <= c && c <= 0xdf){
2197 status_push_ch(ptr, c);
2198 }else if (0xe0 <= c && c <= 0xef){
2200 status_push_ch(ptr, c);
2201 }else if (0xf0 <= c && c <= 0xf4){
2203 status_push_ch(ptr, c);
2205 status_disable(ptr);
2210 if (0x80 <= c && c <= 0xbf){
2211 status_push_ch(ptr, c);
2212 if (ptr->index > ptr->stat){
2213 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2214 && ptr->buf[2] == 0xbf);
2215 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2216 &ptr->buf[0], &ptr->buf[1]);
2223 status_disable(ptr);
2227 if (0x80 <= c && c <= 0xbf){
2228 if (ptr->index < ptr->stat){
2229 status_push_ch(ptr, c);
2234 status_disable(ptr);
2241 void code_status(nkf_char c)
2243 int action_flag = 1;
2244 struct input_code *result = 0;
2245 struct input_code *p = input_code_list;
2247 if (!p->status_func) {
2251 if (!p->status_func)
2253 (p->status_func)(p, c);
2256 }else if(p->stat == 0){
2267 if (result && !estab_f){
2268 set_iconv(TRUE, result->iconv_func);
2269 }else if (c <= DEL){
2270 struct input_code *ptr = input_code_list;
2280 nkf_char std_getc(FILE *f)
2283 return std_gc_buf[--std_gc_ndx];
2289 nkf_char std_ungetc(nkf_char c, FILE *f)
2291 if (std_gc_ndx == STD_GC_BUFSIZE){
2294 std_gc_buf[std_gc_ndx++] = c;
2299 void std_putc(nkf_char c)
2306 #if !defined(PERL_XS) && !defined(WIN32DLL)
2307 nkf_char noconvert(FILE *f)
2312 module_connection();
2313 while ((c = (*i_getc)(f)) != EOF)
2320 void module_connection(void)
2322 oconv = output_conv;
2325 /* replace continucation module, from output side */
2327 /* output redicrection */
2329 if (noout_f || guess_f){
2336 if (mimeout_f == TRUE) {
2337 o_base64conv = oconv; oconv = base64_conv;
2339 /* base64_count = 0; */
2342 if (nlmode_f || guess_f) {
2343 o_nlconv = oconv; oconv = nl_conv;
2346 o_rot_conv = oconv; oconv = rot_conv;
2349 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2352 o_hira_conv = oconv; oconv = hira_conv;
2355 o_fconv = oconv; oconv = fold_conv;
2358 if (alpha_f || x0201_f) {
2359 o_zconv = oconv; oconv = z_conv;
2363 i_ungetc = std_ungetc;
2364 /* input redicrection */
2367 i_cgetc = i_getc; i_getc = cap_getc;
2368 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2371 i_ugetc = i_getc; i_getc = url_getc;
2372 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2375 #ifdef NUMCHAR_OPTION
2377 i_ngetc = i_getc; i_getc = numchar_getc;
2378 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2381 #ifdef UNICODE_NORMALIZATION
2382 if (nfc_f && input_f == UTF8_INPUT){
2383 i_nfc_getc = i_getc; i_getc = nfc_getc;
2384 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2387 if (mime_f && mimebuf_f==FIXED_MIME) {
2388 i_mgetc = i_getc; i_getc = mime_getc;
2389 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2392 i_bgetc = i_getc; i_getc = broken_getc;
2393 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2395 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2396 set_iconv(-TRUE, e_iconv);
2397 } else if (input_f == SJIS_INPUT) {
2398 set_iconv(-TRUE, s_iconv);
2399 #ifdef UTF8_INPUT_ENABLE
2400 } else if (input_f == UTF8_INPUT) {
2401 set_iconv(-TRUE, w_iconv);
2402 } else if (input_f == UTF16_INPUT) {
2403 set_iconv(-TRUE, w_iconv16);
2404 } else if (input_f == UTF32_INPUT) {
2405 set_iconv(-TRUE, w_iconv32);
2408 set_iconv(FALSE, e_iconv);
2412 struct input_code *p = input_code_list;
2420 * Check and Ignore BOM
2422 void check_bom(FILE *f)
2425 switch(c2 = (*i_getc)(f)){
2427 if((c2 = (*i_getc)(f)) == 0x00){
2428 if((c2 = (*i_getc)(f)) == 0xFE){
2429 if((c2 = (*i_getc)(f)) == 0xFF){
2431 set_iconv(TRUE, w_iconv32);
2433 if (iconv == w_iconv32) {
2434 input_endian = ENDIAN_BIG;
2437 (*i_ungetc)(0xFF,f);
2438 }else (*i_ungetc)(c2,f);
2439 (*i_ungetc)(0xFE,f);
2440 }else if(c2 == 0xFF){
2441 if((c2 = (*i_getc)(f)) == 0xFE){
2443 set_iconv(TRUE, w_iconv32);
2445 if (iconv == w_iconv32) {
2446 input_endian = ENDIAN_2143;
2449 (*i_ungetc)(0xFF,f);
2450 }else (*i_ungetc)(c2,f);
2451 (*i_ungetc)(0xFF,f);
2452 }else (*i_ungetc)(c2,f);
2453 (*i_ungetc)(0x00,f);
2454 }else (*i_ungetc)(c2,f);
2455 (*i_ungetc)(0x00,f);
2458 if((c2 = (*i_getc)(f)) == 0xBB){
2459 if((c2 = (*i_getc)(f)) == 0xBF){
2461 set_iconv(TRUE, w_iconv);
2463 if (iconv == w_iconv) {
2466 (*i_ungetc)(0xBF,f);
2467 }else (*i_ungetc)(c2,f);
2468 (*i_ungetc)(0xBB,f);
2469 }else (*i_ungetc)(c2,f);
2470 (*i_ungetc)(0xEF,f);
2473 if((c2 = (*i_getc)(f)) == 0xFF){
2474 if((c2 = (*i_getc)(f)) == 0x00){
2475 if((c2 = (*i_getc)(f)) == 0x00){
2477 set_iconv(TRUE, w_iconv32);
2479 if (iconv == w_iconv32) {
2480 input_endian = ENDIAN_3412;
2483 (*i_ungetc)(0x00,f);
2484 }else (*i_ungetc)(c2,f);
2485 (*i_ungetc)(0x00,f);
2486 }else (*i_ungetc)(c2,f);
2488 set_iconv(TRUE, w_iconv16);
2490 if (iconv == w_iconv16) {
2491 input_endian = ENDIAN_BIG;
2494 (*i_ungetc)(0xFF,f);
2495 }else (*i_ungetc)(c2,f);
2496 (*i_ungetc)(0xFE,f);
2499 if((c2 = (*i_getc)(f)) == 0xFE){
2500 if((c2 = (*i_getc)(f)) == 0x00){
2501 if((c2 = (*i_getc)(f)) == 0x00){
2503 set_iconv(TRUE, w_iconv32);
2505 if (iconv == w_iconv32) {
2506 input_endian = ENDIAN_LITTLE;
2509 (*i_ungetc)(0x00,f);
2510 }else (*i_ungetc)(c2,f);
2511 (*i_ungetc)(0x00,f);
2512 }else (*i_ungetc)(c2,f);
2514 set_iconv(TRUE, w_iconv16);
2516 if (iconv == w_iconv16) {
2517 input_endian = ENDIAN_LITTLE;
2520 (*i_ungetc)(0xFE,f);
2521 }else (*i_ungetc)(c2,f);
2522 (*i_ungetc)(0xFF,f);
2531 Conversion main loop. Code detection only.
2534 nkf_char kanji_convert(FILE *f)
2536 nkf_char c3, c2=0, c1, c0=0;
2537 int is_8bit = FALSE;
2539 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2540 #ifdef UTF8_INPUT_ENABLE
2541 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2548 output_mode = ASCII;
2551 #define NEXT continue /* no output, get next */
2552 #define SEND ; /* output c1 and c2, get next */
2553 #define LAST break /* end of loop, go closing */
2555 module_connection();
2558 while ((c1 = (*i_getc)(f)) != EOF) {
2559 #ifdef INPUT_CODE_FIX
2565 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2566 /* in case of 8th bit is on */
2567 if (!estab_f&&!mime_decode_mode) {
2568 /* in case of not established yet */
2569 /* It is still ambiguious */
2570 if (h_conv(f, c2, c1)==EOF)
2576 /* in case of already established */
2578 /* ignore bogus code and not CP5022x UCD */
2586 /* second byte, 7 bit code */
2587 /* it might be kanji shitfted */
2588 if ((c1 == DEL) || (c1 <= SP)) {
2589 /* ignore bogus first code */
2596 #ifdef UTF8_INPUT_ENABLE
2597 if (iconv == w_iconv16) {
2598 if (input_endian == ENDIAN_BIG) {
2600 if ((c1 = (*i_getc)(f)) != EOF) {
2601 if (0xD8 <= c2 && c2 <= 0xDB) {
2602 if ((c0 = (*i_getc)(f)) != EOF) {
2604 if ((c3 = (*i_getc)(f)) != EOF) {
2611 if ((c2 = (*i_getc)(f)) != EOF) {
2612 if (0xD8 <= c2 && c2 <= 0xDB) {
2613 if ((c3 = (*i_getc)(f)) != EOF) {
2614 if ((c0 = (*i_getc)(f)) != EOF) {
2623 } else if(iconv == w_iconv32){
2625 if((c2 = (*i_getc)(f)) != EOF &&
2626 (c1 = (*i_getc)(f)) != EOF &&
2627 (c0 = (*i_getc)(f)) != EOF){
2628 switch(input_endian){
2630 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2633 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2636 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2639 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2649 #ifdef NUMCHAR_OPTION
2650 if (is_unicode_capsule(c1)){
2654 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2656 if (!estab_f && !iso8859_f) {
2657 /* not established yet */
2660 } else { /* estab_f==TRUE */
2665 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2666 /* SJIS X0201 Case... */
2667 if (iso2022jp_f && !x0201_f) {
2668 (*oconv)(GETA1, GETA2);
2675 } else if (c1==SSO && iconv != s_iconv) {
2676 /* EUC X0201 Case */
2677 c1 = (*i_getc)(f); /* skip SSO */
2679 if (SSP<=c1 && c1<0xe0) {
2680 if (iso2022jp_f && !x0201_f) {
2681 (*oconv)(GETA1, GETA2);
2688 } else { /* bogus code, skip SSO and one byte */
2691 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2692 (c1 == 0xFD || c1 == 0xFE)) {
2698 /* already established */
2703 } else if ((c1 > SP) && (c1 != DEL)) {
2704 /* in case of Roman characters */
2706 /* output 1 shifted byte */
2710 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2711 /* output 1 shifted byte */
2712 if (iso2022jp_f && !x0201_f) {
2713 (*oconv)(GETA1, GETA2);
2720 /* look like bogus code */
2723 } else if (input_mode == X0208 || input_mode == X0212 ||
2724 input_mode == X0213_1 || input_mode == X0213_2) {
2725 /* in case of Kanji shifted */
2728 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2729 /* Check MIME code */
2730 if ((c1 = (*i_getc)(f)) == EOF) {
2733 } else if (c1 == '?') {
2734 /* =? is mime conversion start sequence */
2735 if(mime_f == STRICT_MIME) {
2736 /* check in real detail */
2737 if (mime_begin_strict(f) == EOF)
2741 } else if (mime_begin(f) == EOF)
2751 /* normal ASCII code */
2754 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2757 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2760 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2761 if ((c1 = (*i_getc)(f)) == EOF) {
2762 /* (*oconv)(0, ESC); don't send bogus code */
2764 } else if (c1 == '$') {
2765 if ((c1 = (*i_getc)(f)) == EOF) {
2767 (*oconv)(0, ESC); don't send bogus code
2768 (*oconv)(0, '$'); */
2770 } else if (c1 == '@'|| c1 == 'B') {
2771 /* This is kanji introduction */
2774 set_input_codename("ISO-2022-JP");
2776 debug("ISO-2022-JP");
2779 } else if (c1 == '(') {
2780 if ((c1 = (*i_getc)(f)) == EOF) {
2781 /* don't send bogus code
2787 } else if (c1 == '@'|| c1 == 'B') {
2788 /* This is kanji introduction */
2793 } else if (c1 == 'D'){
2797 #endif /* X0212_ENABLE */
2798 } else if (c1 == (X0213_1&0x7F)){
2799 input_mode = X0213_1;
2802 } else if (c1 == (X0213_2&0x7F)){
2803 input_mode = X0213_2;
2807 /* could be some special code */
2814 } else if (broken_f&0x2) {
2815 /* accept any ESC-(-x as broken code ... */
2825 } else if (c1 == '(') {
2826 if ((c1 = (*i_getc)(f)) == EOF) {
2827 /* don't send bogus code
2829 (*oconv)(0, '('); */
2833 /* This is X0201 kana introduction */
2834 input_mode = X0201; shift_mode = X0201;
2836 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2837 /* This is X0208 kanji introduction */
2838 input_mode = ASCII; shift_mode = FALSE;
2840 } else if (broken_f&0x2) {
2841 input_mode = ASCII; shift_mode = FALSE;
2846 /* maintain various input_mode here */
2850 } else if ( c1 == 'N' || c1 == 'n'){
2852 c3 = (*i_getc)(f); /* skip SS2 */
2853 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2868 } else if (c1 == ESC && iconv == s_iconv) {
2869 /* ESC in Shift_JIS */
2870 if ((c1 = (*i_getc)(f)) == EOF) {
2871 /* (*oconv)(0, ESC); don't send bogus code */
2873 } else if (c1 == '$') {
2875 if ((c1 = (*i_getc)(f)) == EOF) {
2877 (*oconv)(0, ESC); don't send bogus code
2878 (*oconv)(0, '$'); */
2881 if (('E' <= c1 && c1 <= 'G') ||
2882 ('O' <= c1 && c1 <= 'Q')) {
2890 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2891 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2892 while ((c1 = (*i_getc)(f)) != EOF) {
2893 if (SP <= c1 && c1 <= 'z') {
2894 (*oconv)(0, c1 + c0);
2895 } else break; /* c1 == SO */
2899 if (c1 == EOF) LAST;
2906 } else if (c1 == LF || c1 == CR) {
2908 input_mode = ASCII; set_iconv(FALSE, 0);
2910 } else if (mime_decode_f && !mime_decode_mode){
2912 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2920 } else { /* if (c1 == CR)*/
2921 if ((c1=(*i_getc)(f))!=EOF) {
2925 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2939 } else if (c1 == DEL && input_mode == X0208) {
2949 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2952 if ((c0 = (*i_getc)(f)) != EOF) {
2955 if ((c3 = (*i_getc)(f)) != EOF) {
2957 (*iconv)(c2, c1, c0|c3);
2962 /* 3 bytes EUC or UTF-8 */
2963 if ((c0 = (*i_getc)(f)) != EOF) {
2965 (*iconv)(c2, c1, c0);
2973 0x7F <= c2 && c2 <= 0x92 &&
2974 0x21 <= c1 && c1 <= 0x7E) {
2976 if(c1 == 0x7F) return 0;
2977 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2980 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2984 (*oconv)(PREFIX_EUCG3 | c2, c1);
2986 #endif /* X0212_ENABLE */
2988 (*oconv)(PREFIX_EUCG3 | c2, c1);
2991 (*oconv)(input_mode, c1); /* other special case */
2997 /* goto next_word */
3001 (*iconv)(EOF, 0, 0);
3002 if (!input_codename)
3005 struct input_code *p = input_code_list;
3006 struct input_code *result = p;
3008 if (p->score < result->score) result = p;
3011 set_input_codename(result->name);
3013 debug(result->name);
3021 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3023 nkf_char ret, c3, c0;
3027 /** it must NOT be in the kanji shifte sequence */
3028 /** it must NOT be written in JIS7 */
3029 /** and it must be after 2 byte 8bit code */
3035 while ((c1 = (*i_getc)(f)) != EOF) {
3041 if (push_hold_buf(c1) == EOF || estab_f){
3047 struct input_code *p = input_code_list;
3048 struct input_code *result = p;
3053 if (p->status_func && p->score < result->score){
3058 set_iconv(TRUE, result->iconv_func);
3063 ** 1) EOF is detected, or
3064 ** 2) Code is established, or
3065 ** 3) Buffer is FULL (but last word is pushed)
3067 ** in 1) and 3) cases, we continue to use
3068 ** Kanji codes by oconv and leave estab_f unchanged.
3073 while (hold_index < hold_count){
3074 c2 = hold_buf[hold_index++];
3076 #ifdef NUMCHAR_OPTION
3077 || is_unicode_capsule(c2)
3082 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3083 (*iconv)(X0201, c2, 0);
3086 if (hold_index < hold_count){
3087 c1 = hold_buf[hold_index++];
3097 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3100 if (hold_index < hold_count){
3101 c0 = hold_buf[hold_index++];
3102 } else if ((c0 = (*i_getc)(f)) == EOF) {
3108 if (hold_index < hold_count){
3109 c3 = hold_buf[hold_index++];
3110 } else if ((c3 = (*i_getc)(f)) == EOF) {
3115 (*iconv)(c2, c1, c0|c3);
3120 /* 3 bytes EUC or UTF-8 */
3121 if (hold_index < hold_count){
3122 c0 = hold_buf[hold_index++];
3123 } else if ((c0 = (*i_getc)(f)) == EOF) {
3129 (*iconv)(c2, c1, c0);
3132 if (c0 == EOF) break;
3137 nkf_char push_hold_buf(nkf_char c2)
3139 if (hold_count >= HOLD_SIZE*2)
3141 hold_buf[hold_count++] = (unsigned char)c2;
3142 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3145 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3147 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3150 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3151 #ifdef SHIFTJIS_CP932
3152 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3153 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3160 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3161 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3167 #endif /* SHIFTJIS_CP932 */
3169 if (!x0213_f && is_ibmext_in_sjis(c2)){
3170 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3173 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3186 if(x0213_f && c2 >= 0xF0){
3187 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3188 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3189 }else{ /* 78<=k<=94 */
3190 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3191 if (0x9E < c1) c2++;
3194 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3195 if (0x9E < c1) c2++;
3198 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3205 c2 = x0212_unshift(c2);
3212 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3216 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3218 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3220 if(c1 == 0x7F) return 0;
3221 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3224 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3225 if (ret) return ret;
3231 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3236 }else if (c2 == 0x8f){
3240 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3241 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3242 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3245 c2 = (c2 << 8) | (c1 & 0x7f);
3247 #ifdef SHIFTJIS_CP932
3250 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3251 s2e_conv(s2, s1, &c2, &c1);
3258 #endif /* SHIFTJIS_CP932 */
3260 #endif /* X0212_ENABLE */
3261 } else if (c2 == SSO){
3264 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3267 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3268 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3269 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3274 #ifdef SHIFTJIS_CP932
3275 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3277 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3278 s2e_conv(s2, s1, &c2, &c1);
3285 #endif /* SHIFTJIS_CP932 */
3292 #ifdef UTF8_INPUT_ENABLE
3293 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3300 }else if (0xc0 <= c2 && c2 <= 0xef) {
3301 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3302 #ifdef NUMCHAR_OPTION
3305 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3313 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3316 static const char w_iconv_utf8_1st_byte[] =
3318 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3319 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3320 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3321 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3323 if (c2 < 0 || 0xff < c2) {
3324 }else if (c2 == 0) { /* 0 : 1 byte*/
3326 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3329 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3331 if (c1 < 0x80 || 0xBF < c1) return 0;
3334 if (c0 == 0) return -1;
3335 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3340 if (c0 == 0) return -1;
3341 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3345 if (c0 == 0) return -1;
3346 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3350 if (c0 == 0) return -2;
3351 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3355 if (c0 == 0) return -2;
3356 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3360 if (c0 == 0) return -2;
3361 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3369 if (c2 == 0 || c2 == EOF){
3370 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3371 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3374 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3383 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3384 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3391 }else if (val < 0x800){
3392 *p2 = 0xc0 | (val >> 6);
3393 *p1 = 0x80 | (val & 0x3f);
3395 } else if (val <= NKF_INT32_C(0xFFFF)) {
3396 *p2 = 0xe0 | (val >> 12);
3397 *p1 = 0x80 | ((val >> 6) & 0x3f);
3398 *p0 = 0x80 | (val & 0x3f);
3399 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3400 *p2 = 0xe0 | (val >> 16);
3401 *p1 = 0x80 | ((val >> 12) & 0x3f);
3402 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3411 #ifdef UTF8_INPUT_ENABLE
3412 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3417 } else if (c2 >= 0xf0){
3418 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3419 val = (c2 & 0x0f) << 18;
3420 val |= (c1 & 0x3f) << 12;
3421 val |= (c0 & 0x3f00) >> 2;
3423 }else if (c2 >= 0xe0){
3424 val = (c2 & 0x0f) << 12;
3425 val |= (c1 & 0x3f) << 6;
3427 }else if (c2 >= 0xc0){
3428 val = (c2 & 0x1f) << 6;
3436 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3438 nkf_char c2, c1, c0;
3445 w16w_conv(val, &c2, &c1, &c0);
3446 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3447 #ifdef NUMCHAR_OPTION
3450 *p1 = CLASS_UNICODE | val;
3459 #ifdef UTF8_INPUT_ENABLE
3460 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3463 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3466 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3467 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3469 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3471 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3476 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3477 if (ret) return ret;
3482 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3486 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3487 } else if (is_unicode_bmp(c1)) {
3488 ret = w16e_conv(c1, &c2, &c1);
3491 c1 = CLASS_UNICODE | c1;
3493 if (ret) return ret;
3498 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3500 const unsigned short *const *pp;
3501 const unsigned short *const *const *ppp;
3502 static const char no_best_fit_chars_table_C2[] =
3503 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3504 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3505 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3506 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3507 static const char no_best_fit_chars_table_C2_ms[] =
3508 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3509 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3510 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3511 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3512 static const char no_best_fit_chars_table_932_C2[] =
3513 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3514 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3515 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3516 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3517 static const char no_best_fit_chars_table_932_C3[] =
3518 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3519 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3520 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3527 }else if(c2 < 0xe0){
3528 if(no_best_fit_chars_f){
3529 if(ms_ucs_map_f == UCS_MAP_CP932){
3532 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3535 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3538 }else if(!cp932inv_f){
3541 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3544 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3547 }else if(ms_ucs_map_f == UCS_MAP_MS){
3548 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3549 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3567 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3568 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3569 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3571 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3572 }else if(c0 < 0xF0){
3573 if(no_best_fit_chars_f){
3574 if(ms_ucs_map_f == UCS_MAP_CP932){
3575 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3576 }else if(ms_ucs_map_f == UCS_MAP_MS){
3581 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3584 if(c0 == 0x92) return 1;
3589 if(c1 == 0x80 || c0 == 0x9C) return 1;
3592 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3597 if(c0 == 0x94) return 1;
3600 if(c0 == 0xBB) return 1;
3610 if(c0 == 0x95) return 1;
3613 if(c0 == 0xA5) return 1;
3620 if(c0 == 0x8D) return 1;
3623 if(c0 == 0x9E && !cp932inv_f) return 1;
3626 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3634 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3635 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3636 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3638 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3640 #ifdef SHIFTJIS_CP932
3641 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3643 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3644 s2e_conv(s2, s1, p2, p1);
3653 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3656 const unsigned short *p;
3659 if (pp == 0) return 1;
3662 if (c1 < 0 || psize <= c1) return 1;
3664 if (p == 0) return 1;
3667 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3669 if (val == 0) return 1;
3670 if (no_cp932ext_f && (
3671 (val>>8) == 0x2D || /* NEC special characters */
3672 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3680 if (c2 == SO) c2 = X0201;
3687 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3694 (*f)(0, bin2hex(c>>shift));
3704 void encode_fallback_html(nkf_char c)
3709 if(c >= NKF_INT32_C(1000000))
3710 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3711 if(c >= NKF_INT32_C(100000))
3712 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3714 (*oconv)(0, 0x30+(c/10000 )%10);
3716 (*oconv)(0, 0x30+(c/1000 )%10);
3718 (*oconv)(0, 0x30+(c/100 )%10);
3720 (*oconv)(0, 0x30+(c/10 )%10);
3722 (*oconv)(0, 0x30+ c %10);
3727 void encode_fallback_xml(nkf_char c)
3732 nkf_each_char_to_hex(oconv, c);
3737 void encode_fallback_java(nkf_char c)
3741 if(!is_unicode_bmp(c)){
3745 (*oconv)(0, bin2hex(c>>20));
3746 (*oconv)(0, bin2hex(c>>16));
3750 (*oconv)(0, bin2hex(c>>12));
3751 (*oconv)(0, bin2hex(c>> 8));
3752 (*oconv)(0, bin2hex(c>> 4));
3753 (*oconv)(0, bin2hex(c ));
3757 void encode_fallback_perl(nkf_char c)
3762 nkf_each_char_to_hex(oconv, c);
3767 void encode_fallback_subchar(nkf_char c)
3769 c = unicode_subchar;
3770 (*oconv)((c>>8)&0xFF, c&0xFF);
3775 #ifdef UTF8_OUTPUT_ENABLE
3776 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3778 const unsigned short *p;
3781 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3789 p = euc_to_utf8_1byte;
3791 } else if (is_eucg3(c2)){
3792 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3795 c2 = (c2&0x7f) - 0x21;
3796 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3797 p = x0212_to_utf8_2bytes[c2];
3803 c2 = (c2&0x7f) - 0x21;
3804 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3806 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3807 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3808 euc_to_utf8_2bytes_ms[c2];
3813 c1 = (c1 & 0x7f) - 0x21;
3814 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3819 void w_oconv(nkf_char c2, nkf_char c1)
3825 output_bom_f = FALSE;
3836 #ifdef NUMCHAR_OPTION
3837 if (c2 == 0 && is_unicode_capsule(c1)){
3838 val = c1 & VALUE_MASK;
3841 }else if (val < 0x800){
3842 (*o_putc)(0xC0 | (val >> 6));
3843 (*o_putc)(0x80 | (val & 0x3f));
3844 } else if (val <= NKF_INT32_C(0xFFFF)) {
3845 (*o_putc)(0xE0 | (val >> 12));
3846 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3847 (*o_putc)(0x80 | (val & 0x3f));
3848 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3849 (*o_putc)(0xF0 | ( val>>18));
3850 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3851 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3852 (*o_putc)(0x80 | ( val & 0x3f));
3859 output_mode = ASCII;
3861 } else if (c2 == ISO8859_1) {
3863 (*o_putc)(c1 | 0x080);
3866 val = e2w_conv(c2, c1);
3868 w16w_conv(val, &c2, &c1, &c0);
3872 if (c0) (*o_putc)(c0);
3878 void w_oconv16(nkf_char c2, nkf_char c1)
3881 output_bom_f = FALSE;
3882 if (output_endian == ENDIAN_LITTLE){
3883 (*o_putc)((unsigned char)'\377');
3887 (*o_putc)((unsigned char)'\377');
3896 if (c2 == ISO8859_1) {
3899 #ifdef NUMCHAR_OPTION
3900 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3901 if (is_unicode_bmp(c1)) {
3902 c2 = (c1 >> 8) & 0xff;
3906 if (c1 <= UNICODE_MAX) {
3907 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3908 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3909 if (output_endian == ENDIAN_LITTLE){
3910 (*o_putc)(c2 & 0xff);
3911 (*o_putc)((c2 >> 8) & 0xff);
3912 (*o_putc)(c1 & 0xff);
3913 (*o_putc)((c1 >> 8) & 0xff);
3915 (*o_putc)((c2 >> 8) & 0xff);
3916 (*o_putc)(c2 & 0xff);
3917 (*o_putc)((c1 >> 8) & 0xff);
3918 (*o_putc)(c1 & 0xff);
3925 nkf_char val = e2w_conv(c2, c1);
3926 c2 = (val >> 8) & 0xff;
3930 if (output_endian == ENDIAN_LITTLE){
3939 void w_oconv32(nkf_char c2, nkf_char c1)
3942 output_bom_f = FALSE;
3943 if (output_endian == ENDIAN_LITTLE){
3944 (*o_putc)((unsigned char)'\377');
3952 (*o_putc)((unsigned char)'\377');
3961 if (c2 == ISO8859_1) {
3963 #ifdef NUMCHAR_OPTION
3964 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3968 c1 = e2w_conv(c2, c1);
3971 if (output_endian == ENDIAN_LITTLE){
3972 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3973 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3974 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3978 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3979 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3980 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3985 void e_oconv(nkf_char c2, nkf_char c1)
3987 #ifdef NUMCHAR_OPTION
3988 if (c2 == 0 && is_unicode_capsule(c1)){
3989 w16e_conv(c1, &c2, &c1);
3990 if (c2 == 0 && is_unicode_capsule(c1)){
3991 c2 = c1 & VALUE_MASK;
3992 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3996 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3997 c1 = 0x21 + c1 % 94;
4000 (*o_putc)((c2 & 0x7f) | 0x080);
4001 (*o_putc)(c1 | 0x080);
4003 (*o_putc)((c2 & 0x7f) | 0x080);
4004 (*o_putc)(c1 | 0x080);
4008 if (encode_fallback) (*encode_fallback)(c1);
4017 } else if (c2 == 0) {
4018 output_mode = ASCII;
4020 } else if (c2 == X0201) {
4021 output_mode = JAPANESE_EUC;
4022 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4023 } else if (c2 == ISO8859_1) {
4024 output_mode = ISO8859_1;
4025 (*o_putc)(c1 | 0x080);
4027 } else if (is_eucg3(c2)){
4028 output_mode = JAPANESE_EUC;
4029 #ifdef SHIFTJIS_CP932
4032 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4033 s2e_conv(s2, s1, &c2, &c1);
4038 output_mode = ASCII;
4040 }else if (is_eucg3(c2)){
4043 (*o_putc)((c2 & 0x7f) | 0x080);
4044 (*o_putc)(c1 | 0x080);
4047 (*o_putc)((c2 & 0x7f) | 0x080);
4048 (*o_putc)(c1 | 0x080);
4052 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4053 set_iconv(FALSE, 0);
4054 return; /* too late to rescue this char */
4056 output_mode = JAPANESE_EUC;
4057 (*o_putc)(c2 | 0x080);
4058 (*o_putc)(c1 | 0x080);
4063 nkf_char x0212_shift(nkf_char c)
4068 if (0x75 <= c && c <= 0x7f){
4069 ret = c + (0x109 - 0x75);
4072 if (0x75 <= c && c <= 0x7f){
4073 ret = c + (0x113 - 0x75);
4080 nkf_char x0212_unshift(nkf_char c)
4083 if (0x7f <= c && c <= 0x88){
4084 ret = c + (0x75 - 0x7f);
4085 }else if (0x89 <= c && c <= 0x92){
4086 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4090 #endif /* X0212_ENABLE */
4092 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4098 if((0x21 <= ndx && ndx <= 0x2F)){
4099 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4100 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4102 }else if(0x6E <= ndx && ndx <= 0x7E){
4103 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4104 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4110 else if(nkf_isgraph(ndx)){
4112 const unsigned short *ptr;
4113 ptr = x0212_shiftjis[ndx - 0x21];
4115 val = ptr[(c1 & 0x7f) - 0x21];
4124 c2 = x0212_shift(c2);
4126 #endif /* X0212_ENABLE */
4128 if(0x7F < c2) return 1;
4129 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4130 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4134 void s_oconv(nkf_char c2, nkf_char c1)
4136 #ifdef NUMCHAR_OPTION
4137 if (c2 == 0 && is_unicode_capsule(c1)){
4138 w16e_conv(c1, &c2, &c1);
4139 if (c2 == 0 && is_unicode_capsule(c1)){
4140 c2 = c1 & VALUE_MASK;
4141 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4144 c2 = c1 / 188 + 0xF0;
4146 c1 += 0x40 + (c1 > 0x3e);
4151 if(encode_fallback)(*encode_fallback)(c1);
4160 } else if (c2 == 0) {
4161 output_mode = ASCII;
4163 } else if (c2 == X0201) {
4164 output_mode = SHIFT_JIS;
4166 } else if (c2 == ISO8859_1) {
4167 output_mode = ISO8859_1;
4168 (*o_putc)(c1 | 0x080);
4170 } else if (is_eucg3(c2)){
4171 output_mode = SHIFT_JIS;
4172 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4178 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4179 set_iconv(FALSE, 0);
4180 return; /* too late to rescue this char */
4182 output_mode = SHIFT_JIS;
4183 e2s_conv(c2, c1, &c2, &c1);
4185 #ifdef SHIFTJIS_CP932
4187 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4188 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4194 #endif /* SHIFTJIS_CP932 */
4197 if (prefix_table[(unsigned char)c1]){
4198 (*o_putc)(prefix_table[(unsigned char)c1]);
4204 void j_oconv(nkf_char c2, nkf_char c1)
4206 #ifdef NUMCHAR_OPTION
4207 if (c2 == 0 && is_unicode_capsule(c1)){
4208 w16e_conv(c1, &c2, &c1);
4209 if (c2 == 0 && is_unicode_capsule(c1)){
4210 c2 = c1 & VALUE_MASK;
4211 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4214 c2 = 0x7F + c1 / 94;
4215 c1 = 0x21 + c1 % 94;
4217 if (encode_fallback) (*encode_fallback)(c1);
4224 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4227 (*o_putc)(ascii_intro);
4228 output_mode = ASCII;
4232 } else if (is_eucg3(c2)){
4234 if(output_mode!=X0213_2){
4235 output_mode = X0213_2;
4239 (*o_putc)(X0213_2&0x7F);
4242 if(output_mode!=X0212){
4243 output_mode = X0212;
4247 (*o_putc)(X0212&0x7F);
4250 (*o_putc)(c2 & 0x7f);
4253 } else if (c2==X0201) {
4254 if (output_mode!=X0201) {
4255 output_mode = X0201;
4261 } else if (c2==ISO8859_1) {
4262 /* iso8859 introduction, or 8th bit on */
4263 /* Can we convert in 7bit form using ESC-'-'-A ?
4265 output_mode = ISO8859_1;
4267 } else if (c2 == 0) {
4268 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4271 (*o_putc)(ascii_intro);
4272 output_mode = ASCII;
4277 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4278 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4280 if (output_mode!=X0213_1) {
4281 output_mode = X0213_1;
4285 (*o_putc)(X0213_1&0x7F);
4287 }else if (output_mode != X0208) {
4288 output_mode = X0208;
4291 (*o_putc)(kanji_intro);
4298 void base64_conv(nkf_char c2, nkf_char c1)
4300 mime_prechar(c2, c1);
4301 (*o_base64conv)(c2,c1);
4305 static nkf_char broken_buf[3];
4306 static int broken_counter = 0;
4307 static int broken_last = 0;
4308 nkf_char broken_getc(FILE *f)
4312 if (broken_counter>0) {
4313 return broken_buf[--broken_counter];
4316 if (c=='$' && broken_last != ESC
4317 && (input_mode==ASCII || input_mode==X0201)) {
4320 if (c1=='@'|| c1=='B') {
4321 broken_buf[0]=c1; broken_buf[1]=c;
4328 } else if (c=='(' && broken_last != ESC
4329 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4332 if (c1=='J'|| c1=='B') {
4333 broken_buf[0]=c1; broken_buf[1]=c;
4346 nkf_char broken_ungetc(nkf_char c, FILE *f)
4348 if (broken_counter<2)
4349 broken_buf[broken_counter++]=c;
4353 void nl_conv(nkf_char c2, nkf_char c1)
4355 if (guess_f && input_newline != EOF) {
4356 if (c2 == 0 && c1 == LF) {
4357 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4358 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4359 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4361 else if (!input_newline) input_newline = CR;
4362 else if (input_newline != CR) input_newline = EOF;
4364 if (prev_cr || c2 == 0 && c1 == LF) {
4366 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4367 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4369 if (c2 == 0 && c1 == CR) prev_cr = CR;
4370 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4374 Return value of fold_conv()
4376 LF add newline and output char
4377 CR add newline and output nothing
4380 1 (or else) normal output
4382 fold state in prev (previous character)
4384 >0x80 Japanese (X0208/X0201)
4389 This fold algorthm does not preserve heading space in a line.
4390 This is the main difference from fmt.
4393 #define char_size(c2,c1) (c2?2:1)
4395 void fold_conv(nkf_char c2, nkf_char c1)
4398 nkf_char fold_state;
4400 if (c1== CR && !fold_preserve_f) {
4401 fold_state=0; /* ignore cr */
4402 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4404 fold_state=0; /* ignore cr */
4405 } else if (c1== BS) {
4406 if (f_line>0) f_line--;
4408 } else if (c2==EOF && f_line != 0) { /* close open last line */
4410 } else if ((c1==LF && !fold_preserve_f)
4411 || ((c1==CR||(c1==LF&&f_prev!=CR))
4412 && fold_preserve_f)) {
4414 if (fold_preserve_f) {
4418 } else if ((f_prev == c1 && !fold_preserve_f)
4419 || (f_prev == LF && fold_preserve_f)
4420 ) { /* duplicate newline */
4423 fold_state = LF; /* output two newline */
4429 if (f_prev&0x80) { /* Japanese? */
4431 fold_state = 0; /* ignore given single newline */
4432 } else if (f_prev==SP) {
4436 if (++f_line<=fold_len)
4440 fold_state = CR; /* fold and output nothing */
4444 } else if (c1=='\f') {
4447 fold_state = LF; /* output newline and clear */
4448 } else if ( (c2==0 && c1==SP)||
4449 (c2==0 && c1==TAB)||
4450 (c2=='!'&& c1=='!')) {
4451 /* X0208 kankaku or ascii space */
4453 fold_state = 0; /* remove duplicate spaces */
4456 if (++f_line<=fold_len)
4457 fold_state = SP; /* output ASCII space only */
4459 f_prev = SP; f_line = 0;
4460 fold_state = CR; /* fold and output nothing */
4464 prev0 = f_prev; /* we still need this one... , but almost done */
4466 if (c2 || c2==X0201)
4467 f_prev |= 0x80; /* this is Japanese */
4468 f_line += char_size(c2,c1);
4469 if (f_line<=fold_len) { /* normal case */
4472 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4473 f_line = char_size(c2,c1);
4474 fold_state = LF; /* We can't wait, do fold now */
4475 } else if (c2==X0201) {
4476 /* simple kinsoku rules return 1 means no folding */
4477 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4478 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4479 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4480 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4481 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4482 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4483 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4485 fold_state = LF;/* add one new f_line before this character */
4488 fold_state = LF;/* add one new f_line before this character */
4491 /* kinsoku point in ASCII */
4492 if ( c1==')'|| /* { [ ( */
4503 /* just after special */
4504 } else if (!is_alnum(prev0)) {
4505 f_line = char_size(c2,c1);
4507 } else if ((prev0==SP) || /* ignored new f_line */
4508 (prev0==LF)|| /* ignored new f_line */
4509 (prev0&0x80)) { /* X0208 - ASCII */
4510 f_line = char_size(c2,c1);
4511 fold_state = LF;/* add one new f_line before this character */
4513 fold_state = 1; /* default no fold in ASCII */
4517 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4518 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4519 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4520 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4521 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4522 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4523 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4524 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4525 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4526 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4527 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4528 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4529 /* default no fold in kinsoku */
4532 f_line = char_size(c2,c1);
4533 /* add one new f_line before this character */
4536 f_line = char_size(c2,c1);
4538 /* add one new f_line before this character */
4543 /* terminator process */
4544 switch(fold_state) {
4563 nkf_char z_prev2=0,z_prev1=0;
4565 void z_conv(nkf_char c2, nkf_char c1)
4568 /* if (c2) c1 &= 0x7f; assertion */
4570 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4576 if (z_prev2 == X0201) {
4578 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4580 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4582 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4584 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4589 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4592 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4593 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4598 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4609 if (alpha_f&1 && c2 == 0x23) {
4610 /* JISX0208 Alphabet */
4612 } else if (c2 == 0x21) {
4613 /* JISX0208 Kigou */
4618 } else if (alpha_f&4) {
4623 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4629 if (alpha_f&8 && c2 == 0) {
4633 case '>': entity = ">"; break;
4634 case '<': entity = "<"; break;
4635 case '\"': entity = """; break;
4636 case '&': entity = "&"; break;
4639 while (*entity) (*o_zconv)(0, *entity++);
4645 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4650 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4654 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4658 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4662 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4666 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4670 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4674 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4678 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4683 (*o_zconv)(X0201, c);
4686 } else if (c2 == 0x25) {
4687 /* JISX0208 Katakana */
4688 static const int fullwidth_to_halfwidth[] =
4690 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4691 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4692 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4693 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4694 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4695 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4696 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4697 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4698 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4699 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4700 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4701 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4703 if (fullwidth_to_halfwidth[c1-0x20]){
4704 c2 = fullwidth_to_halfwidth[c1-0x20];
4705 (*o_zconv)(X0201, c2>>8);
4707 (*o_zconv)(X0201, c2&0xFF);
4717 #define rot13(c) ( \
4719 (c <= 'M') ? (c + 13): \
4720 (c <= 'Z') ? (c - 13): \
4722 (c <= 'm') ? (c + 13): \
4723 (c <= 'z') ? (c - 13): \
4727 #define rot47(c) ( \
4729 ( c <= 'O') ? (c + 47) : \
4730 ( c <= '~') ? (c - 47) : \
4734 void rot_conv(nkf_char c2, nkf_char c1)
4736 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4742 (*o_rot_conv)(c2,c1);
4745 void hira_conv(nkf_char c2, nkf_char c1)
4749 if (0x20 < c1 && c1 < 0x74) {
4751 (*o_hira_conv)(c2,c1);
4753 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4755 c1 = CLASS_UNICODE | 0x3094;
4756 (*o_hira_conv)(c2,c1);
4759 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4761 (*o_hira_conv)(c2,c1);
4766 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4769 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4771 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4775 (*o_hira_conv)(c2,c1);
4779 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4781 static const nkf_char range[RANGE_NUM_MAX][2] = {
4802 nkf_char start, end, c;
4804 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4808 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4813 for (i = 0; i < RANGE_NUM_MAX; i++) {
4814 start = range[i][0];
4817 if (c >= start && c <= end) {
4822 (*o_iso2022jp_check_conv)(c2,c1);
4826 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4828 static const unsigned char *mime_pattern[] = {
4829 (const unsigned char *)"\075?EUC-JP?B?",
4830 (const unsigned char *)"\075?SHIFT_JIS?B?",
4831 (const unsigned char *)"\075?ISO-8859-1?Q?",
4832 (const unsigned char *)"\075?ISO-8859-1?B?",
4833 (const unsigned char *)"\075?ISO-2022-JP?B?",
4834 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4835 #if defined(UTF8_INPUT_ENABLE)
4836 (const unsigned char *)"\075?UTF-8?B?",
4837 (const unsigned char *)"\075?UTF-8?Q?",
4839 (const unsigned char *)"\075?US-ASCII?Q?",
4844 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4845 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4846 e_iconv, s_iconv, 0, 0, 0, 0,
4847 #if defined(UTF8_INPUT_ENABLE)
4853 static const nkf_char mime_encode[] = {
4854 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4855 #if defined(UTF8_INPUT_ENABLE)
4862 static const nkf_char mime_encode_method[] = {
4863 'B', 'B','Q', 'B', 'B', 'Q',
4864 #if defined(UTF8_INPUT_ENABLE)
4872 #define MAXRECOVER 20
4874 void switch_mime_getc(void)
4876 if (i_getc!=mime_getc) {
4877 i_mgetc = i_getc; i_getc = mime_getc;
4878 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4879 if(mime_f==STRICT_MIME) {
4880 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4881 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4886 void unswitch_mime_getc(void)
4888 if(mime_f==STRICT_MIME) {
4889 i_mgetc = i_mgetc_buf;
4890 i_mungetc = i_mungetc_buf;
4893 i_ungetc = i_mungetc;
4894 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4895 mime_iconv_back = NULL;
4898 nkf_char mime_begin_strict(FILE *f)
4902 const unsigned char *p,*q;
4903 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4905 mime_decode_mode = FALSE;
4906 /* =? has been checked */
4908 p = mime_pattern[j];
4911 for(i=2;p[i]>SP;i++) { /* start at =? */
4912 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4913 /* pattern fails, try next one */
4915 while (mime_pattern[++j]) {
4916 p = mime_pattern[j];
4917 for(k=2;k<i;k++) /* assume length(p) > i */
4918 if (p[k]!=q[k]) break;
4919 if (k==i && nkf_toupper(c1)==p[k]) break;
4921 p = mime_pattern[j];
4922 if (p) continue; /* found next one, continue */
4923 /* all fails, output from recovery buffer */
4931 mime_decode_mode = p[i-2];
4933 mime_iconv_back = iconv;
4934 set_iconv(FALSE, mime_priority_func[j]);
4935 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4937 if (mime_decode_mode=='B') {
4938 mimebuf_f = unbuf_f;
4940 /* do MIME integrity check */
4941 return mime_integrity(f,mime_pattern[j]);
4949 nkf_char mime_getc_buf(FILE *f)
4951 /* we don't keep eof of Fifo, becase it contains ?= as
4952 a terminator. It was checked in mime_integrity. */
4953 return ((mimebuf_f)?
4954 (*i_mgetc_buf)(f):Fifo(mime_input++));
4957 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4960 (*i_mungetc_buf)(c,f);
4962 Fifo(--mime_input) = (unsigned char)c;
4966 nkf_char mime_begin(FILE *f)
4971 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4972 /* re-read and convert again from mime_buffer. */
4974 /* =? has been checked */
4976 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4977 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4978 /* We accept any character type even if it is breaked by new lines */
4979 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4980 if (c1==LF||c1==SP||c1==CR||
4981 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4983 /* Failed. But this could be another MIME preemble */
4991 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4992 if (!(++i<MAXRECOVER) || c1==EOF) break;
4993 if (c1=='b'||c1=='B') {
4994 mime_decode_mode = 'B';
4995 } else if (c1=='q'||c1=='Q') {
4996 mime_decode_mode = 'Q';
5000 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5001 if (!(++i<MAXRECOVER) || c1==EOF) break;
5003 mime_decode_mode = FALSE;
5009 if (!mime_decode_mode) {
5010 /* false MIME premble, restart from mime_buffer */
5011 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5012 /* Since we are in MIME mode until buffer becomes empty, */
5013 /* we never go into mime_begin again for a while. */
5016 /* discard mime preemble, and goto MIME mode */
5018 /* do no MIME integrity check */
5019 return c1; /* used only for checking EOF */
5023 void no_putc(nkf_char c)
5028 void debug(const char *str)
5031 fprintf(stderr, "%s\n", str ? str : "NULL");
5036 void set_input_codename(char *codename)
5038 if (!input_codename) {
5039 input_codename = codename;
5040 } else if (strcmp(codename, input_codename) != 0) {
5041 input_codename = "";
5045 #if !defined(PERL_XS) && !defined(WIN32DLL)
5046 void print_guessed_code(char *filename)
5048 char *codename = "BINARY";
5049 char *str_nlmode = NULL;
5050 if (filename != NULL) printf("%s: ", filename);
5051 if (input_codename && !*input_codename) {
5054 struct input_code *p = find_inputcode_byfunc(iconv);
5056 printf("%s\n", input_codename ? input_codename : "ASCII");
5058 if (!input_codename) {
5059 input_codename = "ASCII";
5060 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5061 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5062 input_codename = "CP932";
5063 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5064 if (p->score & (SCORE_X0212))
5065 input_codename = "EUCJP-MS";
5066 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5067 input_codename = "CP51932";
5068 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5069 if (p->score & (SCORE_KANA))
5070 input_codename = "CP50221";
5071 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5072 input_codename = "CP50220";
5076 input_newline == CR ? " (CR)" :
5077 input_newline == LF ? " (LF)" :
5078 input_newline == CRLF ? " (CRLF)" :
5079 input_newline == EOF ? " (MIXED NL)" :
5088 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5090 nkf_char c1, c2, c3;
5096 if (!nkf_isxdigit(c2)){
5101 if (!nkf_isxdigit(c3)){
5106 return (hex2bin(c2) << 4) | hex2bin(c3);
5109 nkf_char cap_getc(FILE *f)
5111 return hex_getc(':', f, i_cgetc, i_cungetc);
5114 nkf_char cap_ungetc(nkf_char c, FILE *f)
5116 return (*i_cungetc)(c, f);
5119 nkf_char url_getc(FILE *f)
5121 return hex_getc('%', f, i_ugetc, i_uungetc);
5124 nkf_char url_ungetc(nkf_char c, FILE *f)
5126 return (*i_uungetc)(c, f);
5130 #ifdef NUMCHAR_OPTION
5131 nkf_char numchar_getc(FILE *f)
5133 nkf_char (*g)(FILE *) = i_ngetc;
5134 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5145 if (buf[i] == 'x' || buf[i] == 'X'){
5146 for (j = 0; j < 7; j++){
5148 if (!nkf_isxdigit(buf[i])){
5155 c |= hex2bin(buf[i]);
5158 for (j = 0; j < 8; j++){
5162 if (!nkf_isdigit(buf[i])){
5169 c += hex2bin(buf[i]);
5175 return CLASS_UNICODE | c;
5184 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5186 return (*i_nungetc)(c, f);
5190 #ifdef UNICODE_NORMALIZATION
5192 /* Normalization Form C */
5193 nkf_char nfc_getc(FILE *f)
5195 nkf_char (*g)(FILE *f) = i_nfc_getc;
5196 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5197 int i=0, j, k=1, lower, upper;
5199 const nkf_nfchar *array;
5202 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5203 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5204 while (upper >= lower) {
5205 j = (lower+upper) / 2;
5206 array = normalization_table[j].nfd;
5207 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5208 if (array[k] != buf[k]){
5209 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5216 array = normalization_table[j].nfc;
5217 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5218 buf[i] = (nkf_char)(array[i]);
5229 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5231 return (*i_nfc_ungetc)(c, f);
5233 #endif /* UNICODE_NORMALIZATION */
5239 nkf_char c1, c2, c3, c4, cc;
5240 nkf_char t1, t2, t3, t4, mode, exit_mode;
5241 nkf_char lwsp_count;
5244 nkf_char lwsp_size = 128;
5246 if (mime_top != mime_last) { /* Something is in FIFO */
5247 return Fifo(mime_top++);
5249 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5250 mime_decode_mode=FALSE;
5251 unswitch_mime_getc();
5252 return (*i_getc)(f);
5255 if (mimebuf_f == FIXED_MIME)
5256 exit_mode = mime_decode_mode;
5259 if (mime_decode_mode == 'Q') {
5260 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5262 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5263 if (c1<=SP || DEL<=c1) {
5264 mime_decode_mode = exit_mode; /* prepare for quit */
5267 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5271 mime_decode_mode = exit_mode; /* prepare for quit */
5272 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5273 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5274 /* end Q encoding */
5275 input_mode = exit_mode;
5277 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5278 if (lwsp_buf==NULL) {
5279 perror("can't malloc");
5282 while ((c1=(*i_getc)(f))!=EOF) {
5287 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5295 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5296 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5311 lwsp_buf[lwsp_count] = (unsigned char)c1;
5312 if (lwsp_count++>lwsp_size){
5314 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5315 if (lwsp_buf_new==NULL) {
5317 perror("can't realloc");
5320 lwsp_buf = lwsp_buf_new;
5326 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5328 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5329 i_ungetc(lwsp_buf[lwsp_count],f);
5335 if (c1=='='&&c2<SP) { /* this is soft wrap */
5336 while((c1 = (*i_mgetc)(f)) <=SP) {
5337 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5339 mime_decode_mode = 'Q'; /* still in MIME */
5340 goto restart_mime_q;
5343 mime_decode_mode = 'Q'; /* still in MIME */
5347 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5348 if (c2<=SP) return c2;
5349 mime_decode_mode = 'Q'; /* still in MIME */
5350 return ((hex2bin(c2)<<4) + hex2bin(c3));
5353 if (mime_decode_mode != 'B') {
5354 mime_decode_mode = FALSE;
5355 return (*i_mgetc)(f);
5359 /* Base64 encoding */
5361 MIME allows line break in the middle of
5362 Base64, but we are very pessimistic in decoding
5363 in unbuf mode because MIME encoded code may broken by
5364 less or editor's control sequence (such as ESC-[-K in unbuffered
5365 mode. ignore incomplete MIME.
5367 mode = mime_decode_mode;
5368 mime_decode_mode = exit_mode; /* prepare for quit */
5370 while ((c1 = (*i_mgetc)(f))<=SP) {
5375 if ((c2 = (*i_mgetc)(f))<=SP) {
5378 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5379 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5382 if ((c1 == '?') && (c2 == '=')) {
5385 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5386 if (lwsp_buf==NULL) {
5387 perror("can't malloc");
5390 while ((c1=(*i_getc)(f))!=EOF) {
5395 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5403 if ((c1=(*i_getc)(f))!=EOF) {
5407 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5422 lwsp_buf[lwsp_count] = (unsigned char)c1;
5423 if (lwsp_count++>lwsp_size){
5425 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5426 if (lwsp_buf_new==NULL) {
5428 perror("can't realloc");
5431 lwsp_buf = lwsp_buf_new;
5437 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5439 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5440 i_ungetc(lwsp_buf[lwsp_count],f);
5447 if ((c3 = (*i_mgetc)(f))<=SP) {
5450 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5451 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5455 if ((c4 = (*i_mgetc)(f))<=SP) {
5458 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5459 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5463 mime_decode_mode = mode; /* still in MIME sigh... */
5465 /* BASE 64 decoding */
5467 t1 = 0x3f & base64decode(c1);
5468 t2 = 0x3f & base64decode(c2);
5469 t3 = 0x3f & base64decode(c3);
5470 t4 = 0x3f & base64decode(c4);
5471 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5473 Fifo(mime_last++) = (unsigned char)cc;
5474 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5476 Fifo(mime_last++) = (unsigned char)cc;
5477 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5479 Fifo(mime_last++) = (unsigned char)cc;
5484 return Fifo(mime_top++);
5487 nkf_char mime_ungetc(nkf_char c, FILE *f)
5489 Fifo(--mime_top) = (unsigned char)c;
5493 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5497 /* In buffered mode, read until =? or NL or buffer full
5499 mime_input = mime_top;
5500 mime_last = mime_top;
5502 while(*p) Fifo(mime_input++) = *p++;
5505 while((c=(*i_getc)(f))!=EOF) {
5506 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5507 break; /* buffer full */
5509 if (c=='=' && d=='?') {
5510 /* checked. skip header, start decode */
5511 Fifo(mime_input++) = (unsigned char)c;
5512 /* mime_last_input = mime_input; */
5517 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5519 /* Should we check length mod 4? */
5520 Fifo(mime_input++) = (unsigned char)c;
5523 /* In case of Incomplete MIME, no MIME decode */
5524 Fifo(mime_input++) = (unsigned char)c;
5525 mime_last = mime_input; /* point undecoded buffer */
5526 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5527 switch_mime_getc(); /* anyway we need buffered getc */
5531 nkf_char base64decode(nkf_char c)
5536 i = c - 'A'; /* A..Z 0-25 */
5537 } else if (c == '_') {
5538 i = '?' /* 63 */ ; /* _ 63 */
5540 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5542 } else if (c > '/') {
5543 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5544 } else if (c == '+' || c == '-') {
5545 i = '>' /* 62 */ ; /* + and - 62 */
5547 i = '?' /* 63 */ ; /* / 63 */
5552 static const char basis_64[] =
5553 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5555 static nkf_char b64c;
5556 #define MIMEOUT_BUF_LENGTH (60)
5557 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5558 int mimeout_buf_count = 0;
5560 void open_mime(nkf_char mode)
5562 const unsigned char *p;
5565 p = mime_pattern[0];
5566 for(i=0;mime_pattern[i];i++) {
5567 if (mode == mime_encode[i]) {
5568 p = mime_pattern[i];
5572 mimeout_mode = mime_encode_method[i];
5574 if (base64_count>45) {
5575 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5576 (*o_mputc)(mimeout_buf[i]);
5582 if (mimeout_buf_count>0
5583 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5584 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5588 for (;i<mimeout_buf_count;i++) {
5589 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5590 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5591 (*o_mputc)(mimeout_buf[i]);
5601 j = mimeout_buf_count;
5602 mimeout_buf_count = 0;
5604 mime_putc(mimeout_buf[i]);
5608 void close_mime(void)
5618 switch(mimeout_mode) {
5623 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5629 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5634 if (mimeout_mode > 0) {
5635 if (mimeout_f!=FIXED_MIME) {
5637 } else if (mimeout_mode != 'Q')
5642 void mimeout_addchar(nkf_char c)
5644 switch(mimeout_mode) {
5649 } else if(!nkf_isalnum(c)) {
5651 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5652 (*o_mputc)(bin2hex((c&0xf)));
5661 (*o_mputc)(basis_64[c>>2]);
5666 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5672 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5673 (*o_mputc)(basis_64[c & 0x3F]);
5684 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5686 void mime_prechar(nkf_char c2, nkf_char c1)
5688 if (mimeout_mode > 0){
5690 if (base64_count + mimeout_buf_count/3*4> 73){
5691 (*o_base64conv)(EOF,0);
5692 (*o_base64conv)(0,LF);
5693 (*o_base64conv)(0,SP);
5697 if (base64_count + mimeout_buf_count/3*4> 66) {
5698 (*o_base64conv)(EOF,0);
5699 (*o_base64conv)(0,LF);
5700 (*o_base64conv)(0,SP);
5706 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5707 mimeout_mode = (output_mode==ASCII ||output_mode == ISO8859_1) ? 'Q' : 'B';
5708 open_mime(output_mode);
5709 (*o_base64conv)(EOF,0);
5710 (*o_base64conv)(0,LF);
5711 (*o_base64conv)(0,SP);
5718 void mime_putc(nkf_char c)
5723 if (mimeout_f == FIXED_MIME){
5724 if (mimeout_mode == 'Q'){
5725 if (base64_count > 71){
5726 if (c!=CR && c!=LF) {
5733 if (base64_count > 71){
5738 if (c == EOF) { /* c==EOF */
5742 if (c != EOF) { /* c==EOF */
5748 /* mimeout_f != FIXED_MIME */
5750 if (c == EOF) { /* c==EOF */
5751 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
5752 j = mimeout_buf_count;
5753 mimeout_buf_count = 0;
5755 if (mimeout_mode > 0) {
5756 if (!nkf_isblank(mimeout_buf[j-1])) {
5758 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5761 mimeout_addchar(mimeout_buf[i]);
5765 mimeout_addchar(mimeout_buf[i]);
5769 mimeout_addchar(mimeout_buf[i]);
5775 mimeout_addchar(mimeout_buf[i]);
5781 if (mimeout_buf_count > 0){
5782 lastchar = mimeout_buf[mimeout_buf_count - 1];
5787 if (mimeout_mode=='Q') {
5788 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5789 if (c == CR || c == LF) {
5794 } else if (c <= SP) {
5796 if (base64_count > 70) {
5800 if (!nkf_isblank(c)) {
5805 if (base64_count > 70) {
5810 open_mime(output_mode);
5812 if (!nkf_noescape_mime(c)) {
5823 if (mimeout_mode <= 0) {
5824 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5825 if (nkf_isspace(c)) {
5827 if (mimeout_mode == -1) {
5830 if (c==CR || c==LF) {
5832 open_mime(output_mode);
5838 for (i=0;i<mimeout_buf_count;i++) {
5839 (*o_mputc)(mimeout_buf[i]);
5840 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5851 mimeout_buf[0] = (char)c;
5852 mimeout_buf_count = 1;
5854 if (base64_count > 1
5855 && base64_count + mimeout_buf_count > 76
5856 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5859 if (!nkf_isspace(mimeout_buf[0])){
5864 mimeout_buf[mimeout_buf_count++] = (char)c;
5865 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5866 open_mime(output_mode);
5871 if (lastchar==CR || lastchar == LF){
5872 for (i=0;i<mimeout_buf_count;i++) {
5873 (*o_mputc)(mimeout_buf[i]);
5876 mimeout_buf_count = 0;
5879 for (i=0;i<mimeout_buf_count-1;i++) {
5880 (*o_mputc)(mimeout_buf[i]);
5883 mimeout_buf[0] = SP;
5884 mimeout_buf_count = 1;
5886 open_mime(output_mode);
5889 /* mimeout_mode == 'B', 1, 2 */
5890 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5891 if (lastchar == CR || lastchar == LF){
5892 if (nkf_isblank(c)) {
5893 for (i=0;i<mimeout_buf_count;i++) {
5894 mimeout_addchar(mimeout_buf[i]);
5896 mimeout_buf_count = 0;
5897 } else if (SP<c && c<DEL) {
5899 for (i=0;i<mimeout_buf_count;i++) {
5900 (*o_mputc)(mimeout_buf[i]);
5903 mimeout_buf_count = 0;
5905 mimeout_buf[mimeout_buf_count++] = (char)c;
5908 if (c==SP || c==TAB || c==CR || c==LF) {
5909 for (i=0;i<mimeout_buf_count;i++) {
5910 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5912 for (i=0;i<mimeout_buf_count;i++) {
5913 (*o_mputc)(mimeout_buf[i]);
5916 mimeout_buf_count = 0;
5919 mimeout_buf[mimeout_buf_count++] = (char)c;
5920 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5922 for (i=0;i<mimeout_buf_count;i++) {
5923 (*o_mputc)(mimeout_buf[i]);
5926 mimeout_buf_count = 0;
5930 if (mimeout_buf_count>0 && SP<c && c!='=') {
5931 mimeout_buf[mimeout_buf_count++] = (char)c;
5932 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5933 j = mimeout_buf_count;
5934 mimeout_buf_count = 0;
5936 mimeout_addchar(mimeout_buf[i]);
5943 if (mimeout_buf_count>0) {
5944 j = mimeout_buf_count;
5945 mimeout_buf_count = 0;
5947 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5949 mimeout_addchar(mimeout_buf[i]);
5955 (*o_mputc)(mimeout_buf[i]);
5957 open_mime(output_mode);
5967 struct input_code *p = input_code_list;
5980 mime_f = MIME_DECODE_DEFAULT;
5981 mime_decode_f = FALSE;
5986 x0201_f = X0201_DEFAULT;
5987 iso2022jp_f = FALSE;
5988 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5989 ms_ucs_map_f = UCS_MAP_ASCII;
5991 #ifdef UTF8_INPUT_ENABLE
5992 no_cp932ext_f = FALSE;
5993 no_best_fit_chars_f = FALSE;
5994 encode_fallback = NULL;
5995 unicode_subchar = '?';
5996 input_endian = ENDIAN_BIG;
5998 #ifdef UTF8_OUTPUT_ENABLE
5999 output_bom_f = FALSE;
6000 output_endian = ENDIAN_BIG;
6002 #ifdef UNICODE_NORMALIZATION
6018 #ifdef SHIFTJIS_CP932
6028 for (i = 0; i < 256; i++){
6029 prefix_table[i] = 0;
6033 mimeout_buf_count = 0;
6038 fold_preserve_f = FALSE;
6041 kanji_intro = DEFAULT_J;
6042 ascii_intro = DEFAULT_R;
6043 fold_margin = FOLD_MARGIN;
6044 output_conv = DEFAULT_CONV;
6045 oconv = DEFAULT_CONV;
6046 o_zconv = no_connection;
6047 o_fconv = no_connection;
6048 o_nlconv = no_connection;
6049 o_rot_conv = no_connection;
6050 o_hira_conv = no_connection;
6051 o_base64conv = no_connection;
6052 o_iso2022jp_check_conv = no_connection;
6055 i_ungetc = std_ungetc;
6057 i_bungetc = std_ungetc;
6060 i_mungetc = std_ungetc;
6061 i_mgetc_buf = std_getc;
6062 i_mungetc_buf = std_ungetc;
6063 output_mode = ASCII;
6066 mime_decode_mode = FALSE;
6074 z_prev2=0,z_prev1=0;
6076 iconv_for_check = 0;
6078 input_codename = NULL;
6084 void no_connection(nkf_char c2, nkf_char c1)
6086 no_connection2(c2,c1,0);
6089 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6091 fprintf(stderr,"nkf internal module connection failure.\n");
6093 return 0; /* LINT */
6098 #define fprintf dllprintf
6102 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6103 fprintf(stderr,"Flags:\n");
6104 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6105 #ifdef DEFAULT_CODE_SJIS
6106 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6108 #ifdef DEFAULT_CODE_JIS
6109 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6111 #ifdef DEFAULT_CODE_EUC
6112 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6114 #ifdef DEFAULT_CODE_UTF8
6115 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6117 #ifdef UTF8_OUTPUT_ENABLE
6118 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6120 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6121 #ifdef UTF8_INPUT_ENABLE
6122 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6124 fprintf(stderr,"t no conversion\n");
6125 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6126 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6127 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6128 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6129 fprintf(stderr,"v Show this usage. V: show version\n");
6130 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6131 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6132 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6133 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6134 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6135 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6136 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6137 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6138 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6140 fprintf(stderr,"T Text mode output\n");
6142 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6143 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6144 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6145 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6146 fprintf(stderr,"\n");
6147 fprintf(stderr,"Long name options\n");
6148 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6149 fprintf(stderr," Specify the input or output codeset\n");
6150 fprintf(stderr," --fj --unix --mac --windows\n");
6151 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6152 fprintf(stderr," Convert for the system or code\n");
6153 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6154 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6155 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6157 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6159 #ifdef NUMCHAR_OPTION
6160 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6162 #ifdef UTF8_INPUT_ENABLE
6163 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6164 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6167 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6168 fprintf(stderr," Overwrite original listed files by filtered result\n");
6169 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6171 fprintf(stderr," -g --guess Guess the input code\n");
6172 fprintf(stderr," --help --version Show this help/the version\n");
6173 fprintf(stderr," For more information, see also man nkf\n");
6174 fprintf(stderr,"\n");
6178 void show_configuration(void)
6180 fprintf(stderr, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n");
6181 fprintf(stderr, " Compile-time options:\n");
6182 fprintf(stderr, " Default encoding: "
6183 #if defined(DEFAULT_CODE_JIS)
6185 #elif defined(DEFAULT_CODE_SJIS)
6187 #elif defined(DEFAULT_CODE_EUC)
6189 #elif defined(DEFAULT_CODE_UTF8)
6195 fprintf(stderr, " Decode MIME encoded string: %s\n", MIME_DECODE_DEFAULT ? "ON" : "OFF");
6196 fprintf(stderr, " Convert JIS X 0201 Katakana: %s\n", X0201_DEFAULT ? "ON" : "OFF");
6202 fprintf(stderr,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");