1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.149 2007/11/18 12:05:18 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-11-18"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #ifndef MIME_DECODE_DEFAULT
44 #define MIME_DECODE_DEFAULT STRICT_MIME
47 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
49 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
65 #if defined(MSDOS) || defined(__OS2__)
68 #if defined(_MSC_VER) || defined(__WATCOMC__)
69 #define mktemp _mktemp
75 #define setbinmode(fp) fsetbin(fp)
76 #elif defined(__DJGPP__)
77 #include <libc/dosio.h>
78 #define setbinmode(fp) djgpp_setbinmode(fp)
79 #else /* Microsoft C, Turbo C */
80 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
83 #define setbinmode(fp)
86 #if defined(__DJGPP__)
87 void djgpp_setbinmode(FILE *fp)
89 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
92 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
93 __file_handle_set(fd, m);
97 #ifdef _IOFBF /* SysV and MSDOS, Windows */
98 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
100 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
103 /*Borland C++ 4.5 EasyWin*/
104 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
113 /* added by satoru@isoternet.org */
115 #include <sys/types.h>
117 #include <sys/stat.h>
118 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
120 #if defined(__WATCOMC__)
121 #include <sys/utime.h>
125 #else /* defined(MSDOS) */
127 #ifdef __BORLANDC__ /* BCC32 */
129 #else /* !defined(__BORLANDC__) */
130 #include <sys/utime.h>
131 #endif /* (__BORLANDC__) */
132 #else /* !defined(__WIN32__) */
133 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
134 #include <sys/utime.h>
135 #elif defined(__TURBOC__) /* BCC */
137 #elif defined(LSI_C) /* LSI C */
138 #endif /* (__WIN32__) */
146 /* state of output_mode and input_mode
163 #define X0213_1 0x284F
164 #define X0213_2 0x2850
166 /* Input Assumption */
171 #define LATIN1_INPUT 6
173 #define STRICT_MIME 8
178 #define JAPANESE_EUC 10
182 #define UTF8_INPUT 13
183 #define UTF16_INPUT 1015
184 #define UTF32_INPUT 1017
188 #define ENDIAN_BIG 1234
189 #define ENDIAN_LITTLE 4321
190 #define ENDIAN_2143 2143
191 #define ENDIAN_3412 3412
212 #define is_alnum(c) \
213 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
215 /* I don't trust portablity of toupper */
216 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
217 #define nkf_isoctal(c) ('0'<=c && c<='7')
218 #define nkf_isdigit(c) ('0'<=c && c<='9')
219 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
220 #define nkf_isblank(c) (c == SP || c == TAB)
221 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
222 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
223 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
224 #define nkf_isprint(c) (SP<=c && c<='~')
225 #define nkf_isgraph(c) ('!'<=c && c<='~')
226 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
227 ('A'<=c&&c<='F') ? (c-'A'+10) : \
228 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
229 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
230 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
231 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
232 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
233 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
235 #define CP932_TABLE_BEGIN 0xFA
236 #define CP932_TABLE_END 0xFC
237 #define CP932INV_TABLE_BEGIN 0xED
238 #define CP932INV_TABLE_END 0xEE
239 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
241 #define HOLD_SIZE 1024
242 #if defined(INT_IS_SHORT)
243 #define IOBUF_SIZE 2048
245 #define IOBUF_SIZE 16384
248 #define DEFAULT_J 'B'
249 #define DEFAULT_R 'B'
251 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
252 #define SJ6394 0x0161 /* 63 - 94 ku offset */
254 #define RANGE_NUM_MAX 18
259 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
260 #define sizeof_euc_to_utf8_1byte 94
261 #define sizeof_euc_to_utf8_2bytes 94
262 #define sizeof_utf8_to_euc_C2 64
263 #define sizeof_utf8_to_euc_E5B8 64
264 #define sizeof_utf8_to_euc_2bytes 112
265 #define sizeof_utf8_to_euc_3bytes 16
268 /* MIME preprocessor */
270 #ifdef EASYWIN /*Easy Win */
271 extern POINT _BufferSize;
280 void (*status_func)(struct input_code *, nkf_char);
281 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
285 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
288 static const char *CopyRight = COPY_RIGHT;
290 #if !defined(PERL_XS) && !defined(WIN32DLL)
291 static nkf_char noconvert(FILE *f);
293 static void module_connection(void);
294 static nkf_char kanji_convert(FILE *f);
295 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
296 static nkf_char push_hold_buf(nkf_char c2);
297 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
298 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
299 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
300 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
301 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
303 * 0: Shift_JIS, eucJP-ascii
308 #define UCS_MAP_ASCII 0
310 #define UCS_MAP_CP932 2
311 #define UCS_MAP_CP10001 3
312 static int ms_ucs_map_f = UCS_MAP_ASCII;
314 #ifdef UTF8_INPUT_ENABLE
315 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
316 static int no_cp932ext_f = FALSE;
317 /* ignore ZERO WIDTH NO-BREAK SPACE */
318 static int no_best_fit_chars_f = FALSE;
319 static int input_endian = ENDIAN_BIG;
320 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
321 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
322 static void encode_fallback_html(nkf_char c);
323 static void encode_fallback_xml(nkf_char c);
324 static void encode_fallback_java(nkf_char c);
325 static void encode_fallback_perl(nkf_char c);
326 static void encode_fallback_subchar(nkf_char c);
327 static void (*encode_fallback)(nkf_char c) = NULL;
328 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
329 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
330 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
331 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
332 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
333 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
334 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
335 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
336 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
337 static void w_status(struct input_code *, nkf_char);
339 #ifdef UTF8_OUTPUT_ENABLE
340 static int output_bom_f = FALSE;
341 static int output_endian = ENDIAN_BIG;
342 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
343 static void w_oconv(nkf_char c2,nkf_char c1);
344 static void w_oconv16(nkf_char c2,nkf_char c1);
345 static void w_oconv32(nkf_char c2,nkf_char c1);
347 static void e_oconv(nkf_char c2,nkf_char c1);
348 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
349 static void s_oconv(nkf_char c2,nkf_char c1);
350 static void j_oconv(nkf_char c2,nkf_char c1);
351 static void fold_conv(nkf_char c2,nkf_char c1);
352 static void nl_conv(nkf_char c2,nkf_char c1);
353 static void z_conv(nkf_char c2,nkf_char c1);
354 static void rot_conv(nkf_char c2,nkf_char c1);
355 static void hira_conv(nkf_char c2,nkf_char c1);
356 static void base64_conv(nkf_char c2,nkf_char c1);
357 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
358 static void no_connection(nkf_char c2,nkf_char c1);
359 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
361 static void code_score(struct input_code *ptr);
362 static void code_status(nkf_char c);
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_begin(FILE *f);
372 static nkf_char mime_getc(FILE *f);
373 static nkf_char mime_ungetc(nkf_char c,FILE *f);
375 static void switch_mime_getc(void);
376 static void unswitch_mime_getc(void);
377 static nkf_char mime_begin_strict(FILE *f);
378 static nkf_char mime_getc_buf(FILE *f);
379 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
380 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
382 static nkf_char base64decode(nkf_char c);
383 static void mime_prechar(nkf_char c2, nkf_char c1);
384 static void mime_putc(nkf_char c);
385 static void open_mime(nkf_char c);
386 static void close_mime(void);
387 static void eof_mime(void);
388 static void mimeout_addchar(nkf_char c);
390 static void usage(void);
391 static void version(void);
393 static void options(unsigned char *c);
394 static void reinit(void);
398 #if !defined(PERL_XS) && !defined(WIN32DLL)
399 static unsigned char stdibuf[IOBUF_SIZE];
400 static unsigned char stdobuf[IOBUF_SIZE];
402 static unsigned char hold_buf[HOLD_SIZE*2];
403 static int hold_count = 0;
405 /* MIME preprocessor fifo */
407 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
408 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
409 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
410 static unsigned char mime_buf[MIME_BUF_SIZE];
411 static unsigned int mime_top = 0;
412 static unsigned int mime_last = 0; /* decoded */
413 static unsigned int mime_input = 0; /* undecoded */
414 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
417 static int unbuf_f = FALSE;
418 static int estab_f = FALSE;
419 static int nop_f = FALSE;
420 static int binmode_f = TRUE; /* binary mode */
421 static int rot_f = FALSE; /* rot14/43 mode */
422 static int hira_f = FALSE; /* hira/kata henkan */
423 static int input_f = FALSE; /* non fixed input code */
424 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
425 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
426 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
427 static int mimebuf_f = FALSE; /* MIME buffered input */
428 static int broken_f = FALSE; /* convert ESC-less broken JIS */
429 static int iso8859_f = FALSE; /* ISO8859 through */
430 static int mimeout_f = FALSE; /* base64 mode */
431 #if defined(MSDOS) || defined(__OS2__)
432 static int x0201_f = TRUE; /* Assume JISX0201 kana */
434 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
436 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
438 #ifdef UNICODE_NORMALIZATION
439 static int nfc_f = FALSE;
440 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
441 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
442 static nkf_char nfc_getc(FILE *f);
443 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
447 static int cap_f = FALSE;
448 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
449 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
450 static nkf_char cap_getc(FILE *f);
451 static nkf_char cap_ungetc(nkf_char c,FILE *f);
453 static int url_f = FALSE;
454 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
455 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
456 static nkf_char url_getc(FILE *f);
457 static nkf_char url_ungetc(nkf_char c,FILE *f);
460 #if defined(INT_IS_SHORT)
461 #define NKF_INT32_C(n) (n##L)
463 #define NKF_INT32_C(n) (n)
465 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
466 #define CLASS_MASK NKF_INT32_C(0xFF000000)
467 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
468 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
469 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
470 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
471 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
473 #ifdef NUMCHAR_OPTION
474 static int numchar_f = FALSE;
475 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
476 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
477 static nkf_char numchar_getc(FILE *f);
478 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
482 static int noout_f = FALSE;
483 static void no_putc(nkf_char c);
484 static int debug_f = FALSE;
485 static void debug(const char *str);
486 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
489 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
491 static void print_guessed_code(char *filename);
493 static void set_input_codename(char *codename);
496 static int exec_f = 0;
499 #ifdef SHIFTJIS_CP932
500 /* invert IBM extended characters to others */
501 static int cp51932_f = FALSE;
503 /* invert NEC-selected IBM extended characters to IBM extended characters */
504 static int cp932inv_f = TRUE;
506 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
507 #endif /* SHIFTJIS_CP932 */
510 static int x0212_f = FALSE;
511 static nkf_char x0212_shift(nkf_char c);
512 static nkf_char x0212_unshift(nkf_char c);
514 static int x0213_f = FALSE;
516 static unsigned char prefix_table[256];
518 static void set_code_score(struct input_code *ptr, nkf_char score);
519 static void clr_code_score(struct input_code *ptr, nkf_char score);
520 static void status_disable(struct input_code *ptr);
521 static void status_push_ch(struct input_code *ptr, nkf_char c);
522 static void status_clear(struct input_code *ptr);
523 static void status_reset(struct input_code *ptr);
524 static void status_reinit(struct input_code *ptr);
525 static void status_check(struct input_code *ptr, nkf_char c);
526 static void e_status(struct input_code *, nkf_char);
527 static void s_status(struct input_code *, nkf_char);
529 struct input_code input_code_list[] = {
530 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
531 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
532 #ifdef UTF8_INPUT_ENABLE
533 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
534 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
535 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
540 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
541 static int base64_count = 0;
543 /* X0208 -> ASCII converter */
546 static int f_line = 0; /* chars in line */
547 static int f_prev = 0;
548 static int fold_preserve_f = FALSE; /* preserve new lines */
549 static int fold_f = FALSE;
550 static int fold_len = 0;
553 static unsigned char kanji_intro = DEFAULT_J;
554 static unsigned char ascii_intro = DEFAULT_R;
558 #define FOLD_MARGIN 10
559 #define DEFAULT_FOLD 60
561 static int fold_margin = FOLD_MARGIN;
565 #ifdef DEFAULT_CODE_JIS
566 # define DEFAULT_CONV j_oconv
568 #ifdef DEFAULT_CODE_SJIS
569 # define DEFAULT_CONV s_oconv
571 #ifdef DEFAULT_CODE_EUC
572 # define DEFAULT_CONV e_oconv
574 #ifdef DEFAULT_CODE_UTF8
575 # define DEFAULT_CONV w_oconv
578 /* process default */
579 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
581 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
582 /* s_iconv or oconv */
583 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
585 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
587 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
588 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
589 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
590 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
591 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
593 /* static redirections */
595 static void (*o_putc)(nkf_char c) = std_putc;
597 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
598 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
600 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
601 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
603 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
605 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
606 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
608 /* for strict mime */
609 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
610 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
613 static int output_mode = ASCII, /* output kanji mode */
614 input_mode = ASCII, /* input kanji mode */
615 shift_mode = FALSE; /* TRUE shift out, or X0201 */
616 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
618 /* X0201 / X0208 conversion tables */
620 /* X0201 kana conversion table */
622 static const unsigned char cv[]= {
623 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
624 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
625 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
626 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
627 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
628 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
629 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
630 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
631 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
632 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
633 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
634 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
635 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
636 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
637 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
638 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
642 /* X0201 kana conversion table for daguten */
644 static const unsigned char dv[]= {
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
650 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
651 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
652 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
653 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
654 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
656 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 /* X0201 kana conversion table for han-daguten */
665 static const unsigned char ev[]= {
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
677 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
681 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
685 /* X0208 kigou conversion table */
686 /* 0x8140 - 0x819e */
687 static const unsigned char fv[] = {
689 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
690 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
691 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
692 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
693 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
694 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
695 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
696 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
697 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
705 static int file_out_f = FALSE;
707 static int overwrite_f = FALSE;
708 static int preserve_time_f = FALSE;
709 static int backup_f = FALSE;
710 static char *backup_suffix = "";
711 static char *get_backup_filename(const char *suffix, const char *filename);
714 static int nlmode_f = 0; /* CR, LF, CRLF */
715 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
716 static nkf_char prev_cr = 0; /* CR or 0 */
717 #ifdef EASYWIN /*Easy Win */
718 static int end_check;
721 #define STD_GC_BUFSIZE (256)
722 nkf_char std_gc_buf[STD_GC_BUFSIZE];
726 #include "nkf32dll.c"
727 #elif defined(PERL_XS)
729 int main(int argc, char **argv)
734 char *outfname = NULL;
737 #ifdef EASYWIN /*Easy Win */
738 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
741 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
742 cp = (unsigned char *)*argv;
746 int debug_f_back = debug_f;
749 int exec_f_back = exec_f;
752 int x0212_f_back = x0212_f;
755 int x0213_f_back = x0213_f;
757 int guess_f_back = guess_f;
759 guess_f = guess_f_back;
762 debug_f = debug_f_back;
765 exec_f = exec_f_back;
768 x0212_f = x0212_f_back;
771 x0213_f = x0213_f_back;
777 if (pipe(fds) < 0 || (pid = fork()) < 0){
788 execvp(argv[1], &argv[1]);
802 if(x0201_f == WISH_TRUE)
803 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
805 if (binmode_f == TRUE)
806 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
807 if (freopen("","wb",stdout) == NULL)
814 setbuf(stdout, (char *) NULL);
816 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
819 if (binmode_f == TRUE)
820 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
821 if (freopen("","rb",stdin) == NULL) return (-1);
825 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
829 kanji_convert(stdin);
830 if (guess_f) print_guessed_code(NULL);
834 int is_argument_error = FALSE;
836 input_codename = NULL;
841 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
844 is_argument_error = TRUE;
852 /* reopen file for stdout */
853 if (file_out_f == TRUE) {
856 outfname = malloc(strlen(origfname)
857 + strlen(".nkftmpXXXXXX")
863 strcpy(outfname, origfname);
867 for (i = strlen(outfname); i; --i){
868 if (outfname[i - 1] == '/'
869 || outfname[i - 1] == '\\'){
875 strcat(outfname, "ntXXXXXX");
877 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
880 strcat(outfname, ".nkftmpXXXXXX");
881 fd = mkstemp(outfname);
884 || (fd_backup = dup(fileno(stdout))) < 0
885 || dup2(fd, fileno(stdout)) < 0
896 outfname = "nkf.out";
899 if(freopen(outfname, "w", stdout) == NULL) {
903 if (binmode_f == TRUE) {
904 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
905 if (freopen("","wb",stdout) == NULL)
912 if (binmode_f == TRUE)
913 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
914 if (freopen("","rb",fin) == NULL)
919 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
923 char *filename = NULL;
925 if (nfiles > 1) filename = origfname;
926 if (guess_f) print_guessed_code(filename);
932 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
940 if (dup2(fd_backup, fileno(stdout)) < 0){
943 if (stat(origfname, &sb)) {
944 fprintf(stderr, "Can't stat %s\n", origfname);
946 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
947 if (chmod(outfname, sb.st_mode)) {
948 fprintf(stderr, "Can't set permission %s\n", outfname);
951 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
953 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
954 tb[0] = tb[1] = sb.st_mtime;
955 if (utime(outfname, tb)) {
956 fprintf(stderr, "Can't set timestamp %s\n", outfname);
959 tb.actime = sb.st_atime;
960 tb.modtime = sb.st_mtime;
961 if (utime(outfname, &tb)) {
962 fprintf(stderr, "Can't set timestamp %s\n", outfname);
967 char *backup_filename = get_backup_filename(backup_suffix, origfname);
969 unlink(backup_filename);
971 if (rename(origfname, backup_filename)) {
972 perror(backup_filename);
973 fprintf(stderr, "Can't rename %s to %s\n",
974 origfname, backup_filename);
978 if (unlink(origfname)){
983 if (rename(outfname, origfname)) {
985 fprintf(stderr, "Can't rename %s to %s\n",
986 outfname, origfname);
993 if (is_argument_error)
996 #ifdef EASYWIN /*Easy Win */
997 if (file_out_f == FALSE)
998 scanf("%d",&end_check);
1001 #else /* for Other OS */
1002 if (file_out_f == TRUE)
1004 #endif /*Easy Win */
1007 #endif /* WIN32DLL */
1010 char *get_backup_filename(const char *suffix, const char *filename)
1012 char *backup_filename;
1013 int asterisk_count = 0;
1015 int filename_length = strlen(filename);
1017 for(i = 0; suffix[i]; i++){
1018 if(suffix[i] == '*') asterisk_count++;
1022 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1023 if (!backup_filename){
1024 perror("Can't malloc backup filename.");
1028 for(i = 0, j = 0; suffix[i];){
1029 if(suffix[i] == '*'){
1030 backup_filename[j] = '\0';
1031 strncat(backup_filename, filename, filename_length);
1033 j += filename_length;
1035 backup_filename[j++] = suffix[i++];
1038 backup_filename[j] = '\0';
1040 j = strlen(suffix) + filename_length;
1041 backup_filename = malloc( + 1);
1042 strcpy(backup_filename, filename);
1043 strcat(backup_filename, suffix);
1044 backup_filename[j] = '\0';
1046 return backup_filename;
1050 static const struct {
1074 {"katakana-hiragana","h3"},
1082 #ifdef UTF8_OUTPUT_ENABLE
1092 {"fb-subchar=", ""},
1094 #ifdef UTF8_INPUT_ENABLE
1095 {"utf8-input", "W"},
1096 {"utf16-input", "W16"},
1097 {"no-cp932ext", ""},
1098 {"no-best-fit-chars",""},
1100 #ifdef UNICODE_NORMALIZATION
1101 {"utf8mac-input", ""},
1113 #ifdef NUMCHAR_OPTION
1114 {"numchar-input", ""},
1120 #ifdef SHIFTJIS_CP932
1130 static int option_mode = 0;
1132 void options(unsigned char *cp)
1136 unsigned char *cp_back = NULL;
1141 while(*cp && *cp++!='-');
1142 while (*cp || cp_back) {
1150 case '-': /* literal options */
1151 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1155 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1156 p = (unsigned char *)long_option[i].name;
1157 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1158 if (*p == cp[j] || cp[j] == SP){
1165 fprintf(stderr, "unknown long option: --%s\n", cp);
1168 while(*cp && *cp != SP && cp++);
1169 if (long_option[i].alias[0]){
1171 cp = (unsigned char *)long_option[i].alias;
1173 if (strcmp(long_option[i].name, "ic=") == 0){
1174 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1175 codeset[i] = nkf_toupper(p[i]);
1178 if(strcmp(codeset, "ISO-2022-JP") == 0){
1179 input_f = JIS_INPUT;
1180 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1181 strcmp(codeset, "CP50220") == 0 ||
1182 strcmp(codeset, "CP50221") == 0 ||
1183 strcmp(codeset, "CP50222") == 0){
1184 input_f = JIS_INPUT;
1185 #ifdef SHIFTJIS_CP932
1188 #ifdef UTF8_OUTPUT_ENABLE
1189 ms_ucs_map_f = UCS_MAP_CP932;
1191 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1192 input_f = JIS_INPUT;
1196 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1197 input_f = JIS_INPUT;
1202 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1203 input_f = SJIS_INPUT;
1204 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1205 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1206 strcmp(codeset, "CP932") == 0 ||
1207 strcmp(codeset, "MS932") == 0){
1208 input_f = SJIS_INPUT;
1209 #ifdef SHIFTJIS_CP932
1212 #ifdef UTF8_OUTPUT_ENABLE
1213 ms_ucs_map_f = UCS_MAP_CP932;
1215 }else if(strcmp(codeset, "CP10001") == 0){
1216 input_f = SJIS_INPUT;
1217 #ifdef SHIFTJIS_CP932
1220 #ifdef UTF8_OUTPUT_ENABLE
1221 ms_ucs_map_f = UCS_MAP_CP10001;
1223 }else if(strcmp(codeset, "EUCJP") == 0 ||
1224 strcmp(codeset, "EUC-JP") == 0){
1225 input_f = EUC_INPUT;
1226 }else if(strcmp(codeset, "CP51932") == 0){
1227 input_f = EUC_INPUT;
1228 #ifdef SHIFTJIS_CP932
1231 #ifdef UTF8_OUTPUT_ENABLE
1232 ms_ucs_map_f = UCS_MAP_CP932;
1234 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1235 strcmp(codeset, "EUCJP-MS") == 0 ||
1236 strcmp(codeset, "EUCJPMS") == 0){
1237 input_f = EUC_INPUT;
1238 #ifdef SHIFTJIS_CP932
1241 #ifdef UTF8_OUTPUT_ENABLE
1242 ms_ucs_map_f = UCS_MAP_MS;
1244 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1245 strcmp(codeset, "EUCJP-ASCII") == 0){
1246 input_f = EUC_INPUT;
1247 #ifdef SHIFTJIS_CP932
1250 #ifdef UTF8_OUTPUT_ENABLE
1251 ms_ucs_map_f = UCS_MAP_ASCII;
1253 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1254 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1255 input_f = SJIS_INPUT;
1257 #ifdef SHIFTJIS_CP932
1260 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1261 strcmp(codeset, "EUC-JIS-2004") == 0){
1262 input_f = EUC_INPUT;
1264 #ifdef SHIFTJIS_CP932
1267 #ifdef UTF8_INPUT_ENABLE
1268 }else if(strcmp(codeset, "UTF-8") == 0 ||
1269 strcmp(codeset, "UTF-8N") == 0 ||
1270 strcmp(codeset, "UTF-8-BOM") == 0){
1271 input_f = UTF8_INPUT;
1272 #ifdef UNICODE_NORMALIZATION
1273 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1274 strcmp(codeset, "UTF-8-MAC") == 0){
1275 input_f = UTF8_INPUT;
1278 }else if(strcmp(codeset, "UTF-16") == 0 ||
1279 strcmp(codeset, "UTF-16BE") == 0 ||
1280 strcmp(codeset, "UTF-16BE-BOM") == 0){
1281 input_f = UTF16_INPUT;
1282 input_endian = ENDIAN_BIG;
1283 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1284 strcmp(codeset, "UTF-16LE-BOM") == 0){
1285 input_f = UTF16_INPUT;
1286 input_endian = ENDIAN_LITTLE;
1287 }else if(strcmp(codeset, "UTF-32") == 0 ||
1288 strcmp(codeset, "UTF-32BE") == 0 ||
1289 strcmp(codeset, "UTF-32BE-BOM") == 0){
1290 input_f = UTF32_INPUT;
1291 input_endian = ENDIAN_BIG;
1292 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1293 strcmp(codeset, "UTF-32LE-BOM") == 0){
1294 input_f = UTF32_INPUT;
1295 input_endian = ENDIAN_LITTLE;
1298 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1302 if (strcmp(long_option[i].name, "oc=") == 0){
1304 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1305 codeset[i] = nkf_toupper(p[i]);
1308 if(strcmp(codeset, "ISO-2022-JP") == 0){
1309 output_conv = j_oconv;
1310 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1311 output_conv = j_oconv;
1312 no_cp932ext_f = TRUE;
1313 #ifdef SHIFTJIS_CP932
1316 #ifdef UTF8_OUTPUT_ENABLE
1317 ms_ucs_map_f = UCS_MAP_CP932;
1319 }else if(strcmp(codeset, "CP50220") == 0){
1320 output_conv = j_oconv;
1322 #ifdef SHIFTJIS_CP932
1325 #ifdef UTF8_OUTPUT_ENABLE
1326 ms_ucs_map_f = UCS_MAP_CP932;
1328 }else if(strcmp(codeset, "CP50221") == 0){
1329 output_conv = j_oconv;
1330 #ifdef SHIFTJIS_CP932
1333 #ifdef UTF8_OUTPUT_ENABLE
1334 ms_ucs_map_f = UCS_MAP_CP932;
1336 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1337 output_conv = j_oconv;
1341 #ifdef SHIFTJIS_CP932
1344 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1345 output_conv = j_oconv;
1350 #ifdef SHIFTJIS_CP932
1353 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1354 output_conv = s_oconv;
1355 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1356 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1357 strcmp(codeset, "CP932") == 0 ||
1358 strcmp(codeset, "MS932") == 0){
1359 output_conv = s_oconv;
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 ms_ucs_map_f = UCS_MAP_CP932;
1363 }else if(strcmp(codeset, "CP10001") == 0){
1364 output_conv = s_oconv;
1365 #ifdef UTF8_OUTPUT_ENABLE
1366 ms_ucs_map_f = UCS_MAP_CP10001;
1368 }else if(strcmp(codeset, "EUCJP") == 0 ||
1369 strcmp(codeset, "EUC-JP") == 0){
1370 output_conv = e_oconv;
1371 }else if(strcmp(codeset, "CP51932") == 0){
1372 output_conv = e_oconv;
1373 #ifdef SHIFTJIS_CP932
1376 #ifdef UTF8_OUTPUT_ENABLE
1377 ms_ucs_map_f = UCS_MAP_CP932;
1379 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1380 strcmp(codeset, "EUCJP-MS") == 0 ||
1381 strcmp(codeset, "EUCJPMS") == 0){
1382 output_conv = e_oconv;
1386 #ifdef UTF8_OUTPUT_ENABLE
1387 ms_ucs_map_f = UCS_MAP_MS;
1389 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1390 strcmp(codeset, "EUCJP-ASCII") == 0){
1391 output_conv = e_oconv;
1395 #ifdef UTF8_OUTPUT_ENABLE
1396 ms_ucs_map_f = UCS_MAP_ASCII;
1398 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1399 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1400 output_conv = s_oconv;
1402 #ifdef SHIFTJIS_CP932
1405 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1406 strcmp(codeset, "EUC-JIS-2004") == 0){
1407 output_conv = e_oconv;
1412 #ifdef SHIFTJIS_CP932
1415 #ifdef UTF8_OUTPUT_ENABLE
1416 }else if(strcmp(codeset, "UTF-8") == 0){
1417 output_conv = w_oconv;
1418 }else if(strcmp(codeset, "UTF-8N") == 0){
1419 output_conv = w_oconv;
1420 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1421 output_conv = w_oconv;
1422 output_bom_f = TRUE;
1423 }else if(strcmp(codeset, "UTF-16BE") == 0){
1424 output_conv = w_oconv16;
1425 }else if(strcmp(codeset, "UTF-16") == 0 ||
1426 strcmp(codeset, "UTF-16BE-BOM") == 0){
1427 output_conv = w_oconv16;
1428 output_bom_f = TRUE;
1429 }else if(strcmp(codeset, "UTF-16LE") == 0){
1430 output_conv = w_oconv16;
1431 output_endian = ENDIAN_LITTLE;
1432 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1433 output_conv = w_oconv16;
1434 output_endian = ENDIAN_LITTLE;
1435 output_bom_f = TRUE;
1436 }else if(strcmp(codeset, "UTF-32") == 0 ||
1437 strcmp(codeset, "UTF-32BE") == 0){
1438 output_conv = w_oconv32;
1439 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1440 output_conv = w_oconv32;
1441 output_bom_f = TRUE;
1442 }else if(strcmp(codeset, "UTF-32LE") == 0){
1443 output_conv = w_oconv32;
1444 output_endian = ENDIAN_LITTLE;
1445 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1446 output_conv = w_oconv32;
1447 output_endian = ENDIAN_LITTLE;
1448 output_bom_f = TRUE;
1451 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1455 if (strcmp(long_option[i].name, "guess=") == 0){
1464 if (strcmp(long_option[i].name, "overwrite") == 0){
1467 preserve_time_f = TRUE;
1470 if (strcmp(long_option[i].name, "overwrite=") == 0){
1473 preserve_time_f = TRUE;
1475 backup_suffix = malloc(strlen((char *) p) + 1);
1476 strcpy(backup_suffix, (char *) p);
1479 if (strcmp(long_option[i].name, "in-place") == 0){
1482 preserve_time_f = FALSE;
1485 if (strcmp(long_option[i].name, "in-place=") == 0){
1488 preserve_time_f = FALSE;
1490 backup_suffix = malloc(strlen((char *) p) + 1);
1491 strcpy(backup_suffix, (char *) p);
1496 if (strcmp(long_option[i].name, "cap-input") == 0){
1500 if (strcmp(long_option[i].name, "url-input") == 0){
1505 #ifdef NUMCHAR_OPTION
1506 if (strcmp(long_option[i].name, "numchar-input") == 0){
1512 if (strcmp(long_option[i].name, "no-output") == 0){
1516 if (strcmp(long_option[i].name, "debug") == 0){
1521 if (strcmp(long_option[i].name, "cp932") == 0){
1522 #ifdef SHIFTJIS_CP932
1526 #ifdef UTF8_OUTPUT_ENABLE
1527 ms_ucs_map_f = UCS_MAP_CP932;
1531 if (strcmp(long_option[i].name, "no-cp932") == 0){
1532 #ifdef SHIFTJIS_CP932
1536 #ifdef UTF8_OUTPUT_ENABLE
1537 ms_ucs_map_f = UCS_MAP_ASCII;
1541 #ifdef SHIFTJIS_CP932
1542 if (strcmp(long_option[i].name, "cp932inv") == 0){
1549 if (strcmp(long_option[i].name, "x0212") == 0){
1556 if (strcmp(long_option[i].name, "exec-in") == 0){
1560 if (strcmp(long_option[i].name, "exec-out") == 0){
1565 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1566 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1567 no_cp932ext_f = TRUE;
1570 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1571 no_best_fit_chars_f = TRUE;
1574 if (strcmp(long_option[i].name, "fb-skip") == 0){
1575 encode_fallback = NULL;
1578 if (strcmp(long_option[i].name, "fb-html") == 0){
1579 encode_fallback = encode_fallback_html;
1582 if (strcmp(long_option[i].name, "fb-xml") == 0){
1583 encode_fallback = encode_fallback_xml;
1586 if (strcmp(long_option[i].name, "fb-java") == 0){
1587 encode_fallback = encode_fallback_java;
1590 if (strcmp(long_option[i].name, "fb-perl") == 0){
1591 encode_fallback = encode_fallback_perl;
1594 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1595 encode_fallback = encode_fallback_subchar;
1598 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1599 encode_fallback = encode_fallback_subchar;
1600 unicode_subchar = 0;
1602 /* decimal number */
1603 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1604 unicode_subchar *= 10;
1605 unicode_subchar += hex2bin(p[i]);
1607 }else if(p[1] == 'x' || p[1] == 'X'){
1608 /* hexadecimal number */
1609 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1610 unicode_subchar <<= 4;
1611 unicode_subchar |= hex2bin(p[i]);
1615 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1616 unicode_subchar *= 8;
1617 unicode_subchar += hex2bin(p[i]);
1620 w16e_conv(unicode_subchar, &i, &j);
1621 unicode_subchar = i<<8 | j;
1625 #ifdef UTF8_OUTPUT_ENABLE
1626 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1627 ms_ucs_map_f = UCS_MAP_MS;
1631 #ifdef UNICODE_NORMALIZATION
1632 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1633 input_f = UTF8_INPUT;
1638 if (strcmp(long_option[i].name, "prefix=") == 0){
1639 if (nkf_isgraph(p[0])){
1640 for (i = 1; nkf_isgraph(p[i]); i++){
1641 prefix_table[p[i]] = p[0];
1648 case 'b': /* buffered mode */
1651 case 'u': /* non bufferd mode */
1654 case 't': /* transparent mode */
1659 } else if (*cp=='2') {
1663 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1671 case 'j': /* JIS output */
1673 output_conv = j_oconv;
1675 case 'e': /* AT&T EUC output */
1676 output_conv = e_oconv;
1679 case 's': /* SJIS output */
1680 output_conv = s_oconv;
1682 case 'l': /* ISO8859 Latin-1 support, no conversion */
1683 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1684 input_f = LATIN1_INPUT;
1686 case 'i': /* Kanji IN ESC-$-@/B */
1687 if (*cp=='@'||*cp=='B')
1688 kanji_intro = *cp++;
1690 case 'o': /* ASCII IN ESC-(-J/B */
1691 if (*cp=='J'||*cp=='B'||*cp=='H')
1692 ascii_intro = *cp++;
1696 bit:1 katakana->hiragana
1697 bit:2 hiragana->katakana
1699 if ('9'>= *cp && *cp>='0')
1700 hira_f |= (*cp++ -'0');
1707 #if defined(MSDOS) || defined(__OS2__)
1722 #ifdef UTF8_OUTPUT_ENABLE
1723 case 'w': /* UTF-8 output */
1725 output_conv = w_oconv; cp++;
1729 output_bom_f = TRUE;
1732 if ('1'== cp[0] && '6'==cp[1]) {
1733 output_conv = w_oconv16; cp+=2;
1734 } else if ('3'== cp[0] && '2'==cp[1]) {
1735 output_conv = w_oconv32; cp+=2;
1737 output_conv = w_oconv;
1742 output_endian = ENDIAN_LITTLE;
1743 } else if (cp[0] == 'B') {
1751 output_bom_f = TRUE;
1756 #ifdef UTF8_INPUT_ENABLE
1757 case 'W': /* UTF input */
1760 input_f = UTF8_INPUT;
1762 if ('1'== cp[0] && '6'==cp[1]) {
1764 input_f = UTF16_INPUT;
1765 input_endian = ENDIAN_BIG;
1766 } else if ('3'== cp[0] && '2'==cp[1]) {
1768 input_f = UTF32_INPUT;
1769 input_endian = ENDIAN_BIG;
1771 input_f = UTF8_INPUT;
1776 input_endian = ENDIAN_LITTLE;
1777 } else if (cp[0] == 'B') {
1783 /* Input code assumption */
1784 case 'J': /* JIS input */
1785 input_f = JIS_INPUT;
1787 case 'E': /* AT&T EUC input */
1788 input_f = EUC_INPUT;
1790 case 'S': /* MS Kanji input */
1791 input_f = SJIS_INPUT;
1792 if (x0201_f==NO_X0201) x0201_f=TRUE;
1794 case 'Z': /* Convert X0208 alphabet to asii */
1796 bit:0 Convert JIS X 0208 Alphabet to ASCII
1797 bit:1 Convert Kankaku to one space
1798 bit:2 Convert Kankaku to two spaces
1799 bit:3 Convert HTML Entity
1800 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1802 while ('0'<= *cp && *cp <='9') {
1803 alpha_f |= 1 << (*cp++ - '0');
1805 if (!alpha_f) alpha_f = 1;
1807 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1808 x0201_f = FALSE; /* No X0201->X0208 conversion */
1810 ESC-(-I in JIS, EUC, MS Kanji
1811 SI/SO in JIS, EUC, MS Kanji
1812 SSO in EUC, JIS, not in MS Kanji
1813 MS Kanji (0xa0-0xdf)
1815 ESC-(-I in JIS (0x20-0x5f)
1816 SSO in EUC (0xa0-0xdf)
1817 0xa0-0xd in MS Kanji (0xa0-0xdf)
1820 case 'X': /* Assume X0201 kana */
1821 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1824 case 'F': /* prserve new lines */
1825 fold_preserve_f = TRUE;
1826 case 'f': /* folding -f60 or -f */
1829 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1831 fold_len += *cp++ - '0';
1833 if (!(0<fold_len && fold_len<BUFSIZ))
1834 fold_len = DEFAULT_FOLD;
1838 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1840 fold_margin += *cp++ - '0';
1844 case 'm': /* MIME support */
1845 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1846 if (*cp=='B'||*cp=='Q') {
1847 mime_decode_mode = *cp++;
1848 mimebuf_f = FIXED_MIME;
1849 } else if (*cp=='N') {
1850 mime_f = TRUE; cp++;
1851 } else if (*cp=='S') {
1852 mime_f = STRICT_MIME; cp++;
1853 } else if (*cp=='0') {
1854 mime_decode_f = FALSE;
1855 mime_f = FALSE; cp++;
1858 case 'M': /* MIME output */
1861 mimeout_f = FIXED_MIME; cp++;
1862 } else if (*cp=='Q') {
1864 mimeout_f = FIXED_MIME; cp++;
1869 case 'B': /* Broken JIS support */
1871 bit:1 allow any x on ESC-(-x or ESC-$-x
1872 bit:2 reset to ascii on NL
1874 if ('9'>= *cp && *cp>='0')
1875 broken_f |= 1<<(*cp++ -'0');
1880 case 'O':/* for Output file */
1884 case 'c':/* add cr code */
1887 case 'd':/* delete cr code */
1890 case 'I': /* ISO-2022-JP output */
1893 case 'L': /* line mode */
1894 if (*cp=='u') { /* unix */
1895 nlmode_f = LF; cp++;
1896 } else if (*cp=='m') { /* mac */
1897 nlmode_f = CR; cp++;
1898 } else if (*cp=='w') { /* windows */
1899 nlmode_f = CRLF; cp++;
1900 } else if (*cp=='0') { /* no conversion */
1909 } else if (*cp == '0') {
1918 /* module muliple options in a string are allowed for Perl moudle */
1919 while(*cp && *cp++!='-');
1922 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
1923 /* bogus option but ignored */
1929 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1932 struct input_code *p = input_code_list;
1934 if (iconv_func == p->iconv_func){
1943 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1945 #ifdef INPUT_CODE_FIX
1953 #ifdef INPUT_CODE_FIX
1954 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1960 if (estab_f && iconv_for_check != iconv){
1961 struct input_code *p = find_inputcode_byfunc(iconv);
1963 set_input_codename(p->name);
1966 iconv_for_check = iconv;
1971 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1972 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1973 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1974 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
1975 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
1976 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1977 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1978 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1980 #define SCORE_INIT (SCORE_iMIME)
1982 static const char score_table_A0[] = {
1985 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1986 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1989 static const char score_table_F0[] = {
1990 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1991 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1992 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
1993 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1996 void set_code_score(struct input_code *ptr, nkf_char score)
1999 ptr->score |= score;
2003 void clr_code_score(struct input_code *ptr, nkf_char score)
2006 ptr->score &= ~score;
2010 void code_score(struct input_code *ptr)
2012 nkf_char c2 = ptr->buf[0];
2013 #ifdef UTF8_OUTPUT_ENABLE
2014 nkf_char c1 = ptr->buf[1];
2017 set_code_score(ptr, SCORE_ERROR);
2018 }else if (c2 == SSO){
2019 set_code_score(ptr, SCORE_KANA);
2020 }else if (c2 == 0x8f){
2021 set_code_score(ptr, SCORE_X0212);
2022 #ifdef UTF8_OUTPUT_ENABLE
2023 }else if (!e2w_conv(c2, c1)){
2024 set_code_score(ptr, SCORE_NO_EXIST);
2026 }else if ((c2 & 0x70) == 0x20){
2027 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2028 }else if ((c2 & 0x70) == 0x70){
2029 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2030 }else if ((c2 & 0x70) >= 0x50){
2031 set_code_score(ptr, SCORE_L2);
2035 void status_disable(struct input_code *ptr)
2040 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2043 void status_push_ch(struct input_code *ptr, nkf_char c)
2045 ptr->buf[ptr->index++] = c;
2048 void status_clear(struct input_code *ptr)
2054 void status_reset(struct input_code *ptr)
2057 ptr->score = SCORE_INIT;
2060 void status_reinit(struct input_code *ptr)
2063 ptr->_file_stat = 0;
2066 void status_check(struct input_code *ptr, nkf_char c)
2068 if (c <= DEL && estab_f){
2073 void s_status(struct input_code *ptr, nkf_char c)
2077 status_check(ptr, c);
2082 #ifdef NUMCHAR_OPTION
2083 }else if (is_unicode_capsule(c)){
2086 }else if (0xa1 <= c && c <= 0xdf){
2087 status_push_ch(ptr, SSO);
2088 status_push_ch(ptr, c);
2091 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2093 status_push_ch(ptr, c);
2094 }else if (0xed <= c && c <= 0xee){
2096 status_push_ch(ptr, c);
2097 #ifdef SHIFTJIS_CP932
2098 }else if (is_ibmext_in_sjis(c)){
2100 status_push_ch(ptr, c);
2101 #endif /* SHIFTJIS_CP932 */
2103 }else if (0xf0 <= c && c <= 0xfc){
2105 status_push_ch(ptr, c);
2106 #endif /* X0212_ENABLE */
2108 status_disable(ptr);
2112 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2113 status_push_ch(ptr, c);
2114 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2118 status_disable(ptr);
2122 #ifdef SHIFTJIS_CP932
2123 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2124 status_push_ch(ptr, c);
2125 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2126 set_code_score(ptr, SCORE_CP932);
2131 #endif /* SHIFTJIS_CP932 */
2132 status_disable(ptr);
2135 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2136 status_push_ch(ptr, c);
2137 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2138 set_code_score(ptr, SCORE_CP932);
2141 status_disable(ptr);
2147 void e_status(struct input_code *ptr, nkf_char c)
2151 status_check(ptr, c);
2156 #ifdef NUMCHAR_OPTION
2157 }else if (is_unicode_capsule(c)){
2160 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2162 status_push_ch(ptr, c);
2164 }else if (0x8f == c){
2166 status_push_ch(ptr, c);
2167 #endif /* X0212_ENABLE */
2169 status_disable(ptr);
2173 if (0xa1 <= c && c <= 0xfe){
2174 status_push_ch(ptr, c);
2178 status_disable(ptr);
2183 if (0xa1 <= c && c <= 0xfe){
2185 status_push_ch(ptr, c);
2187 status_disable(ptr);
2189 #endif /* X0212_ENABLE */
2193 #ifdef UTF8_INPUT_ENABLE
2194 void w_status(struct input_code *ptr, nkf_char c)
2198 status_check(ptr, c);
2203 #ifdef NUMCHAR_OPTION
2204 }else if (is_unicode_capsule(c)){
2207 }else if (0xc0 <= c && c <= 0xdf){
2209 status_push_ch(ptr, c);
2210 }else if (0xe0 <= c && c <= 0xef){
2212 status_push_ch(ptr, c);
2213 }else if (0xf0 <= c && c <= 0xf4){
2215 status_push_ch(ptr, c);
2217 status_disable(ptr);
2222 if (0x80 <= c && c <= 0xbf){
2223 status_push_ch(ptr, c);
2224 if (ptr->index > ptr->stat){
2225 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2226 && ptr->buf[2] == 0xbf);
2227 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2228 &ptr->buf[0], &ptr->buf[1]);
2235 status_disable(ptr);
2239 if (0x80 <= c && c <= 0xbf){
2240 if (ptr->index < ptr->stat){
2241 status_push_ch(ptr, c);
2246 status_disable(ptr);
2253 void code_status(nkf_char c)
2255 int action_flag = 1;
2256 struct input_code *result = 0;
2257 struct input_code *p = input_code_list;
2259 if (!p->status_func) {
2263 if (!p->status_func)
2265 (p->status_func)(p, c);
2268 }else if(p->stat == 0){
2279 if (result && !estab_f){
2280 set_iconv(TRUE, result->iconv_func);
2281 }else if (c <= DEL){
2282 struct input_code *ptr = input_code_list;
2292 nkf_char std_getc(FILE *f)
2295 return std_gc_buf[--std_gc_ndx];
2301 nkf_char std_ungetc(nkf_char c, FILE *f)
2303 if (std_gc_ndx == STD_GC_BUFSIZE){
2306 std_gc_buf[std_gc_ndx++] = c;
2311 void std_putc(nkf_char c)
2318 #if !defined(PERL_XS) && !defined(WIN32DLL)
2319 nkf_char noconvert(FILE *f)
2324 module_connection();
2325 while ((c = (*i_getc)(f)) != EOF)
2332 void module_connection(void)
2334 oconv = output_conv;
2337 /* replace continucation module, from output side */
2339 /* output redicrection */
2341 if (noout_f || guess_f){
2348 if (mimeout_f == TRUE) {
2349 o_base64conv = oconv; oconv = base64_conv;
2351 /* base64_count = 0; */
2354 if (nlmode_f || guess_f) {
2355 o_nlconv = oconv; oconv = nl_conv;
2358 o_rot_conv = oconv; oconv = rot_conv;
2361 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2364 o_hira_conv = oconv; oconv = hira_conv;
2367 o_fconv = oconv; oconv = fold_conv;
2370 if (alpha_f || x0201_f) {
2371 o_zconv = oconv; oconv = z_conv;
2375 i_ungetc = std_ungetc;
2376 /* input redicrection */
2379 i_cgetc = i_getc; i_getc = cap_getc;
2380 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2383 i_ugetc = i_getc; i_getc = url_getc;
2384 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2387 #ifdef NUMCHAR_OPTION
2389 i_ngetc = i_getc; i_getc = numchar_getc;
2390 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2393 #ifdef UNICODE_NORMALIZATION
2394 if (nfc_f && input_f == UTF8_INPUT){
2395 i_nfc_getc = i_getc; i_getc = nfc_getc;
2396 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2399 if (mime_f && mimebuf_f==FIXED_MIME) {
2400 i_mgetc = i_getc; i_getc = mime_getc;
2401 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2404 i_bgetc = i_getc; i_getc = broken_getc;
2405 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2407 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2408 set_iconv(-TRUE, e_iconv);
2409 } else if (input_f == SJIS_INPUT) {
2410 set_iconv(-TRUE, s_iconv);
2411 #ifdef UTF8_INPUT_ENABLE
2412 } else if (input_f == UTF8_INPUT) {
2413 set_iconv(-TRUE, w_iconv);
2414 } else if (input_f == UTF16_INPUT) {
2415 set_iconv(-TRUE, w_iconv16);
2416 } else if (input_f == UTF32_INPUT) {
2417 set_iconv(-TRUE, w_iconv32);
2420 set_iconv(FALSE, e_iconv);
2424 struct input_code *p = input_code_list;
2432 * Check and Ignore BOM
2434 void check_bom(FILE *f)
2437 switch(c2 = (*i_getc)(f)){
2439 if((c2 = (*i_getc)(f)) == 0x00){
2440 if((c2 = (*i_getc)(f)) == 0xFE){
2441 if((c2 = (*i_getc)(f)) == 0xFF){
2443 set_iconv(TRUE, w_iconv32);
2445 if (iconv == w_iconv32) {
2446 input_endian = ENDIAN_BIG;
2449 (*i_ungetc)(0xFF,f);
2450 }else (*i_ungetc)(c2,f);
2451 (*i_ungetc)(0xFE,f);
2452 }else if(c2 == 0xFF){
2453 if((c2 = (*i_getc)(f)) == 0xFE){
2455 set_iconv(TRUE, w_iconv32);
2457 if (iconv == w_iconv32) {
2458 input_endian = ENDIAN_2143;
2461 (*i_ungetc)(0xFF,f);
2462 }else (*i_ungetc)(c2,f);
2463 (*i_ungetc)(0xFF,f);
2464 }else (*i_ungetc)(c2,f);
2465 (*i_ungetc)(0x00,f);
2466 }else (*i_ungetc)(c2,f);
2467 (*i_ungetc)(0x00,f);
2470 if((c2 = (*i_getc)(f)) == 0xBB){
2471 if((c2 = (*i_getc)(f)) == 0xBF){
2473 set_iconv(TRUE, w_iconv);
2475 if (iconv == w_iconv) {
2478 (*i_ungetc)(0xBF,f);
2479 }else (*i_ungetc)(c2,f);
2480 (*i_ungetc)(0xBB,f);
2481 }else (*i_ungetc)(c2,f);
2482 (*i_ungetc)(0xEF,f);
2485 if((c2 = (*i_getc)(f)) == 0xFF){
2486 if((c2 = (*i_getc)(f)) == 0x00){
2487 if((c2 = (*i_getc)(f)) == 0x00){
2489 set_iconv(TRUE, w_iconv32);
2491 if (iconv == w_iconv32) {
2492 input_endian = ENDIAN_3412;
2495 (*i_ungetc)(0x00,f);
2496 }else (*i_ungetc)(c2,f);
2497 (*i_ungetc)(0x00,f);
2498 }else (*i_ungetc)(c2,f);
2500 set_iconv(TRUE, w_iconv16);
2502 if (iconv == w_iconv16) {
2503 input_endian = ENDIAN_BIG;
2506 (*i_ungetc)(0xFF,f);
2507 }else (*i_ungetc)(c2,f);
2508 (*i_ungetc)(0xFE,f);
2511 if((c2 = (*i_getc)(f)) == 0xFE){
2512 if((c2 = (*i_getc)(f)) == 0x00){
2513 if((c2 = (*i_getc)(f)) == 0x00){
2515 set_iconv(TRUE, w_iconv32);
2517 if (iconv == w_iconv32) {
2518 input_endian = ENDIAN_LITTLE;
2521 (*i_ungetc)(0x00,f);
2522 }else (*i_ungetc)(c2,f);
2523 (*i_ungetc)(0x00,f);
2524 }else (*i_ungetc)(c2,f);
2526 set_iconv(TRUE, w_iconv16);
2528 if (iconv == w_iconv16) {
2529 input_endian = ENDIAN_LITTLE;
2532 (*i_ungetc)(0xFE,f);
2533 }else (*i_ungetc)(c2,f);
2534 (*i_ungetc)(0xFF,f);
2543 Conversion main loop. Code detection only.
2546 nkf_char kanji_convert(FILE *f)
2548 nkf_char c3, c2=0, c1, c0=0;
2549 int is_8bit = FALSE;
2551 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2552 #ifdef UTF8_INPUT_ENABLE
2553 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2560 output_mode = ASCII;
2563 #define NEXT continue /* no output, get next */
2564 #define SEND ; /* output c1 and c2, get next */
2565 #define LAST break /* end of loop, go closing */
2567 module_connection();
2570 while ((c1 = (*i_getc)(f)) != EOF) {
2571 #ifdef INPUT_CODE_FIX
2577 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2578 /* in case of 8th bit is on */
2579 if (!estab_f&&!mime_decode_mode) {
2580 /* in case of not established yet */
2581 /* It is still ambiguious */
2582 if (h_conv(f, c2, c1)==EOF)
2588 /* in case of already established */
2590 /* ignore bogus code and not CP5022x UCD */
2598 /* second byte, 7 bit code */
2599 /* it might be kanji shitfted */
2600 if ((c1 == DEL) || (c1 <= SP)) {
2601 /* ignore bogus first code */
2608 #ifdef UTF8_INPUT_ENABLE
2609 if (iconv == w_iconv16) {
2610 if (input_endian == ENDIAN_BIG) {
2612 if ((c1 = (*i_getc)(f)) != EOF) {
2613 if (0xD8 <= c2 && c2 <= 0xDB) {
2614 if ((c0 = (*i_getc)(f)) != EOF) {
2616 if ((c3 = (*i_getc)(f)) != EOF) {
2623 if ((c2 = (*i_getc)(f)) != EOF) {
2624 if (0xD8 <= c2 && c2 <= 0xDB) {
2625 if ((c3 = (*i_getc)(f)) != EOF) {
2626 if ((c0 = (*i_getc)(f)) != EOF) {
2635 } else if(iconv == w_iconv32){
2637 if((c2 = (*i_getc)(f)) != EOF &&
2638 (c1 = (*i_getc)(f)) != EOF &&
2639 (c0 = (*i_getc)(f)) != EOF){
2640 switch(input_endian){
2642 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2645 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2648 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2651 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2661 #ifdef NUMCHAR_OPTION
2662 if (is_unicode_capsule(c1)){
2666 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2668 if (!estab_f && !iso8859_f) {
2669 /* not established yet */
2672 } else { /* estab_f==TRUE */
2677 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2678 /* SJIS X0201 Case... */
2679 if(iso2022jp_f && x0201_f==NO_X0201) {
2680 (*oconv)(GETA1, GETA2);
2687 } else if (c1==SSO && iconv != s_iconv) {
2688 /* EUC X0201 Case */
2689 c1 = (*i_getc)(f); /* skip SSO */
2691 if (SSP<=c1 && c1<0xe0) {
2692 if(iso2022jp_f && x0201_f==NO_X0201) {
2693 (*oconv)(GETA1, GETA2);
2700 } else { /* bogus code, skip SSO and one byte */
2703 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2704 (c1 == 0xFD || c1 == 0xFE)) {
2710 /* already established */
2715 } else if ((c1 > SP) && (c1 != DEL)) {
2716 /* in case of Roman characters */
2718 /* output 1 shifted byte */
2722 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2723 /* output 1 shifted byte */
2724 if(iso2022jp_f && x0201_f==NO_X0201) {
2725 (*oconv)(GETA1, GETA2);
2732 /* look like bogus code */
2735 } else if (input_mode == X0208 || input_mode == X0212 ||
2736 input_mode == X0213_1 || input_mode == X0213_2) {
2737 /* in case of Kanji shifted */
2740 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2741 /* Check MIME code */
2742 if ((c1 = (*i_getc)(f)) == EOF) {
2745 } else if (c1 == '?') {
2746 /* =? is mime conversion start sequence */
2747 if(mime_f == STRICT_MIME) {
2748 /* check in real detail */
2749 if (mime_begin_strict(f) == EOF)
2753 } else if (mime_begin(f) == EOF)
2763 /* normal ASCII code */
2766 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2769 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2772 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2773 if ((c1 = (*i_getc)(f)) == EOF) {
2774 /* (*oconv)(0, ESC); don't send bogus code */
2776 } else if (c1 == '$') {
2777 if ((c1 = (*i_getc)(f)) == EOF) {
2779 (*oconv)(0, ESC); don't send bogus code
2780 (*oconv)(0, '$'); */
2782 } else if (c1 == '@'|| c1 == 'B') {
2783 /* This is kanji introduction */
2786 set_input_codename("ISO-2022-JP");
2788 debug("ISO-2022-JP");
2791 } else if (c1 == '(') {
2792 if ((c1 = (*i_getc)(f)) == EOF) {
2793 /* don't send bogus code
2799 } else if (c1 == '@'|| c1 == 'B') {
2800 /* This is kanji introduction */
2805 } else if (c1 == 'D'){
2809 #endif /* X0212_ENABLE */
2810 } else if (c1 == (X0213_1&0x7F)){
2811 input_mode = X0213_1;
2814 } else if (c1 == (X0213_2&0x7F)){
2815 input_mode = X0213_2;
2819 /* could be some special code */
2826 } else if (broken_f&0x2) {
2827 /* accept any ESC-(-x as broken code ... */
2837 } else if (c1 == '(') {
2838 if ((c1 = (*i_getc)(f)) == EOF) {
2839 /* don't send bogus code
2841 (*oconv)(0, '('); */
2845 /* This is X0201 kana introduction */
2846 input_mode = X0201; shift_mode = X0201;
2848 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2849 /* This is X0208 kanji introduction */
2850 input_mode = ASCII; shift_mode = FALSE;
2852 } else if (broken_f&0x2) {
2853 input_mode = ASCII; shift_mode = FALSE;
2858 /* maintain various input_mode here */
2862 } else if ( c1 == 'N' || c1 == 'n'){
2864 c3 = (*i_getc)(f); /* skip SS2 */
2865 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2880 } else if (c1 == ESC && iconv == s_iconv) {
2881 /* ESC in Shift_JIS */
2882 if ((c1 = (*i_getc)(f)) == EOF) {
2883 /* (*oconv)(0, ESC); don't send bogus code */
2885 } else if (c1 == '$') {
2887 if ((c1 = (*i_getc)(f)) == EOF) {
2889 (*oconv)(0, ESC); don't send bogus code
2890 (*oconv)(0, '$'); */
2893 if (('E' <= c1 && c1 <= 'G') ||
2894 ('O' <= c1 && c1 <= 'Q')) {
2902 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2903 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2904 while ((c1 = (*i_getc)(f)) != EOF) {
2905 if (SP <= c1 && c1 <= 'z') {
2906 (*oconv)(0, c1 + c0);
2907 } else break; /* c1 == SO */
2911 if (c1 == EOF) LAST;
2918 } else if (c1 == LF || c1 == CR) {
2920 input_mode = ASCII; set_iconv(FALSE, 0);
2922 } else if (mime_decode_f && !mime_decode_mode){
2924 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2932 } else { /* if (c1 == CR)*/
2933 if ((c1=(*i_getc)(f))!=EOF) {
2937 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2951 } else if (c1 == DEL && input_mode == X0208) {
2961 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2964 if ((c0 = (*i_getc)(f)) != EOF) {
2967 if ((c3 = (*i_getc)(f)) != EOF) {
2969 (*iconv)(c2, c1, c0|c3);
2974 /* 3 bytes EUC or UTF-8 */
2975 if ((c0 = (*i_getc)(f)) != EOF) {
2977 (*iconv)(c2, c1, c0);
2985 0x7F <= c2 && c2 <= 0x92 &&
2986 0x21 <= c1 && c1 <= 0x7E) {
2988 if(c1 == 0x7F) return 0;
2989 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2992 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2996 (*oconv)(PREFIX_EUCG3 | c2, c1);
2998 #endif /* X0212_ENABLE */
3000 (*oconv)(PREFIX_EUCG3 | c2, c1);
3003 (*oconv)(input_mode, c1); /* other special case */
3009 /* goto next_word */
3013 (*iconv)(EOF, 0, 0);
3014 if (!input_codename)
3017 struct input_code *p = input_code_list;
3018 struct input_code *result = p;
3020 if (p->score < result->score) result = p;
3023 set_input_codename(result->name);
3025 debug(result->name);
3033 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3035 nkf_char ret, c3, c0;
3039 /** it must NOT be in the kanji shifte sequence */
3040 /** it must NOT be written in JIS7 */
3041 /** and it must be after 2 byte 8bit code */
3047 while ((c1 = (*i_getc)(f)) != EOF) {
3053 if (push_hold_buf(c1) == EOF || estab_f){
3059 struct input_code *p = input_code_list;
3060 struct input_code *result = p;
3065 if (p->status_func && p->score < result->score){
3070 set_iconv(TRUE, result->iconv_func);
3075 ** 1) EOF is detected, or
3076 ** 2) Code is established, or
3077 ** 3) Buffer is FULL (but last word is pushed)
3079 ** in 1) and 3) cases, we continue to use
3080 ** Kanji codes by oconv and leave estab_f unchanged.
3085 while (hold_index < hold_count){
3086 c2 = hold_buf[hold_index++];
3088 #ifdef NUMCHAR_OPTION
3089 || is_unicode_capsule(c2)
3094 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3095 (*iconv)(X0201, c2, 0);
3098 if (hold_index < hold_count){
3099 c1 = hold_buf[hold_index++];
3109 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3112 if (hold_index < hold_count){
3113 c0 = hold_buf[hold_index++];
3114 } else if ((c0 = (*i_getc)(f)) == EOF) {
3120 if (hold_index < hold_count){
3121 c3 = hold_buf[hold_index++];
3122 } else if ((c3 = (*i_getc)(f)) == EOF) {
3127 (*iconv)(c2, c1, c0|c3);
3132 /* 3 bytes EUC or UTF-8 */
3133 if (hold_index < hold_count){
3134 c0 = hold_buf[hold_index++];
3135 } else if ((c0 = (*i_getc)(f)) == EOF) {
3141 (*iconv)(c2, c1, c0);
3144 if (c0 == EOF) break;
3149 nkf_char push_hold_buf(nkf_char c2)
3151 if (hold_count >= HOLD_SIZE*2)
3153 hold_buf[hold_count++] = (unsigned char)c2;
3154 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3157 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3159 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3162 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3163 #ifdef SHIFTJIS_CP932
3164 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3165 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3172 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3173 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3179 #endif /* SHIFTJIS_CP932 */
3181 if (!x0213_f && is_ibmext_in_sjis(c2)){
3182 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3185 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3198 if(x0213_f && c2 >= 0xF0){
3199 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3200 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3201 }else{ /* 78<=k<=94 */
3202 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3203 if (0x9E < c1) c2++;
3206 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3207 if (0x9E < c1) c2++;
3210 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3217 c2 = x0212_unshift(c2);
3224 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3228 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3230 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3232 if(c1 == 0x7F) return 0;
3233 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3236 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3237 if (ret) return ret;
3243 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3248 }else if (c2 == 0x8f){
3252 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3253 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3254 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3257 c2 = (c2 << 8) | (c1 & 0x7f);
3259 #ifdef SHIFTJIS_CP932
3262 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3263 s2e_conv(s2, s1, &c2, &c1);
3270 #endif /* SHIFTJIS_CP932 */
3272 #endif /* X0212_ENABLE */
3273 } else if (c2 == SSO){
3276 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3279 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3280 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3281 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3286 #ifdef SHIFTJIS_CP932
3287 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3289 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3290 s2e_conv(s2, s1, &c2, &c1);
3297 #endif /* SHIFTJIS_CP932 */
3304 #ifdef UTF8_INPUT_ENABLE
3305 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3312 }else if (0xc0 <= c2 && c2 <= 0xef) {
3313 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3314 #ifdef NUMCHAR_OPTION
3317 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3325 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3328 static const char w_iconv_utf8_1st_byte[] =
3330 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3331 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3332 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3333 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3335 if (c2 < 0 || 0xff < c2) {
3336 }else if (c2 == 0) { /* 0 : 1 byte*/
3338 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3341 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3343 if (c1 < 0x80 || 0xBF < c1) return 0;
3346 if (c0 == 0) return -1;
3347 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3352 if (c0 == 0) return -1;
3353 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3357 if (c0 == 0) return -1;
3358 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3362 if (c0 == 0) return -2;
3363 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3367 if (c0 == 0) return -2;
3368 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3372 if (c0 == 0) return -2;
3373 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3381 if (c2 == 0 || c2 == EOF){
3382 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3383 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3386 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3395 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3396 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3403 }else if (val < 0x800){
3404 *p2 = 0xc0 | (val >> 6);
3405 *p1 = 0x80 | (val & 0x3f);
3407 } else if (val <= NKF_INT32_C(0xFFFF)) {
3408 *p2 = 0xe0 | (val >> 12);
3409 *p1 = 0x80 | ((val >> 6) & 0x3f);
3410 *p0 = 0x80 | (val & 0x3f);
3411 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3412 *p2 = 0xe0 | (val >> 16);
3413 *p1 = 0x80 | ((val >> 12) & 0x3f);
3414 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3423 #ifdef UTF8_INPUT_ENABLE
3424 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3429 } else if (c2 >= 0xf0){
3430 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3431 val = (c2 & 0x0f) << 18;
3432 val |= (c1 & 0x3f) << 12;
3433 val |= (c0 & 0x3f00) >> 2;
3435 }else if (c2 >= 0xe0){
3436 val = (c2 & 0x0f) << 12;
3437 val |= (c1 & 0x3f) << 6;
3439 }else if (c2 >= 0xc0){
3440 val = (c2 & 0x1f) << 6;
3448 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3450 nkf_char c2, c1, c0;
3457 w16w_conv(val, &c2, &c1, &c0);
3458 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3459 #ifdef NUMCHAR_OPTION
3462 *p1 = CLASS_UNICODE | val;
3471 #ifdef UTF8_INPUT_ENABLE
3472 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3475 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3478 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3479 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3481 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3483 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3488 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3489 if (ret) return ret;
3494 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3498 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3499 } else if (is_unicode_bmp(c1)) {
3500 ret = w16e_conv(c1, &c2, &c1);
3503 c1 = CLASS_UNICODE | c1;
3505 if (ret) return ret;
3510 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3512 const unsigned short *const *pp;
3513 const unsigned short *const *const *ppp;
3514 static const char no_best_fit_chars_table_C2[] =
3515 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3516 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3517 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3518 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3519 static const char no_best_fit_chars_table_C2_ms[] =
3520 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3522 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3523 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3524 static const char no_best_fit_chars_table_932_C2[] =
3525 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3526 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3527 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3528 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3529 static const char no_best_fit_chars_table_932_C3[] =
3530 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3531 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3533 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3539 }else if(c2 < 0xe0){
3540 if(no_best_fit_chars_f){
3541 if(ms_ucs_map_f == UCS_MAP_CP932){
3544 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3547 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3550 }else if(!cp932inv_f){
3553 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3556 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3559 }else if(ms_ucs_map_f == UCS_MAP_MS){
3560 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3561 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3579 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3580 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3581 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3583 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3584 }else if(c0 < 0xF0){
3585 if(no_best_fit_chars_f){
3586 if(ms_ucs_map_f == UCS_MAP_CP932){
3587 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3588 }else if(ms_ucs_map_f == UCS_MAP_MS){
3593 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3596 if(c0 == 0x92) return 1;
3601 if(c1 == 0x80 || c0 == 0x9C) return 1;
3604 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3609 if(c0 == 0x94) return 1;
3612 if(c0 == 0xBB) return 1;
3622 if(c0 == 0x95) return 1;
3625 if(c0 == 0xA5) return 1;
3632 if(c0 == 0x8D) return 1;
3635 if(c0 == 0x9E && !cp932inv_f) return 1;
3638 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3646 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3647 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3648 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3650 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3652 #ifdef SHIFTJIS_CP932
3653 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3655 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3656 s2e_conv(s2, s1, p2, p1);
3665 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3668 const unsigned short *p;
3671 if (pp == 0) return 1;
3674 if (c1 < 0 || psize <= c1) return 1;
3676 if (p == 0) return 1;
3679 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3681 if (val == 0) return 1;
3682 if (no_cp932ext_f && (
3683 (val>>8) == 0x2D || /* NEC special characters */
3684 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3692 if (c2 == SO) c2 = X0201;
3699 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3706 (*f)(0, bin2hex(c>>shift));
3716 void encode_fallback_html(nkf_char c)
3721 if(c >= NKF_INT32_C(1000000))
3722 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3723 if(c >= NKF_INT32_C(100000))
3724 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3726 (*oconv)(0, 0x30+(c/10000 )%10);
3728 (*oconv)(0, 0x30+(c/1000 )%10);
3730 (*oconv)(0, 0x30+(c/100 )%10);
3732 (*oconv)(0, 0x30+(c/10 )%10);
3734 (*oconv)(0, 0x30+ c %10);
3739 void encode_fallback_xml(nkf_char c)
3744 nkf_each_char_to_hex(oconv, c);
3749 void encode_fallback_java(nkf_char c)
3753 if(!is_unicode_bmp(c)){
3757 (*oconv)(0, bin2hex(c>>20));
3758 (*oconv)(0, bin2hex(c>>16));
3762 (*oconv)(0, bin2hex(c>>12));
3763 (*oconv)(0, bin2hex(c>> 8));
3764 (*oconv)(0, bin2hex(c>> 4));
3765 (*oconv)(0, bin2hex(c ));
3769 void encode_fallback_perl(nkf_char c)
3774 nkf_each_char_to_hex(oconv, c);
3779 void encode_fallback_subchar(nkf_char c)
3781 c = unicode_subchar;
3782 (*oconv)((c>>8)&0xFF, c&0xFF);
3787 #ifdef UTF8_OUTPUT_ENABLE
3788 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3790 const unsigned short *p;
3793 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3801 p = euc_to_utf8_1byte;
3803 } else if (is_eucg3(c2)){
3804 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3807 c2 = (c2&0x7f) - 0x21;
3808 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3809 p = x0212_to_utf8_2bytes[c2];
3815 c2 = (c2&0x7f) - 0x21;
3816 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3818 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3819 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3820 euc_to_utf8_2bytes_ms[c2];
3825 c1 = (c1 & 0x7f) - 0x21;
3826 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3831 void w_oconv(nkf_char c2, nkf_char c1)
3837 output_bom_f = FALSE;
3848 #ifdef NUMCHAR_OPTION
3849 if (c2 == 0 && is_unicode_capsule(c1)){
3850 val = c1 & VALUE_MASK;
3853 }else if (val < 0x800){
3854 (*o_putc)(0xC0 | (val >> 6));
3855 (*o_putc)(0x80 | (val & 0x3f));
3856 } else if (val <= NKF_INT32_C(0xFFFF)) {
3857 (*o_putc)(0xE0 | (val >> 12));
3858 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3859 (*o_putc)(0x80 | (val & 0x3f));
3860 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3861 (*o_putc)(0xF0 | ( val>>18));
3862 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3863 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3864 (*o_putc)(0x80 | ( val & 0x3f));
3871 output_mode = ASCII;
3873 } else if (c2 == ISO8859_1) {
3875 (*o_putc)(c1 | 0x080);
3878 val = e2w_conv(c2, c1);
3880 w16w_conv(val, &c2, &c1, &c0);
3884 if (c0) (*o_putc)(c0);
3890 void w_oconv16(nkf_char c2, nkf_char c1)
3893 output_bom_f = FALSE;
3894 if (output_endian == ENDIAN_LITTLE){
3895 (*o_putc)((unsigned char)'\377');
3899 (*o_putc)((unsigned char)'\377');
3908 if (c2 == ISO8859_1) {
3911 #ifdef NUMCHAR_OPTION
3912 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3913 if (is_unicode_bmp(c1)) {
3914 c2 = (c1 >> 8) & 0xff;
3918 if (c1 <= UNICODE_MAX) {
3919 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3920 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3921 if (output_endian == ENDIAN_LITTLE){
3922 (*o_putc)(c2 & 0xff);
3923 (*o_putc)((c2 >> 8) & 0xff);
3924 (*o_putc)(c1 & 0xff);
3925 (*o_putc)((c1 >> 8) & 0xff);
3927 (*o_putc)((c2 >> 8) & 0xff);
3928 (*o_putc)(c2 & 0xff);
3929 (*o_putc)((c1 >> 8) & 0xff);
3930 (*o_putc)(c1 & 0xff);
3937 nkf_char val = e2w_conv(c2, c1);
3938 c2 = (val >> 8) & 0xff;
3942 if (output_endian == ENDIAN_LITTLE){
3951 void w_oconv32(nkf_char c2, nkf_char c1)
3954 output_bom_f = FALSE;
3955 if (output_endian == ENDIAN_LITTLE){
3956 (*o_putc)((unsigned char)'\377');
3964 (*o_putc)((unsigned char)'\377');
3973 if (c2 == ISO8859_1) {
3975 #ifdef NUMCHAR_OPTION
3976 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3980 c1 = e2w_conv(c2, c1);
3983 if (output_endian == ENDIAN_LITTLE){
3984 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3985 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3986 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3990 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3991 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3992 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3997 void e_oconv(nkf_char c2, nkf_char c1)
3999 #ifdef NUMCHAR_OPTION
4000 if (c2 == 0 && is_unicode_capsule(c1)){
4001 w16e_conv(c1, &c2, &c1);
4002 if (c2 == 0 && is_unicode_capsule(c1)){
4003 c2 = c1 & VALUE_MASK;
4004 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4008 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4009 c1 = 0x21 + c1 % 94;
4012 (*o_putc)((c2 & 0x7f) | 0x080);
4013 (*o_putc)(c1 | 0x080);
4015 (*o_putc)((c2 & 0x7f) | 0x080);
4016 (*o_putc)(c1 | 0x080);
4020 if (encode_fallback) (*encode_fallback)(c1);
4029 } else if (c2 == 0) {
4030 output_mode = ASCII;
4032 } else if (c2 == X0201) {
4033 output_mode = JAPANESE_EUC;
4034 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4035 } else if (c2 == ISO8859_1) {
4036 output_mode = ISO8859_1;
4037 (*o_putc)(c1 | 0x080);
4039 } else if (is_eucg3(c2)){
4040 output_mode = JAPANESE_EUC;
4041 #ifdef SHIFTJIS_CP932
4044 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4045 s2e_conv(s2, s1, &c2, &c1);
4050 output_mode = ASCII;
4052 }else if (is_eucg3(c2)){
4055 (*o_putc)((c2 & 0x7f) | 0x080);
4056 (*o_putc)(c1 | 0x080);
4059 (*o_putc)((c2 & 0x7f) | 0x080);
4060 (*o_putc)(c1 | 0x080);
4064 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4065 set_iconv(FALSE, 0);
4066 return; /* too late to rescue this char */
4068 output_mode = JAPANESE_EUC;
4069 (*o_putc)(c2 | 0x080);
4070 (*o_putc)(c1 | 0x080);
4075 nkf_char x0212_shift(nkf_char c)
4080 if (0x75 <= c && c <= 0x7f){
4081 ret = c + (0x109 - 0x75);
4084 if (0x75 <= c && c <= 0x7f){
4085 ret = c + (0x113 - 0x75);
4092 nkf_char x0212_unshift(nkf_char c)
4095 if (0x7f <= c && c <= 0x88){
4096 ret = c + (0x75 - 0x7f);
4097 }else if (0x89 <= c && c <= 0x92){
4098 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4102 #endif /* X0212_ENABLE */
4104 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4110 if((0x21 <= ndx && ndx <= 0x2F)){
4111 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4112 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4114 }else if(0x6E <= ndx && ndx <= 0x7E){
4115 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4116 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4122 else if(nkf_isgraph(ndx)){
4124 const unsigned short *ptr;
4125 ptr = x0212_shiftjis[ndx - 0x21];
4127 val = ptr[(c1 & 0x7f) - 0x21];
4136 c2 = x0212_shift(c2);
4138 #endif /* X0212_ENABLE */
4140 if(0x7F < c2) return 1;
4141 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4142 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4146 void s_oconv(nkf_char c2, nkf_char c1)
4148 #ifdef NUMCHAR_OPTION
4149 if (c2 == 0 && is_unicode_capsule(c1)){
4150 w16e_conv(c1, &c2, &c1);
4151 if (c2 == 0 && is_unicode_capsule(c1)){
4152 c2 = c1 & VALUE_MASK;
4153 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4156 c2 = c1 / 188 + 0xF0;
4158 c1 += 0x40 + (c1 > 0x3e);
4163 if(encode_fallback)(*encode_fallback)(c1);
4172 } else if (c2 == 0) {
4173 output_mode = ASCII;
4175 } else if (c2 == X0201) {
4176 output_mode = SHIFT_JIS;
4178 } else if (c2 == ISO8859_1) {
4179 output_mode = ISO8859_1;
4180 (*o_putc)(c1 | 0x080);
4182 } else if (is_eucg3(c2)){
4183 output_mode = SHIFT_JIS;
4184 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4190 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4191 set_iconv(FALSE, 0);
4192 return; /* too late to rescue this char */
4194 output_mode = SHIFT_JIS;
4195 e2s_conv(c2, c1, &c2, &c1);
4197 #ifdef SHIFTJIS_CP932
4199 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4200 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4206 #endif /* SHIFTJIS_CP932 */
4209 if (prefix_table[(unsigned char)c1]){
4210 (*o_putc)(prefix_table[(unsigned char)c1]);
4216 void j_oconv(nkf_char c2, nkf_char c1)
4218 #ifdef NUMCHAR_OPTION
4219 if (c2 == 0 && is_unicode_capsule(c1)){
4220 w16e_conv(c1, &c2, &c1);
4221 if (c2 == 0 && is_unicode_capsule(c1)){
4222 c2 = c1 & VALUE_MASK;
4223 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4226 c2 = 0x7F + c1 / 94;
4227 c1 = 0x21 + c1 % 94;
4229 if (encode_fallback) (*encode_fallback)(c1);
4236 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4239 (*o_putc)(ascii_intro);
4240 output_mode = ASCII;
4244 } else if (is_eucg3(c2)){
4246 if(output_mode!=X0213_2){
4247 output_mode = X0213_2;
4251 (*o_putc)(X0213_2&0x7F);
4254 if(output_mode!=X0212){
4255 output_mode = X0212;
4259 (*o_putc)(X0212&0x7F);
4262 (*o_putc)(c2 & 0x7f);
4265 } else if (c2==X0201) {
4266 if (output_mode!=X0201) {
4267 output_mode = X0201;
4273 } else if (c2==ISO8859_1) {
4274 /* iso8859 introduction, or 8th bit on */
4275 /* Can we convert in 7bit form using ESC-'-'-A ?
4277 output_mode = ISO8859_1;
4279 } else if (c2 == 0) {
4280 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4283 (*o_putc)(ascii_intro);
4284 output_mode = ASCII;
4289 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4290 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4292 if (output_mode!=X0213_1) {
4293 output_mode = X0213_1;
4297 (*o_putc)(X0213_1&0x7F);
4299 }else if (output_mode != X0208) {
4300 output_mode = X0208;
4303 (*o_putc)(kanji_intro);
4310 void base64_conv(nkf_char c2, nkf_char c1)
4312 mime_prechar(c2, c1);
4313 (*o_base64conv)(c2,c1);
4317 static nkf_char broken_buf[3];
4318 static int broken_counter = 0;
4319 static int broken_last = 0;
4320 nkf_char broken_getc(FILE *f)
4324 if (broken_counter>0) {
4325 return broken_buf[--broken_counter];
4328 if (c=='$' && broken_last != ESC
4329 && (input_mode==ASCII || input_mode==X0201)) {
4332 if (c1=='@'|| c1=='B') {
4333 broken_buf[0]=c1; broken_buf[1]=c;
4340 } else if (c=='(' && broken_last != ESC
4341 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4344 if (c1=='J'|| c1=='B') {
4345 broken_buf[0]=c1; broken_buf[1]=c;
4358 nkf_char broken_ungetc(nkf_char c, FILE *f)
4360 if (broken_counter<2)
4361 broken_buf[broken_counter++]=c;
4365 void nl_conv(nkf_char c2, nkf_char c1)
4367 if (guess_f && input_newline != EOF) {
4368 if (c2 == 0 && c1 == LF) {
4369 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4370 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4371 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4373 else if (!input_newline) input_newline = CR;
4374 else if (input_newline != CR) input_newline = EOF;
4376 if (prev_cr || c2 == 0 && c1 == LF) {
4378 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4379 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4381 if (c2 == 0 && c1 == CR) prev_cr = CR;
4382 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4386 Return value of fold_conv()
4388 LF add newline and output char
4389 CR add newline and output nothing
4392 1 (or else) normal output
4394 fold state in prev (previous character)
4396 >0x80 Japanese (X0208/X0201)
4401 This fold algorthm does not preserve heading space in a line.
4402 This is the main difference from fmt.
4405 #define char_size(c2,c1) (c2?2:1)
4407 void fold_conv(nkf_char c2, nkf_char c1)
4410 nkf_char fold_state;
4412 if (c1== CR && !fold_preserve_f) {
4413 fold_state=0; /* ignore cr */
4414 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4416 fold_state=0; /* ignore cr */
4417 } else if (c1== BS) {
4418 if (f_line>0) f_line--;
4420 } else if (c2==EOF && f_line != 0) { /* close open last line */
4422 } else if ((c1==LF && !fold_preserve_f)
4423 || ((c1==CR||(c1==LF&&f_prev!=CR))
4424 && fold_preserve_f)) {
4426 if (fold_preserve_f) {
4430 } else if ((f_prev == c1 && !fold_preserve_f)
4431 || (f_prev == LF && fold_preserve_f)
4432 ) { /* duplicate newline */
4435 fold_state = LF; /* output two newline */
4441 if (f_prev&0x80) { /* Japanese? */
4443 fold_state = 0; /* ignore given single newline */
4444 } else if (f_prev==SP) {
4448 if (++f_line<=fold_len)
4452 fold_state = CR; /* fold and output nothing */
4456 } else if (c1=='\f') {
4459 fold_state = LF; /* output newline and clear */
4460 } else if ( (c2==0 && c1==SP)||
4461 (c2==0 && c1==TAB)||
4462 (c2=='!'&& c1=='!')) {
4463 /* X0208 kankaku or ascii space */
4465 fold_state = 0; /* remove duplicate spaces */
4468 if (++f_line<=fold_len)
4469 fold_state = SP; /* output ASCII space only */
4471 f_prev = SP; f_line = 0;
4472 fold_state = CR; /* fold and output nothing */
4476 prev0 = f_prev; /* we still need this one... , but almost done */
4478 if (c2 || c2==X0201)
4479 f_prev |= 0x80; /* this is Japanese */
4480 f_line += char_size(c2,c1);
4481 if (f_line<=fold_len) { /* normal case */
4484 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4485 f_line = char_size(c2,c1);
4486 fold_state = LF; /* We can't wait, do fold now */
4487 } else if (c2==X0201) {
4488 /* simple kinsoku rules return 1 means no folding */
4489 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4490 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4491 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4492 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4493 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4494 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4495 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4497 fold_state = LF;/* add one new f_line before this character */
4500 fold_state = LF;/* add one new f_line before this character */
4503 /* kinsoku point in ASCII */
4504 if ( c1==')'|| /* { [ ( */
4515 /* just after special */
4516 } else if (!is_alnum(prev0)) {
4517 f_line = char_size(c2,c1);
4519 } else if ((prev0==SP) || /* ignored new f_line */
4520 (prev0==LF)|| /* ignored new f_line */
4521 (prev0&0x80)) { /* X0208 - ASCII */
4522 f_line = char_size(c2,c1);
4523 fold_state = LF;/* add one new f_line before this character */
4525 fold_state = 1; /* default no fold in ASCII */
4529 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4530 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4531 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4532 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4533 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4534 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4535 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4536 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4537 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4538 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4539 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4540 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4541 /* default no fold in kinsoku */
4544 f_line = char_size(c2,c1);
4545 /* add one new f_line before this character */
4548 f_line = char_size(c2,c1);
4550 /* add one new f_line before this character */
4555 /* terminator process */
4556 switch(fold_state) {
4575 nkf_char z_prev2=0,z_prev1=0;
4577 void z_conv(nkf_char c2, nkf_char c1)
4580 /* if (c2) c1 &= 0x7f; assertion */
4582 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4588 if (z_prev2 == X0201) {
4590 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4592 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4594 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4596 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4601 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4604 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4605 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4610 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4621 if (alpha_f&1 && c2 == 0x23) {
4622 /* JISX0208 Alphabet */
4624 } else if (c2 == 0x21) {
4625 /* JISX0208 Kigou */
4630 } else if (alpha_f&4) {
4635 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4641 if (alpha_f&8 && c2 == 0) {
4645 case '>': entity = ">"; break;
4646 case '<': entity = "<"; break;
4647 case '\"': entity = """; break;
4648 case '&': entity = "&"; break;
4651 while (*entity) (*o_zconv)(0, *entity++);
4657 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4662 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4666 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4670 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4674 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4678 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4682 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4686 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4690 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4695 (*o_zconv)(X0201, c);
4698 } else if (c2 == 0x25) {
4699 /* JISX0208 Katakana */
4700 static const int fullwidth_to_halfwidth[] =
4702 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4703 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4704 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4705 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4706 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4707 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4708 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4709 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4710 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4711 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4712 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4713 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4715 if (fullwidth_to_halfwidth[c1-0x20]){
4716 c2 = fullwidth_to_halfwidth[c1-0x20];
4717 (*o_zconv)(X0201, c2>>8);
4719 (*o_zconv)(X0201, c2&0xFF);
4729 #define rot13(c) ( \
4731 (c <= 'M') ? (c + 13): \
4732 (c <= 'Z') ? (c - 13): \
4734 (c <= 'm') ? (c + 13): \
4735 (c <= 'z') ? (c - 13): \
4739 #define rot47(c) ( \
4741 ( c <= 'O') ? (c + 47) : \
4742 ( c <= '~') ? (c - 47) : \
4746 void rot_conv(nkf_char c2, nkf_char c1)
4748 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4754 (*o_rot_conv)(c2,c1);
4757 void hira_conv(nkf_char c2, nkf_char c1)
4761 if (0x20 < c1 && c1 < 0x74) {
4763 (*o_hira_conv)(c2,c1);
4765 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4767 c1 = CLASS_UNICODE | 0x3094;
4768 (*o_hira_conv)(c2,c1);
4771 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4773 (*o_hira_conv)(c2,c1);
4778 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4781 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4783 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4787 (*o_hira_conv)(c2,c1);
4791 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4793 static const nkf_char range[RANGE_NUM_MAX][2] = {
4814 nkf_char start, end, c;
4816 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4820 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4825 for (i = 0; i < RANGE_NUM_MAX; i++) {
4826 start = range[i][0];
4829 if (c >= start && c <= end) {
4834 (*o_iso2022jp_check_conv)(c2,c1);
4838 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4840 static const unsigned char *mime_pattern[] = {
4841 (const unsigned char *)"\075?EUC-JP?B?",
4842 (const unsigned char *)"\075?SHIFT_JIS?B?",
4843 (const unsigned char *)"\075?ISO-8859-1?Q?",
4844 (const unsigned char *)"\075?ISO-8859-1?B?",
4845 (const unsigned char *)"\075?ISO-2022-JP?B?",
4846 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4847 #if defined(UTF8_INPUT_ENABLE)
4848 (const unsigned char *)"\075?UTF-8?B?",
4849 (const unsigned char *)"\075?UTF-8?Q?",
4851 (const unsigned char *)"\075?US-ASCII?Q?",
4856 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4857 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4858 e_iconv, s_iconv, 0, 0, 0, 0,
4859 #if defined(UTF8_INPUT_ENABLE)
4865 static const nkf_char mime_encode[] = {
4866 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4867 #if defined(UTF8_INPUT_ENABLE)
4874 static const nkf_char mime_encode_method[] = {
4875 'B', 'B','Q', 'B', 'B', 'Q',
4876 #if defined(UTF8_INPUT_ENABLE)
4884 #define MAXRECOVER 20
4886 void switch_mime_getc(void)
4888 if (i_getc!=mime_getc) {
4889 i_mgetc = i_getc; i_getc = mime_getc;
4890 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4891 if(mime_f==STRICT_MIME) {
4892 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4893 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4898 void unswitch_mime_getc(void)
4900 if(mime_f==STRICT_MIME) {
4901 i_mgetc = i_mgetc_buf;
4902 i_mungetc = i_mungetc_buf;
4905 i_ungetc = i_mungetc;
4906 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4907 mime_iconv_back = NULL;
4910 nkf_char mime_begin_strict(FILE *f)
4914 const unsigned char *p,*q;
4915 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4917 mime_decode_mode = FALSE;
4918 /* =? has been checked */
4920 p = mime_pattern[j];
4923 for(i=2;p[i]>SP;i++) { /* start at =? */
4924 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4925 /* pattern fails, try next one */
4927 while (mime_pattern[++j]) {
4928 p = mime_pattern[j];
4929 for(k=2;k<i;k++) /* assume length(p) > i */
4930 if (p[k]!=q[k]) break;
4931 if (k==i && nkf_toupper(c1)==p[k]) break;
4933 p = mime_pattern[j];
4934 if (p) continue; /* found next one, continue */
4935 /* all fails, output from recovery buffer */
4943 mime_decode_mode = p[i-2];
4945 mime_iconv_back = iconv;
4946 set_iconv(FALSE, mime_priority_func[j]);
4947 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4949 if (mime_decode_mode=='B') {
4950 mimebuf_f = unbuf_f;
4952 /* do MIME integrity check */
4953 return mime_integrity(f,mime_pattern[j]);
4961 nkf_char mime_getc_buf(FILE *f)
4963 /* we don't keep eof of Fifo, becase it contains ?= as
4964 a terminator. It was checked in mime_integrity. */
4965 return ((mimebuf_f)?
4966 (*i_mgetc_buf)(f):Fifo(mime_input++));
4969 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4972 (*i_mungetc_buf)(c,f);
4974 Fifo(--mime_input) = (unsigned char)c;
4978 nkf_char mime_begin(FILE *f)
4983 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4984 /* re-read and convert again from mime_buffer. */
4986 /* =? has been checked */
4988 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4989 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4990 /* We accept any character type even if it is breaked by new lines */
4991 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4992 if (c1==LF||c1==SP||c1==CR||
4993 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4995 /* Failed. But this could be another MIME preemble */
5003 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5004 if (!(++i<MAXRECOVER) || c1==EOF) break;
5005 if (c1=='b'||c1=='B') {
5006 mime_decode_mode = 'B';
5007 } else if (c1=='q'||c1=='Q') {
5008 mime_decode_mode = 'Q';
5012 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5013 if (!(++i<MAXRECOVER) || c1==EOF) break;
5015 mime_decode_mode = FALSE;
5021 if (!mime_decode_mode) {
5022 /* false MIME premble, restart from mime_buffer */
5023 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5024 /* Since we are in MIME mode until buffer becomes empty, */
5025 /* we never go into mime_begin again for a while. */
5028 /* discard mime preemble, and goto MIME mode */
5030 /* do no MIME integrity check */
5031 return c1; /* used only for checking EOF */
5035 void no_putc(nkf_char c)
5040 void debug(const char *str)
5043 fprintf(stderr, "%s\n", str ? str : "NULL");
5048 void set_input_codename(char *codename)
5050 if (!input_codename) {
5051 input_codename = codename;
5052 } else if (strcmp(codename, input_codename) != 0) {
5053 input_codename = "";
5057 #if !defined(PERL_XS) && !defined(WIN32DLL)
5058 void print_guessed_code(char *filename)
5060 char *codename = "BINARY";
5061 char *str_nlmode = NULL;
5062 if (filename != NULL) printf("%s: ", filename);
5063 if (input_codename && !*input_codename) {
5066 struct input_code *p = find_inputcode_byfunc(iconv);
5068 printf("%s\n", input_codename ? input_codename : "ASCII");
5070 if (!input_codename) {
5071 input_codename = "ASCII";
5072 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5073 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5074 input_codename = "CP932";
5075 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5076 if (p->score & (SCORE_X0212))
5077 input_codename = "EUCJP-MS";
5078 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5079 input_codename = "CP51932";
5080 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5081 if (p->score & (SCORE_KANA))
5082 input_codename = "CP50221";
5083 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5084 input_codename = "CP50220";
5088 input_newline == CR ? " (CR)" :
5089 input_newline == LF ? " (LF)" :
5090 input_newline == CRLF ? " (CRLF)" :
5091 input_newline == EOF ? " (MIXED NL)" :
5100 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5102 nkf_char c1, c2, c3;
5108 if (!nkf_isxdigit(c2)){
5113 if (!nkf_isxdigit(c3)){
5118 return (hex2bin(c2) << 4) | hex2bin(c3);
5121 nkf_char cap_getc(FILE *f)
5123 return hex_getc(':', f, i_cgetc, i_cungetc);
5126 nkf_char cap_ungetc(nkf_char c, FILE *f)
5128 return (*i_cungetc)(c, f);
5131 nkf_char url_getc(FILE *f)
5133 return hex_getc('%', f, i_ugetc, i_uungetc);
5136 nkf_char url_ungetc(nkf_char c, FILE *f)
5138 return (*i_uungetc)(c, f);
5142 #ifdef NUMCHAR_OPTION
5143 nkf_char numchar_getc(FILE *f)
5145 nkf_char (*g)(FILE *) = i_ngetc;
5146 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5157 if (buf[i] == 'x' || buf[i] == 'X'){
5158 for (j = 0; j < 7; j++){
5160 if (!nkf_isxdigit(buf[i])){
5167 c |= hex2bin(buf[i]);
5170 for (j = 0; j < 8; j++){
5174 if (!nkf_isdigit(buf[i])){
5181 c += hex2bin(buf[i]);
5187 return CLASS_UNICODE | c;
5196 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5198 return (*i_nungetc)(c, f);
5202 #ifdef UNICODE_NORMALIZATION
5204 /* Normalization Form C */
5205 nkf_char nfc_getc(FILE *f)
5207 nkf_char (*g)(FILE *f) = i_nfc_getc;
5208 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5209 int i=0, j, k=1, lower, upper;
5211 const nkf_nfchar *array;
5214 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5215 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5216 while (upper >= lower) {
5217 j = (lower+upper) / 2;
5218 array = normalization_table[j].nfd;
5219 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5220 if (array[k] != buf[k]){
5221 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5228 array = normalization_table[j].nfc;
5229 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5230 buf[i] = (nkf_char)(array[i]);
5241 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5243 return (*i_nfc_ungetc)(c, f);
5245 #endif /* UNICODE_NORMALIZATION */
5251 nkf_char c1, c2, c3, c4, cc;
5252 nkf_char t1, t2, t3, t4, mode, exit_mode;
5253 nkf_char lwsp_count;
5256 nkf_char lwsp_size = 128;
5258 if (mime_top != mime_last) { /* Something is in FIFO */
5259 return Fifo(mime_top++);
5261 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5262 mime_decode_mode=FALSE;
5263 unswitch_mime_getc();
5264 return (*i_getc)(f);
5267 if (mimebuf_f == FIXED_MIME)
5268 exit_mode = mime_decode_mode;
5271 if (mime_decode_mode == 'Q') {
5272 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5274 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5275 if (c1<=SP || DEL<=c1) {
5276 mime_decode_mode = exit_mode; /* prepare for quit */
5279 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5283 mime_decode_mode = exit_mode; /* prepare for quit */
5284 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5285 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5286 /* end Q encoding */
5287 input_mode = exit_mode;
5289 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5290 if (lwsp_buf==NULL) {
5291 perror("can't malloc");
5294 while ((c1=(*i_getc)(f))!=EOF) {
5299 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5307 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5308 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5323 lwsp_buf[lwsp_count] = (unsigned char)c1;
5324 if (lwsp_count++>lwsp_size){
5326 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5327 if (lwsp_buf_new==NULL) {
5329 perror("can't realloc");
5332 lwsp_buf = lwsp_buf_new;
5338 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5340 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5341 i_ungetc(lwsp_buf[lwsp_count],f);
5347 if (c1=='='&&c2<SP) { /* this is soft wrap */
5348 while((c1 = (*i_mgetc)(f)) <=SP) {
5349 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5351 mime_decode_mode = 'Q'; /* still in MIME */
5352 goto restart_mime_q;
5355 mime_decode_mode = 'Q'; /* still in MIME */
5359 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5360 if (c2<=SP) return c2;
5361 mime_decode_mode = 'Q'; /* still in MIME */
5362 return ((hex2bin(c2)<<4) + hex2bin(c3));
5365 if (mime_decode_mode != 'B') {
5366 mime_decode_mode = FALSE;
5367 return (*i_mgetc)(f);
5371 /* Base64 encoding */
5373 MIME allows line break in the middle of
5374 Base64, but we are very pessimistic in decoding
5375 in unbuf mode because MIME encoded code may broken by
5376 less or editor's control sequence (such as ESC-[-K in unbuffered
5377 mode. ignore incomplete MIME.
5379 mode = mime_decode_mode;
5380 mime_decode_mode = exit_mode; /* prepare for quit */
5382 while ((c1 = (*i_mgetc)(f))<=SP) {
5387 if ((c2 = (*i_mgetc)(f))<=SP) {
5390 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5391 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5394 if ((c1 == '?') && (c2 == '=')) {
5397 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5398 if (lwsp_buf==NULL) {
5399 perror("can't malloc");
5402 while ((c1=(*i_getc)(f))!=EOF) {
5407 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5415 if ((c1=(*i_getc)(f))!=EOF) {
5419 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5434 lwsp_buf[lwsp_count] = (unsigned char)c1;
5435 if (lwsp_count++>lwsp_size){
5437 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5438 if (lwsp_buf_new==NULL) {
5440 perror("can't realloc");
5443 lwsp_buf = lwsp_buf_new;
5449 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5451 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5452 i_ungetc(lwsp_buf[lwsp_count],f);
5459 if ((c3 = (*i_mgetc)(f))<=SP) {
5462 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5463 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5467 if ((c4 = (*i_mgetc)(f))<=SP) {
5470 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5471 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5475 mime_decode_mode = mode; /* still in MIME sigh... */
5477 /* BASE 64 decoding */
5479 t1 = 0x3f & base64decode(c1);
5480 t2 = 0x3f & base64decode(c2);
5481 t3 = 0x3f & base64decode(c3);
5482 t4 = 0x3f & base64decode(c4);
5483 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5485 Fifo(mime_last++) = (unsigned char)cc;
5486 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5488 Fifo(mime_last++) = (unsigned char)cc;
5489 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5491 Fifo(mime_last++) = (unsigned char)cc;
5496 return Fifo(mime_top++);
5499 nkf_char mime_ungetc(nkf_char c, FILE *f)
5501 Fifo(--mime_top) = (unsigned char)c;
5505 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5509 /* In buffered mode, read until =? or NL or buffer full
5511 mime_input = mime_top;
5512 mime_last = mime_top;
5514 while(*p) Fifo(mime_input++) = *p++;
5517 while((c=(*i_getc)(f))!=EOF) {
5518 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5519 break; /* buffer full */
5521 if (c=='=' && d=='?') {
5522 /* checked. skip header, start decode */
5523 Fifo(mime_input++) = (unsigned char)c;
5524 /* mime_last_input = mime_input; */
5529 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5531 /* Should we check length mod 4? */
5532 Fifo(mime_input++) = (unsigned char)c;
5535 /* In case of Incomplete MIME, no MIME decode */
5536 Fifo(mime_input++) = (unsigned char)c;
5537 mime_last = mime_input; /* point undecoded buffer */
5538 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5539 switch_mime_getc(); /* anyway we need buffered getc */
5543 nkf_char base64decode(nkf_char c)
5548 i = c - 'A'; /* A..Z 0-25 */
5549 } else if (c == '_') {
5550 i = '?' /* 63 */ ; /* _ 63 */
5552 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5554 } else if (c > '/') {
5555 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5556 } else if (c == '+' || c == '-') {
5557 i = '>' /* 62 */ ; /* + and - 62 */
5559 i = '?' /* 63 */ ; /* / 63 */
5564 static const char basis_64[] =
5565 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5567 static nkf_char b64c;
5568 #define MIMEOUT_BUF_LENGTH (60)
5569 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5570 int mimeout_buf_count = 0;
5572 void open_mime(nkf_char mode)
5574 const unsigned char *p;
5577 p = mime_pattern[0];
5578 for(i=0;mime_pattern[i];i++) {
5579 if (mode == mime_encode[i]) {
5580 p = mime_pattern[i];
5584 mimeout_mode = mime_encode_method[i];
5586 if (base64_count>45) {
5587 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5588 (*o_mputc)(mimeout_buf[i]);
5594 if (mimeout_buf_count>0
5595 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5596 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5600 for (;i<mimeout_buf_count;i++) {
5601 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5602 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5603 (*o_mputc)(mimeout_buf[i]);
5613 j = mimeout_buf_count;
5614 mimeout_buf_count = 0;
5616 mime_putc(mimeout_buf[i]);
5620 void close_mime(void)
5630 switch(mimeout_mode) {
5635 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5641 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5646 if (mimeout_mode > 0) {
5647 if (mimeout_f!=FIXED_MIME) {
5649 } else if (mimeout_mode != 'Q')
5654 void mimeout_addchar(nkf_char c)
5656 switch(mimeout_mode) {
5661 } else if(!nkf_isalnum(c)) {
5663 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5664 (*o_mputc)(bin2hex((c&0xf)));
5673 (*o_mputc)(basis_64[c>>2]);
5678 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5684 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5685 (*o_mputc)(basis_64[c & 0x3F]);
5696 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5698 void mime_prechar(nkf_char c2, nkf_char c1)
5700 if (mimeout_mode > 0){
5702 if (base64_count + mimeout_buf_count/3*4> 73){
5703 (*o_base64conv)(EOF,0);
5704 (*o_base64conv)(0,LF);
5705 (*o_base64conv)(0,SP);
5709 if (base64_count + mimeout_buf_count/3*4> 66) {
5710 (*o_base64conv)(EOF,0);
5711 (*o_base64conv)(0,LF);
5712 (*o_base64conv)(0,SP);
5718 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5719 mimeout_mode = (output_mode==ASCII ||output_mode == ISO8859_1) ? 'Q' : 'B';
5720 open_mime(output_mode);
5721 (*o_base64conv)(EOF,0);
5722 (*o_base64conv)(0,LF);
5723 (*o_base64conv)(0,SP);
5730 void mime_putc(nkf_char c)
5735 if (mimeout_f == FIXED_MIME){
5736 if (mimeout_mode == 'Q'){
5737 if (base64_count > 71){
5738 if (c!=CR && c!=LF) {
5745 if (base64_count > 71){
5750 if (c == EOF) { /* c==EOF */
5754 if (c != EOF) { /* c==EOF */
5760 /* mimeout_f != FIXED_MIME */
5762 if (c == EOF) { /* c==EOF */
5763 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
5764 j = mimeout_buf_count;
5765 mimeout_buf_count = 0;
5767 if (mimeout_mode > 0) {
5768 if (!nkf_isblank(mimeout_buf[j-1])) {
5770 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5773 mimeout_addchar(mimeout_buf[i]);
5777 mimeout_addchar(mimeout_buf[i]);
5781 mimeout_addchar(mimeout_buf[i]);
5787 mimeout_addchar(mimeout_buf[i]);
5793 if (mimeout_buf_count > 0){
5794 lastchar = mimeout_buf[mimeout_buf_count - 1];
5799 if (mimeout_mode=='Q') {
5800 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5801 if (c == CR || c == LF) {
5806 } else if (c <= SP) {
5808 if (base64_count > 70) {
5812 if (!nkf_isblank(c)) {
5817 if (base64_count > 70) {
5822 open_mime(output_mode);
5824 if (!nkf_noescape_mime(c)) {
5835 if (mimeout_mode <= 0) {
5836 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5837 if (nkf_isspace(c)) {
5839 if (mimeout_mode == -1) {
5842 if (c==CR || c==LF) {
5844 open_mime(output_mode);
5850 for (i=0;i<mimeout_buf_count;i++) {
5851 (*o_mputc)(mimeout_buf[i]);
5852 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5863 mimeout_buf[0] = (char)c;
5864 mimeout_buf_count = 1;
5866 if (base64_count > 1
5867 && base64_count + mimeout_buf_count > 76
5868 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5871 if (!nkf_isspace(mimeout_buf[0])){
5876 mimeout_buf[mimeout_buf_count++] = (char)c;
5877 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5878 open_mime(output_mode);
5883 if (lastchar==CR || lastchar == LF){
5884 for (i=0;i<mimeout_buf_count;i++) {
5885 (*o_mputc)(mimeout_buf[i]);
5888 mimeout_buf_count = 0;
5891 for (i=0;i<mimeout_buf_count-1;i++) {
5892 (*o_mputc)(mimeout_buf[i]);
5895 mimeout_buf[0] = SP;
5896 mimeout_buf_count = 1;
5898 open_mime(output_mode);
5901 /* mimeout_mode == 'B', 1, 2 */
5902 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5903 if (lastchar == CR || lastchar == LF){
5904 if (nkf_isblank(c)) {
5905 for (i=0;i<mimeout_buf_count;i++) {
5906 mimeout_addchar(mimeout_buf[i]);
5908 mimeout_buf_count = 0;
5909 } else if (SP<c && c<DEL) {
5911 for (i=0;i<mimeout_buf_count;i++) {
5912 (*o_mputc)(mimeout_buf[i]);
5915 mimeout_buf_count = 0;
5917 mimeout_buf[mimeout_buf_count++] = (char)c;
5920 if (c==SP || c==TAB || c==CR || c==LF) {
5921 for (i=0;i<mimeout_buf_count;i++) {
5922 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5924 for (i=0;i<mimeout_buf_count;i++) {
5925 (*o_mputc)(mimeout_buf[i]);
5928 mimeout_buf_count = 0;
5931 mimeout_buf[mimeout_buf_count++] = (char)c;
5932 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5934 for (i=0;i<mimeout_buf_count;i++) {
5935 (*o_mputc)(mimeout_buf[i]);
5938 mimeout_buf_count = 0;
5942 if (mimeout_buf_count>0 && SP<c && c!='=') {
5943 mimeout_buf[mimeout_buf_count++] = (char)c;
5944 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5945 j = mimeout_buf_count;
5946 mimeout_buf_count = 0;
5948 mimeout_addchar(mimeout_buf[i]);
5955 if (mimeout_buf_count>0) {
5956 j = mimeout_buf_count;
5957 mimeout_buf_count = 0;
5959 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5961 mimeout_addchar(mimeout_buf[i]);
5967 (*o_mputc)(mimeout_buf[i]);
5969 open_mime(output_mode);
5979 struct input_code *p = input_code_list;
5992 mime_f = MIME_DECODE_DEFAULT;
5993 mime_decode_f = FALSE;
5998 #if defined(MSDOS) || defined(__OS2__)
6003 iso2022jp_f = FALSE;
6004 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6005 ms_ucs_map_f = UCS_MAP_ASCII;
6007 #ifdef UTF8_INPUT_ENABLE
6008 no_cp932ext_f = FALSE;
6009 no_best_fit_chars_f = FALSE;
6010 encode_fallback = NULL;
6011 unicode_subchar = '?';
6012 input_endian = ENDIAN_BIG;
6014 #ifdef UTF8_OUTPUT_ENABLE
6015 output_bom_f = FALSE;
6016 output_endian = ENDIAN_BIG;
6018 #ifdef UNICODE_NORMALIZATION
6034 #ifdef SHIFTJIS_CP932
6044 for (i = 0; i < 256; i++){
6045 prefix_table[i] = 0;
6049 mimeout_buf_count = 0;
6054 fold_preserve_f = FALSE;
6057 kanji_intro = DEFAULT_J;
6058 ascii_intro = DEFAULT_R;
6059 fold_margin = FOLD_MARGIN;
6060 output_conv = DEFAULT_CONV;
6061 oconv = DEFAULT_CONV;
6062 o_zconv = no_connection;
6063 o_fconv = no_connection;
6064 o_nlconv = no_connection;
6065 o_rot_conv = no_connection;
6066 o_hira_conv = no_connection;
6067 o_base64conv = no_connection;
6068 o_iso2022jp_check_conv = no_connection;
6071 i_ungetc = std_ungetc;
6073 i_bungetc = std_ungetc;
6076 i_mungetc = std_ungetc;
6077 i_mgetc_buf = std_getc;
6078 i_mungetc_buf = std_ungetc;
6079 output_mode = ASCII;
6082 mime_decode_mode = FALSE;
6090 z_prev2=0,z_prev1=0;
6092 iconv_for_check = 0;
6094 input_codename = NULL;
6100 void no_connection(nkf_char c2, nkf_char c1)
6102 no_connection2(c2,c1,0);
6105 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6107 fprintf(stderr,"nkf internal module connection failure.\n");
6109 return 0; /* LINT */
6114 #define fprintf dllprintf
6118 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6119 fprintf(stderr,"Flags:\n");
6120 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6121 #ifdef DEFAULT_CODE_SJIS
6122 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6124 #ifdef DEFAULT_CODE_JIS
6125 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6127 #ifdef DEFAULT_CODE_EUC
6128 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6130 #ifdef DEFAULT_CODE_UTF8
6131 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6133 #ifdef UTF8_OUTPUT_ENABLE
6134 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6136 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6137 #ifdef UTF8_INPUT_ENABLE
6138 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6140 fprintf(stderr,"t no conversion\n");
6141 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6142 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6143 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6144 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6145 fprintf(stderr,"v Show this usage. V: show version\n");
6146 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6147 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6148 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6149 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6150 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6151 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6152 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6153 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6154 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6156 fprintf(stderr,"T Text mode output\n");
6158 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6159 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6160 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6161 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6162 fprintf(stderr,"\n");
6163 fprintf(stderr,"Long name options\n");
6164 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6165 fprintf(stderr," Specify the input or output codeset\n");
6166 fprintf(stderr," --fj --unix --mac --windows\n");
6167 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6168 fprintf(stderr," Convert for the system or code\n");
6169 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6170 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6171 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6173 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6175 #ifdef NUMCHAR_OPTION
6176 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6178 #ifdef UTF8_INPUT_ENABLE
6179 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6180 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6183 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6184 fprintf(stderr," Overwrite original listed files by filtered result\n");
6185 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6187 fprintf(stderr," -g --guess Guess the input code\n");
6188 fprintf(stderr," --help --version Show this help/the version\n");
6189 fprintf(stderr," For more information, see also man nkf\n");
6190 fprintf(stderr,"\n");
6196 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6197 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6200 #if defined(MSDOS) && defined(__WIN16__)
6203 #if defined(MSDOS) && defined(__WIN32__)
6209 ,NKF_VERSION,NKF_RELEASE_DATE);
6210 fprintf(stderr,"\n%s\n",CopyRight);