1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.130 2007/08/31 14:06:08 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2007-08-31"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
356 #define UCS_MAP_ASCII 0
358 #define UCS_MAP_CP932 2
359 #define UCS_MAP_CP10001 3
360 static int ms_ucs_map_f = UCS_MAP_ASCII;
362 #ifdef UTF8_INPUT_ENABLE
363 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
364 static int no_cp932ext_f = FALSE;
365 /* ignore ZERO WIDTH NO-BREAK SPACE */
366 static int no_best_fit_chars_f = FALSE;
367 static int input_endian = ENDIAN_BIG;
368 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
369 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
370 static void encode_fallback_html(nkf_char c);
371 static void encode_fallback_xml(nkf_char c);
372 static void encode_fallback_java(nkf_char c);
373 static void encode_fallback_perl(nkf_char c);
374 static void encode_fallback_subchar(nkf_char c);
375 static void (*encode_fallback)(nkf_char c) = NULL;
376 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
377 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
379 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
380 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
381 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
382 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
383 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
384 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
385 static void w_status(struct input_code *, nkf_char);
387 #ifdef UTF8_OUTPUT_ENABLE
388 static int output_bom_f = FALSE;
389 static int output_endian = ENDIAN_BIG;
390 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
391 static void w_oconv(nkf_char c2,nkf_char c1);
392 static void w_oconv16(nkf_char c2,nkf_char c1);
393 static void w_oconv32(nkf_char c2,nkf_char c1);
395 static void e_oconv(nkf_char c2,nkf_char c1);
396 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
397 static void s_oconv(nkf_char c2,nkf_char c1);
398 static void j_oconv(nkf_char c2,nkf_char c1);
399 static void fold_conv(nkf_char c2,nkf_char c1);
400 static void cr_conv(nkf_char c2,nkf_char c1);
401 static void z_conv(nkf_char c2,nkf_char c1);
402 static void rot_conv(nkf_char c2,nkf_char c1);
403 static void hira_conv(nkf_char c2,nkf_char c1);
404 static void base64_conv(nkf_char c2,nkf_char c1);
405 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
406 static void no_connection(nkf_char c2,nkf_char c1);
407 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
409 static void code_score(struct input_code *ptr);
410 static void code_status(nkf_char c);
412 static void std_putc(nkf_char c);
413 static nkf_char std_getc(FILE *f);
414 static nkf_char std_ungetc(nkf_char c,FILE *f);
416 static nkf_char broken_getc(FILE *f);
417 static nkf_char broken_ungetc(nkf_char c,FILE *f);
419 static nkf_char mime_begin(FILE *f);
420 static nkf_char mime_getc(FILE *f);
421 static nkf_char mime_ungetc(nkf_char c,FILE *f);
423 static void switch_mime_getc(void);
424 static void unswitch_mime_getc(void);
425 static nkf_char mime_begin_strict(FILE *f);
426 static nkf_char mime_getc_buf(FILE *f);
427 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
428 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
430 static nkf_char base64decode(nkf_char c);
431 static void mime_prechar(nkf_char c2, nkf_char c1);
432 static void mime_putc(nkf_char c);
433 static void open_mime(nkf_char c);
434 static void close_mime(void);
435 static void eof_mime(void);
436 static void mimeout_addchar(nkf_char c);
438 static void usage(void);
439 static void version(void);
441 static void options(unsigned char *c);
442 #if defined(PERL_XS) || defined(WIN32DLL)
443 static void reinit(void);
448 #if !defined(PERL_XS) && !defined(WIN32DLL)
449 static unsigned char stdibuf[IOBUF_SIZE];
450 static unsigned char stdobuf[IOBUF_SIZE];
452 static unsigned char hold_buf[HOLD_SIZE*2];
453 static int hold_count = 0;
455 /* MIME preprocessor fifo */
457 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
458 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
459 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
460 static unsigned char mime_buf[MIME_BUF_SIZE];
461 static unsigned int mime_top = 0;
462 static unsigned int mime_last = 0; /* decoded */
463 static unsigned int mime_input = 0; /* undecoded */
464 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
467 static int unbuf_f = FALSE;
468 static int estab_f = FALSE;
469 static int nop_f = FALSE;
470 static int binmode_f = TRUE; /* binary mode */
471 static int rot_f = FALSE; /* rot14/43 mode */
472 static int hira_f = FALSE; /* hira/kata henkan */
473 static int input_f = FALSE; /* non fixed input code */
474 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
475 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
476 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
477 static int mimebuf_f = FALSE; /* MIME buffered input */
478 static int broken_f = FALSE; /* convert ESC-less broken JIS */
479 static int iso8859_f = FALSE; /* ISO8859 through */
480 static int mimeout_f = FALSE; /* base64 mode */
481 #if defined(MSDOS) || defined(__OS2__)
482 static int x0201_f = TRUE; /* Assume JISX0201 kana */
484 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
486 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
488 #ifdef UNICODE_NORMALIZATION
489 static int nfc_f = FALSE;
490 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
491 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
492 static nkf_char nfc_getc(FILE *f);
493 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
497 static int cap_f = FALSE;
498 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
499 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
500 static nkf_char cap_getc(FILE *f);
501 static nkf_char cap_ungetc(nkf_char c,FILE *f);
503 static int url_f = FALSE;
504 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
505 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
506 static nkf_char url_getc(FILE *f);
507 static nkf_char url_ungetc(nkf_char c,FILE *f);
510 #if defined(INT_IS_SHORT)
511 #define NKF_INT32_C(n) (n##L)
513 #define NKF_INT32_C(n) (n)
515 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
516 #define CLASS_MASK NKF_INT32_C(0xFF000000)
517 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
518 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
519 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
520 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
521 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
523 #ifdef NUMCHAR_OPTION
524 static int numchar_f = FALSE;
525 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
526 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
527 static nkf_char numchar_getc(FILE *f);
528 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
532 static int noout_f = FALSE;
533 static void no_putc(nkf_char c);
534 static nkf_char debug_f = FALSE;
535 static void debug(const char *str);
536 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
539 static int guess_f = FALSE;
541 static void print_guessed_code(char *filename);
543 static void set_input_codename(char *codename);
544 static int is_inputcode_mixed = FALSE;
545 static int is_inputcode_set = FALSE;
548 static int exec_f = 0;
551 #ifdef SHIFTJIS_CP932
552 /* invert IBM extended characters to others */
553 static int cp51932_f = FALSE;
555 /* invert NEC-selected IBM extended characters to IBM extended characters */
556 static int cp932inv_f = TRUE;
558 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
559 #endif /* SHIFTJIS_CP932 */
562 static int x0212_f = FALSE;
563 static nkf_char x0212_shift(nkf_char c);
564 static nkf_char x0212_unshift(nkf_char c);
566 static int x0213_f = FALSE;
568 static unsigned char prefix_table[256];
570 static void set_code_score(struct input_code *ptr, nkf_char score);
571 static void clr_code_score(struct input_code *ptr, nkf_char score);
572 static void status_disable(struct input_code *ptr);
573 static void status_push_ch(struct input_code *ptr, nkf_char c);
574 static void status_clear(struct input_code *ptr);
575 static void status_reset(struct input_code *ptr);
576 static void status_reinit(struct input_code *ptr);
577 static void status_check(struct input_code *ptr, nkf_char c);
578 static void e_status(struct input_code *, nkf_char);
579 static void s_status(struct input_code *, nkf_char);
581 struct input_code input_code_list[] = {
582 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
583 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
584 #ifdef UTF8_INPUT_ENABLE
585 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
586 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
587 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
592 static int mimeout_mode = 0;
593 static int base64_count = 0;
595 /* X0208 -> ASCII converter */
598 static int f_line = 0; /* chars in line */
599 static int f_prev = 0;
600 static int fold_preserve_f = FALSE; /* preserve new lines */
601 static int fold_f = FALSE;
602 static int fold_len = 0;
605 static unsigned char kanji_intro = DEFAULT_J;
606 static unsigned char ascii_intro = DEFAULT_R;
610 #define FOLD_MARGIN 10
611 #define DEFAULT_FOLD 60
613 static int fold_margin = FOLD_MARGIN;
617 #ifdef DEFAULT_CODE_JIS
618 # define DEFAULT_CONV j_oconv
620 #ifdef DEFAULT_CODE_SJIS
621 # define DEFAULT_CONV s_oconv
623 #ifdef DEFAULT_CODE_EUC
624 # define DEFAULT_CONV e_oconv
626 #ifdef DEFAULT_CODE_UTF8
627 # define DEFAULT_CONV w_oconv
630 /* process default */
631 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
633 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
634 /* s_iconv or oconv */
635 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
637 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
642 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
643 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
645 /* static redirections */
647 static void (*o_putc)(nkf_char c) = std_putc;
649 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
650 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
652 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
653 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
655 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
657 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
658 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
660 /* for strict mime */
661 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
662 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
665 static int output_mode = ASCII, /* output kanji mode */
666 input_mode = ASCII, /* input kanji mode */
667 shift_mode = FALSE; /* TRUE shift out, or X0201 */
668 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
670 /* X0201 / X0208 conversion tables */
672 /* X0201 kana conversion table */
675 unsigned char cv[]= {
676 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
677 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
678 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
679 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
680 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
681 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
682 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
683 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
684 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
685 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
686 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
687 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
688 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
689 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
690 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
691 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
695 /* X0201 kana conversion table for daguten */
698 unsigned char dv[]= {
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
702 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
703 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
704 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
705 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
706 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
707 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
708 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
710 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
713 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
714 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
717 /* X0201 kana conversion table for han-daguten */
720 unsigned char ev[]= {
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
730 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
732 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
735 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
736 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
740 /* X0208 kigou conversion table */
741 /* 0x8140 - 0x819e */
743 unsigned char fv[] = {
745 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
746 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
747 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
748 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
749 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
750 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
751 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
753 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
755 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
756 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
762 static int file_out_f = FALSE;
764 static int overwrite_f = FALSE;
765 static int preserve_time_f = FALSE;
766 static int backup_f = FALSE;
767 static char *backup_suffix = "";
768 static char *get_backup_filename(const char *suffix, const char *filename);
771 static int crmode_f = 0; /* CR, NL, CRLF */
772 static nkf_char prev_cr = 0;
773 #ifdef EASYWIN /*Easy Win */
774 static int end_check;
777 #define STD_GC_BUFSIZE (256)
778 nkf_char std_gc_buf[STD_GC_BUFSIZE];
782 #include "nkf32dll.c"
783 #elif defined(PERL_XS)
785 int main(int argc, char **argv)
790 char *outfname = NULL;
793 #ifdef EASYWIN /*Easy Win */
794 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
797 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
798 cp = (unsigned char *)*argv;
803 if (pipe(fds) < 0 || (pid = fork()) < 0){
814 execvp(argv[1], &argv[1]);
828 if(x0201_f == WISH_TRUE)
829 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
831 if (binmode_f == TRUE)
832 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
833 if (freopen("","wb",stdout) == NULL)
840 setbuf(stdout, (char *) NULL);
842 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
845 if (binmode_f == TRUE)
846 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
847 if (freopen("","rb",stdin) == NULL) return (-1);
851 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
855 kanji_convert(stdin);
856 if (guess_f) print_guessed_code(NULL);
860 int is_argument_error = FALSE;
862 is_inputcode_mixed = FALSE;
863 is_inputcode_set = FALSE;
868 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
871 is_argument_error = TRUE;
879 /* reopen file for stdout */
880 if (file_out_f == TRUE) {
883 outfname = malloc(strlen(origfname)
884 + strlen(".nkftmpXXXXXX")
890 strcpy(outfname, origfname);
894 for (i = strlen(outfname); i; --i){
895 if (outfname[i - 1] == '/'
896 || outfname[i - 1] == '\\'){
902 strcat(outfname, "ntXXXXXX");
904 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
907 strcat(outfname, ".nkftmpXXXXXX");
908 fd = mkstemp(outfname);
911 || (fd_backup = dup(fileno(stdout))) < 0
912 || dup2(fd, fileno(stdout)) < 0
923 outfname = "nkf.out";
926 if(freopen(outfname, "w", stdout) == NULL) {
930 if (binmode_f == TRUE) {
931 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
932 if (freopen("","wb",stdout) == NULL)
939 if (binmode_f == TRUE)
940 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
941 if (freopen("","rb",fin) == NULL)
946 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
950 char *filename = NULL;
952 if (nfiles > 1) filename = origfname;
953 if (guess_f) print_guessed_code(filename);
959 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
967 if (dup2(fd_backup, fileno(stdout)) < 0){
970 if (stat(origfname, &sb)) {
971 fprintf(stderr, "Can't stat %s\n", origfname);
973 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
974 if (chmod(outfname, sb.st_mode)) {
975 fprintf(stderr, "Can't set permission %s\n", outfname);
978 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
980 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
981 tb[0] = tb[1] = sb.st_mtime;
982 if (utime(outfname, tb)) {
983 fprintf(stderr, "Can't set timestamp %s\n", outfname);
986 tb.actime = sb.st_atime;
987 tb.modtime = sb.st_mtime;
988 if (utime(outfname, &tb)) {
989 fprintf(stderr, "Can't set timestamp %s\n", outfname);
994 char *backup_filename = get_backup_filename(backup_suffix, origfname);
996 unlink(backup_filename);
998 if (rename(origfname, backup_filename)) {
999 perror(backup_filename);
1000 fprintf(stderr, "Can't rename %s to %s\n",
1001 origfname, backup_filename);
1005 if (unlink(origfname)){
1010 if (rename(outfname, origfname)) {
1012 fprintf(stderr, "Can't rename %s to %s\n",
1013 outfname, origfname);
1020 if (is_argument_error)
1023 #ifdef EASYWIN /*Easy Win */
1024 if (file_out_f == FALSE)
1025 scanf("%d",&end_check);
1028 #else /* for Other OS */
1029 if (file_out_f == TRUE)
1031 #endif /*Easy Win */
1034 #endif /* WIN32DLL */
1037 char *get_backup_filename(const char *suffix, const char *filename)
1039 char *backup_filename;
1040 int asterisk_count = 0;
1042 int filename_length = strlen(filename);
1044 for(i = 0; suffix[i]; i++){
1045 if(suffix[i] == '*') asterisk_count++;
1049 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1050 if (!backup_filename){
1051 perror("Can't malloc backup filename.");
1055 for(i = 0, j = 0; suffix[i];){
1056 if(suffix[i] == '*'){
1057 backup_filename[j] = '\0';
1058 strncat(backup_filename, filename, filename_length);
1060 j += filename_length;
1062 backup_filename[j++] = suffix[i++];
1065 backup_filename[j] = '\0';
1067 j = strlen(suffix) + filename_length;
1068 backup_filename = malloc( + 1);
1069 strcpy(backup_filename, filename);
1070 strcat(backup_filename, suffix);
1071 backup_filename[j] = '\0';
1073 return backup_filename;
1102 {"katakana-hiragana","h3"},
1109 #ifdef UTF8_OUTPUT_ENABLE
1119 {"fb-subchar=", ""},
1121 #ifdef UTF8_INPUT_ENABLE
1122 {"utf8-input", "W"},
1123 {"utf16-input", "W16"},
1124 {"no-cp932ext", ""},
1125 {"no-best-fit-chars",""},
1127 #ifdef UNICODE_NORMALIZATION
1128 {"utf8mac-input", ""},
1140 #ifdef NUMCHAR_OPTION
1141 {"numchar-input", ""},
1147 #ifdef SHIFTJIS_CP932
1157 static int option_mode = 0;
1159 void options(unsigned char *cp)
1163 unsigned char *cp_back = NULL;
1168 while(*cp && *cp++!='-');
1169 while (*cp || cp_back) {
1177 case '-': /* literal options */
1178 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1182 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1183 p = (unsigned char *)long_option[i].name;
1184 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1185 if (*p == cp[j] || cp[j] == ' '){
1192 while(*cp && *cp != SPACE && cp++);
1193 if (long_option[i].alias[0]){
1195 cp = (unsigned char *)long_option[i].alias;
1197 if (strcmp(long_option[i].name, "ic=") == 0){
1198 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1199 codeset[i] = nkf_toupper(p[i]);
1202 if(strcmp(codeset, "ISO-2022-JP") == 0){
1203 input_f = JIS_INPUT;
1204 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1205 strcmp(codeset, "CP50220") == 0 ||
1206 strcmp(codeset, "CP50221") == 0 ||
1207 strcmp(codeset, "CP50222") == 0){
1208 input_f = JIS_INPUT;
1209 #ifdef SHIFTJIS_CP932
1212 #ifdef UTF8_OUTPUT_ENABLE
1213 ms_ucs_map_f = UCS_MAP_CP932;
1215 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1216 input_f = JIS_INPUT;
1220 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1221 input_f = JIS_INPUT;
1226 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1227 input_f = SJIS_INPUT;
1228 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1229 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1230 strcmp(codeset, "CP932") == 0 ||
1231 strcmp(codeset, "MS932") == 0){
1232 input_f = SJIS_INPUT;
1233 #ifdef SHIFTJIS_CP932
1236 #ifdef UTF8_OUTPUT_ENABLE
1237 ms_ucs_map_f = UCS_MAP_CP932;
1239 }else if(strcmp(codeset, "CP10001") == 0){
1240 input_f = SJIS_INPUT;
1241 #ifdef SHIFTJIS_CP932
1244 #ifdef UTF8_OUTPUT_ENABLE
1245 ms_ucs_map_f = UCS_MAP_CP10001;
1247 }else if(strcmp(codeset, "EUCJP") == 0 ||
1248 strcmp(codeset, "EUC-JP") == 0){
1249 input_f = EUC_INPUT;
1250 }else if(strcmp(codeset, "CP51932") == 0){
1251 input_f = EUC_INPUT;
1252 #ifdef SHIFTJIS_CP932
1255 #ifdef UTF8_OUTPUT_ENABLE
1256 ms_ucs_map_f = UCS_MAP_CP932;
1258 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1259 strcmp(codeset, "EUCJP-MS") == 0 ||
1260 strcmp(codeset, "EUCJPMS") == 0){
1261 input_f = EUC_INPUT;
1262 #ifdef SHIFTJIS_CP932
1265 #ifdef UTF8_OUTPUT_ENABLE
1266 ms_ucs_map_f = UCS_MAP_MS;
1268 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1269 strcmp(codeset, "EUCJP-ASCII") == 0){
1270 input_f = EUC_INPUT;
1271 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_ASCII;
1277 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1278 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1279 input_f = SJIS_INPUT;
1281 #ifdef SHIFTJIS_CP932
1284 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1285 strcmp(codeset, "EUC-JIS-2004") == 0){
1286 input_f = EUC_INPUT;
1288 #ifdef SHIFTJIS_CP932
1291 #ifdef UTF8_INPUT_ENABLE
1292 }else if(strcmp(codeset, "UTF-8") == 0 ||
1293 strcmp(codeset, "UTF-8N") == 0 ||
1294 strcmp(codeset, "UTF-8-BOM") == 0){
1295 input_f = UTF8_INPUT;
1296 #ifdef UNICODE_NORMALIZATION
1297 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1298 strcmp(codeset, "UTF-8-MAC") == 0){
1299 input_f = UTF8_INPUT;
1302 }else if(strcmp(codeset, "UTF-16") == 0 ||
1303 strcmp(codeset, "UTF-16BE") == 0 ||
1304 strcmp(codeset, "UTF-16BE-BOM") == 0){
1305 input_f = UTF16_INPUT;
1306 input_endian = ENDIAN_BIG;
1307 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1308 strcmp(codeset, "UTF-16LE-BOM") == 0){
1309 input_f = UTF16_INPUT;
1310 input_endian = ENDIAN_LITTLE;
1311 }else if(strcmp(codeset, "UTF-32") == 0 ||
1312 strcmp(codeset, "UTF-32BE") == 0 ||
1313 strcmp(codeset, "UTF-32BE-BOM") == 0){
1314 input_f = UTF32_INPUT;
1315 input_endian = ENDIAN_BIG;
1316 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1317 strcmp(codeset, "UTF-32LE-BOM") == 0){
1318 input_f = UTF32_INPUT;
1319 input_endian = ENDIAN_LITTLE;
1324 if (strcmp(long_option[i].name, "oc=") == 0){
1326 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1327 codeset[i] = nkf_toupper(p[i]);
1330 if(strcmp(codeset, "ISO-2022-JP") == 0){
1331 output_conv = j_oconv;
1332 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1333 output_conv = j_oconv;
1334 no_cp932ext_f = TRUE;
1335 #ifdef SHIFTJIS_CP932
1338 #ifdef UTF8_OUTPUT_ENABLE
1339 ms_ucs_map_f = UCS_MAP_CP932;
1341 }else if(strcmp(codeset, "CP50220") == 0){
1342 output_conv = j_oconv;
1344 #ifdef SHIFTJIS_CP932
1347 #ifdef UTF8_OUTPUT_ENABLE
1348 ms_ucs_map_f = UCS_MAP_CP932;
1350 }else if(strcmp(codeset, "CP50221") == 0){
1351 output_conv = j_oconv;
1352 #ifdef SHIFTJIS_CP932
1355 #ifdef UTF8_OUTPUT_ENABLE
1356 ms_ucs_map_f = UCS_MAP_CP932;
1358 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1359 output_conv = j_oconv;
1363 #ifdef SHIFTJIS_CP932
1366 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1367 output_conv = j_oconv;
1372 #ifdef SHIFTJIS_CP932
1375 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1376 output_conv = s_oconv;
1377 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1378 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1379 strcmp(codeset, "CP932") == 0 ||
1380 strcmp(codeset, "MS932") == 0){
1381 output_conv = s_oconv;
1382 #ifdef UTF8_OUTPUT_ENABLE
1383 ms_ucs_map_f = UCS_MAP_CP932;
1385 }else if(strcmp(codeset, "CP10001") == 0){
1386 output_conv = s_oconv;
1387 #ifdef UTF8_OUTPUT_ENABLE
1388 ms_ucs_map_f = UCS_MAP_CP10001;
1390 }else if(strcmp(codeset, "EUCJP") == 0 ||
1391 strcmp(codeset, "EUC-JP") == 0){
1392 output_conv = e_oconv;
1393 }else if(strcmp(codeset, "CP51932") == 0){
1394 output_conv = e_oconv;
1395 #ifdef SHIFTJIS_CP932
1398 #ifdef UTF8_OUTPUT_ENABLE
1399 ms_ucs_map_f = UCS_MAP_CP932;
1401 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1402 strcmp(codeset, "EUCJP-MS") == 0 ||
1403 strcmp(codeset, "EUCJPMS") == 0){
1404 output_conv = e_oconv;
1408 #ifdef UTF8_OUTPUT_ENABLE
1409 ms_ucs_map_f = UCS_MAP_MS;
1411 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1412 strcmp(codeset, "EUCJP-ASCII") == 0){
1413 output_conv = e_oconv;
1417 #ifdef UTF8_OUTPUT_ENABLE
1418 ms_ucs_map_f = UCS_MAP_ASCII;
1420 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1421 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1422 output_conv = s_oconv;
1424 #ifdef SHIFTJIS_CP932
1427 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1428 strcmp(codeset, "EUC-JIS-2004") == 0){
1429 output_conv = e_oconv;
1434 #ifdef SHIFTJIS_CP932
1437 #ifdef UTF8_OUTPUT_ENABLE
1438 }else if(strcmp(codeset, "UTF-8") == 0){
1439 output_conv = w_oconv;
1440 }else if(strcmp(codeset, "UTF-8N") == 0){
1441 output_conv = w_oconv;
1442 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1443 output_conv = w_oconv;
1444 output_bom_f = TRUE;
1445 }else if(strcmp(codeset, "UTF-16BE") == 0){
1446 output_conv = w_oconv16;
1447 }else if(strcmp(codeset, "UTF-16") == 0 ||
1448 strcmp(codeset, "UTF-16BE-BOM") == 0){
1449 output_conv = w_oconv16;
1450 output_bom_f = TRUE;
1451 }else if(strcmp(codeset, "UTF-16LE") == 0){
1452 output_conv = w_oconv16;
1453 output_endian = ENDIAN_LITTLE;
1454 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1455 output_conv = w_oconv16;
1456 output_endian = ENDIAN_LITTLE;
1457 output_bom_f = TRUE;
1458 }else if(strcmp(codeset, "UTF-32") == 0 ||
1459 strcmp(codeset, "UTF-32BE") == 0){
1460 output_conv = w_oconv32;
1461 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1462 output_conv = w_oconv32;
1463 output_bom_f = TRUE;
1464 }else if(strcmp(codeset, "UTF-32LE") == 0){
1465 output_conv = w_oconv32;
1466 output_endian = ENDIAN_LITTLE;
1467 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1468 output_conv = w_oconv32;
1469 output_endian = ENDIAN_LITTLE;
1470 output_bom_f = TRUE;
1476 if (strcmp(long_option[i].name, "overwrite") == 0){
1479 preserve_time_f = TRUE;
1482 if (strcmp(long_option[i].name, "overwrite=") == 0){
1485 preserve_time_f = TRUE;
1487 backup_suffix = malloc(strlen((char *) p) + 1);
1488 strcpy(backup_suffix, (char *) p);
1491 if (strcmp(long_option[i].name, "in-place") == 0){
1494 preserve_time_f = FALSE;
1497 if (strcmp(long_option[i].name, "in-place=") == 0){
1500 preserve_time_f = FALSE;
1502 backup_suffix = malloc(strlen((char *) p) + 1);
1503 strcpy(backup_suffix, (char *) p);
1508 if (strcmp(long_option[i].name, "cap-input") == 0){
1512 if (strcmp(long_option[i].name, "url-input") == 0){
1517 #ifdef NUMCHAR_OPTION
1518 if (strcmp(long_option[i].name, "numchar-input") == 0){
1524 if (strcmp(long_option[i].name, "no-output") == 0){
1528 if (strcmp(long_option[i].name, "debug") == 0){
1533 if (strcmp(long_option[i].name, "cp932") == 0){
1534 #ifdef SHIFTJIS_CP932
1538 #ifdef UTF8_OUTPUT_ENABLE
1539 ms_ucs_map_f = UCS_MAP_CP932;
1543 if (strcmp(long_option[i].name, "no-cp932") == 0){
1544 #ifdef SHIFTJIS_CP932
1548 #ifdef UTF8_OUTPUT_ENABLE
1549 ms_ucs_map_f = UCS_MAP_ASCII;
1553 #ifdef SHIFTJIS_CP932
1554 if (strcmp(long_option[i].name, "cp932inv") == 0){
1561 if (strcmp(long_option[i].name, "x0212") == 0){
1568 if (strcmp(long_option[i].name, "exec-in") == 0){
1572 if (strcmp(long_option[i].name, "exec-out") == 0){
1577 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1578 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1579 no_cp932ext_f = TRUE;
1582 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1583 no_best_fit_chars_f = TRUE;
1586 if (strcmp(long_option[i].name, "fb-skip") == 0){
1587 encode_fallback = NULL;
1590 if (strcmp(long_option[i].name, "fb-html") == 0){
1591 encode_fallback = encode_fallback_html;
1594 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1595 encode_fallback = encode_fallback_xml;
1598 if (strcmp(long_option[i].name, "fb-java") == 0){
1599 encode_fallback = encode_fallback_java;
1602 if (strcmp(long_option[i].name, "fb-perl") == 0){
1603 encode_fallback = encode_fallback_perl;
1606 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1607 encode_fallback = encode_fallback_subchar;
1610 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1611 encode_fallback = encode_fallback_subchar;
1612 unicode_subchar = 0;
1614 /* decimal number */
1615 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1616 unicode_subchar *= 10;
1617 unicode_subchar += hex2bin(p[i]);
1619 }else if(p[1] == 'x' || p[1] == 'X'){
1620 /* hexadecimal number */
1621 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1622 unicode_subchar <<= 4;
1623 unicode_subchar |= hex2bin(p[i]);
1627 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1628 unicode_subchar *= 8;
1629 unicode_subchar += hex2bin(p[i]);
1632 w16e_conv(unicode_subchar, &i, &j);
1633 unicode_subchar = i<<8 | j;
1637 #ifdef UTF8_OUTPUT_ENABLE
1638 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1639 ms_ucs_map_f = UCS_MAP_MS;
1643 #ifdef UNICODE_NORMALIZATION
1644 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1645 input_f = UTF8_INPUT;
1650 if (strcmp(long_option[i].name, "prefix=") == 0){
1651 if (nkf_isgraph(p[0])){
1652 for (i = 1; nkf_isgraph(p[i]); i++){
1653 prefix_table[p[i]] = p[0];
1660 case 'b': /* buffered mode */
1663 case 'u': /* non bufferd mode */
1666 case 't': /* transparent mode */
1671 } else if (*cp=='2') {
1675 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1683 case 'j': /* JIS output */
1685 output_conv = j_oconv;
1687 case 'e': /* AT&T EUC output */
1688 output_conv = e_oconv;
1691 case 's': /* SJIS output */
1692 output_conv = s_oconv;
1694 case 'l': /* ISO8859 Latin-1 support, no conversion */
1695 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1696 input_f = LATIN1_INPUT;
1698 case 'i': /* Kanji IN ESC-$-@/B */
1699 if (*cp=='@'||*cp=='B')
1700 kanji_intro = *cp++;
1702 case 'o': /* ASCII IN ESC-(-J/B */
1703 if (*cp=='J'||*cp=='B'||*cp=='H')
1704 ascii_intro = *cp++;
1708 bit:1 katakana->hiragana
1709 bit:2 hiragana->katakana
1711 if ('9'>= *cp && *cp>='0')
1712 hira_f |= (*cp++ -'0');
1719 #if defined(MSDOS) || defined(__OS2__)
1734 #ifdef UTF8_OUTPUT_ENABLE
1735 case 'w': /* UTF-8 output */
1737 output_conv = w_oconv; cp++;
1741 output_bom_f = TRUE;
1744 if ('1'== cp[0] && '6'==cp[1]) {
1745 output_conv = w_oconv16; cp+=2;
1746 } else if ('3'== cp[0] && '2'==cp[1]) {
1747 output_conv = w_oconv32; cp+=2;
1749 output_conv = w_oconv;
1754 output_endian = ENDIAN_LITTLE;
1755 } else if (cp[0] == 'B') {
1763 output_bom_f = TRUE;
1768 #ifdef UTF8_INPUT_ENABLE
1769 case 'W': /* UTF input */
1772 input_f = UTF8_INPUT;
1774 if ('1'== cp[0] && '6'==cp[1]) {
1776 input_f = UTF16_INPUT;
1777 input_endian = ENDIAN_BIG;
1778 } else if ('3'== cp[0] && '2'==cp[1]) {
1780 input_f = UTF32_INPUT;
1781 input_endian = ENDIAN_BIG;
1783 input_f = UTF8_INPUT;
1788 input_endian = ENDIAN_LITTLE;
1789 } else if (cp[0] == 'B') {
1795 /* Input code assumption */
1796 case 'J': /* JIS input */
1797 input_f = JIS_INPUT;
1799 case 'E': /* AT&T EUC input */
1800 input_f = EUC_INPUT;
1802 case 'S': /* MS Kanji input */
1803 input_f = SJIS_INPUT;
1804 if (x0201_f==NO_X0201) x0201_f=TRUE;
1806 case 'Z': /* Convert X0208 alphabet to asii */
1808 bit:0 Convert JIS X 0208 Alphabet to ASCII
1809 bit:1 Convert Kankaku to one space
1810 bit:2 Convert Kankaku to two spaces
1811 bit:3 Convert HTML Entity
1812 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1814 if ('9'>= *cp && *cp>='0')
1815 alpha_f |= 1<<(*cp++ -'0');
1819 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1820 x0201_f = FALSE; /* No X0201->X0208 conversion */
1822 ESC-(-I in JIS, EUC, MS Kanji
1823 SI/SO in JIS, EUC, MS Kanji
1824 SSO in EUC, JIS, not in MS Kanji
1825 MS Kanji (0xa0-0xdf)
1827 ESC-(-I in JIS (0x20-0x5f)
1828 SSO in EUC (0xa0-0xdf)
1829 0xa0-0xd in MS Kanji (0xa0-0xdf)
1832 case 'X': /* Assume X0201 kana */
1833 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1836 case 'F': /* prserve new lines */
1837 fold_preserve_f = TRUE;
1838 case 'f': /* folding -f60 or -f */
1841 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1843 fold_len += *cp++ - '0';
1845 if (!(0<fold_len && fold_len<BUFSIZ))
1846 fold_len = DEFAULT_FOLD;
1850 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1852 fold_margin += *cp++ - '0';
1856 case 'm': /* MIME support */
1857 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1858 if (*cp=='B'||*cp=='Q') {
1859 mime_decode_mode = *cp++;
1860 mimebuf_f = FIXED_MIME;
1861 } else if (*cp=='N') {
1862 mime_f = TRUE; cp++;
1863 } else if (*cp=='S') {
1864 mime_f = STRICT_MIME; cp++;
1865 } else if (*cp=='0') {
1866 mime_decode_f = FALSE;
1867 mime_f = FALSE; cp++;
1870 case 'M': /* MIME output */
1873 mimeout_f = FIXED_MIME; cp++;
1874 } else if (*cp=='Q') {
1876 mimeout_f = FIXED_MIME; cp++;
1881 case 'B': /* Broken JIS support */
1883 bit:1 allow any x on ESC-(-x or ESC-$-x
1884 bit:2 reset to ascii on NL
1886 if ('9'>= *cp && *cp>='0')
1887 broken_f |= 1<<(*cp++ -'0');
1892 case 'O':/* for Output file */
1896 case 'c':/* add cr code */
1899 case 'd':/* delete cr code */
1902 case 'I': /* ISO-2022-JP output */
1905 case 'L': /* line mode */
1906 if (*cp=='u') { /* unix */
1907 crmode_f = NL; cp++;
1908 } else if (*cp=='m') { /* mac */
1909 crmode_f = CR; cp++;
1910 } else if (*cp=='w') { /* windows */
1911 crmode_f = CRLF; cp++;
1912 } else if (*cp=='0') { /* no conversion */
1922 /* module muliple options in a string are allowed for Perl moudle */
1923 while(*cp && *cp++!='-');
1926 /* bogus option but ignored */
1932 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1935 struct input_code *p = input_code_list;
1937 if (iconv_func == p->iconv_func){
1946 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1948 #ifdef INPUT_CODE_FIX
1956 #ifdef INPUT_CODE_FIX
1957 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1963 if (estab_f && iconv_for_check != iconv){
1964 struct input_code *p = find_inputcode_byfunc(iconv);
1966 set_input_codename(p->name);
1967 debug(input_codename);
1969 iconv_for_check = iconv;
1974 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1975 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1976 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1977 #ifdef SHIFTJIS_CP932
1978 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1979 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1981 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1983 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1984 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1986 #define SCORE_INIT (SCORE_iMIME)
1988 const nkf_char score_table_A0[] = {
1991 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1992 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1995 const nkf_char score_table_F0[] = {
1996 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1997 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1998 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1999 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2002 void set_code_score(struct input_code *ptr, nkf_char score)
2005 ptr->score |= score;
2009 void clr_code_score(struct input_code *ptr, nkf_char score)
2012 ptr->score &= ~score;
2016 void code_score(struct input_code *ptr)
2018 nkf_char c2 = ptr->buf[0];
2019 #ifdef UTF8_OUTPUT_ENABLE
2020 nkf_char c1 = ptr->buf[1];
2023 set_code_score(ptr, SCORE_ERROR);
2024 }else if (c2 == SSO){
2025 set_code_score(ptr, SCORE_KANA);
2026 #ifdef UTF8_OUTPUT_ENABLE
2027 }else if (!e2w_conv(c2, c1)){
2028 set_code_score(ptr, SCORE_NO_EXIST);
2030 }else if ((c2 & 0x70) == 0x20){
2031 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2032 }else if ((c2 & 0x70) == 0x70){
2033 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2034 }else if ((c2 & 0x70) >= 0x50){
2035 set_code_score(ptr, SCORE_L2);
2039 void status_disable(struct input_code *ptr)
2044 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2047 void status_push_ch(struct input_code *ptr, nkf_char c)
2049 ptr->buf[ptr->index++] = c;
2052 void status_clear(struct input_code *ptr)
2058 void status_reset(struct input_code *ptr)
2061 ptr->score = SCORE_INIT;
2064 void status_reinit(struct input_code *ptr)
2067 ptr->_file_stat = 0;
2070 void status_check(struct input_code *ptr, nkf_char c)
2072 if (c <= DEL && estab_f){
2077 void s_status(struct input_code *ptr, nkf_char c)
2081 status_check(ptr, c);
2086 #ifdef NUMCHAR_OPTION
2087 }else if (is_unicode_capsule(c)){
2090 }else if (0xa1 <= c && c <= 0xdf){
2091 status_push_ch(ptr, SSO);
2092 status_push_ch(ptr, c);
2095 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2097 status_push_ch(ptr, c);
2098 #ifdef SHIFTJIS_CP932
2100 && is_ibmext_in_sjis(c)){
2102 status_push_ch(ptr, c);
2103 #endif /* SHIFTJIS_CP932 */
2105 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2107 status_push_ch(ptr, c);
2108 #endif /* X0212_ENABLE */
2110 status_disable(ptr);
2114 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2115 status_push_ch(ptr, c);
2116 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2120 status_disable(ptr);
2124 #ifdef SHIFTJIS_CP932
2125 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2126 status_push_ch(ptr, c);
2127 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2128 set_code_score(ptr, SCORE_CP932);
2133 #endif /* SHIFTJIS_CP932 */
2134 #ifndef X0212_ENABLE
2135 status_disable(ptr);
2141 void e_status(struct input_code *ptr, nkf_char c)
2145 status_check(ptr, c);
2150 #ifdef NUMCHAR_OPTION
2151 }else if (is_unicode_capsule(c)){
2154 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2156 status_push_ch(ptr, c);
2158 }else if (0x8f == c){
2160 status_push_ch(ptr, c);
2161 #endif /* X0212_ENABLE */
2163 status_disable(ptr);
2167 if (0xa1 <= c && c <= 0xfe){
2168 status_push_ch(ptr, c);
2172 status_disable(ptr);
2177 if (0xa1 <= c && c <= 0xfe){
2179 status_push_ch(ptr, c);
2181 status_disable(ptr);
2183 #endif /* X0212_ENABLE */
2187 #ifdef UTF8_INPUT_ENABLE
2188 void w_status(struct input_code *ptr, nkf_char c)
2192 status_check(ptr, c);
2197 #ifdef NUMCHAR_OPTION
2198 }else if (is_unicode_capsule(c)){
2201 }else if (0xc0 <= c && c <= 0xdf){
2203 status_push_ch(ptr, c);
2204 }else if (0xe0 <= c && c <= 0xef){
2206 status_push_ch(ptr, c);
2207 }else if (0xf0 <= c && c <= 0xf4){
2209 status_push_ch(ptr, c);
2211 status_disable(ptr);
2216 if (0x80 <= c && c <= 0xbf){
2217 status_push_ch(ptr, c);
2218 if (ptr->index > ptr->stat){
2219 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2220 && ptr->buf[2] == 0xbf);
2221 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2222 &ptr->buf[0], &ptr->buf[1]);
2229 status_disable(ptr);
2233 if (0x80 <= c && c <= 0xbf){
2234 if (ptr->index < ptr->stat){
2235 status_push_ch(ptr, c);
2240 status_disable(ptr);
2247 void code_status(nkf_char c)
2249 int action_flag = 1;
2250 struct input_code *result = 0;
2251 struct input_code *p = input_code_list;
2253 if (!p->status_func) {
2257 if (!p->status_func)
2259 (p->status_func)(p, c);
2262 }else if(p->stat == 0){
2273 if (result && !estab_f){
2274 set_iconv(TRUE, result->iconv_func);
2275 }else if (c <= DEL){
2276 struct input_code *ptr = input_code_list;
2286 nkf_char std_getc(FILE *f)
2289 return std_gc_buf[--std_gc_ndx];
2295 nkf_char std_ungetc(nkf_char c, FILE *f)
2297 if (std_gc_ndx == STD_GC_BUFSIZE){
2300 std_gc_buf[std_gc_ndx++] = c;
2305 void std_putc(nkf_char c)
2312 #if !defined(PERL_XS) && !defined(WIN32DLL)
2313 nkf_char noconvert(FILE *f)
2318 module_connection();
2319 while ((c = (*i_getc)(f)) != EOF)
2326 void module_connection(void)
2328 oconv = output_conv;
2331 /* replace continucation module, from output side */
2333 /* output redicrection */
2335 if (noout_f || guess_f){
2342 if (mimeout_f == TRUE) {
2343 o_base64conv = oconv; oconv = base64_conv;
2345 /* base64_count = 0; */
2349 o_crconv = oconv; oconv = cr_conv;
2352 o_rot_conv = oconv; oconv = rot_conv;
2355 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2358 o_hira_conv = oconv; oconv = hira_conv;
2361 o_fconv = oconv; oconv = fold_conv;
2364 if (alpha_f || x0201_f) {
2365 o_zconv = oconv; oconv = z_conv;
2369 i_ungetc = std_ungetc;
2370 /* input redicrection */
2373 i_cgetc = i_getc; i_getc = cap_getc;
2374 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2377 i_ugetc = i_getc; i_getc = url_getc;
2378 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2381 #ifdef NUMCHAR_OPTION
2383 i_ngetc = i_getc; i_getc = numchar_getc;
2384 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2387 #ifdef UNICODE_NORMALIZATION
2388 if (nfc_f && input_f == UTF8_INPUT){
2389 i_nfc_getc = i_getc; i_getc = nfc_getc;
2390 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2393 if (mime_f && mimebuf_f==FIXED_MIME) {
2394 i_mgetc = i_getc; i_getc = mime_getc;
2395 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2398 i_bgetc = i_getc; i_getc = broken_getc;
2399 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2401 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2402 set_iconv(-TRUE, e_iconv);
2403 } else if (input_f == SJIS_INPUT) {
2404 set_iconv(-TRUE, s_iconv);
2405 #ifdef UTF8_INPUT_ENABLE
2406 } else if (input_f == UTF8_INPUT) {
2407 set_iconv(-TRUE, w_iconv);
2408 } else if (input_f == UTF16_INPUT) {
2409 set_iconv(-TRUE, w_iconv16);
2410 } else if (input_f == UTF32_INPUT) {
2411 set_iconv(-TRUE, w_iconv32);
2414 set_iconv(FALSE, e_iconv);
2418 struct input_code *p = input_code_list;
2426 * Check and Ignore BOM
2428 void check_bom(FILE *f)
2431 switch(c2 = (*i_getc)(f)){
2433 if((c2 = (*i_getc)(f)) == 0x00){
2434 if((c2 = (*i_getc)(f)) == 0xFE){
2435 if((c2 = (*i_getc)(f)) == 0xFF){
2437 set_iconv(TRUE, w_iconv32);
2439 if (iconv == w_iconv32) {
2440 input_endian = ENDIAN_BIG;
2443 (*i_ungetc)(0xFF,f);
2444 }else (*i_ungetc)(c2,f);
2445 (*i_ungetc)(0xFE,f);
2446 }else if(c2 == 0xFF){
2447 if((c2 = (*i_getc)(f)) == 0xFE){
2449 set_iconv(TRUE, w_iconv32);
2451 if (iconv == w_iconv32) {
2452 input_endian = ENDIAN_2143;
2455 (*i_ungetc)(0xFF,f);
2456 }else (*i_ungetc)(c2,f);
2457 (*i_ungetc)(0xFF,f);
2458 }else (*i_ungetc)(c2,f);
2459 (*i_ungetc)(0x00,f);
2460 }else (*i_ungetc)(c2,f);
2461 (*i_ungetc)(0x00,f);
2464 if((c2 = (*i_getc)(f)) == 0xBB){
2465 if((c2 = (*i_getc)(f)) == 0xBF){
2467 set_iconv(TRUE, w_iconv);
2469 if (iconv == w_iconv) {
2472 (*i_ungetc)(0xBF,f);
2473 }else (*i_ungetc)(c2,f);
2474 (*i_ungetc)(0xBB,f);
2475 }else (*i_ungetc)(c2,f);
2476 (*i_ungetc)(0xEF,f);
2479 if((c2 = (*i_getc)(f)) == 0xFF){
2480 if((c2 = (*i_getc)(f)) == 0x00){
2481 if((c2 = (*i_getc)(f)) == 0x00){
2483 set_iconv(TRUE, w_iconv32);
2485 if (iconv == w_iconv32) {
2486 input_endian = ENDIAN_3412;
2489 (*i_ungetc)(0x00,f);
2490 }else (*i_ungetc)(c2,f);
2491 (*i_ungetc)(0x00,f);
2492 }else (*i_ungetc)(c2,f);
2494 set_iconv(TRUE, w_iconv16);
2496 if (iconv == w_iconv16) {
2497 input_endian = ENDIAN_BIG;
2500 (*i_ungetc)(0xFF,f);
2501 }else (*i_ungetc)(c2,f);
2502 (*i_ungetc)(0xFE,f);
2505 if((c2 = (*i_getc)(f)) == 0xFE){
2506 if((c2 = (*i_getc)(f)) == 0x00){
2507 if((c2 = (*i_getc)(f)) == 0x00){
2509 set_iconv(TRUE, w_iconv32);
2511 if (iconv == w_iconv32) {
2512 input_endian = ENDIAN_LITTLE;
2515 (*i_ungetc)(0x00,f);
2516 }else (*i_ungetc)(c2,f);
2517 (*i_ungetc)(0x00,f);
2518 }else (*i_ungetc)(c2,f);
2520 set_iconv(TRUE, w_iconv16);
2522 if (iconv == w_iconv16) {
2523 input_endian = ENDIAN_LITTLE;
2526 (*i_ungetc)(0xFE,f);
2527 }else (*i_ungetc)(c2,f);
2528 (*i_ungetc)(0xFF,f);
2537 Conversion main loop. Code detection only.
2540 nkf_char kanji_convert(FILE *f)
2542 nkf_char c3, c2=0, c1, c0=0;
2543 int is_8bit = FALSE;
2545 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2546 #ifdef UTF8_INPUT_ENABLE
2547 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2554 output_mode = ASCII;
2557 #define NEXT continue /* no output, get next */
2558 #define SEND ; /* output c1 and c2, get next */
2559 #define LAST break /* end of loop, go closing */
2561 module_connection();
2564 while ((c1 = (*i_getc)(f)) != EOF) {
2565 #ifdef INPUT_CODE_FIX
2571 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2572 /* in case of 8th bit is on */
2573 if (!estab_f&&!mime_decode_mode) {
2574 /* in case of not established yet */
2575 /* It is still ambiguious */
2576 if (h_conv(f, c2, c1)==EOF)
2582 /* in case of already established */
2584 /* ignore bogus code and not CP5022x UCD */
2592 /* second byte, 7 bit code */
2593 /* it might be kanji shitfted */
2594 if ((c1 == DEL) || (c1 <= SPACE)) {
2595 /* ignore bogus first code */
2602 #ifdef UTF8_INPUT_ENABLE
2603 if (iconv == w_iconv16) {
2604 if (input_endian == ENDIAN_BIG) {
2606 if ((c1 = (*i_getc)(f)) != EOF) {
2607 if (0xD8 <= c2 && c2 <= 0xDB) {
2608 if ((c0 = (*i_getc)(f)) != EOF) {
2610 if ((c3 = (*i_getc)(f)) != EOF) {
2617 if ((c2 = (*i_getc)(f)) != EOF) {
2618 if (0xD8 <= c2 && c2 <= 0xDB) {
2619 if ((c3 = (*i_getc)(f)) != EOF) {
2620 if ((c0 = (*i_getc)(f)) != EOF) {
2629 } else if(iconv == w_iconv32){
2631 if((c2 = (*i_getc)(f)) != EOF &&
2632 (c1 = (*i_getc)(f)) != EOF &&
2633 (c0 = (*i_getc)(f)) != EOF){
2634 switch(input_endian){
2636 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2639 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2642 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2645 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2655 #ifdef NUMCHAR_OPTION
2656 if (is_unicode_capsule(c1)){
2660 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2662 if (!estab_f && !iso8859_f) {
2663 /* not established yet */
2666 } else { /* estab_f==TRUE */
2671 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2672 /* SJIS X0201 Case... */
2673 if(iso2022jp_f && x0201_f==NO_X0201) {
2674 (*oconv)(GETA1, GETA2);
2681 } else if (c1==SSO && iconv != s_iconv) {
2682 /* EUC X0201 Case */
2683 c1 = (*i_getc)(f); /* skip SSO */
2685 if (SSP<=c1 && c1<0xe0) {
2686 if(iso2022jp_f && x0201_f==NO_X0201) {
2687 (*oconv)(GETA1, GETA2);
2694 } else { /* bogus code, skip SSO and one byte */
2697 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2698 (c1 == 0xFD || c1 == 0xFE)) {
2704 /* already established */
2709 } else if ((c1 > SPACE) && (c1 != DEL)) {
2710 /* in case of Roman characters */
2712 /* output 1 shifted byte */
2716 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2717 /* output 1 shifted byte */
2718 if(iso2022jp_f && x0201_f==NO_X0201) {
2719 (*oconv)(GETA1, GETA2);
2726 /* look like bogus code */
2729 } else if (input_mode == X0208 || input_mode == X0212 ||
2730 input_mode == X0213_1 || input_mode == X0213_2) {
2731 /* in case of Kanji shifted */
2734 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2735 /* Check MIME code */
2736 if ((c1 = (*i_getc)(f)) == EOF) {
2739 } else if (c1 == '?') {
2740 /* =? is mime conversion start sequence */
2741 if(mime_f == STRICT_MIME) {
2742 /* check in real detail */
2743 if (mime_begin_strict(f) == EOF)
2747 } else if (mime_begin(f) == EOF)
2757 /* normal ASCII code */
2760 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
\r
2763 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
\r
2766 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
\r
2767 if ((c1 = (*i_getc)(f)) == EOF) {
2768 /* (*oconv)(0, ESC); don't send bogus code */
2770 } else if (c1 == '$') {
2771 if ((c1 = (*i_getc)(f)) == EOF) {
2773 (*oconv)(0, ESC); don't send bogus code
2774 (*oconv)(0, '$'); */
2776 } else if (c1 == '@'|| c1 == 'B') {
2777 /* This is kanji introduction */
2780 set_input_codename("ISO-2022-JP");
2782 debug(input_codename);
2785 } else if (c1 == '(') {
2786 if ((c1 = (*i_getc)(f)) == EOF) {
2787 /* don't send bogus code
2793 } else if (c1 == '@'|| c1 == 'B') {
2794 /* This is kanji introduction */
2799 } else if (c1 == 'D'){
2803 #endif /* X0212_ENABLE */
2804 } else if (c1 == (X0213_1&0x7F)){
2805 input_mode = X0213_1;
2808 } else if (c1 == (X0213_2&0x7F)){
2809 input_mode = X0213_2;
2813 /* could be some special code */
2820 } else if (broken_f&0x2) {
2821 /* accept any ESC-(-x as broken code ... */
2831 } else if (c1 == '(') {
2832 if ((c1 = (*i_getc)(f)) == EOF) {
2833 /* don't send bogus code
2835 (*oconv)(0, '('); */
2839 /* This is X0201 kana introduction */
2840 input_mode = X0201; shift_mode = X0201;
2842 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2843 /* This is X0208 kanji introduction */
2844 input_mode = ASCII; shift_mode = FALSE;
2846 } else if (broken_f&0x2) {
2847 input_mode = ASCII; shift_mode = FALSE;
2852 /* maintain various input_mode here */
2856 } else if ( c1 == 'N' || c1 == 'n' ){
2858 c3 = (*i_getc)(f); /* skip SS2 */
2859 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2874 } else if (c1 == ESC && iconv == s_iconv) {
2875 /* ESC in Shift_JIS */
2876 if ((c1 = (*i_getc)(f)) == EOF) {
2877 /* (*oconv)(0, ESC); don't send bogus code */
2879 } else if (c1 == '$') {
2881 if ((c1 = (*i_getc)(f)) == EOF) {
2883 (*oconv)(0, ESC); don't send bogus code
2884 (*oconv)(0, '$'); */
2887 if (('E' <= c1 && c1 <= 'G') ||
2888 ('O' <= c1 && c1 <= 'Q')) {
2896 static const int jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2897 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
2898 while ((c1 = (*i_getc)(f)) != EOF) {
2899 if (SPACE <= c1 && c1 <= 'z') {
2900 (*oconv)(0, c1 + c0);
2901 } else break; /* c1 == SO */
2905 if (c1 == EOF) LAST;
2912 } else if (c1 == NL || c1 == CR) {
2914 input_mode = ASCII; set_iconv(FALSE, 0);
2916 } else if (mime_decode_f && !mime_decode_mode){
2918 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2926 } else { /* if (c1 == CR)*/
2927 if ((c1=(*i_getc)(f))!=EOF) {
2931 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2946 if (prev_cr && c1 == NL) crmode_f = CRLF;
2949 } else if (c1 == DEL && input_mode == X0208 ) {
2959 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2962 if ((c0 = (*i_getc)(f)) != EOF) {
2965 if ((c3 = (*i_getc)(f)) != EOF) {
2967 (*iconv)(c2, c1, c0|c3);
2972 /* 3 bytes EUC or UTF-8 */
2973 if ((c0 = (*i_getc)(f)) != EOF) {
2975 (*iconv)(c2, c1, c0);
2983 0x7F <= c2 && c2 <= 0x92 &&
2984 0x21 <= c1 && c1 <= 0x7E) {
2986 if(c1 == 0x7F) return 0;
2987 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2990 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2994 (*oconv)(PREFIX_EUCG3 | c2, c1);
2996 #endif /* X0212_ENABLE */
2998 (*oconv)(PREFIX_EUCG3 | c2, c1);
3001 (*oconv)(input_mode, c1); /* other special case */
3007 /* goto next_word */
3011 (*iconv)(EOF, 0, 0);
3012 if (!is_inputcode_set)
3015 struct input_code *p = input_code_list;
3016 struct input_code *result = p;
3018 if (p->score < result->score) result = p;
3021 set_input_codename(result->name);
3028 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3030 nkf_char ret, c3, c0;
3034 /** it must NOT be in the kanji shifte sequence */
3035 /** it must NOT be written in JIS7 */
3036 /** and it must be after 2 byte 8bit code */
3042 while ((c1 = (*i_getc)(f)) != EOF) {
3048 if (push_hold_buf(c1) == EOF || estab_f){
3054 struct input_code *p = input_code_list;
3055 struct input_code *result = p;
3060 if (p->status_func && p->score < result->score){
3065 set_iconv(TRUE, result->iconv_func);
3070 ** 1) EOF is detected, or
3071 ** 2) Code is established, or
3072 ** 3) Buffer is FULL (but last word is pushed)
3074 ** in 1) and 3) cases, we continue to use
3075 ** Kanji codes by oconv and leave estab_f unchanged.
3080 while (hold_index < hold_count){
3081 c2 = hold_buf[hold_index++];
3083 #ifdef NUMCHAR_OPTION
3084 || is_unicode_capsule(c2)
3089 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3090 (*iconv)(X0201, c2, 0);
3093 if (hold_index < hold_count){
3094 c1 = hold_buf[hold_index++];
3104 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3107 if (hold_index < hold_count){
3108 c0 = hold_buf[hold_index++];
3109 } else if ((c0 = (*i_getc)(f)) == EOF) {
3115 if (hold_index < hold_count){
3116 c3 = hold_buf[hold_index++];
3117 } else if ((c3 = (*i_getc)(f)) == EOF) {
3122 (*iconv)(c2, c1, c0|c3);
3127 /* 3 bytes EUC or UTF-8 */
3128 if (hold_index < hold_count){
3129 c0 = hold_buf[hold_index++];
3130 } else if ((c0 = (*i_getc)(f)) == EOF) {
3136 (*iconv)(c2, c1, c0);
3139 if (c0 == EOF) break;
3144 nkf_char push_hold_buf(nkf_char c2)
3146 if (hold_count >= HOLD_SIZE*2)
3148 hold_buf[hold_count++] = (unsigned char)c2;
3149 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3152 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3154 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3157 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3158 #ifdef SHIFTJIS_CP932
3159 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3160 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3167 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3168 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3174 #endif /* SHIFTJIS_CP932 */
3176 if (!x0213_f && is_ibmext_in_sjis(c2)){
3177 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3180 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3193 if(x0213_f && c2 >= 0xF0){
3194 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3195 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3196 }else{ /* 78<=k<=94 */
3197 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3198 if (0x9E < c1) c2++;
3201 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3202 if (0x9E < c1) c2++;
3205 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3212 c2 = x0212_unshift(c2);
3219 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3223 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3225 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3227 if(c1 == 0x7F) return 0;
3228 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3231 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3232 if (ret) return ret;
3238 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3243 }else if (c2 == 0x8f){
3247 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3248 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3249 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3252 c2 = (c2 << 8) | (c1 & 0x7f);
3254 #ifdef SHIFTJIS_CP932
3257 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3258 s2e_conv(s2, s1, &c2, &c1);
3265 #endif /* SHIFTJIS_CP932 */
3267 #endif /* X0212_ENABLE */
3268 } else if (c2 == SSO){
3271 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3274 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3275 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3276 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3281 #ifdef SHIFTJIS_CP932
3282 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3284 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3285 s2e_conv(s2, s1, &c2, &c1);
3292 #endif /* SHIFTJIS_CP932 */
3299 #ifdef UTF8_INPUT_ENABLE
3300 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3307 }else if (0xc0 <= c2 && c2 <= 0xef) {
3308 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3309 #ifdef NUMCHAR_OPTION
3312 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3320 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3323 static const int w_iconv_utf8_1st_byte[] =
3325 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3326 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3327 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3328 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3330 if (c2 < 0 || 0xff < c2) {
3331 }else if (c2 == 0) { /* 0 : 1 byte*/
3333 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3336 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3338 if (c1 < 0x80 || 0xBF < c1) return 0;
3341 if (c0 == 0) return -1;
3342 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3347 if (c0 == 0) return -1;
3348 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3352 if (c0 == 0) return -1;
3353 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3357 if (c0 == 0) return -2;
3358 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3362 if (c0 == 0) return -2;
3363 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3367 if (c0 == 0) return -2;
3368 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3376 if (c2 == 0 || c2 == EOF){
3377 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3378 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3381 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3390 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3391 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3398 }else if (val < 0x800){
3399 *p2 = 0xc0 | (val >> 6);
3400 *p1 = 0x80 | (val & 0x3f);
3402 } else if (val <= NKF_INT32_C(0xFFFF)) {
3403 *p2 = 0xe0 | (val >> 12);
3404 *p1 = 0x80 | ((val >> 6) & 0x3f);
3405 *p0 = 0x80 | (val & 0x3f);
3406 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3407 *p2 = 0xe0 | (val >> 16);
3408 *p1 = 0x80 | ((val >> 12) & 0x3f);
3409 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3418 #ifdef UTF8_INPUT_ENABLE
3419 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3424 } else if (c2 >= 0xf0){
3425 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3426 val = (c2 & 0x0f) << 18;
3427 val |= (c1 & 0x3f) << 12;
3428 val |= (c0 & 0x3f00) >> 2;
3430 }else if (c2 >= 0xe0){
3431 val = (c2 & 0x0f) << 12;
3432 val |= (c1 & 0x3f) << 6;
3434 }else if (c2 >= 0xc0){
3435 val = (c2 & 0x1f) << 6;
3443 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3445 nkf_char c2, c1, c0;
3452 w16w_conv(val, &c2, &c1, &c0);
3453 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3454 #ifdef NUMCHAR_OPTION
3457 *p1 = CLASS_UNICODE | val;
3466 #ifdef UTF8_INPUT_ENABLE
3467 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3470 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3473 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3474 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3476 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3478 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3483 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3484 if (ret) return ret;
3489 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3493 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3494 } else if (is_unicode_bmp(c1)) {
3495 ret = w16e_conv(c1, &c2, &c1);
3498 c1 = CLASS_UNICODE | c1;
3500 if (ret) return ret;
3505 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3507 const unsigned short *const *pp;
3508 const unsigned short *const *const *ppp;
3509 static const int no_best_fit_chars_table_C2[] =
3510 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3512 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3513 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3514 static const int no_best_fit_chars_table_C2_ms[] =
3515 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3516 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3517 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3518 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3519 static const int no_best_fit_chars_table_932_C2[] =
3520 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3522 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3523 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3524 static const int no_best_fit_chars_table_932_C3[] =
3525 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3526 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3527 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3528 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3534 }else if(c2 < 0xe0){
3535 if(no_best_fit_chars_f){
3536 if(ms_ucs_map_f == UCS_MAP_CP932){
3539 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3542 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3545 }else if(!cp932inv_f){
3548 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3551 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3554 }else if(ms_ucs_map_f == UCS_MAP_MS){
3555 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3556 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3574 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3575 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3576 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3578 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3579 }else if(c0 < 0xF0){
3580 if(no_best_fit_chars_f){
3581 if(ms_ucs_map_f == UCS_MAP_CP932){
3582 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3583 }else if(ms_ucs_map_f == UCS_MAP_MS){
3588 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3591 if(c0 == 0x92) return 1;
3596 if(c1 == 0x80 || c0 == 0x9C) return 1;
3599 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3604 if(c0 == 0x94) return 1;
3607 if(c0 == 0xBB) return 1;
3617 if(c0 == 0x95) return 1;
3620 if(c0 == 0xA5) return 1;
3627 if(c0 == 0x8D) return 1;
3630 if(c0 == 0x9E && !cp932inv_f) return 1;
3633 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3641 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3642 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3643 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3645 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3647 #ifdef SHIFTJIS_CP932
3648 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3650 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3651 s2e_conv(s2, s1, p2, p1);
3660 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3663 const unsigned short *p;
3666 if (pp == 0) return 1;
3669 if (c1 < 0 || psize <= c1) return 1;
3671 if (p == 0) return 1;
3674 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3676 if (val == 0) return 1;
3677 if (no_cp932ext_f && (
3678 (val>>8) == 0x2D || /* NEC special characters */
3679 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3687 if (c2 == SO) c2 = X0201;
3694 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3696 const char *hex = "0123456789ABCDEF";
3702 (*f)(0, hex[(c>>shift)&0xF]);
3712 void encode_fallback_html(nkf_char c)
3717 if(c >= NKF_INT32_C(1000000))
3718 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3719 if(c >= NKF_INT32_C(100000))
3720 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3722 (*oconv)(0, 0x30+(c/10000 )%10);
3724 (*oconv)(0, 0x30+(c/1000 )%10);
3726 (*oconv)(0, 0x30+(c/100 )%10);
3728 (*oconv)(0, 0x30+(c/10 )%10);
3730 (*oconv)(0, 0x30+ c %10);
3735 void encode_fallback_xml(nkf_char c)
3740 nkf_each_char_to_hex(oconv, c);
3745 void encode_fallback_java(nkf_char c)
3747 const char *hex = "0123456789ABCDEF";
3750 if(!is_unicode_bmp(c)){
3754 (*oconv)(0, hex[(c>>20)&0xF]);
3755 (*oconv)(0, hex[(c>>16)&0xF]);
3759 (*oconv)(0, hex[(c>>12)&0xF]);
3760 (*oconv)(0, hex[(c>> 8)&0xF]);
3761 (*oconv)(0, hex[(c>> 4)&0xF]);
3762 (*oconv)(0, hex[ c &0xF]);
3766 void encode_fallback_perl(nkf_char c)
3771 nkf_each_char_to_hex(oconv, c);
3776 void encode_fallback_subchar(nkf_char c)
3778 c = unicode_subchar;
3779 (*oconv)((c>>8)&0xFF, c&0xFF);
3784 #ifdef UTF8_OUTPUT_ENABLE
3785 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3787 const unsigned short *p;
3790 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3798 p = euc_to_utf8_1byte;
3800 } else if (is_eucg3(c2)){
3801 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3804 c2 = (c2&0x7f) - 0x21;
3805 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3806 p = x0212_to_utf8_2bytes[c2];
3812 c2 = (c2&0x7f) - 0x21;
3813 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3815 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3816 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3817 euc_to_utf8_2bytes_ms[c2];
3822 c1 = (c1 & 0x7f) - 0x21;
3823 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3828 void w_oconv(nkf_char c2, nkf_char c1)
3834 output_bom_f = FALSE;
3845 #ifdef NUMCHAR_OPTION
3846 if (c2 == 0 && is_unicode_capsule(c1)){
3847 val = c1 & VALUE_MASK;
3850 }else if (val < 0x800){
3851 (*o_putc)(0xC0 | (val >> 6));
3852 (*o_putc)(0x80 | (val & 0x3f));
3853 } else if (val <= NKF_INT32_C(0xFFFF)) {
3854 (*o_putc)(0xE0 | (val >> 12));
3855 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3856 (*o_putc)(0x80 | (val & 0x3f));
3857 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3858 (*o_putc)(0xF0 | ( val>>18));
3859 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3860 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3861 (*o_putc)(0x80 | ( val & 0x3f));
3868 output_mode = ASCII;
3870 } else if (c2 == ISO8859_1) {
3871 output_mode = ISO8859_1;
3872 (*o_putc)(c1 | 0x080);
3875 val = e2w_conv(c2, c1);
3877 w16w_conv(val, &c2, &c1, &c0);
3881 if (c0) (*o_putc)(c0);
3887 void w_oconv16(nkf_char c2, nkf_char c1)
3890 output_bom_f = FALSE;
3891 if (output_endian == ENDIAN_LITTLE){
3892 (*o_putc)((unsigned char)'\377');
3896 (*o_putc)((unsigned char)'\377');
3905 if (c2 == ISO8859_1) {
3908 #ifdef NUMCHAR_OPTION
3909 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3910 if (is_unicode_bmp(c1)) {
3911 c2 = (c1 >> 8) & 0xff;
3915 if (c1 <= UNICODE_MAX) {
3916 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3917 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3918 if (output_endian == ENDIAN_LITTLE){
3919 (*o_putc)(c2 & 0xff);
3920 (*o_putc)((c2 >> 8) & 0xff);
3921 (*o_putc)(c1 & 0xff);
3922 (*o_putc)((c1 >> 8) & 0xff);
3924 (*o_putc)((c2 >> 8) & 0xff);
3925 (*o_putc)(c2 & 0xff);
3926 (*o_putc)((c1 >> 8) & 0xff);
3927 (*o_putc)(c1 & 0xff);
3934 nkf_char val = e2w_conv(c2, c1);
3935 c2 = (val >> 8) & 0xff;
3939 if (output_endian == ENDIAN_LITTLE){
3948 void w_oconv32(nkf_char c2, nkf_char c1)
3951 output_bom_f = FALSE;
3952 if (output_endian == ENDIAN_LITTLE){
3953 (*o_putc)((unsigned char)'\377');
3961 (*o_putc)((unsigned char)'\377');
3970 if (c2 == ISO8859_1) {
3972 #ifdef NUMCHAR_OPTION
3973 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3977 c1 = e2w_conv(c2, c1);
3980 if (output_endian == ENDIAN_LITTLE){
3981 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3982 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3983 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3987 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3988 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3989 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3994 void e_oconv(nkf_char c2, nkf_char c1)
3996 #ifdef NUMCHAR_OPTION
3997 if (c2 == 0 && is_unicode_capsule(c1)){
3998 w16e_conv(c1, &c2, &c1);
3999 if (c2 == 0 && is_unicode_capsule(c1)){
4000 c2 = c1 & VALUE_MASK;
4001 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4005 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4006 c1 = 0x21 + c1 % 94;
4009 (*o_putc)((c2 & 0x7f) | 0x080);
4010 (*o_putc)(c1 | 0x080);
4012 (*o_putc)((c2 & 0x7f) | 0x080);
4013 (*o_putc)(c1 | 0x080);
4017 if (encode_fallback) (*encode_fallback)(c1);
4026 } else if (c2 == 0) {
4027 output_mode = ASCII;
4029 } else if (c2 == X0201) {
4030 output_mode = JAPANESE_EUC;
4031 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4032 } else if (c2 == ISO8859_1) {
4033 output_mode = ISO8859_1;
4034 (*o_putc)(c1 | 0x080);
4036 } else if (is_eucg3(c2)){
4037 output_mode = JAPANESE_EUC;
4038 #ifdef SHIFTJIS_CP932
4041 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4042 s2e_conv(s2, s1, &c2, &c1);
4047 output_mode = ASCII;
4049 }else if (is_eucg3(c2)){
4052 (*o_putc)((c2 & 0x7f) | 0x080);
4053 (*o_putc)(c1 | 0x080);
4056 (*o_putc)((c2 & 0x7f) | 0x080);
4057 (*o_putc)(c1 | 0x080);
4061 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4062 set_iconv(FALSE, 0);
4063 return; /* too late to rescue this char */
4065 output_mode = JAPANESE_EUC;
4066 (*o_putc)(c2 | 0x080);
4067 (*o_putc)(c1 | 0x080);
4072 nkf_char x0212_shift(nkf_char c)
4077 if (0x75 <= c && c <= 0x7f){
4078 ret = c + (0x109 - 0x75);
4081 if (0x75 <= c && c <= 0x7f){
4082 ret = c + (0x113 - 0x75);
4089 nkf_char x0212_unshift(nkf_char c)
4092 if (0x7f <= c && c <= 0x88){
4093 ret = c + (0x75 - 0x7f);
4094 }else if (0x89 <= c && c <= 0x92){
4095 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4099 #endif /* X0212_ENABLE */
4101 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4107 if((0x21 <= ndx && ndx <= 0x2F)){
4108 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4109 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4111 }else if(0x6E <= ndx && ndx <= 0x7E){
4112 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4113 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4119 else if(nkf_isgraph(ndx)){
4121 const unsigned short *ptr;
4122 ptr = x0212_shiftjis[ndx - 0x21];
4124 val = ptr[(c1 & 0x7f) - 0x21];
4133 c2 = x0212_shift(c2);
4135 #endif /* X0212_ENABLE */
4137 if(0x7F < c2) return 1;
4138 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4139 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4143 void s_oconv(nkf_char c2, nkf_char c1)
4145 #ifdef NUMCHAR_OPTION
4146 if (c2 == 0 && is_unicode_capsule(c1)){
4147 w16e_conv(c1, &c2, &c1);
4148 if (c2 == 0 && is_unicode_capsule(c1)){
4149 c2 = c1 & VALUE_MASK;
4150 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4153 c2 = c1 / 188 + 0xF0;
4155 c1 += 0x40 + (c1 > 0x3e);
4160 if(encode_fallback)(*encode_fallback)(c1);
4169 } else if (c2 == 0) {
4170 output_mode = ASCII;
4172 } else if (c2 == X0201) {
4173 output_mode = SHIFT_JIS;
4175 } else if (c2 == ISO8859_1) {
4176 output_mode = ISO8859_1;
4177 (*o_putc)(c1 | 0x080);
4179 } else if (is_eucg3(c2)){
4180 output_mode = SHIFT_JIS;
4181 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4187 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4188 set_iconv(FALSE, 0);
4189 return; /* too late to rescue this char */
4191 output_mode = SHIFT_JIS;
4192 e2s_conv(c2, c1, &c2, &c1);
4194 #ifdef SHIFTJIS_CP932
4196 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4197 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4203 #endif /* SHIFTJIS_CP932 */
4206 if (prefix_table[(unsigned char)c1]){
4207 (*o_putc)(prefix_table[(unsigned char)c1]);
4213 void j_oconv(nkf_char c2, nkf_char c1)
4215 #ifdef NUMCHAR_OPTION
4216 if (c2 == 0 && is_unicode_capsule(c1)){
4217 w16e_conv(c1, &c2, &c1);
4218 if (c2 == 0 && is_unicode_capsule(c1)){
4219 c2 = c1 & VALUE_MASK;
4220 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4223 c2 = 0x7F + c1 / 94;
4224 c1 = 0x21 + c1 % 94;
4226 if (encode_fallback) (*encode_fallback)(c1);
4233 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4236 (*o_putc)(ascii_intro);
4237 output_mode = ASCII;
4241 } else if (is_eucg3(c2)){
4243 if(output_mode!=X0213_2){
4244 output_mode = X0213_2;
4248 (*o_putc)(X0213_2&0x7F);
4251 if(output_mode!=X0212){
4252 output_mode = X0212;
4256 (*o_putc)(X0212&0x7F);
4259 (*o_putc)(c2 & 0x7f);
4262 } else if (c2==X0201) {
4263 if (output_mode!=X0201) {
4264 output_mode = X0201;
4270 } else if (c2==ISO8859_1) {
4271 /* iso8859 introduction, or 8th bit on */
4272 /* Can we convert in 7bit form using ESC-'-'-A ?
4274 output_mode = ISO8859_1;
4276 } else if (c2 == 0) {
4277 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4280 (*o_putc)(ascii_intro);
4281 output_mode = ASCII;
4286 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4287 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4289 if (output_mode!=X0213_1) {
4290 output_mode = X0213_1;
4294 (*o_putc)(X0213_1&0x7F);
4296 }else if (output_mode != X0208) {
4297 output_mode = X0208;
4300 (*o_putc)(kanji_intro);
4307 void base64_conv(nkf_char c2, nkf_char c1)
4309 mime_prechar(c2, c1);
4310 (*o_base64conv)(c2,c1);
4314 static nkf_char broken_buf[3];
4315 static int broken_counter = 0;
4316 static int broken_last = 0;
4317 nkf_char broken_getc(FILE *f)
4321 if (broken_counter>0) {
4322 return broken_buf[--broken_counter];
4325 if (c=='$' && broken_last != ESC
4326 && (input_mode==ASCII || input_mode==X0201)) {
4329 if (c1=='@'|| c1=='B') {
4330 broken_buf[0]=c1; broken_buf[1]=c;
4337 } else if (c=='(' && broken_last != ESC
4338 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4341 if (c1=='J'|| c1=='B') {
4342 broken_buf[0]=c1; broken_buf[1]=c;
4355 nkf_char broken_ungetc(nkf_char c, FILE *f)
4357 if (broken_counter<2)
4358 broken_buf[broken_counter++]=c;
4362 void cr_conv(nkf_char c2, nkf_char c1)
4366 if (! (c2==0&&c1==NL) ) {
4372 } else if (c1=='\r') {
4374 } else if (c1=='\n') {
4375 if (crmode_f==CRLF) {
4376 (*o_crconv)(0,'\r');
4377 } else if (crmode_f==CR) {
4378 (*o_crconv)(0,'\r');
4382 } else if (c1!='\032' || crmode_f!=NL){
4388 Return value of fold_conv()
4390 \n add newline and output char
4391 \r add newline and output nothing
4394 1 (or else) normal output
4396 fold state in prev (previous character)
4398 >0x80 Japanese (X0208/X0201)
4403 This fold algorthm does not preserve heading space in a line.
4404 This is the main difference from fmt.
4407 #define char_size(c2,c1) (c2?2:1)
4409 void fold_conv(nkf_char c2, nkf_char c1)
4412 nkf_char fold_state;
4414 if (c1== '\r' && !fold_preserve_f) {
4415 fold_state=0; /* ignore cr */
4416 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4418 fold_state=0; /* ignore cr */
4419 } else if (c1== BS) {
4420 if (f_line>0) f_line--;
4422 } else if (c2==EOF && f_line != 0) { /* close open last line */
4424 } else if ((c1=='\n' && !fold_preserve_f)
4425 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4426 && fold_preserve_f)) {
4428 if (fold_preserve_f) {
4432 } else if ((f_prev == c1 && !fold_preserve_f)
4433 || (f_prev == '\n' && fold_preserve_f)
4434 ) { /* duplicate newline */
4437 fold_state = '\n'; /* output two newline */
4443 if (f_prev&0x80) { /* Japanese? */
4445 fold_state = 0; /* ignore given single newline */
4446 } else if (f_prev==' ') {
4450 if (++f_line<=fold_len)
4454 fold_state = '\r'; /* fold and output nothing */
4458 } else if (c1=='\f') {
4461 fold_state = '\n'; /* output newline and clear */
4462 } else if ( (c2==0 && c1==' ')||
4463 (c2==0 && c1=='\t')||
4464 (c2=='!'&& c1=='!')) {
4465 /* X0208 kankaku or ascii space */
4466 if (f_prev == ' ') {
4467 fold_state = 0; /* remove duplicate spaces */
4470 if (++f_line<=fold_len)
4471 fold_state = ' '; /* output ASCII space only */
4473 f_prev = ' '; f_line = 0;
4474 fold_state = '\r'; /* fold and output nothing */
4478 prev0 = f_prev; /* we still need this one... , but almost done */
4480 if (c2 || c2==X0201)
4481 f_prev |= 0x80; /* this is Japanese */
4482 f_line += char_size(c2,c1);
4483 if (f_line<=fold_len) { /* normal case */
4486 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4487 f_line = char_size(c2,c1);
4488 fold_state = '\n'; /* We can't wait, do fold now */
4489 } else if (c2==X0201) {
4490 /* simple kinsoku rules return 1 means no folding */
4491 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4492 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4493 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4494 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4495 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4496 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4497 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4499 fold_state = '\n';/* add one new f_line before this character */
4502 fold_state = '\n';/* add one new f_line before this character */
4505 /* kinsoku point in ASCII */
4506 if ( c1==')'|| /* { [ ( */
4517 /* just after special */
4518 } else if (!is_alnum(prev0)) {
4519 f_line = char_size(c2,c1);
4521 } else if ((prev0==' ') || /* ignored new f_line */
4522 (prev0=='\n')|| /* ignored new f_line */
4523 (prev0&0x80)) { /* X0208 - ASCII */
4524 f_line = char_size(c2,c1);
4525 fold_state = '\n';/* add one new f_line before this character */
4527 fold_state = 1; /* default no fold in ASCII */
4531 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4532 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4533 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4534 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4535 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4536 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4537 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4538 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4539 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4540 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4541 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4542 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4543 /* default no fold in kinsoku */
4546 f_line = char_size(c2,c1);
4547 /* add one new f_line before this character */
4550 f_line = char_size(c2,c1);
4552 /* add one new f_line before this character */
4557 /* terminator process */
4558 switch(fold_state) {
4577 nkf_char z_prev2=0,z_prev1=0;
4579 void z_conv(nkf_char c2, nkf_char c1)
4582 /* if (c2) c1 &= 0x7f; assertion */
4584 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4589 if (x0201_f && z_prev2==X0201) { /* X0201 */
4590 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4592 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4594 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4596 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4600 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4609 if (x0201_f && c2==X0201) {
4610 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4611 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4612 z_prev1 = c1; z_prev2 = c2;
4615 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4620 if (alpha_f && c2 == 0x23 ) {
4621 /* JISX0208 Alphabet */
4623 } else if (alpha_f && c2 == 0x21 ) {
4624 /* JISX0208 Kigou */
4629 } else if (alpha_f&0x4) {
4634 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4640 case '>': entity = ">"; break;
4641 case '<': entity = "<"; break;
4642 case '\"': entity = """; break;
4643 case '&': entity = "&"; break;
4646 while (*entity) (*o_zconv)(0, *entity++);
4651 } else if (alpha_f & 0x10 && c2 == 0x25) {
4652 /* JISX0208 Katakana */
4653 static const int fullwidth_to_halfwidth[] =
4655 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4656 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4657 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4658 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4659 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4660 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4661 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4662 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4663 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4664 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4665 0x0000, 0x0000, 0x2600, 0x5D00, 0x0000, 0x0000, 0x0000, 0x0000,
4666 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4668 if (fullwidth_to_halfwidth[c1-0x20]){
4669 c2 = fullwidth_to_halfwidth[c1-0x20];
4670 (*o_zconv)(X0201, c2>>8);
4672 (*o_zconv)(X0201, c2&0xFF);
4681 #define rot13(c) ( \
4683 (c <= 'M') ? (c + 13): \
4684 (c <= 'Z') ? (c - 13): \
4686 (c <= 'm') ? (c + 13): \
4687 (c <= 'z') ? (c - 13): \
4691 #define rot47(c) ( \
4693 ( c <= 'O' ) ? (c + 47) : \
4694 ( c <= '~' ) ? (c - 47) : \
4698 void rot_conv(nkf_char c2, nkf_char c1)
4700 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4706 (*o_rot_conv)(c2,c1);
4709 void hira_conv(nkf_char c2, nkf_char c1)
4713 if (0x20 < c1 && c1 < 0x74) {
4715 (*o_hira_conv)(c2,c1);
4717 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4719 c1 = CLASS_UNICODE | 0x3094;
4720 (*o_hira_conv)(c2,c1);
4723 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4725 (*o_hira_conv)(c2,c1);
4730 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4733 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4735 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4739 (*o_hira_conv)(c2,c1);
4743 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4745 static const nkf_char range[RANGE_NUM_MAX][2] = {
4766 nkf_char start, end, c;
4768 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4772 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4777 for (i = 0; i < RANGE_NUM_MAX; i++) {
4778 start = range[i][0];
4781 if (c >= start && c <= end) {
4786 (*o_iso2022jp_check_conv)(c2,c1);
4790 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4792 const unsigned char *mime_pattern[] = {
4793 (const unsigned char *)"\075?EUC-JP?B?",
4794 (const unsigned char *)"\075?SHIFT_JIS?B?",
4795 (const unsigned char *)"\075?ISO-8859-1?Q?",
4796 (const unsigned char *)"\075?ISO-8859-1?B?",
4797 (const unsigned char *)"\075?ISO-2022-JP?B?",
4798 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4799 #if defined(UTF8_INPUT_ENABLE)
4800 (const unsigned char *)"\075?UTF-8?B?",
4801 (const unsigned char *)"\075?UTF-8?Q?",
4803 (const unsigned char *)"\075?US-ASCII?Q?",
4808 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4809 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4810 e_iconv, s_iconv, 0, 0, 0, 0,
4811 #if defined(UTF8_INPUT_ENABLE)
4817 const nkf_char mime_encode[] = {
4818 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4819 #if defined(UTF8_INPUT_ENABLE)
4826 const nkf_char mime_encode_method[] = {
4827 'B', 'B','Q', 'B', 'B', 'Q',
4828 #if defined(UTF8_INPUT_ENABLE)
4836 #define MAXRECOVER 20
4838 void switch_mime_getc(void)
4840 if (i_getc!=mime_getc) {
4841 i_mgetc = i_getc; i_getc = mime_getc;
4842 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4843 if(mime_f==STRICT_MIME) {
4844 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4845 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4850 void unswitch_mime_getc(void)
4852 if(mime_f==STRICT_MIME) {
4853 i_mgetc = i_mgetc_buf;
4854 i_mungetc = i_mungetc_buf;
4857 i_ungetc = i_mungetc;
4858 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4859 mime_iconv_back = NULL;
4862 nkf_char mime_begin_strict(FILE *f)
4866 const unsigned char *p,*q;
4867 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4869 mime_decode_mode = FALSE;
4870 /* =? has been checked */
4872 p = mime_pattern[j];
4875 for(i=2;p[i]>' ';i++) { /* start at =? */
4876 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4877 /* pattern fails, try next one */
4879 while (mime_pattern[++j]) {
4880 p = mime_pattern[j];
4881 for(k=2;k<i;k++) /* assume length(p) > i */
4882 if (p[k]!=q[k]) break;
4883 if (k==i && nkf_toupper(c1)==p[k]) break;
4885 p = mime_pattern[j];
4886 if (p) continue; /* found next one, continue */
4887 /* all fails, output from recovery buffer */
4895 mime_decode_mode = p[i-2];
4897 mime_iconv_back = iconv;
4898 set_iconv(FALSE, mime_priority_func[j]);
4899 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4901 if (mime_decode_mode=='B') {
4902 mimebuf_f = unbuf_f;
4904 /* do MIME integrity check */
4905 return mime_integrity(f,mime_pattern[j]);
4913 nkf_char mime_getc_buf(FILE *f)
4915 /* we don't keep eof of Fifo, becase it contains ?= as
4916 a terminator. It was checked in mime_integrity. */
4917 return ((mimebuf_f)?
4918 (*i_mgetc_buf)(f):Fifo(mime_input++));
4921 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4924 (*i_mungetc_buf)(c,f);
4926 Fifo(--mime_input) = (unsigned char)c;
4930 nkf_char mime_begin(FILE *f)
4935 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4936 /* re-read and convert again from mime_buffer. */
4938 /* =? has been checked */
4940 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4941 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4942 /* We accept any character type even if it is breaked by new lines */
4943 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4944 if (c1=='\n'||c1==' '||c1=='\r'||
4945 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4947 /* Failed. But this could be another MIME preemble */
4955 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4956 if (!(++i<MAXRECOVER) || c1==EOF) break;
4957 if (c1=='b'||c1=='B') {
4958 mime_decode_mode = 'B';
4959 } else if (c1=='q'||c1=='Q') {
4960 mime_decode_mode = 'Q';
4964 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4965 if (!(++i<MAXRECOVER) || c1==EOF) break;
4967 mime_decode_mode = FALSE;
4973 if (!mime_decode_mode) {
4974 /* false MIME premble, restart from mime_buffer */
4975 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4976 /* Since we are in MIME mode until buffer becomes empty, */
4977 /* we never go into mime_begin again for a while. */
4980 /* discard mime preemble, and goto MIME mode */
4982 /* do no MIME integrity check */
4983 return c1; /* used only for checking EOF */
4987 void no_putc(nkf_char c)
4992 void debug(const char *str)
4995 fprintf(stderr, "%s\n", str);
5000 void set_input_codename(char *codename)
5004 strcmp(codename, "") != 0 &&
5005 strcmp(codename, input_codename) != 0)
5007 is_inputcode_mixed = TRUE;
5009 input_codename = codename;
5010 is_inputcode_set = TRUE;
5013 #if !defined(PERL_XS) && !defined(WIN32DLL)
5014 void print_guessed_code(char *filename)
5016 char *codename = "BINARY";
5017 char *str_crmode = NULL;
5018 if (!is_inputcode_mixed) {
5019 if (strcmp(input_codename, "") == 0) {
5022 codename = input_codename;
5024 if (crmode_f == CR) str_crmode = "CR";
5025 else if (crmode_f == NL) str_crmode = "LF";
5026 else if (crmode_f == CRLF) str_crmode = "CRLF";
5028 if (filename != NULL) printf("%s:", filename);
5029 if (str_crmode != NULL) printf("%s (%s)\n", codename, str_crmode);
5030 else printf("%s\n", codename);
5036 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5038 nkf_char c1, c2, c3;
5044 if (!nkf_isxdigit(c2)){
5049 if (!nkf_isxdigit(c3)){
5054 return (hex2bin(c2) << 4) | hex2bin(c3);
5057 nkf_char cap_getc(FILE *f)
5059 return hex_getc(':', f, i_cgetc, i_cungetc);
5062 nkf_char cap_ungetc(nkf_char c, FILE *f)
5064 return (*i_cungetc)(c, f);
5067 nkf_char url_getc(FILE *f)
5069 return hex_getc('%', f, i_ugetc, i_uungetc);
5072 nkf_char url_ungetc(nkf_char c, FILE *f)
5074 return (*i_uungetc)(c, f);
5078 #ifdef NUMCHAR_OPTION
5079 nkf_char numchar_getc(FILE *f)
5081 nkf_char (*g)(FILE *) = i_ngetc;
5082 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5093 if (buf[i] == 'x' || buf[i] == 'X'){
5094 for (j = 0; j < 7; j++){
5096 if (!nkf_isxdigit(buf[i])){
5103 c |= hex2bin(buf[i]);
5106 for (j = 0; j < 8; j++){
5110 if (!nkf_isdigit(buf[i])){
5117 c += hex2bin(buf[i]);
5123 return CLASS_UNICODE | c;
5132 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5134 return (*i_nungetc)(c, f);
5138 #ifdef UNICODE_NORMALIZATION
5140 /* Normalization Form C */
5141 nkf_char nfc_getc(FILE *f)
5143 nkf_char (*g)(FILE *f) = i_nfc_getc;
5144 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5145 int i=0, j, k=1, lower, upper;
5147 const nkf_nfchar *array;
5150 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5151 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5152 while (upper >= lower) {
5153 j = (lower+upper) / 2;
5154 array = normalization_table[j].nfd;
5155 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5156 if (array[k] != buf[k]){
5157 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5164 array = normalization_table[j].nfc;
5165 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5166 buf[i] = (nkf_char)(array[i]);
5177 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5179 return (*i_nfc_ungetc)(c, f);
5181 #endif /* UNICODE_NORMALIZATION */
5187 nkf_char c1, c2, c3, c4, cc;
5188 nkf_char t1, t2, t3, t4, mode, exit_mode;
5189 nkf_char lwsp_count;
5192 nkf_char lwsp_size = 128;
5194 if (mime_top != mime_last) { /* Something is in FIFO */
5195 return Fifo(mime_top++);
5197 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5198 mime_decode_mode=FALSE;
5199 unswitch_mime_getc();
5200 return (*i_getc)(f);
5203 if (mimebuf_f == FIXED_MIME)
5204 exit_mode = mime_decode_mode;
5207 if (mime_decode_mode == 'Q') {
5208 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5210 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5211 if (c1<=' ' || DEL<=c1) {
5212 mime_decode_mode = exit_mode; /* prepare for quit */
5215 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5219 mime_decode_mode = exit_mode; /* prepare for quit */
5220 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5221 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5222 /* end Q encoding */
5223 input_mode = exit_mode;
5225 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5226 if (lwsp_buf==NULL) {
5227 perror("can't malloc");
5230 while ((c1=(*i_getc)(f))!=EOF) {
5235 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5243 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5244 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5259 lwsp_buf[lwsp_count] = (unsigned char)c1;
5260 if (lwsp_count++>lwsp_size){
5262 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5263 if (lwsp_buf_new==NULL) {
5265 perror("can't realloc");
5268 lwsp_buf = lwsp_buf_new;
5274 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5276 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5277 i_ungetc(lwsp_buf[lwsp_count],f);
5283 if (c1=='='&&c2<' ') { /* this is soft wrap */
5284 while((c1 = (*i_mgetc)(f)) <=' ') {
5285 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5287 mime_decode_mode = 'Q'; /* still in MIME */
5288 goto restart_mime_q;
5291 mime_decode_mode = 'Q'; /* still in MIME */
5295 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5296 if (c2<=' ') return c2;
5297 mime_decode_mode = 'Q'; /* still in MIME */
5298 return ((hex2bin(c2)<<4) + hex2bin(c3));
5301 if (mime_decode_mode != 'B') {
5302 mime_decode_mode = FALSE;
5303 return (*i_mgetc)(f);
5307 /* Base64 encoding */
5309 MIME allows line break in the middle of
5310 Base64, but we are very pessimistic in decoding
5311 in unbuf mode because MIME encoded code may broken by
5312 less or editor's control sequence (such as ESC-[-K in unbuffered
5313 mode. ignore incomplete MIME.
5315 mode = mime_decode_mode;
5316 mime_decode_mode = exit_mode; /* prepare for quit */
5318 while ((c1 = (*i_mgetc)(f))<=' ') {
5323 if ((c2 = (*i_mgetc)(f))<=' ') {
5326 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5327 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5330 if ((c1 == '?') && (c2 == '=')) {
5333 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5334 if (lwsp_buf==NULL) {
5335 perror("can't malloc");
5338 while ((c1=(*i_getc)(f))!=EOF) {
5343 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5351 if ((c1=(*i_getc)(f))!=EOF) {
5355 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5370 lwsp_buf[lwsp_count] = (unsigned char)c1;
5371 if (lwsp_count++>lwsp_size){
5373 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5374 if (lwsp_buf_new==NULL) {
5376 perror("can't realloc");
5379 lwsp_buf = lwsp_buf_new;
5385 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5387 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5388 i_ungetc(lwsp_buf[lwsp_count],f);
5395 if ((c3 = (*i_mgetc)(f))<=' ') {
5398 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5399 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5403 if ((c4 = (*i_mgetc)(f))<=' ') {
5406 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5407 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5411 mime_decode_mode = mode; /* still in MIME sigh... */
5413 /* BASE 64 decoding */
5415 t1 = 0x3f & base64decode(c1);
5416 t2 = 0x3f & base64decode(c2);
5417 t3 = 0x3f & base64decode(c3);
5418 t4 = 0x3f & base64decode(c4);
5419 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5421 Fifo(mime_last++) = (unsigned char)cc;
5422 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5424 Fifo(mime_last++) = (unsigned char)cc;
5425 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5427 Fifo(mime_last++) = (unsigned char)cc;
5432 return Fifo(mime_top++);
5435 nkf_char mime_ungetc(nkf_char c, FILE *f)
5437 Fifo(--mime_top) = (unsigned char)c;
5441 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5445 /* In buffered mode, read until =? or NL or buffer full
5447 mime_input = mime_top;
5448 mime_last = mime_top;
5450 while(*p) Fifo(mime_input++) = *p++;
5453 while((c=(*i_getc)(f))!=EOF) {
5454 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5455 break; /* buffer full */
5457 if (c=='=' && d=='?') {
5458 /* checked. skip header, start decode */
5459 Fifo(mime_input++) = (unsigned char)c;
5460 /* mime_last_input = mime_input; */
5465 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5467 /* Should we check length mod 4? */
5468 Fifo(mime_input++) = (unsigned char)c;
5471 /* In case of Incomplete MIME, no MIME decode */
5472 Fifo(mime_input++) = (unsigned char)c;
5473 mime_last = mime_input; /* point undecoded buffer */
5474 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5475 switch_mime_getc(); /* anyway we need buffered getc */
5479 nkf_char base64decode(nkf_char c)
5484 i = c - 'A'; /* A..Z 0-25 */
5486 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5488 } else if (c > '/') {
5489 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5490 } else if (c == '+') {
5491 i = '>' /* 62 */ ; /* + 62 */
5493 i = '?' /* 63 */ ; /* / 63 */
5498 static const char basis_64[] =
5499 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5501 static nkf_char b64c;
5502 #define MIMEOUT_BUF_LENGTH (60)
5503 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5504 int mimeout_buf_count = 0;
5505 int mimeout_preserve_space = 0;
5506 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5508 void open_mime(nkf_char mode)
5510 const unsigned char *p;
5513 p = mime_pattern[0];
5514 for(i=0;mime_pattern[i];i++) {
5515 if (mode == mime_encode[i]) {
5516 p = mime_pattern[i];
5520 mimeout_mode = mime_encode_method[i];
5523 if (base64_count>45) {
5524 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5525 (*o_mputc)(mimeout_buf[i]);
5531 if (!mimeout_preserve_space && mimeout_buf_count>0
5532 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5533 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5537 if (!mimeout_preserve_space) {
5538 for (;i<mimeout_buf_count;i++) {
5539 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5540 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5541 (*o_mputc)(mimeout_buf[i]);
5548 mimeout_preserve_space = FALSE;
5554 j = mimeout_buf_count;
5555 mimeout_buf_count = 0;
5557 mime_putc(mimeout_buf[i]);
5561 void close_mime(void)
5571 switch(mimeout_mode) {
5576 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5582 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5588 if (mimeout_f!=FIXED_MIME) {
5590 } else if (mimeout_mode != 'Q')
5595 void mimeout_addchar(nkf_char c)
5597 switch(mimeout_mode) {
5602 } else if(!nkf_isalnum(c)) {
5604 (*o_mputc)(itoh4(((c>>4)&0xf)));
5605 (*o_mputc)(itoh4((c&0xf)));
5614 (*o_mputc)(basis_64[c>>2]);
5619 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5625 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5626 (*o_mputc)(basis_64[c & 0x3F]);
5637 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5639 void mime_prechar(nkf_char c2, nkf_char c1)
5643 if (base64_count + mimeout_buf_count/3*4> 73){
5644 (*o_base64conv)(EOF,0);
5645 (*o_base64conv)(0,NL);
5646 (*o_base64conv)(0,SPACE);
5649 if (base64_count + mimeout_buf_count/3*4> 66){
5650 (*o_base64conv)(EOF,0);
5651 (*o_base64conv)(0,NL);
5652 (*o_base64conv)(0,SPACE);
5654 }/*else if (mime_lastchar2){
5655 if (c1 <=DEL && !nkf_isspace(c1)){
5656 (*o_base64conv)(0,SPACE);
5660 if (c2 && mime_lastchar2 == 0
5661 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5662 (*o_base64conv)(0,SPACE);
5665 /*mime_lastchar2 = c2;
5666 mime_lastchar1 = c1;*/
5669 void mime_putc(nkf_char c)
5674 if (mimeout_f == FIXED_MIME){
5675 if (mimeout_mode == 'Q'){
5676 if (base64_count > 71){
5677 if (c!=CR && c!=NL) {
5684 if (base64_count > 71){
5689 if (c == EOF) { /* c==EOF */
5693 if (c != EOF) { /* c==EOF */
5699 /* mimeout_f != FIXED_MIME */
5701 if (c == EOF) { /* c==EOF */
5702 j = mimeout_buf_count;
5703 mimeout_buf_count = 0;
5706 if (!nkf_isblank(mimeout_buf[j-1])) {
5708 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5711 mimeout_addchar(mimeout_buf[i]);
5715 mimeout_addchar(mimeout_buf[i]);
5719 mimeout_addchar(mimeout_buf[i]);
5725 mimeout_addchar(mimeout_buf[i]);
5731 if (mimeout_mode=='Q') {
5732 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5733 if (c == CR || c == NL) {
5738 } else if (c <= SPACE) {
5740 if (base64_count > 70) {
5744 if (!nkf_isblank(c)) {
5755 if (mimeout_buf_count > 0){
5756 lastchar = mimeout_buf[mimeout_buf_count - 1];
5761 if (!mimeout_mode) {
5762 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5763 if (nkf_isspace(c)) {
5764 if (c==CR || c==NL) {
5767 for (i=0;i<mimeout_buf_count;i++) {
5768 (*o_mputc)(mimeout_buf[i]);
5769 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5775 mimeout_buf[0] = (char)c;
5776 mimeout_buf_count = 1;
5778 if (base64_count > 1
5779 && base64_count + mimeout_buf_count > 76
5780 && mimeout_buf[0] != CR && mimeout_buf[0] != NL){
5783 if (!nkf_isspace(mimeout_buf[0])){
5788 mimeout_buf[mimeout_buf_count++] = (char)c;
5789 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5790 open_mime(output_mode);
5795 if (lastchar==CR || lastchar == NL){
5796 for (i=0;i<mimeout_buf_count;i++) {
5797 (*o_mputc)(mimeout_buf[i]);
5800 mimeout_buf_count = 0;
5802 if (lastchar==SPACE) {
5803 for (i=0;i<mimeout_buf_count-1;i++) {
5804 (*o_mputc)(mimeout_buf[i]);
5807 mimeout_buf[0] = SPACE;
5808 mimeout_buf_count = 1;
5810 open_mime(output_mode);
5813 /* mimeout_mode == 'B', 1, 2 */
5814 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5815 if (lastchar == CR || lastchar == NL){
5816 if (nkf_isblank(c)) {
5817 for (i=0;i<mimeout_buf_count;i++) {
5818 mimeout_addchar(mimeout_buf[i]);
5820 mimeout_buf_count = 0;
5821 } else if (SPACE<c && c<DEL) {
5823 for (i=0;i<mimeout_buf_count;i++) {
5824 (*o_mputc)(mimeout_buf[i]);
5827 mimeout_buf_count = 0;
5830 if (c==SPACE || c==TAB || c==CR || c==NL) {
5831 for (i=0;i<mimeout_buf_count;i++) {
5832 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5834 for (i=0;i<mimeout_buf_count;i++) {
5835 (*o_mputc)(mimeout_buf[i]);
5838 mimeout_buf_count = 0;
5841 mimeout_buf[mimeout_buf_count++] = (char)c;
5842 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5844 for (i=0;i<mimeout_buf_count;i++) {
5845 (*o_mputc)(mimeout_buf[i]);
5848 mimeout_buf_count = 0;
5852 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5853 mimeout_buf[mimeout_buf_count++] = (char)c;
5854 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5855 j = mimeout_buf_count;
5856 mimeout_buf_count = 0;
5858 mimeout_addchar(mimeout_buf[i]);
5865 if (mimeout_buf_count>0) {
5866 j = mimeout_buf_count;
5867 mimeout_buf_count = 0;
5869 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5871 mimeout_addchar(mimeout_buf[i]);
5877 (*o_mputc)(mimeout_buf[i]);
5879 open_mime(output_mode);
5886 #if defined(PERL_XS) || defined(WIN32DLL)
5890 struct input_code *p = input_code_list;
5903 mime_f = STRICT_MIME;
5904 mime_decode_f = FALSE;
5909 #if defined(MSDOS) || defined(__OS2__)
5914 iso2022jp_f = FALSE;
5915 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5916 ms_ucs_map_f = UCS_MAP_ASCII;
5918 #ifdef UTF8_INPUT_ENABLE
5919 no_cp932ext_f = FALSE;
5920 no_best_fit_chars_f = FALSE;
5921 encode_fallback = NULL;
5922 unicode_subchar = '?';
5923 input_endian = ENDIAN_BIG;
5925 #ifdef UTF8_OUTPUT_ENABLE
5926 output_bom_f = FALSE;
5927 output_endian = ENDIAN_BIG;
5929 #ifdef UNICODE_NORMALIZATION
5942 is_inputcode_mixed = FALSE;
5943 is_inputcode_set = FALSE;
5947 #ifdef SHIFTJIS_CP932
5957 for (i = 0; i < 256; i++){
5958 prefix_table[i] = 0;
5962 mimeout_buf_count = 0;
5967 fold_preserve_f = FALSE;
5970 kanji_intro = DEFAULT_J;
5971 ascii_intro = DEFAULT_R;
5972 fold_margin = FOLD_MARGIN;
5973 output_conv = DEFAULT_CONV;
5974 oconv = DEFAULT_CONV;
5975 o_zconv = no_connection;
5976 o_fconv = no_connection;
5977 o_crconv = no_connection;
5978 o_rot_conv = no_connection;
5979 o_hira_conv = no_connection;
5980 o_base64conv = no_connection;
5981 o_iso2022jp_check_conv = no_connection;
5984 i_ungetc = std_ungetc;
5986 i_bungetc = std_ungetc;
5989 i_mungetc = std_ungetc;
5990 i_mgetc_buf = std_getc;
5991 i_mungetc_buf = std_ungetc;
5992 output_mode = ASCII;
5995 mime_decode_mode = FALSE;
6001 z_prev2=0,z_prev1=0;
6003 iconv_for_check = 0;
6005 input_codename = "";
6012 void no_connection(nkf_char c2, nkf_char c1)
6014 no_connection2(c2,c1,0);
6017 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6019 fprintf(stderr,"nkf internal module connection failure.\n");
6021 return 0; /* LINT */
6026 #define fprintf dllprintf
6030 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6031 fprintf(stderr,"Flags:\n");
6032 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6033 #ifdef DEFAULT_CODE_SJIS
6034 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6036 #ifdef DEFAULT_CODE_JIS
6037 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6039 #ifdef DEFAULT_CODE_EUC
6040 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6042 #ifdef DEFAULT_CODE_UTF8
6043 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6045 #ifdef UTF8_OUTPUT_ENABLE
6046 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6048 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6049 #ifdef UTF8_INPUT_ENABLE
6050 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6052 fprintf(stderr,"t no conversion\n");
6053 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6054 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6055 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6056 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6057 fprintf(stderr,"v Show this usage. V: show version\n");
6058 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6059 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6060 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6061 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6062 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6063 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6064 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6065 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6066 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6068 fprintf(stderr,"T Text mode output\n");
6070 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6071 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6072 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6073 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6074 fprintf(stderr,"\n");
6075 fprintf(stderr,"Long name options\n");
6076 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6077 fprintf(stderr," Specify the input or output codeset\n");
6078 fprintf(stderr," --fj --unix --mac --windows\n");
6079 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6080 fprintf(stderr," Convert for the system or code\n");
6081 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6082 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6083 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6085 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6087 #ifdef NUMCHAR_OPTION
6088 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6090 #ifdef UTF8_INPUT_ENABLE
6091 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6092 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6095 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6096 fprintf(stderr," Overwrite original listed files by filtered result\n");
6097 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6099 fprintf(stderr," -g --guess Guess the input code\n");
6100 fprintf(stderr," --help --version Show this help/the version\n");
6101 fprintf(stderr," For more information, see also man nkf\n");
6102 fprintf(stderr,"\n");
6108 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6109 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6112 #if defined(MSDOS) && defined(__WIN16__)
6115 #if defined(MSDOS) && defined(__WIN32__)
6121 ,NKF_VERSION,NKF_RELEASE_DATE);
6122 fprintf(stderr,"\n%s\n",CopyRight);
6127 **
\e$B%Q%C%A@):n<T
\e(B
6128 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
6129 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
6130 ** ohta@src.ricoh.co.jp (Junn Ohta)
6131 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
6132 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
6133 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
6134 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
6135 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
6136 ** GHG00637@nifty-serve.or.jp (COW)