1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_IDENT "$Id: nkf.c,v 1.187 2008/11/07 02:37:21 naruse Exp $"
35 #define NKF_VERSION "2.0.8"
36 #define NKF_RELEASE_DATE "2008-10-28"
38 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
39 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
50 # define INCL_DOSERRORS
54 /* state of output_mode and input_mode
133 NKF_ENCODING_TABLE_SIZE,
134 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
135 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
136 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
137 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
138 JIS_X_0208 = 0x1168, /* @B */
139 JIS_X_0212 = 0x1159, /* D */
140 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
141 JIS_X_0213_2 = 0x1229, /* P */
142 JIS_X_0213_1 = 0x1233, /* Q */
145 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
146 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
147 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
148 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
149 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
150 static void j_oconv(nkf_char c2, nkf_char c1);
151 static void s_oconv(nkf_char c2, nkf_char c1);
152 static void e_oconv(nkf_char c2, nkf_char c1);
153 static void w_oconv(nkf_char c2, nkf_char c1);
154 static void w_oconv16(nkf_char c2, nkf_char c1);
155 static void w_oconv32(nkf_char c2, nkf_char c1);
159 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
160 void (*oconv)(nkf_char c2, nkf_char c1);
161 } nkf_native_encoding;
163 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
164 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
165 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
166 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
167 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
168 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
169 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
174 const nkf_native_encoding *base_encoding;
177 nkf_encoding nkf_encoding_table[] = {
178 {ASCII, "US-ASCII", &NkfEncodingASCII},
179 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
180 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
181 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
182 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
183 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
184 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
185 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
186 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
187 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
188 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
189 {CP10001, "CP10001", &NkfEncodingShift_JIS},
190 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
191 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
192 {CP51932, "CP51932", &NkfEncodingEUC_JP},
193 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
194 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
195 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
196 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
197 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
198 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
199 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
200 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
201 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
202 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
203 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
204 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
205 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
206 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
207 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
208 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
209 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
210 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
211 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
212 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
213 {BINARY, "BINARY", &NkfEncodingASCII},
220 } encoding_name_to_id_table[] = {
223 {"ISO-2022-JP", ISO_2022_JP},
224 {"ISO2022JP-CP932", CP50220},
225 {"CP50220", CP50220},
226 {"CP50221", CP50221},
227 {"CSISO2022JP", CP50221},
228 {"CP50222", CP50222},
229 {"ISO-2022-JP-1", ISO_2022_JP_1},
230 {"ISO-2022-JP-3", ISO_2022_JP_3},
231 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
232 {"SHIFT_JIS", SHIFT_JIS},
234 {"WINDOWS-31J", WINDOWS_31J},
235 {"CSWINDOWS31J", WINDOWS_31J},
236 {"CP932", WINDOWS_31J},
237 {"MS932", WINDOWS_31J},
238 {"CP10001", CP10001},
241 {"EUCJP-NKF", EUCJP_NKF},
242 {"CP51932", CP51932},
243 {"EUC-JP-MS", EUCJP_MS},
244 {"EUCJP-MS", EUCJP_MS},
245 {"EUCJPMS", EUCJP_MS},
246 {"EUC-JP-ASCII", EUCJP_ASCII},
247 {"EUCJP-ASCII", EUCJP_ASCII},
248 {"SHIFT_JISX0213", SHIFT_JISX0213},
249 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
250 {"EUC-JISX0213", EUC_JISX0213},
251 {"EUC-JIS-2004", EUC_JIS_2004},
254 {"UTF-8-BOM", UTF_8_BOM},
255 {"UTF8-MAC", UTF8_MAC},
256 {"UTF-8-MAC", UTF8_MAC},
258 {"UTF-16BE", UTF_16BE},
259 {"UTF-16BE-BOM", UTF_16BE_BOM},
260 {"UTF-16LE", UTF_16LE},
261 {"UTF-16LE-BOM", UTF_16LE_BOM},
263 {"UTF-32BE", UTF_32BE},
264 {"UTF-32BE-BOM", UTF_32BE_BOM},
265 {"UTF-32LE", UTF_32LE},
266 {"UTF-32LE-BOM", UTF_32LE_BOM},
271 #if defined(DEFAULT_CODE_JIS)
272 #define DEFAULT_ENCIDX ISO_2022_JP
273 #elif defined(DEFAULT_CODE_SJIS)
274 #define DEFAULT_ENCIDX SHIFT_JIS
275 #elif defined(DEFAULT_CODE_WINDOWS_31J)
276 #define DEFAULT_ENCIDX WINDOWS_31J
277 #elif defined(DEFAULT_CODE_EUC)
278 #define DEFAULT_ENCIDX EUC_JP
279 #elif defined(DEFAULT_CODE_UTF8)
280 #define DEFAULT_ENCIDX UTF_8
284 #define is_alnum(c) \
285 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
287 /* I don't trust portablity of toupper */
288 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
289 #define nkf_isoctal(c) ('0'<=c && c<='7')
290 #define nkf_isdigit(c) ('0'<=c && c<='9')
291 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
292 #define nkf_isblank(c) (c == SP || c == TAB)
293 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
294 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
295 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
296 #define nkf_isprint(c) (SP<=c && c<='~')
297 #define nkf_isgraph(c) ('!'<=c && c<='~')
298 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
299 ('A'<=c&&c<='F') ? (c-'A'+10) : \
300 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
301 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
302 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
303 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
304 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
305 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
307 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
308 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
310 #define HOLD_SIZE 1024
311 #if defined(INT_IS_SHORT)
312 #define IOBUF_SIZE 2048
314 #define IOBUF_SIZE 16384
317 #define DEFAULT_J 'B'
318 #define DEFAULT_R 'B'
325 /* MIME preprocessor */
327 #ifdef EASYWIN /*Easy Win */
328 extern POINT _BufferSize;
337 void (*status_func)(struct input_code *, nkf_char);
338 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
342 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
343 static nkf_encoding *input_encoding = NULL;
344 static nkf_encoding *output_encoding = NULL;
346 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
348 * 0: Shift_JIS, eucJP-ascii
353 #define UCS_MAP_ASCII 0
355 #define UCS_MAP_CP932 2
356 #define UCS_MAP_CP10001 3
357 static int ms_ucs_map_f = UCS_MAP_ASCII;
359 #ifdef UTF8_INPUT_ENABLE
360 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
361 static int no_cp932ext_f = FALSE;
362 /* ignore ZERO WIDTH NO-BREAK SPACE */
363 static int no_best_fit_chars_f = FALSE;
364 static int input_endian = ENDIAN_BIG;
365 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
366 static void (*encode_fallback)(nkf_char c) = NULL;
367 static void w_status(struct input_code *, nkf_char);
369 #ifdef UTF8_OUTPUT_ENABLE
370 static int output_bom_f = FALSE;
371 static int output_endian = ENDIAN_BIG;
374 static void std_putc(nkf_char c);
375 static nkf_char std_getc(FILE *f);
376 static nkf_char std_ungetc(nkf_char c,FILE *f);
378 static nkf_char broken_getc(FILE *f);
379 static nkf_char broken_ungetc(nkf_char c,FILE *f);
381 static nkf_char mime_getc(FILE *f);
383 static void mime_putc(nkf_char c);
387 #if !defined(PERL_XS) && !defined(WIN32DLL)
388 static unsigned char stdibuf[IOBUF_SIZE];
389 static unsigned char stdobuf[IOBUF_SIZE];
393 static int unbuf_f = FALSE;
394 static int estab_f = FALSE;
395 static int nop_f = FALSE;
396 static int binmode_f = TRUE; /* binary mode */
397 static int rot_f = FALSE; /* rot14/43 mode */
398 static int hira_f = FALSE; /* hira/kata henkan */
399 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
400 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
401 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
402 static int mimebuf_f = FALSE; /* MIME buffered input */
403 static int broken_f = FALSE; /* convert ESC-less broken JIS */
404 static int iso8859_f = FALSE; /* ISO8859 through */
405 static int mimeout_f = FALSE; /* base64 mode */
406 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
407 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
409 #ifdef UNICODE_NORMALIZATION
410 static int nfc_f = FALSE;
411 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
416 static int cap_f = FALSE;
417 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
418 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
420 static int url_f = FALSE;
421 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
422 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
425 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
426 #define CLASS_MASK NKF_INT32_C(0xFF000000)
427 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
428 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
429 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
430 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
431 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
432 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
433 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
434 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
435 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
437 #ifdef NUMCHAR_OPTION
438 static int numchar_f = FALSE;
439 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
440 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
444 static int noout_f = FALSE;
445 static void no_putc(nkf_char c);
446 static int debug_f = FALSE;
447 static void debug(const char *str);
448 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
451 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
452 static void set_input_codename(const char *codename);
455 static int exec_f = 0;
458 #ifdef SHIFTJIS_CP932
459 /* invert IBM extended characters to others */
460 static int cp51932_f = FALSE;
462 /* invert NEC-selected IBM extended characters to IBM extended characters */
463 static int cp932inv_f = TRUE;
465 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
466 #endif /* SHIFTJIS_CP932 */
468 static int x0212_f = FALSE;
469 static int x0213_f = FALSE;
471 static unsigned char prefix_table[256];
473 static void e_status(struct input_code *, nkf_char);
474 static void s_status(struct input_code *, nkf_char);
476 struct input_code input_code_list[] = {
477 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
478 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
479 #ifdef UTF8_INPUT_ENABLE
480 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
485 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
486 static int base64_count = 0;
488 /* X0208 -> ASCII converter */
491 static int f_line = 0; /* chars in line */
492 static int f_prev = 0;
493 static int fold_preserve_f = FALSE; /* preserve new lines */
494 static int fold_f = FALSE;
495 static int fold_len = 0;
498 static unsigned char kanji_intro = DEFAULT_J;
499 static unsigned char ascii_intro = DEFAULT_R;
503 #define FOLD_MARGIN 10
504 #define DEFAULT_FOLD 60
506 static int fold_margin = FOLD_MARGIN;
508 /* process default */
511 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
513 fprintf(stderr,"nkf internal module connection failure.\n");
519 no_connection(nkf_char c2, nkf_char c1)
521 no_connection2(c2,c1,0);
524 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
525 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
527 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
529 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
530 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
531 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
532 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
533 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
535 /* static redirections */
537 static void (*o_putc)(nkf_char c) = std_putc;
539 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
540 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
542 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
543 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
545 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
547 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
548 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
550 /* for strict mime */
551 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
552 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
555 static int output_mode = ASCII; /* output kanji mode */
556 static int input_mode = ASCII; /* input kanji mode */
557 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
559 /* X0201 / X0208 conversion tables */
561 /* X0201 kana conversion table */
563 static const unsigned char cv[]= {
564 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
565 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
566 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
567 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
568 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
569 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
570 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
571 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
572 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
573 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
574 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
575 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
576 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
577 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
578 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
579 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
583 /* X0201 kana conversion table for daguten */
585 static const unsigned char dv[]= {
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
591 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
592 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
593 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
594 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
595 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
597 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 /* X0201 kana conversion table for han-daguten */
606 static const unsigned char ev[]= {
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
618 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
626 /* X0208 kigou conversion table */
627 /* 0x8140 - 0x819e */
628 static const unsigned char fv[] = {
630 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
631 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
632 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
634 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
635 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
636 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
638 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
646 static int option_mode = 0;
647 static int file_out_f = FALSE;
649 static int overwrite_f = FALSE;
650 static int preserve_time_f = FALSE;
651 static int backup_f = FALSE;
652 static char *backup_suffix = "";
655 static int eolmode_f = 0; /* CR, LF, CRLF */
656 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
657 static nkf_char prev_cr = 0; /* CR or 0 */
658 #ifdef EASYWIN /*Easy Win */
659 static int end_check;
662 #define STD_GC_BUFSIZE (256)
663 nkf_char std_gc_buf[STD_GC_BUFSIZE];
667 nkf_str_caseeql(const char *src, const char *target)
670 for (i = 0; src[i] && target[i]; i++) {
671 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
673 if (src[i] || target[i]) return FALSE;
678 nkf_enc_from_index(int idx)
680 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
683 return &nkf_encoding_table[idx];
687 nkf_enc_find_index(const char *name)
690 if (name[0] == 'X' && *(name+1) == '-') name += 2;
691 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
692 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
693 return encoding_name_to_id_table[i].id;
700 nkf_enc_find(const char *name)
703 idx = nkf_enc_find_index(name);
704 if (idx < 0) return 0;
705 return nkf_enc_from_index(idx);
708 #define nkf_enc_name(enc) (enc)->name
709 #define nkf_enc_to_index(enc) (enc)->id
710 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
711 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
712 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
713 #define nkf_enc_asciicompat(enc) (\
714 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
715 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
716 #define nkf_enc_unicode_p(enc) (\
717 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
718 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
719 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
720 #define nkf_enc_cp5022x_p(enc) (\
721 nkf_enc_to_index(enc) == CP50220 ||\
722 nkf_enc_to_index(enc) == CP50221 ||\
723 nkf_enc_to_index(enc) == CP50222)
725 #ifdef DEFAULT_CODE_LOCALE
729 #ifdef HAVE_LANGINFO_H
730 return nl_langinfo(CODESET);
731 #elif defined(__WIN32__)
734 int len = sprintf(buf, "CP%d", GetACP());
736 str = malloc(len + 1);
742 #elif defined(__OS2__)
743 # if defined(INT_IS_SHORT)
749 ULONG ulCP[1], ulncp;
750 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
751 if (ulCP[0] == 932 || ulCP[0] == 943)
752 strcpy(buf, "Shift_JIS");
754 sprintf(buf, "CP%lu", ulCP[0]);
763 nkf_locale_encoding()
765 nkf_encoding *enc = 0;
766 char *encname = nkf_locale_charmap();
768 enc = nkf_enc_find(encname);
771 #endif /* DEFAULT_CODE_LOCALE */
776 return &nkf_encoding_table[UTF_8];
780 nkf_default_encoding()
782 nkf_encoding *enc = 0;
783 #ifdef DEFAULT_CODE_LOCALE
784 enc = nkf_locale_encoding();
785 #elif defined(DEFAULT_ENCIDX)
786 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
788 if (!enc) enc = nkf_utf8_encoding();
794 #define fprintf dllprintf
800 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
807 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
809 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
810 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
811 #ifdef UTF8_OUTPUT_ENABLE
812 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
814 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
815 #ifdef UTF8_INPUT_ENABLE
816 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
819 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
820 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
821 "r {de/en}crypt ROT13/47\n"
822 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
823 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
824 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
825 "l ISO8859-1 (Latin-1) support\n"
826 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
827 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
828 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
829 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
830 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
831 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
833 "T Text mode output\n"
835 "O Output to File (DEFAULT 'nkf.out')\n"
836 "I Convert non ISO-2022-JP charactor to GETA\n"
837 "d,c Convert line breaks -d: LF -c: CRLF\n"
838 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
839 "v, V Show this usage. V: show configuration\n"
841 "Long name options\n"
842 " --ic=<input codeset> --oc=<output codeset>\n"
843 " Specify the input or output codeset\n"
844 " --fj --unix --mac --windows\n"
845 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
846 " Convert for the system or code\n"
847 " --hiragana --katakana --katakana-hiragana\n"
848 " To Hiragana/Katakana Conversion\n"
849 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
851 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
853 #ifdef NUMCHAR_OPTION
854 " --numchar-input Convert Unicode Character Reference\n"
856 #ifdef UTF8_INPUT_ENABLE
857 " --fb-{skip, html, xml, perl, java, subchar}\n"
858 " Specify how nkf handles unassigned characters\n"
861 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
862 " Overwrite original listed files by filtered result\n"
863 " --overwrite preserves timestamp of original files\n"
865 " -g --guess Guess the input code\n"
866 " --help --version Show this help/the version\n"
867 " For more information, see also man nkf\n"
873 show_configuration(void)
876 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
879 " Compile-time options:\n"
880 " Compiled at: " __DATE__ " " __TIME__ "\n"
883 " Default output encoding: "
884 #ifdef DEFAULT_CODE_LOCALE
885 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
886 #elif defined(DEFAULT_ENCIDX)
887 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
893 " Default output end of line: "
894 #if DEFAULT_NEWLINE == CR
896 #elif DEFAULT_NEWLINE == CRLF
902 " Decode MIME encoded string: "
903 #if MIME_DECODE_DEFAULT
909 " Convert JIS X 0201 Katakana: "
916 " --help, --version output: "
917 #if HELP_OUTPUT_HELP_OUTPUT
928 get_backup_filename(const char *suffix, const char *filename)
930 char *backup_filename;
931 int asterisk_count = 0;
933 int filename_length = strlen(filename);
935 for(i = 0; suffix[i]; i++){
936 if(suffix[i] == '*') asterisk_count++;
940 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
941 if (!backup_filename){
942 perror("Can't malloc backup filename.");
946 for(i = 0, j = 0; suffix[i];){
947 if(suffix[i] == '*'){
948 backup_filename[j] = '\0';
949 strncat(backup_filename, filename, filename_length);
951 j += filename_length;
953 backup_filename[j++] = suffix[i++];
956 backup_filename[j] = '\0';
958 j = filename_length + strlen(suffix);
959 backup_filename = malloc(j + 1);
960 strcpy(backup_filename, filename);
961 strcat(backup_filename, suffix);
962 backup_filename[j] = '\0';
964 return backup_filename;
968 #ifdef UTF8_INPUT_ENABLE
970 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
977 (*f)(0, bin2hex(c>>shift));
988 encode_fallback_html(nkf_char c)
993 if(c >= NKF_INT32_C(1000000))
994 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
995 if(c >= NKF_INT32_C(100000))
996 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
998 (*oconv)(0, 0x30+(c/10000 )%10);
1000 (*oconv)(0, 0x30+(c/1000 )%10);
1002 (*oconv)(0, 0x30+(c/100 )%10);
1004 (*oconv)(0, 0x30+(c/10 )%10);
1006 (*oconv)(0, 0x30+ c %10);
1012 encode_fallback_xml(nkf_char c)
1017 nkf_each_char_to_hex(oconv, c);
1023 encode_fallback_java(nkf_char c)
1027 if(!nkf_char_unicode_bmp_p(c)){
1031 (*oconv)(0, bin2hex(c>>20));
1032 (*oconv)(0, bin2hex(c>>16));
1036 (*oconv)(0, bin2hex(c>>12));
1037 (*oconv)(0, bin2hex(c>> 8));
1038 (*oconv)(0, bin2hex(c>> 4));
1039 (*oconv)(0, bin2hex(c ));
1044 encode_fallback_perl(nkf_char c)
1049 nkf_each_char_to_hex(oconv, c);
1055 encode_fallback_subchar(nkf_char c)
1057 c = unicode_subchar;
1058 (*oconv)((c>>8)&0xFF, c&0xFF);
1063 static const struct {
1087 {"katakana-hiragana","h3"},
1095 #ifdef UTF8_OUTPUT_ENABLE
1105 {"fb-subchar=", ""},
1107 #ifdef UTF8_INPUT_ENABLE
1108 {"utf8-input", "W"},
1109 {"utf16-input", "W16"},
1110 {"no-cp932ext", ""},
1111 {"no-best-fit-chars",""},
1113 #ifdef UNICODE_NORMALIZATION
1114 {"utf8mac-input", ""},
1126 #ifdef NUMCHAR_OPTION
1127 {"numchar-input", ""},
1133 #ifdef SHIFTJIS_CP932
1144 set_input_encoding(nkf_encoding *enc)
1146 switch (nkf_enc_to_index(enc)) {
1153 #ifdef SHIFTJIS_CP932
1156 #ifdef UTF8_OUTPUT_ENABLE
1157 ms_ucs_map_f = UCS_MAP_CP932;
1167 case ISO_2022_JP_2004:
1174 #ifdef SHIFTJIS_CP932
1177 #ifdef UTF8_OUTPUT_ENABLE
1178 ms_ucs_map_f = UCS_MAP_CP932;
1183 #ifdef SHIFTJIS_CP932
1186 #ifdef UTF8_OUTPUT_ENABLE
1187 ms_ucs_map_f = UCS_MAP_CP10001;
1195 #ifdef SHIFTJIS_CP932
1198 #ifdef UTF8_OUTPUT_ENABLE
1199 ms_ucs_map_f = UCS_MAP_CP932;
1203 #ifdef SHIFTJIS_CP932
1206 #ifdef UTF8_OUTPUT_ENABLE
1207 ms_ucs_map_f = UCS_MAP_MS;
1211 #ifdef SHIFTJIS_CP932
1214 #ifdef UTF8_OUTPUT_ENABLE
1215 ms_ucs_map_f = UCS_MAP_ASCII;
1218 case SHIFT_JISX0213:
1219 case SHIFT_JIS_2004:
1221 #ifdef SHIFTJIS_CP932
1228 #ifdef SHIFTJIS_CP932
1232 #ifdef UTF8_INPUT_ENABLE
1233 #ifdef UNICODE_NORMALIZATION
1241 input_endian = ENDIAN_BIG;
1245 input_endian = ENDIAN_LITTLE;
1250 input_endian = ENDIAN_BIG;
1254 input_endian = ENDIAN_LITTLE;
1261 set_output_encoding(nkf_encoding *enc)
1263 switch (nkf_enc_to_index(enc)) {
1266 #ifdef SHIFTJIS_CP932
1267 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1269 #ifdef UTF8_OUTPUT_ENABLE
1270 ms_ucs_map_f = UCS_MAP_CP932;
1274 #ifdef SHIFTJIS_CP932
1275 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1277 #ifdef UTF8_OUTPUT_ENABLE
1278 ms_ucs_map_f = UCS_MAP_CP932;
1283 #ifdef SHIFTJIS_CP932
1284 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1290 #ifdef SHIFTJIS_CP932
1291 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1297 #ifdef UTF8_OUTPUT_ENABLE
1298 ms_ucs_map_f = UCS_MAP_CP932;
1302 #ifdef UTF8_OUTPUT_ENABLE
1303 ms_ucs_map_f = UCS_MAP_CP10001;
1308 #ifdef SHIFTJIS_CP932
1309 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1311 #ifdef UTF8_OUTPUT_ENABLE
1312 ms_ucs_map_f = UCS_MAP_ASCII;
1317 #ifdef SHIFTJIS_CP932
1318 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1320 #ifdef UTF8_OUTPUT_ENABLE
1321 ms_ucs_map_f = UCS_MAP_ASCII;
1325 #ifdef SHIFTJIS_CP932
1326 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1328 #ifdef UTF8_OUTPUT_ENABLE
1329 ms_ucs_map_f = UCS_MAP_CP932;
1334 #ifdef UTF8_OUTPUT_ENABLE
1335 ms_ucs_map_f = UCS_MAP_MS;
1340 #ifdef UTF8_OUTPUT_ENABLE
1341 ms_ucs_map_f = UCS_MAP_ASCII;
1344 case SHIFT_JISX0213:
1345 case SHIFT_JIS_2004:
1347 #ifdef SHIFTJIS_CP932
1348 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1355 #ifdef SHIFTJIS_CP932
1356 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1359 #ifdef UTF8_OUTPUT_ENABLE
1361 output_bom_f = TRUE;
1365 output_bom_f = TRUE;
1368 output_endian = ENDIAN_LITTLE;
1369 output_bom_f = FALSE;
1372 output_endian = ENDIAN_LITTLE;
1373 output_bom_f = TRUE;
1376 output_bom_f = TRUE;
1379 output_endian = ENDIAN_LITTLE;
1380 output_bom_f = FALSE;
1383 output_endian = ENDIAN_LITTLE;
1384 output_bom_f = TRUE;
1390 static struct input_code*
1391 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1394 struct input_code *p = input_code_list;
1396 if (iconv_func == p->iconv_func){
1406 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1408 #ifdef INPUT_CODE_FIX
1409 if (f || !input_encoding)
1416 #ifdef INPUT_CODE_FIX
1417 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1423 if (estab_f && iconv_for_check != iconv){
1424 struct input_code *p = find_inputcode_byfunc(iconv);
1426 set_input_codename(p->name);
1429 iconv_for_check = iconv;
1436 x0212_shift(nkf_char c)
1441 if (0x75 <= c && c <= 0x7f){
1442 ret = c + (0x109 - 0x75);
1445 if (0x75 <= c && c <= 0x7f){
1446 ret = c + (0x113 - 0x75);
1454 x0212_unshift(nkf_char c)
1457 if (0x7f <= c && c <= 0x88){
1458 ret = c + (0x75 - 0x7f);
1459 }else if (0x89 <= c && c <= 0x92){
1460 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1464 #endif /* X0212_ENABLE */
1467 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1473 if((0x21 <= ndx && ndx <= 0x2F)){
1474 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1475 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1477 }else if(0x6E <= ndx && ndx <= 0x7E){
1478 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1479 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1485 else if(nkf_isgraph(ndx)){
1487 const unsigned short *ptr;
1488 ptr = x0212_shiftjis[ndx - 0x21];
1490 val = ptr[(c1 & 0x7f) - 0x21];
1499 c2 = x0212_shift(c2);
1501 #endif /* X0212_ENABLE */
1503 if(0x7F < c2) return 1;
1504 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1505 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1510 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1512 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1515 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1516 if (0xFC < c1) return 1;
1517 #ifdef SHIFTJIS_CP932
1518 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1519 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1526 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1527 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1533 #endif /* SHIFTJIS_CP932 */
1535 if (!x0213_f && is_ibmext_in_sjis(c2)){
1536 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1539 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1552 if(x0213_f && c2 >= 0xF0){
1553 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1554 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1555 }else{ /* 78<=k<=94 */
1556 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1557 if (0x9E < c1) c2++;
1560 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1561 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1562 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1563 if (0x9E < c1) c2++;
1566 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1573 c2 = x0212_unshift(c2);
1580 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1582 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1590 }else if (val < 0x800){
1591 *p1 = 0xc0 | (val >> 6);
1592 *p2 = 0x80 | (val & 0x3f);
1595 } else if (nkf_char_unicode_bmp_p(val)) {
1596 *p1 = 0xe0 | (val >> 12);
1597 *p2 = 0x80 | ((val >> 6) & 0x3f);
1598 *p3 = 0x80 | ( val & 0x3f);
1600 } else if (nkf_char_unicode_value_p(val)) {
1601 *p1 = 0xe0 | (val >> 16);
1602 *p2 = 0x80 | ((val >> 12) & 0x3f);
1603 *p3 = 0x80 | ((val >> 6) & 0x3f);
1604 *p4 = 0x80 | ( val & 0x3f);
1614 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1621 else if (c1 <= 0xC3) {
1622 /* trail byte or invalid */
1625 else if (c1 <= 0xDF) {
1627 wc = (c1 & 0x1F) << 6;
1630 else if (c1 <= 0xEF) {
1632 wc = (c1 & 0x0F) << 12;
1633 wc |= (c2 & 0x3F) << 6;
1636 else if (c2 <= 0xF4) {
1638 wc = (c1 & 0x0F) << 18;
1639 wc |= (c2 & 0x3F) << 12;
1640 wc |= (c3 & 0x3F) << 6;
1650 #ifdef UTF8_INPUT_ENABLE
1652 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1653 const unsigned short *const *pp, nkf_char psize,
1654 nkf_char *p2, nkf_char *p1)
1657 const unsigned short *p;
1660 if (pp == 0) return 1;
1663 if (c1 < 0 || psize <= c1) return 1;
1665 if (p == 0) return 1;
1668 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1670 if (val == 0) return 1;
1671 if (no_cp932ext_f && (
1672 (val>>8) == 0x2D || /* NEC special characters */
1673 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1681 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1689 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1691 const unsigned short *const *pp;
1692 const unsigned short *const *const *ppp;
1693 static const char no_best_fit_chars_table_C2[] =
1694 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1695 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1696 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1697 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1698 static const char no_best_fit_chars_table_C2_ms[] =
1699 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1700 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1701 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1702 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1703 static const char no_best_fit_chars_table_932_C2[] =
1704 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1705 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1706 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1707 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1708 static const char no_best_fit_chars_table_932_C3[] =
1709 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1710 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1711 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1712 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1718 }else if(c2 < 0xe0){
1719 if(no_best_fit_chars_f){
1720 if(ms_ucs_map_f == UCS_MAP_CP932){
1723 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1726 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1729 }else if(!cp932inv_f){
1732 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1735 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1738 }else if(ms_ucs_map_f == UCS_MAP_MS){
1739 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1740 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1758 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1759 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1760 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1762 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1763 }else if(c0 < 0xF0){
1764 if(no_best_fit_chars_f){
1765 if(ms_ucs_map_f == UCS_MAP_CP932){
1766 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1767 }else if(ms_ucs_map_f == UCS_MAP_MS){
1772 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1775 if(c0 == 0x92) return 1;
1780 if(c1 == 0x80 || c0 == 0x9C) return 1;
1783 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1788 if(c0 == 0x94) return 1;
1791 if(c0 == 0xBB) return 1;
1801 if(c0 == 0x95) return 1;
1804 if(c0 == 0xA5) return 1;
1811 if(c0 == 0x8D) return 1;
1814 if(c0 == 0x9E && !cp932inv_f) return 1;
1817 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1825 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1826 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1827 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1829 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1831 #ifdef SHIFTJIS_CP932
1832 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1834 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1835 s2e_conv(s2, s1, p2, p1);
1844 #ifdef UTF8_OUTPUT_ENABLE
1846 e2w_conv(nkf_char c2, nkf_char c1)
1848 const unsigned short *p;
1850 if (c2 == JIS_X_0201_1976_K) {
1851 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1859 p = euc_to_utf8_1byte;
1861 } else if (is_eucg3(c2)){
1862 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1865 c2 = (c2&0x7f) - 0x21;
1866 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1867 p = x0212_to_utf8_2bytes[c2];
1873 c2 = (c2&0x7f) - 0x21;
1874 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1876 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1877 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1878 euc_to_utf8_2bytes_ms[c2];
1883 c1 = (c1 & 0x7f) - 0x21;
1884 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1891 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1898 }else if (0xc0 <= c2 && c2 <= 0xef) {
1899 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1900 #ifdef NUMCHAR_OPTION
1903 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1911 #ifdef UTF8_INPUT_ENABLE
1913 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1915 nkf_char c1, c2, c3, c4;
1922 else if (nkf_char_unicode_bmp_p(val)){
1923 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1924 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1927 *p1 = nkf_char_unicode_new(val);
1933 *p1 = nkf_char_unicode_new(val);
1940 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1942 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1943 if (iso2022jp_f && !x0201_f) {
1944 c2 = GETA1; c1 = GETA2;
1946 c2 = JIS_X_0201_1976_K;
1950 }else if (c2 == 0x8f){
1954 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
1955 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1956 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
1959 c2 = (c2 << 8) | (c1 & 0x7f);
1961 #ifdef SHIFTJIS_CP932
1964 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1965 s2e_conv(s2, s1, &c2, &c1);
1972 #endif /* SHIFTJIS_CP932 */
1974 #endif /* X0212_ENABLE */
1975 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
1978 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
1979 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1980 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
1985 #ifdef SHIFTJIS_CP932
1986 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
1988 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1989 s2e_conv(s2, s1, &c2, &c1);
1996 #endif /* SHIFTJIS_CP932 */
2004 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2006 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2007 if (iso2022jp_f && !x0201_f) {
2008 c2 = GETA1; c1 = GETA2;
2012 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2014 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2016 if(c1 == 0x7F) return 0;
2017 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2020 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2021 if (ret) return ret;
2028 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2030 nkf_char ret = 0, c4 = 0;
2031 static const char w_iconv_utf8_1st_byte[] =
2033 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2034 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2035 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2036 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2043 if (c1 < 0 || 0xff < c1) {
2044 }else if (c1 == 0) { /* 0 : 1 byte*/
2046 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2049 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2051 if (c2 < 0x80 || 0xBF < c2) return 0;
2054 if (c3 == 0) return -1;
2055 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2060 if (c3 == 0) return -1;
2061 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2065 if (c3 == 0) return -1;
2066 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2070 if (c3 == 0) return -2;
2071 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2075 if (c3 == 0) return -2;
2076 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2080 if (c3 == 0) return -2;
2081 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2089 if (c1 == 0 || c1 == EOF){
2090 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2091 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2094 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2102 #define NKF_ICONV_INVALID_CODE_RANGE -13
2104 unicode_iconv(nkf_char wc)
2112 }else if ((wc>>11) == 27) {
2113 /* unpaired surrogate */
2114 return NKF_ICONV_INVALID_CODE_RANGE;
2115 }else if (wc < 0xFFFF) {
2116 ret = w16e_conv(wc, &c2, &c1);
2117 if (ret) return ret;
2118 }else if (wc < 0x10FFFF) {
2120 c1 = nkf_char_unicode_new(wc);
2122 return NKF_ICONV_INVALID_CODE_RANGE;
2128 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2129 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2130 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2132 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2141 if (input_endian == ENDIAN_BIG) {
2142 if (0xD8 <= c1 && c1 <= 0xDB) {
2143 if (0xDC <= c3 && c3 <= 0xDF) {
2144 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2145 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2150 if (0xD8 <= c2 && c2 <= 0xDB) {
2151 if (0xDC <= c4 && c4 <= 0xDF) {
2152 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2153 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2159 return (*unicode_iconv)(wc);
2163 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2169 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2175 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2184 switch(input_endian){
2186 wc = c2 << 16 | c3 << 8 | c4;
2189 wc = c3 << 16 | c2 << 8 | c1;
2192 wc = c1 << 16 | c4 << 8 | c3;
2195 wc = c4 << 16 | c1 << 8 | c2;
2198 return NKF_ICONV_INVALID_CODE_RANGE;
2201 return (*unicode_iconv)(wc);
2205 #define output_ascii_escape_sequence(mode) do { \
2206 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2209 (*o_putc)(ascii_intro); \
2210 output_mode = mode; \
2215 output_escape_sequence(int mode)
2217 if (output_mode == mode)
2225 case JIS_X_0201_1976_K:
2233 (*o_putc)(kanji_intro);
2258 j_oconv(nkf_char c2, nkf_char c1)
2260 #ifdef NUMCHAR_OPTION
2261 if (c2 == 0 && nkf_char_unicode_p(c1)){
2262 w16e_conv(c1, &c2, &c1);
2263 if (c2 == 0 && nkf_char_unicode_p(c1)){
2264 c2 = c1 & VALUE_MASK;
2265 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2268 c2 = 0x7F + c1 / 94;
2269 c1 = 0x21 + c1 % 94;
2271 if (encode_fallback) (*encode_fallback)(c1);
2278 output_ascii_escape_sequence(ASCII);
2281 else if (c2 == EOF) {
2282 output_ascii_escape_sequence(ASCII);
2285 else if (c2 == ISO_8859_1) {
2286 output_ascii_escape_sequence(ISO_8859_1);
2289 else if (c2 == JIS_X_0201_1976_K) {
2290 output_escape_sequence(JIS_X_0201_1976_K);
2293 } else if (is_eucg3(c2)){
2294 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2295 (*o_putc)(c2 & 0x7f);
2300 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2301 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2302 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2309 e_oconv(nkf_char c2, nkf_char c1)
2311 if (c2 == 0 && nkf_char_unicode_p(c1)){
2312 w16e_conv(c1, &c2, &c1);
2313 if (c2 == 0 && nkf_char_unicode_p(c1)){
2314 c2 = c1 & VALUE_MASK;
2315 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2319 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2320 c1 = 0x21 + c1 % 94;
2323 (*o_putc)((c2 & 0x7f) | 0x080);
2324 (*o_putc)(c1 | 0x080);
2326 (*o_putc)((c2 & 0x7f) | 0x080);
2327 (*o_putc)(c1 | 0x080);
2331 if (encode_fallback) (*encode_fallback)(c1);
2339 } else if (c2 == 0) {
2340 output_mode = ASCII;
2342 } else if (c2 == JIS_X_0201_1976_K) {
2343 output_mode = EUC_JP;
2344 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2345 } else if (c2 == ISO_8859_1) {
2346 output_mode = ISO_8859_1;
2347 (*o_putc)(c1 | 0x080);
2349 } else if (is_eucg3(c2)){
2350 output_mode = EUC_JP;
2351 #ifdef SHIFTJIS_CP932
2354 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2355 s2e_conv(s2, s1, &c2, &c1);
2360 output_mode = ASCII;
2362 }else if (is_eucg3(c2)){
2365 (*o_putc)((c2 & 0x7f) | 0x080);
2366 (*o_putc)(c1 | 0x080);
2369 (*o_putc)((c2 & 0x7f) | 0x080);
2370 (*o_putc)(c1 | 0x080);
2374 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2375 set_iconv(FALSE, 0);
2376 return; /* too late to rescue this char */
2378 output_mode = EUC_JP;
2379 (*o_putc)(c2 | 0x080);
2380 (*o_putc)(c1 | 0x080);
2385 s_oconv(nkf_char c2, nkf_char c1)
2387 #ifdef NUMCHAR_OPTION
2388 if (c2 == 0 && nkf_char_unicode_p(c1)){
2389 w16e_conv(c1, &c2, &c1);
2390 if (c2 == 0 && nkf_char_unicode_p(c1)){
2391 c2 = c1 & VALUE_MASK;
2392 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2395 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2397 c1 += 0x40 + (c1 > 0x3e);
2402 if(encode_fallback)(*encode_fallback)(c1);
2411 } else if (c2 == 0) {
2412 output_mode = ASCII;
2414 } else if (c2 == JIS_X_0201_1976_K) {
2415 output_mode = SHIFT_JIS;
2417 } else if (c2 == ISO_8859_1) {
2418 output_mode = ISO_8859_1;
2419 (*o_putc)(c1 | 0x080);
2421 } else if (is_eucg3(c2)){
2422 output_mode = SHIFT_JIS;
2423 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2429 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2430 set_iconv(FALSE, 0);
2431 return; /* too late to rescue this char */
2433 output_mode = SHIFT_JIS;
2434 e2s_conv(c2, c1, &c2, &c1);
2436 #ifdef SHIFTJIS_CP932
2438 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2439 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2445 #endif /* SHIFTJIS_CP932 */
2448 if (prefix_table[(unsigned char)c1]){
2449 (*o_putc)(prefix_table[(unsigned char)c1]);
2455 #ifdef UTF8_OUTPUT_ENABLE
2457 w_oconv(nkf_char c2, nkf_char c1)
2463 output_bom_f = FALSE;
2474 if (c2 == 0 && nkf_char_unicode_p(c1)){
2475 val = c1 & VALUE_MASK;
2476 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2478 if (c2) (*o_putc)(c2);
2479 if (c3) (*o_putc)(c3);
2480 if (c4) (*o_putc)(c4);
2487 val = e2w_conv(c2, c1);
2489 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2491 if (c2) (*o_putc)(c2);
2492 if (c3) (*o_putc)(c3);
2493 if (c4) (*o_putc)(c4);
2499 w_oconv16(nkf_char c2, nkf_char c1)
2502 output_bom_f = FALSE;
2503 if (output_endian == ENDIAN_LITTLE){
2517 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2518 if (nkf_char_unicode_bmp_p(c1)) {
2519 c2 = (c1 >> 8) & 0xff;
2523 if (c1 <= UNICODE_MAX) {
2524 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2525 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2526 if (output_endian == ENDIAN_LITTLE){
2527 (*o_putc)(c2 & 0xff);
2528 (*o_putc)((c2 >> 8) & 0xff);
2529 (*o_putc)(c1 & 0xff);
2530 (*o_putc)((c1 >> 8) & 0xff);
2532 (*o_putc)((c2 >> 8) & 0xff);
2533 (*o_putc)(c2 & 0xff);
2534 (*o_putc)((c1 >> 8) & 0xff);
2535 (*o_putc)(c1 & 0xff);
2541 nkf_char val = e2w_conv(c2, c1);
2542 c2 = (val >> 8) & 0xff;
2547 if (output_endian == ENDIAN_LITTLE){
2557 w_oconv32(nkf_char c2, nkf_char c1)
2560 output_bom_f = FALSE;
2561 if (output_endian == ENDIAN_LITTLE){
2579 if (c2 == ISO_8859_1) {
2581 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2584 c1 = e2w_conv(c2, c1);
2587 if (output_endian == ENDIAN_LITTLE){
2588 (*o_putc)( c1 & 0xFF);
2589 (*o_putc)((c1 >> 8) & 0xFF);
2590 (*o_putc)((c1 >> 16) & 0xFF);
2594 (*o_putc)((c1 >> 16) & 0xFF);
2595 (*o_putc)((c1 >> 8) & 0xFF);
2596 (*o_putc)( c1 & 0xFF);
2601 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2602 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2603 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2604 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2605 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2606 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2607 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2608 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2610 #define SCORE_INIT (SCORE_iMIME)
2612 static const char score_table_A0[] = {
2615 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2616 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2619 static const char score_table_F0[] = {
2620 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2621 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2622 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2623 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2627 set_code_score(struct input_code *ptr, nkf_char score)
2630 ptr->score |= score;
2635 clr_code_score(struct input_code *ptr, nkf_char score)
2638 ptr->score &= ~score;
2643 code_score(struct input_code *ptr)
2645 nkf_char c2 = ptr->buf[0];
2646 #ifdef UTF8_OUTPUT_ENABLE
2647 nkf_char c1 = ptr->buf[1];
2650 set_code_score(ptr, SCORE_ERROR);
2651 }else if (c2 == SS2){
2652 set_code_score(ptr, SCORE_KANA);
2653 }else if (c2 == 0x8f){
2654 set_code_score(ptr, SCORE_X0212);
2655 #ifdef UTF8_OUTPUT_ENABLE
2656 }else if (!e2w_conv(c2, c1)){
2657 set_code_score(ptr, SCORE_NO_EXIST);
2659 }else if ((c2 & 0x70) == 0x20){
2660 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2661 }else if ((c2 & 0x70) == 0x70){
2662 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2663 }else if ((c2 & 0x70) >= 0x50){
2664 set_code_score(ptr, SCORE_L2);
2669 status_disable(struct input_code *ptr)
2674 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2678 status_push_ch(struct input_code *ptr, nkf_char c)
2680 ptr->buf[ptr->index++] = c;
2684 status_clear(struct input_code *ptr)
2691 status_reset(struct input_code *ptr)
2694 ptr->score = SCORE_INIT;
2698 status_reinit(struct input_code *ptr)
2701 ptr->_file_stat = 0;
2705 status_check(struct input_code *ptr, nkf_char c)
2707 if (c <= DEL && estab_f){
2713 s_status(struct input_code *ptr, nkf_char c)
2717 status_check(ptr, c);
2722 }else if (nkf_char_unicode_p(c)){
2724 }else if (0xa1 <= c && c <= 0xdf){
2725 status_push_ch(ptr, SS2);
2726 status_push_ch(ptr, c);
2729 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2731 status_push_ch(ptr, c);
2732 }else if (0xed <= c && c <= 0xee){
2734 status_push_ch(ptr, c);
2735 #ifdef SHIFTJIS_CP932
2736 }else if (is_ibmext_in_sjis(c)){
2738 status_push_ch(ptr, c);
2739 #endif /* SHIFTJIS_CP932 */
2741 }else if (0xf0 <= c && c <= 0xfc){
2743 status_push_ch(ptr, c);
2744 #endif /* X0212_ENABLE */
2746 status_disable(ptr);
2750 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2751 status_push_ch(ptr, c);
2752 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2756 status_disable(ptr);
2760 #ifdef SHIFTJIS_CP932
2761 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2762 status_push_ch(ptr, c);
2763 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2764 set_code_score(ptr, SCORE_CP932);
2769 #endif /* SHIFTJIS_CP932 */
2770 status_disable(ptr);
2773 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2774 status_push_ch(ptr, c);
2775 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2776 set_code_score(ptr, SCORE_CP932);
2779 status_disable(ptr);
2786 e_status(struct input_code *ptr, nkf_char c)
2790 status_check(ptr, c);
2795 }else if (nkf_char_unicode_p(c)){
2797 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2799 status_push_ch(ptr, c);
2801 }else if (0x8f == c){
2803 status_push_ch(ptr, c);
2804 #endif /* X0212_ENABLE */
2806 status_disable(ptr);
2810 if (0xa1 <= c && c <= 0xfe){
2811 status_push_ch(ptr, c);
2815 status_disable(ptr);
2820 if (0xa1 <= c && c <= 0xfe){
2822 status_push_ch(ptr, c);
2824 status_disable(ptr);
2826 #endif /* X0212_ENABLE */
2830 #ifdef UTF8_INPUT_ENABLE
2832 w_status(struct input_code *ptr, nkf_char c)
2836 status_check(ptr, c);
2841 }else if (nkf_char_unicode_p(c)){
2843 }else if (0xc0 <= c && c <= 0xdf){
2845 status_push_ch(ptr, c);
2846 }else if (0xe0 <= c && c <= 0xef){
2848 status_push_ch(ptr, c);
2849 }else if (0xf0 <= c && c <= 0xf4){
2851 status_push_ch(ptr, c);
2853 status_disable(ptr);
2858 if (0x80 <= c && c <= 0xbf){
2859 status_push_ch(ptr, c);
2860 if (ptr->index > ptr->stat){
2861 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2862 && ptr->buf[2] == 0xbf);
2863 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2864 &ptr->buf[0], &ptr->buf[1]);
2871 status_disable(ptr);
2875 if (0x80 <= c && c <= 0xbf){
2876 if (ptr->index < ptr->stat){
2877 status_push_ch(ptr, c);
2882 status_disable(ptr);
2890 code_status(nkf_char c)
2892 int action_flag = 1;
2893 struct input_code *result = 0;
2894 struct input_code *p = input_code_list;
2896 if (!p->status_func) {
2900 if (!p->status_func)
2902 (p->status_func)(p, c);
2905 }else if(p->stat == 0){
2916 if (result && !estab_f){
2917 set_iconv(TRUE, result->iconv_func);
2918 }else if (c <= DEL){
2919 struct input_code *ptr = input_code_list;
2933 return std_gc_buf[--std_gc_ndx];
2940 std_ungetc(nkf_char c, FILE *f)
2942 if (std_gc_ndx == STD_GC_BUFSIZE){
2945 std_gc_buf[std_gc_ndx++] = c;
2951 std_putc(nkf_char c)
2958 static unsigned char hold_buf[HOLD_SIZE*2];
2959 static int hold_count = 0;
2961 push_hold_buf(nkf_char c2)
2963 if (hold_count >= HOLD_SIZE*2)
2965 hold_buf[hold_count++] = (unsigned char)c2;
2966 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2970 h_conv(FILE *f, int c1, int c2)
2976 /** it must NOT be in the kanji shifte sequence */
2977 /** it must NOT be written in JIS7 */
2978 /** and it must be after 2 byte 8bit code */
2984 while ((c2 = (*i_getc)(f)) != EOF) {
2990 if (push_hold_buf(c2) == EOF || estab_f) {
2996 struct input_code *p = input_code_list;
2997 struct input_code *result = p;
3002 if (p->status_func && p->score < result->score) {
3007 set_iconv(TRUE, result->iconv_func);
3012 ** 1) EOF is detected, or
3013 ** 2) Code is established, or
3014 ** 3) Buffer is FULL (but last word is pushed)
3016 ** in 1) and 3) cases, we continue to use
3017 ** Kanji codes by oconv and leave estab_f unchanged.
3022 while (hold_index < hold_count){
3023 c1 = hold_buf[hold_index++];
3027 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3028 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3031 if (hold_index < hold_count){
3032 c2 = hold_buf[hold_index++];
3042 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3045 if (hold_index < hold_count){
3046 c3 = hold_buf[hold_index++];
3047 } else if ((c3 = (*i_getc)(f)) == EOF) {
3052 if (hold_index < hold_count){
3053 c4 = hold_buf[hold_index++];
3054 } else if ((c4 = (*i_getc)(f)) == EOF) {
3059 (*iconv)(c1, c2, (c3<<8)|c4);
3064 /* 3 bytes EUC or UTF-8 */
3065 if (hold_index < hold_count){
3066 c3 = hold_buf[hold_index++];
3067 } else if ((c3 = (*i_getc)(f)) == EOF) {
3073 (*iconv)(c1, c2, c3);
3076 if (c3 == EOF) break;
3082 * Check and Ignore BOM
3088 switch(c2 = (*i_getc)(f)){
3090 if((c2 = (*i_getc)(f)) == 0x00){
3091 if((c2 = (*i_getc)(f)) == 0xFE){
3092 if((c2 = (*i_getc)(f)) == 0xFF){
3093 if(!input_encoding){
3094 set_iconv(TRUE, w_iconv32);
3096 if (iconv == w_iconv32) {
3097 input_endian = ENDIAN_BIG;
3100 (*i_ungetc)(0xFF,f);
3101 }else (*i_ungetc)(c2,f);
3102 (*i_ungetc)(0xFE,f);
3103 }else if(c2 == 0xFF){
3104 if((c2 = (*i_getc)(f)) == 0xFE){
3105 if(!input_encoding){
3106 set_iconv(TRUE, w_iconv32);
3108 if (iconv == w_iconv32) {
3109 input_endian = ENDIAN_2143;
3112 (*i_ungetc)(0xFF,f);
3113 }else (*i_ungetc)(c2,f);
3114 (*i_ungetc)(0xFF,f);
3115 }else (*i_ungetc)(c2,f);
3116 (*i_ungetc)(0x00,f);
3117 }else (*i_ungetc)(c2,f);
3118 (*i_ungetc)(0x00,f);
3121 if((c2 = (*i_getc)(f)) == 0xBB){
3122 if((c2 = (*i_getc)(f)) == 0xBF){
3123 if(!input_encoding){
3124 set_iconv(TRUE, w_iconv);
3126 if (iconv == w_iconv) {
3129 (*i_ungetc)(0xBF,f);
3130 }else (*i_ungetc)(c2,f);
3131 (*i_ungetc)(0xBB,f);
3132 }else (*i_ungetc)(c2,f);
3133 (*i_ungetc)(0xEF,f);
3136 if((c2 = (*i_getc)(f)) == 0xFF){
3137 if((c2 = (*i_getc)(f)) == 0x00){
3138 if((c2 = (*i_getc)(f)) == 0x00){
3139 if(!input_encoding){
3140 set_iconv(TRUE, w_iconv32);
3142 if (iconv == w_iconv32) {
3143 input_endian = ENDIAN_3412;
3146 (*i_ungetc)(0x00,f);
3147 }else (*i_ungetc)(c2,f);
3148 (*i_ungetc)(0x00,f);
3149 }else (*i_ungetc)(c2,f);
3150 if(!input_encoding){
3151 set_iconv(TRUE, w_iconv16);
3153 if (iconv == w_iconv16) {
3154 input_endian = ENDIAN_BIG;
3157 (*i_ungetc)(0xFF,f);
3158 }else (*i_ungetc)(c2,f);
3159 (*i_ungetc)(0xFE,f);
3162 if((c2 = (*i_getc)(f)) == 0xFE){
3163 if((c2 = (*i_getc)(f)) == 0x00){
3164 if((c2 = (*i_getc)(f)) == 0x00){
3165 if(!input_encoding){
3166 set_iconv(TRUE, w_iconv32);
3168 if (iconv == w_iconv32) {
3169 input_endian = ENDIAN_LITTLE;
3172 (*i_ungetc)(0x00,f);
3173 }else (*i_ungetc)(c2,f);
3174 (*i_ungetc)(0x00,f);
3175 }else (*i_ungetc)(c2,f);
3176 if(!input_encoding){
3177 set_iconv(TRUE, w_iconv16);
3179 if (iconv == w_iconv16) {
3180 input_endian = ENDIAN_LITTLE;
3183 (*i_ungetc)(0xFE,f);
3184 }else (*i_ungetc)(c2,f);
3185 (*i_ungetc)(0xFF,f);
3200 init_broken_state(void)
3202 memset(&broken_state, 0, sizeof(broken_state));
3208 broken_state.buf[broken_state.count++] = c;
3212 pop_broken_buf(void)
3214 return broken_state.buf[--broken_state.count];
3218 broken_getc(FILE *f)
3222 if (broken_state.count > 0) {
3223 return pop_broken_buf();
3226 if (c=='$' && broken_state.status != ESC
3227 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3229 broken_state.status = 0;
3230 if (c1=='@'|| c1=='B') {
3231 push_broken_buf(c1);
3238 } else if (c=='(' && broken_state.status != ESC
3239 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3241 broken_state.status = 0;
3242 if (c1=='J'|| c1=='B') {
3243 push_broken_buf(c1);
3251 broken_state.status = c;
3257 broken_ungetc(nkf_char c, FILE *f)
3259 if (broken_state.count < 2)
3265 eol_conv(nkf_char c2, nkf_char c1)
3267 if (guess_f && input_eol != EOF) {
3268 if (c2 == 0 && c1 == LF) {
3269 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3270 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3271 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3273 else if (!input_eol) input_eol = CR;
3274 else if (input_eol != CR) input_eol = EOF;
3276 if (prev_cr || (c2 == 0 && c1 == LF)) {
3278 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3279 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3281 if (c2 == 0 && c1 == CR) prev_cr = CR;
3282 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3286 Return value of fold_conv()
3288 LF add newline and output char
3289 CR add newline and output nothing
3292 1 (or else) normal output
3294 fold state in prev (previous character)
3296 >0x80 Japanese (X0208/X0201)
3301 This fold algorthm does not preserve heading space in a line.
3302 This is the main difference from fmt.
3305 #define char_size(c2,c1) (c2?2:1)
3308 fold_conv(nkf_char c2, nkf_char c1)
3311 nkf_char fold_state;
3313 if (c1== CR && !fold_preserve_f) {
3314 fold_state=0; /* ignore cr */
3315 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3317 fold_state=0; /* ignore cr */
3318 } else if (c1== BS) {
3319 if (f_line>0) f_line--;
3321 } else if (c2==EOF && f_line != 0) { /* close open last line */
3323 } else if ((c1==LF && !fold_preserve_f)
3324 || ((c1==CR||(c1==LF&&f_prev!=CR))
3325 && fold_preserve_f)) {
3327 if (fold_preserve_f) {
3331 } else if ((f_prev == c1 && !fold_preserve_f)
3332 || (f_prev == LF && fold_preserve_f)
3333 ) { /* duplicate newline */
3336 fold_state = LF; /* output two newline */
3342 if (f_prev&0x80) { /* Japanese? */
3344 fold_state = 0; /* ignore given single newline */
3345 } else if (f_prev==SP) {
3349 if (++f_line<=fold_len)
3353 fold_state = CR; /* fold and output nothing */
3357 } else if (c1=='\f') {
3360 fold_state = LF; /* output newline and clear */
3361 } else if ( (c2==0 && c1==SP)||
3362 (c2==0 && c1==TAB)||
3363 (c2=='!'&& c1=='!')) {
3364 /* X0208 kankaku or ascii space */
3366 fold_state = 0; /* remove duplicate spaces */
3369 if (++f_line<=fold_len)
3370 fold_state = SP; /* output ASCII space only */
3372 f_prev = SP; f_line = 0;
3373 fold_state = CR; /* fold and output nothing */
3377 prev0 = f_prev; /* we still need this one... , but almost done */
3379 if (c2 || c2 == JIS_X_0201_1976_K)
3380 f_prev |= 0x80; /* this is Japanese */
3381 f_line += char_size(c2,c1);
3382 if (f_line<=fold_len) { /* normal case */
3385 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3386 f_line = char_size(c2,c1);
3387 fold_state = LF; /* We can't wait, do fold now */
3388 } else if (c2 == JIS_X_0201_1976_K) {
3389 /* simple kinsoku rules return 1 means no folding */
3390 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3391 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3392 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3393 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3394 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3395 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3396 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3398 fold_state = LF;/* add one new f_line before this character */
3401 fold_state = LF;/* add one new f_line before this character */
3404 /* kinsoku point in ASCII */
3405 if ( c1==')'|| /* { [ ( */
3416 /* just after special */
3417 } else if (!is_alnum(prev0)) {
3418 f_line = char_size(c2,c1);
3420 } else if ((prev0==SP) || /* ignored new f_line */
3421 (prev0==LF)|| /* ignored new f_line */
3422 (prev0&0x80)) { /* X0208 - ASCII */
3423 f_line = char_size(c2,c1);
3424 fold_state = LF;/* add one new f_line before this character */
3426 fold_state = 1; /* default no fold in ASCII */
3430 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3431 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3432 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3433 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3434 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3435 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3436 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3437 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3438 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3439 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3440 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3441 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3442 /* default no fold in kinsoku */
3445 f_line = char_size(c2,c1);
3446 /* add one new f_line before this character */
3449 f_line = char_size(c2,c1);
3451 /* add one new f_line before this character */
3456 /* terminator process */
3457 switch(fold_state) {
3459 OCONV_NEWLINE((*o_fconv));
3465 OCONV_NEWLINE((*o_fconv));
3476 static nkf_char z_prev2=0,z_prev1=0;
3479 z_conv(nkf_char c2, nkf_char c1)
3482 /* if (c2) c1 &= 0x7f; assertion */
3484 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3490 if (z_prev2 == JIS_X_0201_1976_K) {
3491 if (c2 == JIS_X_0201_1976_K) {
3492 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3494 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3496 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3498 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3503 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3505 if (c2 == JIS_X_0201_1976_K) {
3506 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3507 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3512 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3523 if (alpha_f&1 && c2 == 0x23) {
3524 /* JISX0208 Alphabet */
3526 } else if (c2 == 0x21) {
3527 /* JISX0208 Kigou */
3532 } else if (alpha_f&4) {
3537 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3543 if (alpha_f&8 && c2 == 0) {
3545 const char *entity = 0;
3547 case '>': entity = ">"; break;
3548 case '<': entity = "<"; break;
3549 case '\"': entity = """; break;
3550 case '&': entity = "&"; break;
3553 while (*entity) (*o_zconv)(0, *entity++);
3559 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3564 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3568 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3572 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3576 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3580 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3584 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3588 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3592 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3597 (*o_zconv)(JIS_X_0201_1976_K, c);
3600 } else if (c2 == 0x25) {
3601 /* JISX0208 Katakana */
3602 static const int fullwidth_to_halfwidth[] =
3604 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3605 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3606 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3607 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3608 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3609 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3610 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3611 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3612 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3613 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3614 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3615 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3617 if (fullwidth_to_halfwidth[c1-0x20]){
3618 c2 = fullwidth_to_halfwidth[c1-0x20];
3619 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3621 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3631 #define rot13(c) ( \
3633 (c <= 'M') ? (c + 13): \
3634 (c <= 'Z') ? (c - 13): \
3636 (c <= 'm') ? (c + 13): \
3637 (c <= 'z') ? (c - 13): \
3641 #define rot47(c) ( \
3643 ( c <= 'O') ? (c + 47) : \
3644 ( c <= '~') ? (c - 47) : \
3649 rot_conv(nkf_char c2, nkf_char c1)
3651 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3657 (*o_rot_conv)(c2,c1);
3661 hira_conv(nkf_char c2, nkf_char c1)
3665 if (0x20 < c1 && c1 < 0x74) {
3667 (*o_hira_conv)(c2,c1);
3669 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3671 c1 = nkf_char_unicode_new(0x3094);
3672 (*o_hira_conv)(c2,c1);
3675 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3677 (*o_hira_conv)(c2,c1);
3682 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3685 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3687 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3691 (*o_hira_conv)(c2,c1);
3696 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3698 #define RANGE_NUM_MAX 18
3699 static const nkf_char range[RANGE_NUM_MAX][2] = {
3720 nkf_char start, end, c;
3722 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3726 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3731 for (i = 0; i < RANGE_NUM_MAX; i++) {
3732 start = range[i][0];
3735 if (c >= start && c <= end) {
3740 (*o_iso2022jp_check_conv)(c2,c1);
3744 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3746 static const unsigned char *mime_pattern[] = {
3747 (const unsigned char *)"\075?EUC-JP?B?",
3748 (const unsigned char *)"\075?SHIFT_JIS?B?",
3749 (const unsigned char *)"\075?ISO-8859-1?Q?",
3750 (const unsigned char *)"\075?ISO-8859-1?B?",
3751 (const unsigned char *)"\075?ISO-2022-JP?B?",
3752 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3753 #if defined(UTF8_INPUT_ENABLE)
3754 (const unsigned char *)"\075?UTF-8?B?",
3755 (const unsigned char *)"\075?UTF-8?Q?",
3757 (const unsigned char *)"\075?US-ASCII?Q?",
3762 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3763 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3764 e_iconv, s_iconv, 0, 0, 0, 0,
3765 #if defined(UTF8_INPUT_ENABLE)
3771 static const nkf_char mime_encode[] = {
3772 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3773 #if defined(UTF8_INPUT_ENABLE)
3780 static const nkf_char mime_encode_method[] = {
3781 'B', 'B','Q', 'B', 'B', 'Q',
3782 #if defined(UTF8_INPUT_ENABLE)
3790 /* MIME preprocessor fifo */
3792 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3793 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3794 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3796 unsigned char buf[MIME_BUF_SIZE];
3798 unsigned int last; /* decoded */
3799 unsigned int input; /* undecoded */
3801 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3803 #define MAXRECOVER 20
3806 mime_input_buf_unshift(nkf_char c)
3808 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3812 mime_ungetc(nkf_char c, FILE *f)
3814 mime_input_buf_unshift(c);
3819 mime_ungetc_buf(nkf_char c, FILE *f)
3822 (*i_mungetc_buf)(c,f);
3824 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3829 mime_getc_buf(FILE *f)
3831 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3832 a terminator. It was checked in mime_integrity. */
3833 return ((mimebuf_f)?
3834 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3838 switch_mime_getc(void)
3840 if (i_getc!=mime_getc) {
3841 i_mgetc = i_getc; i_getc = mime_getc;
3842 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3843 if(mime_f==STRICT_MIME) {
3844 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3845 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3851 unswitch_mime_getc(void)
3853 if(mime_f==STRICT_MIME) {
3854 i_mgetc = i_mgetc_buf;
3855 i_mungetc = i_mungetc_buf;
3858 i_ungetc = i_mungetc;
3859 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3860 mime_iconv_back = NULL;
3864 mime_integrity(FILE *f, const unsigned char *p)
3868 /* In buffered mode, read until =? or NL or buffer full
3870 mime_input_state.input = mime_input_state.top;
3871 mime_input_state.last = mime_input_state.top;
3873 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3875 q = mime_input_state.input;
3876 while((c=(*i_getc)(f))!=EOF) {
3877 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3878 break; /* buffer full */
3880 if (c=='=' && d=='?') {
3881 /* checked. skip header, start decode */
3882 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3883 /* mime_last_input = mime_input_state.input; */
3884 mime_input_state.input = q;
3888 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3890 /* Should we check length mod 4? */
3891 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3894 /* In case of Incomplete MIME, no MIME decode */
3895 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3896 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3897 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3898 switch_mime_getc(); /* anyway we need buffered getc */
3903 mime_begin_strict(FILE *f)
3907 const unsigned char *p,*q;
3908 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3910 mime_decode_mode = FALSE;
3911 /* =? has been checked */
3913 p = mime_pattern[j];
3916 for(i=2;p[i]>SP;i++) { /* start at =? */
3917 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3918 /* pattern fails, try next one */
3920 while (mime_pattern[++j]) {
3921 p = mime_pattern[j];
3922 for(k=2;k<i;k++) /* assume length(p) > i */
3923 if (p[k]!=q[k]) break;
3924 if (k==i && nkf_toupper(c1)==p[k]) break;
3926 p = mime_pattern[j];
3927 if (p) continue; /* found next one, continue */
3928 /* all fails, output from recovery buffer */
3936 mime_decode_mode = p[i-2];
3938 mime_iconv_back = iconv;
3939 set_iconv(FALSE, mime_priority_func[j]);
3940 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3942 if (mime_decode_mode=='B') {
3943 mimebuf_f = unbuf_f;
3945 /* do MIME integrity check */
3946 return mime_integrity(f,mime_pattern[j]);
3960 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3961 /* re-read and convert again from mime_buffer. */
3963 /* =? has been checked */
3964 k = mime_input_state.last;
3965 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
3966 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3967 /* We accept any character type even if it is breaked by new lines */
3968 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3969 if (c1==LF||c1==SP||c1==CR||
3970 c1=='-'||c1=='_'||is_alnum(c1)) continue;
3972 /* Failed. But this could be another MIME preemble */
3974 mime_input_state.last--;
3980 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3981 if (!(++i<MAXRECOVER) || c1==EOF) break;
3982 if (c1=='b'||c1=='B') {
3983 mime_decode_mode = 'B';
3984 } else if (c1=='q'||c1=='Q') {
3985 mime_decode_mode = 'Q';
3989 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3990 if (!(++i<MAXRECOVER) || c1==EOF) break;
3992 mime_decode_mode = FALSE;
3998 if (!mime_decode_mode) {
3999 /* false MIME premble, restart from mime_buffer */
4000 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4001 /* Since we are in MIME mode until buffer becomes empty, */
4002 /* we never go into mime_begin again for a while. */
4005 /* discard mime preemble, and goto MIME mode */
4006 mime_input_state.last = k;
4007 /* do no MIME integrity check */
4008 return c1; /* used only for checking EOF */
4019 debug(const char *str)
4022 fprintf(stderr, "%s\n", str ? str : "NULL");
4028 set_input_codename(const char *codename)
4030 if (!input_codename) {
4031 input_codename = codename;
4032 } else if (strcmp(codename, input_codename) != 0) {
4033 input_codename = "";
4038 get_guessed_code(void)
4040 if (input_codename && !*input_codename) {
4041 input_codename = "BINARY";
4043 struct input_code *p = find_inputcode_byfunc(iconv);
4044 if (!input_codename) {
4045 input_codename = "ASCII";
4046 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4047 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4048 input_codename = "CP932";
4049 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4050 if (p->score & (SCORE_X0212))
4051 input_codename = "EUCJP-MS";
4052 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4053 input_codename = "CP51932";
4054 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4055 if (p->score & (SCORE_KANA))
4056 input_codename = "CP50221";
4057 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4058 input_codename = "CP50220";
4061 return input_codename;
4064 #if !defined(PERL_XS) && !defined(WIN32DLL)
4066 print_guessed_code(char *filename)
4068 if (filename != NULL) printf("%s: ", filename);
4069 if (input_codename && !*input_codename) {
4072 input_codename = get_guessed_code();
4074 printf("%s\n", input_codename);
4078 input_eol == CR ? " (CR)" :
4079 input_eol == LF ? " (LF)" :
4080 input_eol == CRLF ? " (CRLF)" :
4081 input_eol == EOF ? " (MIXED NL)" :
4091 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4093 nkf_char c1, c2, c3;
4099 if (!nkf_isxdigit(c2)){
4104 if (!nkf_isxdigit(c3)){
4109 return (hex2bin(c2) << 4) | hex2bin(c3);
4115 return hex_getc(':', f, i_cgetc, i_cungetc);
4119 cap_ungetc(nkf_char c, FILE *f)
4121 return (*i_cungetc)(c, f);
4127 return hex_getc('%', f, i_ugetc, i_uungetc);
4131 url_ungetc(nkf_char c, FILE *f)
4133 return (*i_uungetc)(c, f);
4137 #ifdef NUMCHAR_OPTION
4139 numchar_getc(FILE *f)
4141 nkf_char (*g)(FILE *) = i_ngetc;
4142 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4153 if (buf[i] == 'x' || buf[i] == 'X'){
4154 for (j = 0; j < 7; j++){
4156 if (!nkf_isxdigit(buf[i])){
4163 c |= hex2bin(buf[i]);
4166 for (j = 0; j < 8; j++){
4170 if (!nkf_isdigit(buf[i])){
4177 c += hex2bin(buf[i]);
4183 return nkf_char_unicode_new(c);
4193 numchar_ungetc(nkf_char c, FILE *f)
4195 return (*i_nungetc)(c, f);
4199 #ifdef UNICODE_NORMALIZATION
4201 /* Normalization Form C */
4205 nkf_char (*g)(FILE *f) = i_nfc_getc;
4206 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4207 int i=0, j, k=1, lower, upper;
4209 const unsigned char *array;
4212 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4213 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4214 while (upper >= lower) {
4215 j = (lower+upper) / 2;
4216 array = normalization_table[j].nfd;
4217 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4218 if (array[k] != buf[k]){
4219 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4226 array = normalization_table[j].nfc;
4227 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4228 buf[i] = (nkf_char)(array[i]);
4240 nfc_ungetc(nkf_char c, FILE *f)
4242 return (*i_nfc_ungetc)(c, f);
4244 #endif /* UNICODE_NORMALIZATION */
4248 base64decode(nkf_char c)
4253 i = c - 'A'; /* A..Z 0-25 */
4254 } else if (c == '_') {
4255 i = '?' /* 63 */ ; /* _ 63 */
4257 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4259 } else if (c > '/') {
4260 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4261 } else if (c == '+' || c == '-') {
4262 i = '>' /* 62 */ ; /* + and - 62 */
4264 i = '?' /* 63 */ ; /* / 63 */
4272 nkf_char c1, c2, c3, c4, cc;
4273 nkf_char t1, t2, t3, t4, mode, exit_mode;
4274 nkf_char lwsp_count;
4277 nkf_char lwsp_size = 128;
4279 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4280 return mime_input_buf(mime_input_state.top++);
4282 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4283 mime_decode_mode=FALSE;
4284 unswitch_mime_getc();
4285 return (*i_getc)(f);
4288 if (mimebuf_f == FIXED_MIME)
4289 exit_mode = mime_decode_mode;
4292 if (mime_decode_mode == 'Q') {
4293 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4295 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4296 if (c1<=SP || DEL<=c1) {
4297 mime_decode_mode = exit_mode; /* prepare for quit */
4300 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4304 mime_decode_mode = exit_mode; /* prepare for quit */
4305 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4306 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4307 /* end Q encoding */
4308 input_mode = exit_mode;
4310 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4311 if (lwsp_buf==NULL) {
4312 perror("can't malloc");
4315 while ((c1=(*i_getc)(f))!=EOF) {
4320 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4328 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4329 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4344 lwsp_buf[lwsp_count] = (unsigned char)c1;
4345 if (lwsp_count++>lwsp_size){
4347 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4348 if (lwsp_buf_new==NULL) {
4350 perror("can't realloc");
4353 lwsp_buf = lwsp_buf_new;
4359 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4361 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4362 i_ungetc(lwsp_buf[lwsp_count],f);
4368 if (c1=='='&&c2<SP) { /* this is soft wrap */
4369 while((c1 = (*i_mgetc)(f)) <=SP) {
4370 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4372 mime_decode_mode = 'Q'; /* still in MIME */
4373 goto restart_mime_q;
4376 mime_decode_mode = 'Q'; /* still in MIME */
4380 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4381 if (c2<=SP) return c2;
4382 mime_decode_mode = 'Q'; /* still in MIME */
4383 return ((hex2bin(c2)<<4) + hex2bin(c3));
4386 if (mime_decode_mode != 'B') {
4387 mime_decode_mode = FALSE;
4388 return (*i_mgetc)(f);
4392 /* Base64 encoding */
4394 MIME allows line break in the middle of
4395 Base64, but we are very pessimistic in decoding
4396 in unbuf mode because MIME encoded code may broken by
4397 less or editor's control sequence (such as ESC-[-K in unbuffered
4398 mode. ignore incomplete MIME.
4400 mode = mime_decode_mode;
4401 mime_decode_mode = exit_mode; /* prepare for quit */
4403 while ((c1 = (*i_mgetc)(f))<=SP) {
4408 if ((c2 = (*i_mgetc)(f))<=SP) {
4411 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4412 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4415 if ((c1 == '?') && (c2 == '=')) {
4418 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4419 if (lwsp_buf==NULL) {
4420 perror("can't malloc");
4423 while ((c1=(*i_getc)(f))!=EOF) {
4428 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4436 if ((c1=(*i_getc)(f))!=EOF) {
4440 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4455 lwsp_buf[lwsp_count] = (unsigned char)c1;
4456 if (lwsp_count++>lwsp_size){
4458 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4459 if (lwsp_buf_new==NULL) {
4461 perror("can't realloc");
4464 lwsp_buf = lwsp_buf_new;
4470 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4472 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4473 i_ungetc(lwsp_buf[lwsp_count],f);
4480 if ((c3 = (*i_mgetc)(f))<=SP) {
4483 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4484 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4488 if ((c4 = (*i_mgetc)(f))<=SP) {
4491 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4492 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4496 mime_decode_mode = mode; /* still in MIME sigh... */
4498 /* BASE 64 decoding */
4500 t1 = 0x3f & base64decode(c1);
4501 t2 = 0x3f & base64decode(c2);
4502 t3 = 0x3f & base64decode(c3);
4503 t4 = 0x3f & base64decode(c4);
4504 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4506 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4507 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4509 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4510 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4512 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4517 return mime_input_buf(mime_input_state.top++);
4520 static const char basis_64[] =
4521 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4523 #define MIMEOUT_BUF_LENGTH (60)
4525 char buf[MIMEOUT_BUF_LENGTH+1];
4530 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4533 open_mime(nkf_char mode)
4535 const unsigned char *p;
4538 p = mime_pattern[0];
4539 for(i=0;mime_pattern[i];i++) {
4540 if (mode == mime_encode[i]) {
4541 p = mime_pattern[i];
4545 mimeout_mode = mime_encode_method[i];
4547 if (base64_count>45) {
4548 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4549 (*o_mputc)(mimeout_state.buf[i]);
4552 PUT_NEWLINE((*o_mputc));
4555 if (mimeout_state.count>0
4556 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4557 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4561 for (;i<mimeout_state.count;i++) {
4562 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4563 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4564 (*o_mputc)(mimeout_state.buf[i]);
4574 j = mimeout_state.count;
4575 mimeout_state.count = 0;
4577 mime_putc(mimeout_state.buf[i]);
4582 mime_prechar(nkf_char c2, nkf_char c1)
4584 if (mimeout_mode > 0){
4586 if (base64_count + mimeout_state.count/3*4> 73){
4587 (*o_base64conv)(EOF,0);
4588 OCONV_NEWLINE((*o_base64conv));
4589 (*o_base64conv)(0,SP);
4593 if (base64_count + mimeout_state.count/3*4> 66) {
4594 (*o_base64conv)(EOF,0);
4595 OCONV_NEWLINE((*o_base64conv));
4596 (*o_base64conv)(0,SP);
4602 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4603 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4604 open_mime(output_mode);
4605 (*o_base64conv)(EOF,0);
4606 OCONV_NEWLINE((*o_base64conv));
4607 (*o_base64conv)(0,SP);
4626 switch(mimeout_mode) {
4631 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4637 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4642 if (mimeout_mode > 0) {
4643 if (mimeout_f!=FIXED_MIME) {
4645 } else if (mimeout_mode != 'Q')
4651 mimeout_addchar(nkf_char c)
4653 switch(mimeout_mode) {
4658 } else if(!nkf_isalnum(c)) {
4660 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4661 (*o_mputc)(bin2hex((c&0xf)));
4669 mimeout_state.state=c;
4670 (*o_mputc)(basis_64[c>>2]);
4675 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4676 mimeout_state.state=c;
4681 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4682 (*o_mputc)(basis_64[c & 0x3F]);
4694 mime_putc(nkf_char c)
4699 if (mimeout_f == FIXED_MIME){
4700 if (mimeout_mode == 'Q'){
4701 if (base64_count > 71){
4702 if (c!=CR && c!=LF) {
4704 PUT_NEWLINE((*o_mputc));
4709 if (base64_count > 71){
4711 PUT_NEWLINE((*o_mputc));
4714 if (c == EOF) { /* c==EOF */
4718 if (c != EOF) { /* c==EOF */
4724 /* mimeout_f != FIXED_MIME */
4726 if (c == EOF) { /* c==EOF */
4727 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4728 j = mimeout_state.count;
4729 mimeout_state.count = 0;
4731 if (mimeout_mode > 0) {
4732 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4734 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4737 mimeout_addchar(mimeout_state.buf[i]);
4741 mimeout_addchar(mimeout_state.buf[i]);
4745 mimeout_addchar(mimeout_state.buf[i]);
4751 mimeout_addchar(mimeout_state.buf[i]);
4757 if (mimeout_state.count > 0){
4758 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4763 if (mimeout_mode=='Q') {
4764 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4765 if (c == CR || c == LF) {
4770 } else if (c <= SP) {
4772 if (base64_count > 70) {
4773 PUT_NEWLINE((*o_mputc));
4776 if (!nkf_isblank(c)) {
4781 if (base64_count > 70) {
4783 PUT_NEWLINE((*o_mputc));
4786 open_mime(output_mode);
4788 if (!nkf_noescape_mime(c)) {
4799 if (mimeout_mode <= 0) {
4800 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4801 if (nkf_isspace(c)) {
4803 if (mimeout_mode == -1) {
4806 if (c==CR || c==LF) {
4808 open_mime(output_mode);
4814 for (i=0;i<mimeout_state.count;i++) {
4815 (*o_mputc)(mimeout_state.buf[i]);
4816 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4827 mimeout_state.buf[0] = (char)c;
4828 mimeout_state.count = 1;
4830 if (base64_count > 1
4831 && base64_count + mimeout_state.count > 76
4832 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4833 PUT_NEWLINE((*o_mputc));
4835 if (!nkf_isspace(mimeout_state.buf[0])){
4840 mimeout_state.buf[mimeout_state.count++] = (char)c;
4841 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4842 open_mime(output_mode);
4847 if (lastchar==CR || lastchar == LF){
4848 for (i=0;i<mimeout_state.count;i++) {
4849 (*o_mputc)(mimeout_state.buf[i]);
4852 mimeout_state.count = 0;
4855 for (i=0;i<mimeout_state.count-1;i++) {
4856 (*o_mputc)(mimeout_state.buf[i]);
4859 mimeout_state.buf[0] = SP;
4860 mimeout_state.count = 1;
4862 open_mime(output_mode);
4865 /* mimeout_mode == 'B', 1, 2 */
4866 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4867 if (lastchar == CR || lastchar == LF){
4868 if (nkf_isblank(c)) {
4869 for (i=0;i<mimeout_state.count;i++) {
4870 mimeout_addchar(mimeout_state.buf[i]);
4872 mimeout_state.count = 0;
4873 } else if (SP<c && c<DEL) {
4875 for (i=0;i<mimeout_state.count;i++) {
4876 (*o_mputc)(mimeout_state.buf[i]);
4879 mimeout_state.count = 0;
4881 mimeout_state.buf[mimeout_state.count++] = (char)c;
4884 if (c==SP || c==TAB || c==CR || c==LF) {
4885 for (i=0;i<mimeout_state.count;i++) {
4886 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4888 for (i=0;i<mimeout_state.count;i++) {
4889 (*o_mputc)(mimeout_state.buf[i]);
4892 mimeout_state.count = 0;
4895 mimeout_state.buf[mimeout_state.count++] = (char)c;
4896 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4898 for (i=0;i<mimeout_state.count;i++) {
4899 (*o_mputc)(mimeout_state.buf[i]);
4902 mimeout_state.count = 0;
4906 if (mimeout_state.count>0 && SP<c && c!='=') {
4907 mimeout_state.buf[mimeout_state.count++] = (char)c;
4908 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4909 j = mimeout_state.count;
4910 mimeout_state.count = 0;
4912 mimeout_addchar(mimeout_state.buf[i]);
4919 if (mimeout_state.count>0) {
4920 j = mimeout_state.count;
4921 mimeout_state.count = 0;
4923 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4925 mimeout_addchar(mimeout_state.buf[i]);
4931 (*o_mputc)(mimeout_state.buf[i]);
4933 open_mime(output_mode);
4940 base64_conv(nkf_char c2, nkf_char c1)
4942 mime_prechar(c2, c1);
4943 (*o_base64conv)(c2,c1);
4947 typedef struct nkf_iconv_t {
4950 size_t input_buffer_size;
4951 char *output_buffer;
4952 size_t output_buffer_size;
4956 nkf_iconv_new(char *tocode, char *fromcode)
4958 nkf_iconv_t converter;
4960 converter->input_buffer_size = IOBUF_SIZE;
4961 converter->input_buffer = malloc(converter->input_buffer_size);
4962 if (converter->input_buffer == NULL)
4963 perror("can't malloc");
4965 converter->output_buffer_size = IOBUF_SIZE * 2;
4966 converter->output_buffer = malloc(converter->output_buffer_size);
4967 if (converter->output_buffer == NULL)
4968 perror("can't malloc");
4970 converter->cd = iconv_open(tocode, fromcode);
4971 if (converter->cd == (iconv_t)-1)
4975 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
4978 perror("can't iconv_open");
4984 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
4986 size_t invalid = (size_t)0;
4987 char *input_buffer = converter->input_buffer;
4988 size_t input_length = (size_t)0;
4989 char *output_buffer = converter->output_buffer;
4990 size_t output_length = converter->output_buffer_size;
4995 while ((c = (*i_getc)(f)) != EOF) {
4996 input_buffer[input_length++] = c;
4997 if (input_length < converter->input_buffer_size) break;
5001 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5002 while (output_length-- > 0) {
5003 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5005 if (ret == (size_t) - 1) {
5008 if (input_buffer != converter->input_buffer)
5009 memmove(converter->input_buffer, input_buffer, input_length);
5012 converter->output_buffer_size *= 2;
5013 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5014 if (output_buffer == NULL) {
5015 perror("can't realloc");
5018 converter->output_buffer = output_buffer;
5021 perror("can't iconv");
5034 nkf_iconv_close(nkf_iconv_t *convert)
5036 free(converter->inbuf);
5037 free(converter->outbuf);
5038 iconv_close(converter->cd);
5047 struct input_code *p = input_code_list;
5059 mime_f = MIME_DECODE_DEFAULT;
5060 mime_decode_f = FALSE;
5065 x0201_f = X0201_DEFAULT;
5066 iso2022jp_f = FALSE;
5067 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5068 ms_ucs_map_f = UCS_MAP_ASCII;
5070 #ifdef UTF8_INPUT_ENABLE
5071 no_cp932ext_f = FALSE;
5072 no_best_fit_chars_f = FALSE;
5073 encode_fallback = NULL;
5074 unicode_subchar = '?';
5075 input_endian = ENDIAN_BIG;
5077 #ifdef UTF8_OUTPUT_ENABLE
5078 output_bom_f = FALSE;
5079 output_endian = ENDIAN_BIG;
5081 #ifdef UNICODE_NORMALIZATION
5097 #ifdef SHIFTJIS_CP932
5107 for (i = 0; i < 256; i++){
5108 prefix_table[i] = 0;
5112 mimeout_state.count = 0;
5117 fold_preserve_f = FALSE;
5120 kanji_intro = DEFAULT_J;
5121 ascii_intro = DEFAULT_R;
5122 fold_margin = FOLD_MARGIN;
5123 o_zconv = no_connection;
5124 o_fconv = no_connection;
5125 o_eol_conv = no_connection;
5126 o_rot_conv = no_connection;
5127 o_hira_conv = no_connection;
5128 o_base64conv = no_connection;
5129 o_iso2022jp_check_conv = no_connection;
5132 i_ungetc = std_ungetc;
5134 i_bungetc = std_ungetc;
5137 i_mungetc = std_ungetc;
5138 i_mgetc_buf = std_getc;
5139 i_mungetc_buf = std_ungetc;
5140 output_mode = ASCII;
5142 mime_decode_mode = FALSE;
5148 init_broken_state();
5149 z_prev2=0,z_prev1=0;
5151 iconv_for_check = 0;
5153 input_codename = NULL;
5154 input_encoding = NULL;
5155 output_encoding = NULL;
5162 module_connection(void)
5164 if (input_encoding) set_input_encoding(input_encoding);
5165 if (!output_encoding) {
5166 output_encoding = nkf_default_encoding();
5168 if (!output_encoding) {
5169 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5172 set_output_encoding(output_encoding);
5173 oconv = nkf_enc_to_oconv(output_encoding);
5176 /* replace continucation module, from output side */
5178 /* output redicrection */
5180 if (noout_f || guess_f){
5187 if (mimeout_f == TRUE) {
5188 o_base64conv = oconv; oconv = base64_conv;
5190 /* base64_count = 0; */
5193 if (eolmode_f || guess_f) {
5194 o_eol_conv = oconv; oconv = eol_conv;
5197 o_rot_conv = oconv; oconv = rot_conv;
5200 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5203 o_hira_conv = oconv; oconv = hira_conv;
5206 o_fconv = oconv; oconv = fold_conv;
5209 if (alpha_f || x0201_f) {
5210 o_zconv = oconv; oconv = z_conv;
5214 i_ungetc = std_ungetc;
5215 /* input redicrection */
5218 i_cgetc = i_getc; i_getc = cap_getc;
5219 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5222 i_ugetc = i_getc; i_getc = url_getc;
5223 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5226 #ifdef NUMCHAR_OPTION
5228 i_ngetc = i_getc; i_getc = numchar_getc;
5229 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5232 #ifdef UNICODE_NORMALIZATION
5234 i_nfc_getc = i_getc; i_getc = nfc_getc;
5235 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5238 if (mime_f && mimebuf_f==FIXED_MIME) {
5239 i_mgetc = i_getc; i_getc = mime_getc;
5240 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5243 i_bgetc = i_getc; i_getc = broken_getc;
5244 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5246 if (input_encoding) {
5247 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5249 set_iconv(FALSE, e_iconv);
5253 struct input_code *p = input_code_list;
5262 Conversion main loop. Code detection only.
5265 #if !defined(PERL_XS) && !defined(WIN32DLL)
5272 module_connection();
5273 while ((c = (*i_getc)(f)) != EOF)
5280 #define NEXT continue /* no output, get next */
5281 #define SKIP c2=0;continue /* no output, get next */
5282 #define MORE c2=c1;continue /* need one more byte */
5283 #define SEND ; /* output c1 and c2, get next */
5284 #define LAST break /* end of loop, go closing */
5285 #define set_input_mode(mode) do { \
5286 input_mode = mode; \
5288 set_input_codename("ISO-2022-JP"); \
5289 debug("ISO-2022-JP"); \
5293 kanji_convert(FILE *f)
5295 nkf_char c1=0, c2=0, c3=0, c4=0;
5296 int shift_mode = 0; /* 0, 1, 2, 3 */
5298 int is_8bit = FALSE;
5300 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5305 output_mode = ASCII;
5307 if (module_connection() < 0) {
5308 #if !defined(PERL_XS) && !defined(WIN32DLL)
5309 fprintf(stderr, "no output encoding given\n");
5315 #ifdef UTF8_INPUT_ENABLE
5316 if(iconv == w_iconv32){
5317 while ((c1 = (*i_getc)(f)) != EOF &&
5318 (c2 = (*i_getc)(f)) != EOF &&
5319 (c3 = (*i_getc)(f)) != EOF &&
5320 (c4 = (*i_getc)(f)) != EOF) {
5321 nkf_iconv_utf_32(c1, c2, c3, c4);
5323 (*i_ungetc)(EOF, f);
5325 else if (iconv == w_iconv16) {
5326 while ((c1 = (*i_getc)(f)) != EOF &&
5327 (c2 = (*i_getc)(f)) != EOF) {
5328 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5329 (c3 = (*i_getc)(f)) != EOF &&
5330 (c4 = (*i_getc)(f)) != EOF) {
5331 nkf_iconv_utf_16(c1, c2, c3, c4);
5334 (*i_ungetc)(EOF, f);
5338 while ((c1 = (*i_getc)(f)) != EOF) {
5339 #ifdef INPUT_CODE_FIX
5340 if (!input_encoding)
5346 /* in case of 8th bit is on */
5347 if (!estab_f&&!mime_decode_mode) {
5348 /* in case of not established yet */
5349 /* It is still ambiguious */
5350 if (h_conv(f, c2, c1)==EOF) {
5358 /* in case of already established */
5360 /* ignore bogus code */
5368 /* 2nd byte of 7 bit code or SJIS */
5372 else if (nkf_char_unicode_p(c1)) {
5378 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5381 } else if (c1 > DEL) {
5383 if (!estab_f && !iso8859_f) {
5384 /* not established yet */
5386 } else { /* estab_f==TRUE */
5392 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5393 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5395 c2 = JIS_X_0201_1976_K;
5400 /* already established */
5404 } else if (SP < c1 && c1 < DEL) {
5405 /* in case of Roman characters */
5407 /* output 1 shifted byte */
5411 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5412 /* output 1 shifted byte */
5413 c2 = JIS_X_0201_1976_K;
5416 /* look like bogus code */
5419 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5420 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5421 /* in case of Kanji shifted */
5423 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5424 /* Check MIME code */
5425 if ((c1 = (*i_getc)(f)) == EOF) {
5428 } else if (c1 == '?') {
5429 /* =? is mime conversion start sequence */
5430 if(mime_f == STRICT_MIME) {
5431 /* check in real detail */
5432 if (mime_begin_strict(f) == EOF)
5435 } else if (mime_begin(f) == EOF)
5444 /* normal ASCII code */
5447 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5450 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5453 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5454 if ((c1 = (*i_getc)(f)) == EOF) {
5455 /* (*oconv)(0, ESC); don't send bogus code */
5458 else if (c1 == '&') {
5460 if ((c1 = (*i_getc)(f)) == EOF) {
5466 else if (c1 == '$') {
5468 if ((c1 = (*i_getc)(f)) == EOF) {
5469 /* don't send bogus code
5471 (*oconv)(0, '$'); */
5473 } else if (c1 == '@' || c1 == 'B') {
5475 set_input_mode(JIS_X_0208);
5477 } else if (c1 == '(') {
5479 if ((c1 = (*i_getc)(f)) == EOF) {
5480 /* don't send bogus code
5486 } else if (c1 == '@'|| c1 == 'B') {
5488 set_input_mode(JIS_X_0208);
5491 } else if (c1 == 'D'){
5492 set_input_mode(JIS_X_0212);
5494 #endif /* X0212_ENABLE */
5495 } else if (c1 == 'O' || c1 == 'Q'){
5496 set_input_mode(JIS_X_0213_1);
5498 } else if (c1 == 'P'){
5499 set_input_mode(JIS_X_0213_2);
5502 /* could be some special code */
5509 } else if (broken_f&0x2) {
5510 /* accept any ESC-(-x as broken code ... */
5511 input_mode = JIS_X_0208;
5520 } else if (c1 == '(') {
5522 if ((c1 = (*i_getc)(f)) == EOF) {
5523 /* don't send bogus code
5525 (*oconv)(0, '('); */
5528 else if (c1 == 'I') {
5529 /* JIS X 0201 Katakana */
5530 set_input_mode(JIS_X_0201_1976_K);
5533 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5534 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5535 set_input_mode(ASCII);
5538 else if (broken_f&0x2) {
5539 set_input_mode(ASCII);
5548 else if (c1 == '.') {
5550 if ((c1 = (*i_getc)(f)) == EOF) {
5553 else if (c1 == 'A') {
5564 else if (c1 == 'N') {
5567 if (g2 == ISO_8859_1) {
5582 } else if (c1 == ESC && iconv == s_iconv) {
5583 /* ESC in Shift_JIS */
5584 if ((c1 = (*i_getc)(f)) == EOF) {
5585 /* (*oconv)(0, ESC); don't send bogus code */
5587 } else if (c1 == '$') {
5589 if ((c1 = (*i_getc)(f)) == EOF) {
5591 } else if (('E' <= c1 && c1 <= 'G') ||
5592 ('O' <= c1 && c1 <= 'Q')) {
5600 static const nkf_char jphone_emoji_first_table[7] =
5601 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5602 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5603 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5604 while (SP <= c1 && c1 <= 'z') {
5605 (*oconv)(0, c1 + c3);
5606 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5621 } else if (c1 == LF || c1 == CR) {
5623 input_mode = ASCII; set_iconv(FALSE, 0);
5625 } else if (mime_decode_f && !mime_decode_mode){
5627 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5635 } else { /* if (c1 == CR)*/
5636 if ((c1=(*i_getc)(f))!=EOF) {
5640 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5660 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5663 if ((c3 = (*i_getc)(f)) != EOF) {
5666 if ((c4 = (*i_getc)(f)) != EOF) {
5668 (*iconv)(c2, c1, c3|c4);
5673 /* 3 bytes EUC or UTF-8 */
5674 if ((c3 = (*i_getc)(f)) != EOF) {
5676 (*iconv)(c2, c1, c3);
5684 0x7F <= c2 && c2 <= 0x92 &&
5685 0x21 <= c1 && c1 <= 0x7E) {
5687 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5690 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5694 (*oconv)(PREFIX_EUCG3 | c2, c1);
5696 #endif /* X0212_ENABLE */
5698 (*oconv)(PREFIX_EUCG3 | c2, c1);
5701 (*oconv)(input_mode, c1); /* other special case */
5707 /* goto next_word */
5711 (*iconv)(EOF, 0, 0);
5712 if (!input_codename)
5715 struct input_code *p = input_code_list;
5716 struct input_code *result = p;
5718 if (p->score < result->score) result = p;
5721 set_input_codename(result->name);
5723 debug(result->name);
5731 * int options(unsigned char *cp)
5738 options(unsigned char *cp)
5742 unsigned char *cp_back = NULL;
5747 while(*cp && *cp++!='-');
5748 while (*cp || cp_back) {
5756 case '-': /* literal options */
5757 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5761 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5762 p = (unsigned char *)long_option[i].name;
5763 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5764 if (*p == cp[j] || cp[j] == SP){
5771 #if !defined(PERL_XS) && !defined(WIN32DLL)
5772 fprintf(stderr, "unknown long option: --%s\n", cp);
5776 while(*cp && *cp != SP && cp++);
5777 if (long_option[i].alias[0]){
5779 cp = (unsigned char *)long_option[i].alias;
5781 if (strcmp(long_option[i].name, "ic=") == 0){
5782 enc = nkf_enc_find((char *)p);
5784 input_encoding = enc;
5787 if (strcmp(long_option[i].name, "oc=") == 0){
5788 enc = nkf_enc_find((char *)p);
5789 /* if (enc <= 0) continue; */
5791 output_encoding = enc;
5794 if (strcmp(long_option[i].name, "guess=") == 0){
5795 if (p[0] == '0' || p[0] == '1') {
5803 if (strcmp(long_option[i].name, "overwrite") == 0){
5806 preserve_time_f = TRUE;
5809 if (strcmp(long_option[i].name, "overwrite=") == 0){
5812 preserve_time_f = TRUE;
5814 backup_suffix = malloc(strlen((char *) p) + 1);
5815 strcpy(backup_suffix, (char *) p);
5818 if (strcmp(long_option[i].name, "in-place") == 0){
5821 preserve_time_f = FALSE;
5824 if (strcmp(long_option[i].name, "in-place=") == 0){
5827 preserve_time_f = FALSE;
5829 backup_suffix = malloc(strlen((char *) p) + 1);
5830 strcpy(backup_suffix, (char *) p);
5835 if (strcmp(long_option[i].name, "cap-input") == 0){
5839 if (strcmp(long_option[i].name, "url-input") == 0){
5844 #ifdef NUMCHAR_OPTION
5845 if (strcmp(long_option[i].name, "numchar-input") == 0){
5851 if (strcmp(long_option[i].name, "no-output") == 0){
5855 if (strcmp(long_option[i].name, "debug") == 0){
5860 if (strcmp(long_option[i].name, "cp932") == 0){
5861 #ifdef SHIFTJIS_CP932
5865 #ifdef UTF8_OUTPUT_ENABLE
5866 ms_ucs_map_f = UCS_MAP_CP932;
5870 if (strcmp(long_option[i].name, "no-cp932") == 0){
5871 #ifdef SHIFTJIS_CP932
5875 #ifdef UTF8_OUTPUT_ENABLE
5876 ms_ucs_map_f = UCS_MAP_ASCII;
5880 #ifdef SHIFTJIS_CP932
5881 if (strcmp(long_option[i].name, "cp932inv") == 0){
5888 if (strcmp(long_option[i].name, "x0212") == 0){
5895 if (strcmp(long_option[i].name, "exec-in") == 0){
5899 if (strcmp(long_option[i].name, "exec-out") == 0){
5904 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5905 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5906 no_cp932ext_f = TRUE;
5909 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5910 no_best_fit_chars_f = TRUE;
5913 if (strcmp(long_option[i].name, "fb-skip") == 0){
5914 encode_fallback = NULL;
5917 if (strcmp(long_option[i].name, "fb-html") == 0){
5918 encode_fallback = encode_fallback_html;
5921 if (strcmp(long_option[i].name, "fb-xml") == 0){
5922 encode_fallback = encode_fallback_xml;
5925 if (strcmp(long_option[i].name, "fb-java") == 0){
5926 encode_fallback = encode_fallback_java;
5929 if (strcmp(long_option[i].name, "fb-perl") == 0){
5930 encode_fallback = encode_fallback_perl;
5933 if (strcmp(long_option[i].name, "fb-subchar") == 0){
5934 encode_fallback = encode_fallback_subchar;
5937 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
5938 encode_fallback = encode_fallback_subchar;
5939 unicode_subchar = 0;
5941 /* decimal number */
5942 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
5943 unicode_subchar *= 10;
5944 unicode_subchar += hex2bin(p[i]);
5946 }else if(p[1] == 'x' || p[1] == 'X'){
5947 /* hexadecimal number */
5948 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
5949 unicode_subchar <<= 4;
5950 unicode_subchar |= hex2bin(p[i]);
5954 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
5955 unicode_subchar *= 8;
5956 unicode_subchar += hex2bin(p[i]);
5959 w16e_conv(unicode_subchar, &i, &j);
5960 unicode_subchar = i<<8 | j;
5964 #ifdef UTF8_OUTPUT_ENABLE
5965 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
5966 ms_ucs_map_f = UCS_MAP_MS;
5970 #ifdef UNICODE_NORMALIZATION
5971 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
5976 if (strcmp(long_option[i].name, "prefix=") == 0){
5977 if (nkf_isgraph(p[0])){
5978 for (i = 1; nkf_isgraph(p[i]); i++){
5979 prefix_table[p[i]] = p[0];
5984 #if !defined(PERL_XS) && !defined(WIN32DLL)
5985 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
5990 case 'b': /* buffered mode */
5993 case 'u': /* non bufferd mode */
5996 case 't': /* transparent mode */
6001 } else if (*cp=='2') {
6005 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6013 case 'j': /* JIS output */
6015 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6017 case 'e': /* AT&T EUC output */
6018 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6020 case 's': /* SJIS output */
6021 output_encoding = nkf_enc_from_index(WINDOWS_31J);
6023 case 'l': /* ISO8859 Latin-1 support, no conversion */
6024 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6025 input_encoding = nkf_enc_from_index(ISO_8859_1);
6027 case 'i': /* Kanji IN ESC-$-@/B */
6028 if (*cp=='@'||*cp=='B')
6029 kanji_intro = *cp++;
6031 case 'o': /* ASCII IN ESC-(-J/B */
6032 if (*cp=='J'||*cp=='B'||*cp=='H')
6033 ascii_intro = *cp++;
6037 bit:1 katakana->hiragana
6038 bit:2 hiragana->katakana
6040 if ('9'>= *cp && *cp>='0')
6041 hira_f |= (*cp++ -'0');
6048 #if defined(MSDOS) || defined(__OS2__)
6055 show_configuration();
6063 #ifdef UTF8_OUTPUT_ENABLE
6064 case 'w': /* UTF-8 output */
6069 output_encoding = nkf_enc_from_index(UTF_8N);
6071 output_bom_f = TRUE;
6072 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6076 if ('1'== cp[0] && '6'==cp[1]) {
6079 } else if ('3'== cp[0] && '2'==cp[1]) {
6083 output_encoding = nkf_enc_from_index(UTF_8);
6088 output_endian = ENDIAN_LITTLE;
6089 } else if (cp[0] == 'B') {
6092 output_encoding = nkf_enc_from_index(enc_idx);
6097 enc_idx = enc_idx == UTF_16
6098 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6099 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6101 output_bom_f = TRUE;
6102 enc_idx = enc_idx == UTF_16
6103 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6104 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6106 output_encoding = nkf_enc_from_index(enc_idx);
6110 #ifdef UTF8_INPUT_ENABLE
6111 case 'W': /* UTF input */
6114 input_encoding = nkf_enc_from_index(UTF_8);
6117 if ('1'== cp[0] && '6'==cp[1]) {
6119 input_endian = ENDIAN_BIG;
6121 } else if ('3'== cp[0] && '2'==cp[1]) {
6123 input_endian = ENDIAN_BIG;
6126 input_encoding = nkf_enc_from_index(UTF_8);
6131 input_endian = ENDIAN_LITTLE;
6132 } else if (cp[0] == 'B') {
6134 input_endian = ENDIAN_BIG;
6136 enc_idx = (enc_idx == UTF_16
6137 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6138 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6139 input_encoding = nkf_enc_from_index(enc_idx);
6143 /* Input code assumption */
6144 case 'J': /* ISO-2022-JP input */
6145 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6147 case 'E': /* EUC-JP input */
6148 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6150 case 'S': /* Windows-31J input */
6151 input_encoding = nkf_enc_from_index(WINDOWS_31J);
6153 case 'Z': /* Convert X0208 alphabet to asii */
6155 bit:0 Convert JIS X 0208 Alphabet to ASCII
6156 bit:1 Convert Kankaku to one space
6157 bit:2 Convert Kankaku to two spaces
6158 bit:3 Convert HTML Entity
6159 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6161 while ('0'<= *cp && *cp <='9') {
6162 alpha_f |= 1 << (*cp++ - '0');
6164 if (!alpha_f) alpha_f = 1;
6166 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6167 x0201_f = FALSE; /* No X0201->X0208 conversion */
6169 ESC-(-I in JIS, EUC, MS Kanji
6170 SI/SO in JIS, EUC, MS Kanji
6171 SS2 in EUC, JIS, not in MS Kanji
6172 MS Kanji (0xa0-0xdf)
6174 ESC-(-I in JIS (0x20-0x5f)
6175 SS2 in EUC (0xa0-0xdf)
6176 0xa0-0xd in MS Kanji (0xa0-0xdf)
6179 case 'X': /* Convert X0201 kana to X0208 */
6182 case 'F': /* prserve new lines */
6183 fold_preserve_f = TRUE;
6184 case 'f': /* folding -f60 or -f */
6187 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6189 fold_len += *cp++ - '0';
6191 if (!(0<fold_len && fold_len<BUFSIZ))
6192 fold_len = DEFAULT_FOLD;
6196 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6198 fold_margin += *cp++ - '0';
6202 case 'm': /* MIME support */
6203 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6204 if (*cp=='B'||*cp=='Q') {
6205 mime_decode_mode = *cp++;
6206 mimebuf_f = FIXED_MIME;
6207 } else if (*cp=='N') {
6208 mime_f = TRUE; cp++;
6209 } else if (*cp=='S') {
6210 mime_f = STRICT_MIME; cp++;
6211 } else if (*cp=='0') {
6212 mime_decode_f = FALSE;
6213 mime_f = FALSE; cp++;
6215 mime_f = STRICT_MIME;
6218 case 'M': /* MIME output */
6221 mimeout_f = FIXED_MIME; cp++;
6222 } else if (*cp=='Q') {
6224 mimeout_f = FIXED_MIME; cp++;
6229 case 'B': /* Broken JIS support */
6231 bit:1 allow any x on ESC-(-x or ESC-$-x
6232 bit:2 reset to ascii on NL
6234 if ('9'>= *cp && *cp>='0')
6235 broken_f |= 1<<(*cp++ -'0');
6240 case 'O':/* for Output file */
6244 case 'c':/* add cr code */
6247 case 'd':/* delete cr code */
6250 case 'I': /* ISO-2022-JP output */
6253 case 'L': /* line mode */
6254 if (*cp=='u') { /* unix */
6255 eolmode_f = LF; cp++;
6256 } else if (*cp=='m') { /* mac */
6257 eolmode_f = CR; cp++;
6258 } else if (*cp=='w') { /* windows */
6259 eolmode_f = CRLF; cp++;
6260 } else if (*cp=='0') { /* no conversion */
6261 eolmode_f = 0; cp++;
6266 if ('2' <= *cp && *cp <= '9') {
6269 } else if (*cp == '0' || *cp == '1') {
6278 /* module muliple options in a string are allowed for Perl moudle */
6279 while(*cp && *cp++!='-');
6282 #if !defined(PERL_XS) && !defined(WIN32DLL)
6283 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6285 /* bogus option but ignored */
6293 #include "nkf32dll.c"
6294 #elif defined(PERL_XS)
6295 #else /* WIN32DLL */
6297 main(int argc, char **argv)
6302 char *outfname = NULL;
6305 #ifdef EASYWIN /*Easy Win */
6306 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6308 #ifdef DEFAULT_CODE_LOCALE
6309 setlocale(LC_CTYPE, "");
6311 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6312 cp = (unsigned char *)*argv;
6317 if (pipe(fds) < 0 || (pid = fork()) < 0){
6328 execvp(argv[1], &argv[1]);
6345 int debug_f_back = debug_f;
6348 int exec_f_back = exec_f;
6351 int x0212_f_back = x0212_f;
6353 int x0213_f_back = x0213_f;
6354 int guess_f_back = guess_f;
6356 guess_f = guess_f_back;
6359 debug_f = debug_f_back;
6362 exec_f = exec_f_back;
6364 x0212_f = x0212_f_back;
6365 x0213_f = x0213_f_back;
6368 if (binmode_f == TRUE)
6369 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6370 if (freopen("","wb",stdout) == NULL)
6377 setbuf(stdout, (char *) NULL);
6379 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6382 if (binmode_f == TRUE)
6383 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6384 if (freopen("","rb",stdin) == NULL) return (-1);
6388 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6392 kanji_convert(stdin);
6393 if (guess_f) print_guessed_code(NULL);
6397 int is_argument_error = FALSE;
6399 input_codename = NULL;
6402 iconv_for_check = 0;
6404 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6406 is_argument_error = TRUE;
6414 /* reopen file for stdout */
6415 if (file_out_f == TRUE) {
6418 outfname = malloc(strlen(origfname)
6419 + strlen(".nkftmpXXXXXX")
6425 strcpy(outfname, origfname);
6429 for (i = strlen(outfname); i; --i){
6430 if (outfname[i - 1] == '/'
6431 || outfname[i - 1] == '\\'){
6437 strcat(outfname, "ntXXXXXX");
6439 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6440 S_IREAD | S_IWRITE);
6442 strcat(outfname, ".nkftmpXXXXXX");
6443 fd = mkstemp(outfname);
6446 || (fd_backup = dup(fileno(stdout))) < 0
6447 || dup2(fd, fileno(stdout)) < 0
6458 outfname = "nkf.out";
6461 if(freopen(outfname, "w", stdout) == NULL) {
6465 if (binmode_f == TRUE) {
6466 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6467 if (freopen("","wb",stdout) == NULL)
6474 if (binmode_f == TRUE)
6475 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6476 if (freopen("","rb",fin) == NULL)
6481 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6485 char *filename = NULL;
6487 if (nfiles > 1) filename = origfname;
6488 if (guess_f) print_guessed_code(filename);
6494 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6502 if (dup2(fd_backup, fileno(stdout)) < 0){
6505 if (stat(origfname, &sb)) {
6506 fprintf(stderr, "Can't stat %s\n", origfname);
6508 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6509 if (chmod(outfname, sb.st_mode)) {
6510 fprintf(stderr, "Can't set permission %s\n", outfname);
6513 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6514 if(preserve_time_f){
6515 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6516 tb[0] = tb[1] = sb.st_mtime;
6517 if (utime(outfname, tb)) {
6518 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6521 tb.actime = sb.st_atime;
6522 tb.modtime = sb.st_mtime;
6523 if (utime(outfname, &tb)) {
6524 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6529 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6531 unlink(backup_filename);
6533 if (rename(origfname, backup_filename)) {
6534 perror(backup_filename);
6535 fprintf(stderr, "Can't rename %s to %s\n",
6536 origfname, backup_filename);
6538 free(backup_filename);
6541 if (unlink(origfname)){
6546 if (rename(outfname, origfname)) {
6548 fprintf(stderr, "Can't rename %s to %s\n",
6549 outfname, origfname);
6556 if (is_argument_error)
6559 #ifdef EASYWIN /*Easy Win */
6560 if (file_out_f == FALSE)
6561 scanf("%d",&end_check);
6564 #else /* for Other OS */
6565 if (file_out_f == TRUE)
6567 #endif /*Easy Win */
6570 #endif /* WIN32DLL */