1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_IDENT "$Id: nkf.c,v 1.176 2008/02/08 11:37:12 naruse Exp $"
35 #define NKF_VERSION "2.0.8"
36 #define NKF_RELEASE_DATE "2008-02-07"
38 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
39 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
45 /* state of output_mode and input_mode
124 NKF_ENCODING_TABLE_SIZE,
125 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
126 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
127 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
128 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
129 JIS_X_0208 = 0x1168, /* @B */
130 JIS_X_0212 = 0x1159, /* D */
131 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
132 JIS_X_0213_2 = 0x1229, /* P */
133 JIS_X_0213_1 = 0x1233, /* Q */
136 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
139 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
140 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
141 void j_oconv(nkf_char c2, nkf_char c1);
142 void s_oconv(nkf_char c2, nkf_char c1);
143 void e_oconv(nkf_char c2, nkf_char c1);
144 void w_oconv(nkf_char c2, nkf_char c1);
145 void w_oconv16(nkf_char c2, nkf_char c1);
146 void w_oconv32(nkf_char c2, nkf_char c1);
150 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
151 void (*oconv)(nkf_char c2, nkf_char c1);
152 } nkf_native_encoding;
154 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
155 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
156 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
157 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
158 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
159 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
160 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
165 const nkf_native_encoding *base_encoding;
168 nkf_encoding nkf_encoding_table[] = {
169 {ASCII, "US-ASCII", &NkfEncodingASCII},
170 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
171 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
172 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
173 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
174 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
177 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
178 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
179 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
180 {CP10001, "CP10001", &NkfEncodingShift_JIS},
181 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
182 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
183 {CP51932, "CP51932", &NkfEncodingEUC_JP},
184 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
185 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
186 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
187 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
188 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
189 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
190 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
191 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
192 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
193 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
194 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
195 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
196 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
197 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
198 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
199 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
200 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
201 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
202 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
203 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
204 {BINARY, "BINARY", &NkfEncodingASCII},
211 } encoding_name_to_id_table[] = {
214 {"ISO-2022-JP", ISO_2022_JP},
215 {"ISO2022JP-CP932", CP50220},
216 {"CP50220", CP50220},
217 {"CP50221", CP50221},
218 {"CSISO2022JP", CP50221},
219 {"CP50222", CP50222},
220 {"ISO-2022-JP-1", ISO_2022_JP_1},
221 {"ISO-2022-JP-3", ISO_2022_JP_3},
222 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
223 {"SHIFT_JIS", SHIFT_JIS},
225 {"WINDOWS-31J", WINDOWS_31J},
226 {"CSWINDOWS31J", WINDOWS_31J},
227 {"CP932", WINDOWS_31J},
228 {"MS932", WINDOWS_31J},
229 {"CP10001", CP10001},
232 {"EUCJP-NKF", EUCJP_NKF},
233 {"CP51932", CP51932},
234 {"EUC-JP-MS", EUCJP_MS},
235 {"EUCJP-MS", EUCJP_MS},
236 {"EUCJPMS", EUCJP_MS},
237 {"EUC-JP-ASCII", EUCJP_ASCII},
238 {"EUCJP-ASCII", EUCJP_ASCII},
239 {"SHIFT_JISX0213", SHIFT_JISX0213},
240 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
241 {"EUC-JISX0213", EUC_JISX0213},
242 {"EUC-JIS-2004", EUC_JIS_2004},
245 {"UTF-8-BOM", UTF_8_BOM},
246 {"UTF8-MAC", UTF8_MAC},
247 {"UTF-8-MAC", UTF8_MAC},
249 {"UTF-16BE", UTF_16BE},
250 {"UTF-16BE-BOM", UTF_16BE_BOM},
251 {"UTF-16LE", UTF_16LE},
252 {"UTF-16LE-BOM", UTF_16LE_BOM},
254 {"UTF-32BE", UTF_32BE},
255 {"UTF-32BE-BOM", UTF_32BE_BOM},
256 {"UTF-32LE", UTF_32LE},
257 {"UTF-32LE-BOM", UTF_32LE_BOM},
262 #if defined(DEFAULT_CODE_JIS)
263 #define DEFAULT_ENCIDX ISO_2022_JP
264 #elif defined(DEFAULT_CODE_SJIS)
265 #define DEFAULT_ENCIDX SHIFT_JIS
266 #elif defined(DEFAULT_CODE_EUC)
267 #define DEFAULT_ENCIDX EUC_JP
268 #elif defined(DEFAULT_CODE_UTF8)
269 #define DEFAULT_ENCIDX UTF_8
273 #define is_alnum(c) \
274 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
276 /* I don't trust portablity of toupper */
277 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
278 #define nkf_isoctal(c) ('0'<=c && c<='7')
279 #define nkf_isdigit(c) ('0'<=c && c<='9')
280 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
281 #define nkf_isblank(c) (c == SP || c == TAB)
282 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
283 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
284 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
285 #define nkf_isprint(c) (SP<=c && c<='~')
286 #define nkf_isgraph(c) ('!'<=c && c<='~')
287 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
288 ('A'<=c&&c<='F') ? (c-'A'+10) : \
289 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
290 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
291 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
292 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
293 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
294 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
296 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
297 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
299 #define HOLD_SIZE 1024
300 #if defined(INT_IS_SHORT)
301 #define IOBUF_SIZE 2048
303 #define IOBUF_SIZE 16384
306 #define DEFAULT_J 'B'
307 #define DEFAULT_R 'B'
314 /* MIME preprocessor */
316 #ifdef EASYWIN /*Easy Win */
317 extern POINT _BufferSize;
326 void (*status_func)(struct input_code *, nkf_char);
327 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
331 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
332 static nkf_encoding *input_encoding = NULL;
333 static nkf_encoding *output_encoding = NULL;
335 static int kanji_convert(FILE *f);
336 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
338 * 0: Shift_JIS, eucJP-ascii
343 #define UCS_MAP_ASCII 0
345 #define UCS_MAP_CP932 2
346 #define UCS_MAP_CP10001 3
347 static int ms_ucs_map_f = UCS_MAP_ASCII;
349 #ifdef UTF8_INPUT_ENABLE
350 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
351 static int no_cp932ext_f = FALSE;
352 /* ignore ZERO WIDTH NO-BREAK SPACE */
353 static int no_best_fit_chars_f = FALSE;
354 static int input_endian = ENDIAN_BIG;
355 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
356 static void (*encode_fallback)(nkf_char c) = NULL;
357 static void w_status(struct input_code *, nkf_char);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int output_bom_f = FALSE;
361 static int output_endian = ENDIAN_BIG;
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_getc(FILE *f);
373 static void mime_putc(nkf_char c);
377 #if !defined(PERL_XS) && !defined(WIN32DLL)
378 static unsigned char stdibuf[IOBUF_SIZE];
379 static unsigned char stdobuf[IOBUF_SIZE];
383 static int unbuf_f = FALSE;
384 static int estab_f = FALSE;
385 static int nop_f = FALSE;
386 static int binmode_f = TRUE; /* binary mode */
387 static int rot_f = FALSE; /* rot14/43 mode */
388 static int hira_f = FALSE; /* hira/kata henkan */
389 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
390 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
391 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
392 static int mimebuf_f = FALSE; /* MIME buffered input */
393 static int broken_f = FALSE; /* convert ESC-less broken JIS */
394 static int iso8859_f = FALSE; /* ISO8859 through */
395 static int mimeout_f = FALSE; /* base64 mode */
396 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
397 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399 #ifdef UNICODE_NORMALIZATION
400 static int nfc_f = FALSE;
401 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
402 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
406 static int cap_f = FALSE;
407 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
408 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 static int url_f = FALSE;
411 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
415 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
416 #define CLASS_MASK NKF_INT32_C(0xFF000000)
417 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
418 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
419 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
420 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
421 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
422 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
423 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
424 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_BMP_MAX))
425 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_MAX))
427 #ifdef NUMCHAR_OPTION
428 static int numchar_f = FALSE;
429 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
430 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static int noout_f = FALSE;
435 static void no_putc(nkf_char c);
436 static int debug_f = FALSE;
437 static void debug(const char *str);
438 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
441 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
442 static void set_input_codename(char *codename);
445 static int exec_f = 0;
448 #ifdef SHIFTJIS_CP932
449 /* invert IBM extended characters to others */
450 static int cp51932_f = FALSE;
452 /* invert NEC-selected IBM extended characters to IBM extended characters */
453 static int cp932inv_f = TRUE;
455 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
456 #endif /* SHIFTJIS_CP932 */
458 static int x0212_f = FALSE;
459 static int x0213_f = FALSE;
461 static unsigned char prefix_table[256];
463 static void e_status(struct input_code *, nkf_char);
464 static void s_status(struct input_code *, nkf_char);
466 struct input_code input_code_list[] = {
467 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
468 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469 #ifdef UTF8_INPUT_ENABLE
470 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
475 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
476 static int base64_count = 0;
478 /* X0208 -> ASCII converter */
481 static int f_line = 0; /* chars in line */
482 static int f_prev = 0;
483 static int fold_preserve_f = FALSE; /* preserve new lines */
484 static int fold_f = FALSE;
485 static int fold_len = 0;
488 static unsigned char kanji_intro = DEFAULT_J;
489 static unsigned char ascii_intro = DEFAULT_R;
493 #define FOLD_MARGIN 10
494 #define DEFAULT_FOLD 60
496 static int fold_margin = FOLD_MARGIN;
498 /* process default */
500 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
502 fprintf(stderr,"nkf internal module connection failure.\n");
507 void no_connection(nkf_char c2, nkf_char c1)
509 no_connection2(c2,c1,0);
512 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
513 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
515 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
516 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
517 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
518 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
523 /* static redirections */
525 static void (*o_putc)(nkf_char c) = std_putc;
527 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
528 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
530 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
531 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
533 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
535 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
536 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
538 /* for strict mime */
539 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
540 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
543 static int output_mode = ASCII; /* output kanji mode */
544 static int input_mode = ASCII; /* input kanji mode */
545 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
547 /* X0201 / X0208 conversion tables */
549 /* X0201 kana conversion table */
551 static const unsigned char cv[]= {
552 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
553 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
554 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
555 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
556 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
557 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
558 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
559 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
560 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
561 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
562 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
563 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
564 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
565 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
566 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
567 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
571 /* X0201 kana conversion table for daguten */
573 static const unsigned char dv[]= {
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
578 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
579 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
580 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
581 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
582 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
583 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
584 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
585 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
592 /* X0201 kana conversion table for han-daguten */
594 static const unsigned char ev[]= {
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
606 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 /* X0208 kigou conversion table */
615 /* 0x8140 - 0x819e */
616 static const unsigned char fv[] = {
618 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
619 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
620 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
622 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
623 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
624 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
625 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
626 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
628 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
634 static int option_mode = 0;
635 static int file_out_f = FALSE;
637 static int overwrite_f = FALSE;
638 static int preserve_time_f = FALSE;
639 static int backup_f = FALSE;
640 static char *backup_suffix = "";
643 static int eolmode_f = 0; /* CR, LF, CRLF */
644 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
645 static nkf_char prev_cr = 0; /* CR or 0 */
646 #ifdef EASYWIN /*Easy Win */
647 static int end_check;
650 #define STD_GC_BUFSIZE (256)
651 nkf_char std_gc_buf[STD_GC_BUFSIZE];
654 char* nkf_strcpy(const char *str)
656 char* result = malloc(strlen(str) + 1);
665 static void nkf_str_upcase(const char *src, char *dest, size_t length)
668 for (; i < length && src[i]; i++) {
669 dest[i] = nkf_toupper(src[i]);
674 static nkf_encoding *nkf_enc_from_index(int idx)
676 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
679 return &nkf_encoding_table[idx];
682 static int nkf_enc_find_index(const char *name)
685 if (*name == 'X' && *(name+1) == '-') name += 2;
686 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
687 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
688 return encoding_name_to_id_table[i].id;
694 static nkf_encoding *nkf_enc_find(const char *name)
697 idx = nkf_enc_find_index(name);
698 if (idx < 0) return 0;
699 return nkf_enc_from_index(idx);
702 #define nkf_enc_name(enc) (enc)->name
703 #define nkf_enc_to_index(enc) (enc)->id
704 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
705 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
706 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
707 #define nkf_enc_asciicompat(enc) (\
708 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
709 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
710 #define nkf_enc_unicode_p(enc) (\
711 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
712 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
713 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
714 #define nkf_enc_cp5022x_p(enc) (\
715 nkf_enc_to_index(enc) == CP50220 ||\
716 nkf_enc_to_index(enc) == CP50221 ||\
717 nkf_enc_to_index(enc) == CP50222)
719 #ifdef DEFAULT_CODE_LOCALE
720 static char* nkf_locale_charmap()
722 #ifdef HAVE_LANGINFO_H
723 return nl_langinfo(CODESET);
724 #elif defined(__WIN32__)
725 return sprintf("CP%d", GetACP());
731 static nkf_encoding* nkf_locale_encoding()
733 nkf_encoding *enc = 0;
734 char *encname = nkf_locale_charmap();
736 enc = nkf_enc_find(encname);
737 if (enc < 0) enc = 0;
740 #endif /* DEFAULT_CODE_LOCALE */
742 static nkf_encoding* nkf_default_encoding()
744 nkf_encoding *enc = 0;
745 #ifdef DEFAULT_CODE_LOCALE
746 enc = nkf_locale_encoding();
748 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
755 #define fprintf dllprintf
760 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
766 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
768 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
769 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
770 #ifdef UTF8_OUTPUT_ENABLE
771 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
773 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
774 #ifdef UTF8_INPUT_ENABLE
775 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
778 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
779 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
780 "r {de/en}crypt ROT13/47\n"
781 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
782 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
783 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
784 "l ISO8859-1 (Latin-1) support\n"
785 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
786 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
787 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
788 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
789 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
790 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
792 "T Text mode output\n"
794 "O Output to File (DEFAULT 'nkf.out')\n"
795 "I Convert non ISO-2022-JP charactor to GETA\n"
796 "d,c Convert line breaks -d: LF -c: CRLF\n"
797 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
798 "v, V Show this usage. V: show configuration\n"
800 "Long name options\n"
801 " --ic=<input codeset> --oc=<output codeset>\n"
802 " Specify the input or output codeset\n"
803 " --fj --unix --mac --windows\n"
804 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
805 " Convert for the system or code\n"
806 " --hiragana --katakana --katakana-hiragana\n"
807 " To Hiragana/Katakana Conversion\n"
808 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
810 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
812 #ifdef NUMCHAR_OPTION
813 " --numchar-input Convert Unicode Character Reference\n"
815 #ifdef UTF8_INPUT_ENABLE
816 " --fb-{skip, html, xml, perl, java, subchar}\n"
817 " Specify how nkf handles unassigned characters\n"
820 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
821 " Overwrite original listed files by filtered result\n"
822 " --overwrite preserves timestamp of original files\n"
824 " -g --guess Guess the input code\n"
825 " --help --version Show this help/the version\n"
826 " For more information, see also man nkf\n"
831 void show_configuration(void)
834 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
837 " Compile-time options:\n"
838 " Compiled at: " __DATE__ " " __TIME__ "\n"
841 " Default output encoding: "
842 #ifdef DEFAULT_CODE_LOCALE
843 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
845 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
851 " Default output end of line: "
852 #if DEFAULT_NEWLINE == CR
854 #elif DEFAULT_NEWLINE == CRLF
860 " Decode MIME encoded string: "
861 #if MIME_DECODE_DEFAULT
867 " Convert JIS X 0201 Katakana: "
874 " --help, --version output: "
875 #if HELP_OUTPUT_HELP_OUTPUT
885 char *get_backup_filename(const char *suffix, const char *filename)
887 char *backup_filename;
888 int asterisk_count = 0;
890 int filename_length = strlen(filename);
892 for(i = 0; suffix[i]; i++){
893 if(suffix[i] == '*') asterisk_count++;
897 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
898 if (!backup_filename){
899 perror("Can't malloc backup filename.");
903 for(i = 0, j = 0; suffix[i];){
904 if(suffix[i] == '*'){
905 backup_filename[j] = '\0';
906 strncat(backup_filename, filename, filename_length);
908 j += filename_length;
910 backup_filename[j++] = suffix[i++];
913 backup_filename[j] = '\0';
915 j = strlen(suffix) + filename_length;
916 backup_filename = malloc( + 1);
917 strcpy(backup_filename, filename);
918 strcat(backup_filename, suffix);
919 backup_filename[j] = '\0';
921 return backup_filename;
925 #ifdef UTF8_INPUT_ENABLE
926 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
933 (*f)(0, bin2hex(c>>shift));
943 void encode_fallback_html(nkf_char c)
948 if(c >= NKF_INT32_C(1000000))
949 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
950 if(c >= NKF_INT32_C(100000))
951 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
953 (*oconv)(0, 0x30+(c/10000 )%10);
955 (*oconv)(0, 0x30+(c/1000 )%10);
957 (*oconv)(0, 0x30+(c/100 )%10);
959 (*oconv)(0, 0x30+(c/10 )%10);
961 (*oconv)(0, 0x30+ c %10);
966 void encode_fallback_xml(nkf_char c)
971 nkf_each_char_to_hex(oconv, c);
976 void encode_fallback_java(nkf_char c)
980 if(!nkf_char_unicode_bmp_p(c)){
984 (*oconv)(0, bin2hex(c>>20));
985 (*oconv)(0, bin2hex(c>>16));
989 (*oconv)(0, bin2hex(c>>12));
990 (*oconv)(0, bin2hex(c>> 8));
991 (*oconv)(0, bin2hex(c>> 4));
992 (*oconv)(0, bin2hex(c ));
996 void encode_fallback_perl(nkf_char c)
1001 nkf_each_char_to_hex(oconv, c);
1006 void encode_fallback_subchar(nkf_char c)
1008 c = unicode_subchar;
1009 (*oconv)((c>>8)&0xFF, c&0xFF);
1014 static const struct {
1038 {"katakana-hiragana","h3"},
1046 #ifdef UTF8_OUTPUT_ENABLE
1056 {"fb-subchar=", ""},
1058 #ifdef UTF8_INPUT_ENABLE
1059 {"utf8-input", "W"},
1060 {"utf16-input", "W16"},
1061 {"no-cp932ext", ""},
1062 {"no-best-fit-chars",""},
1064 #ifdef UNICODE_NORMALIZATION
1065 {"utf8mac-input", ""},
1077 #ifdef NUMCHAR_OPTION
1078 {"numchar-input", ""},
1084 #ifdef SHIFTJIS_CP932
1094 static void set_input_encoding(nkf_encoding *enc)
1096 switch (nkf_enc_to_index(enc)) {
1103 #ifdef SHIFTJIS_CP932
1106 #ifdef UTF8_OUTPUT_ENABLE
1107 ms_ucs_map_f = UCS_MAP_CP932;
1117 case ISO_2022_JP_2004:
1124 #ifdef SHIFTJIS_CP932
1127 #ifdef UTF8_OUTPUT_ENABLE
1128 ms_ucs_map_f = UCS_MAP_CP932;
1133 #ifdef SHIFTJIS_CP932
1136 #ifdef UTF8_OUTPUT_ENABLE
1137 ms_ucs_map_f = UCS_MAP_CP10001;
1145 #ifdef SHIFTJIS_CP932
1148 #ifdef UTF8_OUTPUT_ENABLE
1149 ms_ucs_map_f = UCS_MAP_CP932;
1153 #ifdef SHIFTJIS_CP932
1156 #ifdef UTF8_OUTPUT_ENABLE
1157 ms_ucs_map_f = UCS_MAP_MS;
1161 #ifdef SHIFTJIS_CP932
1164 #ifdef UTF8_OUTPUT_ENABLE
1165 ms_ucs_map_f = UCS_MAP_ASCII;
1168 case SHIFT_JISX0213:
1169 case SHIFT_JIS_2004:
1171 #ifdef SHIFTJIS_CP932
1178 #ifdef SHIFTJIS_CP932
1182 #ifdef UTF8_INPUT_ENABLE
1183 #ifdef UNICODE_NORMALIZATION
1191 input_endian = ENDIAN_BIG;
1195 input_endian = ENDIAN_LITTLE;
1200 input_endian = ENDIAN_BIG;
1204 input_endian = ENDIAN_LITTLE;
1210 static void set_output_encoding(nkf_encoding *enc)
1212 switch (nkf_enc_to_index(enc)) {
1215 #ifdef SHIFTJIS_CP932
1216 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1218 #ifdef UTF8_OUTPUT_ENABLE
1219 ms_ucs_map_f = UCS_MAP_CP932;
1223 #ifdef SHIFTJIS_CP932
1224 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1226 #ifdef UTF8_OUTPUT_ENABLE
1227 ms_ucs_map_f = UCS_MAP_CP932;
1232 #ifdef SHIFTJIS_CP932
1233 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1239 #ifdef SHIFTJIS_CP932
1240 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1246 #ifdef UTF8_OUTPUT_ENABLE
1247 ms_ucs_map_f = UCS_MAP_CP932;
1251 #ifdef UTF8_OUTPUT_ENABLE
1252 ms_ucs_map_f = UCS_MAP_CP10001;
1257 #ifdef SHIFTJIS_CP932
1258 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1260 #ifdef UTF8_OUTPUT_ENABLE
1261 ms_ucs_map_f = UCS_MAP_ASCII;
1266 #ifdef SHIFTJIS_CP932
1267 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1269 #ifdef UTF8_OUTPUT_ENABLE
1270 ms_ucs_map_f = UCS_MAP_ASCII;
1274 #ifdef SHIFTJIS_CP932
1275 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1277 #ifdef UTF8_OUTPUT_ENABLE
1278 ms_ucs_map_f = UCS_MAP_CP932;
1283 #ifdef UTF8_OUTPUT_ENABLE
1284 ms_ucs_map_f = UCS_MAP_MS;
1289 #ifdef UTF8_OUTPUT_ENABLE
1290 ms_ucs_map_f = UCS_MAP_ASCII;
1293 case SHIFT_JISX0213:
1294 case SHIFT_JIS_2004:
1296 #ifdef SHIFTJIS_CP932
1297 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1304 #ifdef SHIFTJIS_CP932
1305 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1308 #ifdef UTF8_OUTPUT_ENABLE
1310 output_bom_f = TRUE;
1314 output_bom_f = TRUE;
1317 output_endian = ENDIAN_LITTLE;
1318 output_bom_f = FALSE;
1321 output_endian = ENDIAN_LITTLE;
1322 output_bom_f = TRUE;
1325 output_bom_f = TRUE;
1328 output_endian = ENDIAN_LITTLE;
1329 output_bom_f = FALSE;
1332 output_endian = ENDIAN_LITTLE;
1333 output_bom_f = TRUE;
1339 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1342 struct input_code *p = input_code_list;
1344 if (iconv_func == p->iconv_func){
1353 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1355 #ifdef INPUT_CODE_FIX
1356 if (f || !input_encoding)
1363 #ifdef INPUT_CODE_FIX
1364 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1370 if (estab_f && iconv_for_check != iconv){
1371 struct input_code *p = find_inputcode_byfunc(iconv);
1373 set_input_codename(p->name);
1376 iconv_for_check = iconv;
1382 nkf_char x0212_shift(nkf_char c)
1387 if (0x75 <= c && c <= 0x7f){
1388 ret = c + (0x109 - 0x75);
1391 if (0x75 <= c && c <= 0x7f){
1392 ret = c + (0x113 - 0x75);
1399 nkf_char x0212_unshift(nkf_char c)
1402 if (0x7f <= c && c <= 0x88){
1403 ret = c + (0x75 - 0x7f);
1404 }else if (0x89 <= c && c <= 0x92){
1405 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1409 #endif /* X0212_ENABLE */
1411 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1417 if((0x21 <= ndx && ndx <= 0x2F)){
1418 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1419 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1421 }else if(0x6E <= ndx && ndx <= 0x7E){
1422 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1423 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1429 else if(nkf_isgraph(ndx)){
1431 const unsigned short *ptr;
1432 ptr = x0212_shiftjis[ndx - 0x21];
1434 val = ptr[(c1 & 0x7f) - 0x21];
1443 c2 = x0212_shift(c2);
1445 #endif /* X0212_ENABLE */
1447 if(0x7F < c2) return 1;
1448 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1449 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1453 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1455 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1458 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1459 #ifdef SHIFTJIS_CP932
1460 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1461 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1468 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1469 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1475 #endif /* SHIFTJIS_CP932 */
1477 if (!x0213_f && is_ibmext_in_sjis(c2)){
1478 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1481 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1494 if(x0213_f && c2 >= 0xF0){
1495 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1496 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1497 }else{ /* 78<=k<=94 */
1498 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1499 if (0x9E < c1) c2++;
1502 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1503 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1504 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1505 if (0x9E < c1) c2++;
1508 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1515 c2 = x0212_unshift(c2);
1522 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1523 void nkf_unicode_to_utf8(nkf_char val, int *p1, int *p2, int *p3, int *p4)
1531 }else if (val < 0x800){
1532 *p1 = 0xc0 | (val >> 6);
1533 *p2 = 0x80 | (val & 0x3f);
1536 } else if (nkf_char_unicode_bmp_p(val)) {
1537 *p1 = 0xe0 | (val >> 12);
1538 *p2 = 0x80 | ((val >> 6) & 0x3f);
1539 *p3 = 0x80 | ( val & 0x3f);
1541 } else if (nkf_char_unicode_value_p(val)) {
1542 *p1 = 0xe0 | (val >> 16);
1543 *p2 = 0x80 | ((val >> 12) & 0x3f);
1544 *p3 = 0x80 | ((val >> 6) & 0x3f);
1545 *p4 = 0x80 | ( val & 0x3f);
1554 nkf_char nkf_utf8_to_unicode(int c1, int c2, int c3, int c4)
1561 else if (c1 <= 0xC3) {
1562 /* trail byte or invalid */
1565 else if (c1 <= 0xDF) {
1567 wc = (c1 & 0x1F) << 6;
1570 else if (c1 <= 0xEF) {
1572 wc = (c1 & 0x0F) << 12;
1573 wc |= (c2 & 0x3F) << 6;
1576 else if (c2 <= 0xF4) {
1578 wc = (c1 & 0x0F) << 18;
1579 wc |= (c2 & 0x3F) << 12;
1580 wc |= (c3 & 0x3F) << 6;
1590 #ifdef UTF8_INPUT_ENABLE
1591 static int unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1592 const unsigned short *const *pp, nkf_char psize,
1593 nkf_char *p2, nkf_char *p1)
1596 const unsigned short *p;
1599 if (pp == 0) return 1;
1602 if (c1 < 0 || psize <= c1) return 1;
1604 if (p == 0) return 1;
1607 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1609 if (val == 0) return 1;
1610 if (no_cp932ext_f && (
1611 (val>>8) == 0x2D || /* NEC special characters */
1612 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1620 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1627 static nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1629 const unsigned short *const *pp;
1630 const unsigned short *const *const *ppp;
1631 static const char no_best_fit_chars_table_C2[] =
1632 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1633 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1634 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1635 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1636 static const char no_best_fit_chars_table_C2_ms[] =
1637 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1639 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1640 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1641 static const char no_best_fit_chars_table_932_C2[] =
1642 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1644 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1645 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1646 static const char no_best_fit_chars_table_932_C3[] =
1647 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1648 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1649 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1650 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1656 }else if(c2 < 0xe0){
1657 if(no_best_fit_chars_f){
1658 if(ms_ucs_map_f == UCS_MAP_CP932){
1661 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1664 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1667 }else if(!cp932inv_f){
1670 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1673 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1676 }else if(ms_ucs_map_f == UCS_MAP_MS){
1677 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1678 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1696 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1697 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1698 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1700 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1701 }else if(c0 < 0xF0){
1702 if(no_best_fit_chars_f){
1703 if(ms_ucs_map_f == UCS_MAP_CP932){
1704 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1705 }else if(ms_ucs_map_f == UCS_MAP_MS){
1710 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1713 if(c0 == 0x92) return 1;
1718 if(c1 == 0x80 || c0 == 0x9C) return 1;
1721 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1726 if(c0 == 0x94) return 1;
1729 if(c0 == 0xBB) return 1;
1739 if(c0 == 0x95) return 1;
1742 if(c0 == 0xA5) return 1;
1749 if(c0 == 0x8D) return 1;
1752 if(c0 == 0x9E && !cp932inv_f) return 1;
1755 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1763 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1764 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1765 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1767 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1769 #ifdef SHIFTJIS_CP932
1770 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1772 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1773 s2e_conv(s2, s1, p2, p1);
1782 #ifdef UTF8_OUTPUT_ENABLE
1783 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
1785 const unsigned short *p;
1787 if (c2 == JIS_X_0201_1976_K) {
1788 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1796 p = euc_to_utf8_1byte;
1798 } else if (is_eucg3(c2)){
1799 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1802 c2 = (c2&0x7f) - 0x21;
1803 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1804 p = x0212_to_utf8_2bytes[c2];
1810 c2 = (c2&0x7f) - 0x21;
1811 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1813 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1814 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1815 euc_to_utf8_2bytes_ms[c2];
1820 c1 = (c1 & 0x7f) - 0x21;
1821 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1827 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1834 }else if (0xc0 <= c2 && c2 <= 0xef) {
1835 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1836 #ifdef NUMCHAR_OPTION
1839 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1847 #ifdef UTF8_INPUT_ENABLE
1848 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1857 else if (nkf_char_unicode_bmp_p(val)){
1858 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1859 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1862 *p1 = nkf_char_unicode_new(val);
1868 *p1 = nkf_char_unicode_new(val);
1874 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1876 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1877 if (iso2022jp_f && !x0201_f) {
1878 c2 = GETA1; c1 = GETA2;
1880 c2 = JIS_X_0201_1976_K;
1884 }else if (c2 == 0x8f){
1888 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
1889 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1890 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
1893 c2 = (c2 << 8) | (c1 & 0x7f);
1895 #ifdef SHIFTJIS_CP932
1898 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1899 s2e_conv(s2, s1, &c2, &c1);
1906 #endif /* SHIFTJIS_CP932 */
1908 #endif /* X0212_ENABLE */
1909 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
1912 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
1913 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1914 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
1919 #ifdef SHIFTJIS_CP932
1920 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
1922 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1923 s2e_conv(s2, s1, &c2, &c1);
1930 #endif /* SHIFTJIS_CP932 */
1937 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1939 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
1940 if (iso2022jp_f && !x0201_f) {
1941 c2 = GETA1; c1 = GETA2;
1945 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
1947 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
1949 if(c1 == 0x7F) return 0;
1950 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
1953 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
1954 if (ret) return ret;
1960 nkf_char w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
1962 nkf_char ret = 0, c4 = 0;
1963 static const char w_iconv_utf8_1st_byte[] =
1965 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1966 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1967 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
1968 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
1975 if (c1 < 0 || 0xff < c1) {
1976 }else if (c1 == 0) { /* 0 : 1 byte*/
1978 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
1981 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
1983 if (c2 < 0x80 || 0xBF < c2) return 0;
1986 if (c3 == 0) return -1;
1987 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
1992 if (c3 == 0) return -1;
1993 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
1997 if (c3 == 0) return -1;
1998 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2002 if (c3 == 0) return -2;
2003 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2007 if (c3 == 0) return -2;
2008 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2012 if (c3 == 0) return -2;
2013 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2021 if (c1 == 0 || c1 == EOF){
2022 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2023 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2026 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2034 #define NKF_ICONV_INVALID_CODE_RANGE -13
2035 static size_t unicode_iconv(nkf_char wc)
2043 }else if ((wc>>11) == 27) {
2044 /* unpaired surrogate */
2045 return NKF_ICONV_INVALID_CODE_RANGE;
2046 }else if (wc < 0xFFFF) {
2047 ret = w16e_conv(wc, &c2, &c1);
2048 if (ret) return ret;
2049 }else if (wc < 0x10FFFF) {
2051 c1 = nkf_char_unicode_new(wc);
2053 return NKF_ICONV_INVALID_CODE_RANGE;
2059 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2060 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2061 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2062 size_t nkf_iconv_utf_16(int c1, int c2, int c3, int c4)
2071 if (input_endian == ENDIAN_BIG) {
2072 if (0xD8 <= c1 && c1 <= 0xDB) {
2073 if (0xDC <= c3 && c3 <= 0xDF) {
2074 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2075 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2080 if (0xD8 <= c2 && c2 <= 0xDB) {
2081 if (0xDC <= c4 && c4 <= 0xDF) {
2082 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2083 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2089 return (*unicode_iconv)(wc);
2092 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2097 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2102 size_t nkf_iconv_utf_32(int c1, int c2, int c3, int c4)
2111 switch(input_endian){
2113 wc = c2 << 16 | c3 << 8 | c4;
2116 wc = c3 << 16 | c2 << 8 | c1;
2119 wc = c1 << 16 | c4 << 8 | c3;
2122 wc = c4 << 16 | c1 << 8 | c2;
2125 return NKF_ICONV_INVALID_CODE_RANGE;
2128 return (*unicode_iconv)(wc);
2132 #define output_ascii_escape_sequence(mode) do { \
2133 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2136 (*o_putc)(ascii_intro); \
2137 output_mode = mode; \
2141 void output_escape_sequence(int mode)
2143 if (output_mode == mode)
2151 case JIS_X_0201_1976_K:
2159 (*o_putc)(kanji_intro);
2183 void j_oconv(nkf_char c2, nkf_char c1)
2185 #ifdef NUMCHAR_OPTION
2186 if (c2 == 0 && nkf_char_unicode_p(c1)){
2187 w16e_conv(c1, &c2, &c1);
2188 if (c2 == 0 && nkf_char_unicode_p(c1)){
2189 c2 = c1 & VALUE_MASK;
2190 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2193 c2 = 0x7F + c1 / 94;
2194 c1 = 0x21 + c1 % 94;
2196 if (encode_fallback) (*encode_fallback)(c1);
2203 output_ascii_escape_sequence(ASCII);
2206 else if (c2 == EOF) {
2207 output_ascii_escape_sequence(ASCII);
2210 else if (c2 == ISO_8859_1) {
2211 output_ascii_escape_sequence(ISO_8859_1);
2214 else if (c2 == JIS_X_0201_1976_K) {
2215 output_escape_sequence(JIS_X_0201_1976_K);
2218 } else if (is_eucg3(c2)){
2219 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2220 (*o_putc)(c2 & 0x7f);
2225 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2226 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2227 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2233 void e_oconv(nkf_char c2, nkf_char c1)
2235 if (c2 == 0 && nkf_char_unicode_p(c1)){
2236 w16e_conv(c1, &c2, &c1);
2237 if (c2 == 0 && nkf_char_unicode_p(c1)){
2238 c2 = c1 & VALUE_MASK;
2239 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2243 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2244 c1 = 0x21 + c1 % 94;
2247 (*o_putc)((c2 & 0x7f) | 0x080);
2248 (*o_putc)(c1 | 0x080);
2250 (*o_putc)((c2 & 0x7f) | 0x080);
2251 (*o_putc)(c1 | 0x080);
2255 if (encode_fallback) (*encode_fallback)(c1);
2263 } else if (c2 == 0) {
2264 output_mode = ASCII;
2266 } else if (c2 == JIS_X_0201_1976_K) {
2267 output_mode = EUC_JP;
2268 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2269 } else if (c2 == ISO_8859_1) {
2270 output_mode = ISO_8859_1;
2271 (*o_putc)(c1 | 0x080);
2273 } else if (is_eucg3(c2)){
2274 output_mode = EUC_JP;
2275 #ifdef SHIFTJIS_CP932
2278 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2279 s2e_conv(s2, s1, &c2, &c1);
2284 output_mode = ASCII;
2286 }else if (is_eucg3(c2)){
2289 (*o_putc)((c2 & 0x7f) | 0x080);
2290 (*o_putc)(c1 | 0x080);
2293 (*o_putc)((c2 & 0x7f) | 0x080);
2294 (*o_putc)(c1 | 0x080);
2298 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2299 set_iconv(FALSE, 0);
2300 return; /* too late to rescue this char */
2302 output_mode = EUC_JP;
2303 (*o_putc)(c2 | 0x080);
2304 (*o_putc)(c1 | 0x080);
2308 void s_oconv(nkf_char c2, nkf_char c1)
2310 #ifdef NUMCHAR_OPTION
2311 if (c2 == 0 && nkf_char_unicode_p(c1)){
2312 w16e_conv(c1, &c2, &c1);
2313 if (c2 == 0 && nkf_char_unicode_p(c1)){
2314 c2 = c1 & VALUE_MASK;
2315 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2318 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2320 c1 += 0x40 + (c1 > 0x3e);
2325 if(encode_fallback)(*encode_fallback)(c1);
2334 } else if (c2 == 0) {
2335 output_mode = ASCII;
2337 } else if (c2 == JIS_X_0201_1976_K) {
2338 output_mode = SHIFT_JIS;
2340 } else if (c2 == ISO_8859_1) {
2341 output_mode = ISO_8859_1;
2342 (*o_putc)(c1 | 0x080);
2344 } else if (is_eucg3(c2)){
2345 output_mode = SHIFT_JIS;
2346 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2352 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2353 set_iconv(FALSE, 0);
2354 return; /* too late to rescue this char */
2356 output_mode = SHIFT_JIS;
2357 e2s_conv(c2, c1, &c2, &c1);
2359 #ifdef SHIFTJIS_CP932
2361 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2362 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2368 #endif /* SHIFTJIS_CP932 */
2371 if (prefix_table[(unsigned char)c1]){
2372 (*o_putc)(prefix_table[(unsigned char)c1]);
2378 #ifdef UTF8_OUTPUT_ENABLE
2379 void w_oconv(nkf_char c2, nkf_char c1)
2385 output_bom_f = FALSE;
2396 if (c2 == 0 && nkf_char_unicode_p(c1)){
2397 val = c1 & VALUE_MASK;
2398 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2400 if (c2) (*o_putc)(c2);
2401 if (c3) (*o_putc)(c3);
2402 if (c4) (*o_putc)(c4);
2409 val = e2w_conv(c2, c1);
2411 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2413 if (c2) (*o_putc)(c2);
2414 if (c3) (*o_putc)(c3);
2415 if (c4) (*o_putc)(c4);
2420 void w_oconv16(nkf_char c2, nkf_char c1)
2423 output_bom_f = FALSE;
2424 if (output_endian == ENDIAN_LITTLE){
2438 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2439 if (nkf_char_unicode_bmp_p(c1)) {
2440 c2 = (c1 >> 8) & 0xff;
2444 if (c1 <= UNICODE_MAX) {
2445 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2446 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2447 if (output_endian == ENDIAN_LITTLE){
2448 (*o_putc)(c2 & 0xff);
2449 (*o_putc)((c2 >> 8) & 0xff);
2450 (*o_putc)(c1 & 0xff);
2451 (*o_putc)((c1 >> 8) & 0xff);
2453 (*o_putc)((c2 >> 8) & 0xff);
2454 (*o_putc)(c2 & 0xff);
2455 (*o_putc)((c1 >> 8) & 0xff);
2456 (*o_putc)(c1 & 0xff);
2462 nkf_char val = e2w_conv(c2, c1);
2463 c2 = (val >> 8) & 0xff;
2467 if (output_endian == ENDIAN_LITTLE){
2476 void w_oconv32(nkf_char c2, nkf_char c1)
2479 output_bom_f = FALSE;
2480 if (output_endian == ENDIAN_LITTLE){
2498 if (c2 == ISO_8859_1) {
2500 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2503 c1 = e2w_conv(c2, c1);
2506 if (output_endian == ENDIAN_LITTLE){
2507 (*o_putc)( c1 & 0xFF);
2508 (*o_putc)((c1 >> 8) & 0xFF);
2509 (*o_putc)((c1 >> 16) & 0xFF);
2513 (*o_putc)((c1 >> 16) & 0xFF);
2514 (*o_putc)((c1 >> 8) & 0xFF);
2515 (*o_putc)( c1 & 0xFF);
2520 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2521 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2522 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2523 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2524 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2525 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2526 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2527 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2529 #define SCORE_INIT (SCORE_iMIME)
2531 static const char score_table_A0[] = {
2534 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2535 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2538 static const char score_table_F0[] = {
2539 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2540 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2541 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2542 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2545 void set_code_score(struct input_code *ptr, nkf_char score)
2548 ptr->score |= score;
2552 void clr_code_score(struct input_code *ptr, nkf_char score)
2555 ptr->score &= ~score;
2559 void code_score(struct input_code *ptr)
2561 nkf_char c2 = ptr->buf[0];
2562 #ifdef UTF8_OUTPUT_ENABLE
2563 nkf_char c1 = ptr->buf[1];
2566 set_code_score(ptr, SCORE_ERROR);
2567 }else if (c2 == SS2){
2568 set_code_score(ptr, SCORE_KANA);
2569 }else if (c2 == 0x8f){
2570 set_code_score(ptr, SCORE_X0212);
2571 #ifdef UTF8_OUTPUT_ENABLE
2572 }else if (!e2w_conv(c2, c1)){
2573 set_code_score(ptr, SCORE_NO_EXIST);
2575 }else if ((c2 & 0x70) == 0x20){
2576 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2577 }else if ((c2 & 0x70) == 0x70){
2578 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2579 }else if ((c2 & 0x70) >= 0x50){
2580 set_code_score(ptr, SCORE_L2);
2584 void status_disable(struct input_code *ptr)
2589 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2592 void status_push_ch(struct input_code *ptr, nkf_char c)
2594 ptr->buf[ptr->index++] = c;
2597 void status_clear(struct input_code *ptr)
2603 void status_reset(struct input_code *ptr)
2606 ptr->score = SCORE_INIT;
2609 void status_reinit(struct input_code *ptr)
2612 ptr->_file_stat = 0;
2615 void status_check(struct input_code *ptr, nkf_char c)
2617 if (c <= DEL && estab_f){
2622 void s_status(struct input_code *ptr, nkf_char c)
2626 status_check(ptr, c);
2631 }else if (nkf_char_unicode_p(c)){
2633 }else if (0xa1 <= c && c <= 0xdf){
2634 status_push_ch(ptr, SS2);
2635 status_push_ch(ptr, c);
2638 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2640 status_push_ch(ptr, c);
2641 }else if (0xed <= c && c <= 0xee){
2643 status_push_ch(ptr, c);
2644 #ifdef SHIFTJIS_CP932
2645 }else if (is_ibmext_in_sjis(c)){
2647 status_push_ch(ptr, c);
2648 #endif /* SHIFTJIS_CP932 */
2650 }else if (0xf0 <= c && c <= 0xfc){
2652 status_push_ch(ptr, c);
2653 #endif /* X0212_ENABLE */
2655 status_disable(ptr);
2659 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2660 status_push_ch(ptr, c);
2661 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2665 status_disable(ptr);
2669 #ifdef SHIFTJIS_CP932
2670 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2671 status_push_ch(ptr, c);
2672 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2673 set_code_score(ptr, SCORE_CP932);
2678 #endif /* SHIFTJIS_CP932 */
2679 status_disable(ptr);
2682 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2683 status_push_ch(ptr, c);
2684 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2685 set_code_score(ptr, SCORE_CP932);
2688 status_disable(ptr);
2694 void e_status(struct input_code *ptr, nkf_char c)
2698 status_check(ptr, c);
2703 }else if (nkf_char_unicode_p(c)){
2705 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2707 status_push_ch(ptr, c);
2709 }else if (0x8f == c){
2711 status_push_ch(ptr, c);
2712 #endif /* X0212_ENABLE */
2714 status_disable(ptr);
2718 if (0xa1 <= c && c <= 0xfe){
2719 status_push_ch(ptr, c);
2723 status_disable(ptr);
2728 if (0xa1 <= c && c <= 0xfe){
2730 status_push_ch(ptr, c);
2732 status_disable(ptr);
2734 #endif /* X0212_ENABLE */
2738 #ifdef UTF8_INPUT_ENABLE
2739 void w_status(struct input_code *ptr, nkf_char c)
2743 status_check(ptr, c);
2748 }else if (nkf_char_unicode_p(c)){
2750 }else if (0xc0 <= c && c <= 0xdf){
2752 status_push_ch(ptr, c);
2753 }else if (0xe0 <= c && c <= 0xef){
2755 status_push_ch(ptr, c);
2756 }else if (0xf0 <= c && c <= 0xf4){
2758 status_push_ch(ptr, c);
2760 status_disable(ptr);
2765 if (0x80 <= c && c <= 0xbf){
2766 status_push_ch(ptr, c);
2767 if (ptr->index > ptr->stat){
2768 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2769 && ptr->buf[2] == 0xbf);
2770 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2771 &ptr->buf[0], &ptr->buf[1]);
2778 status_disable(ptr);
2782 if (0x80 <= c && c <= 0xbf){
2783 if (ptr->index < ptr->stat){
2784 status_push_ch(ptr, c);
2789 status_disable(ptr);
2796 void code_status(nkf_char c)
2798 int action_flag = 1;
2799 struct input_code *result = 0;
2800 struct input_code *p = input_code_list;
2802 if (!p->status_func) {
2806 if (!p->status_func)
2808 (p->status_func)(p, c);
2811 }else if(p->stat == 0){
2822 if (result && !estab_f){
2823 set_iconv(TRUE, result->iconv_func);
2824 }else if (c <= DEL){
2825 struct input_code *ptr = input_code_list;
2835 nkf_char std_getc(FILE *f)
2838 return std_gc_buf[--std_gc_ndx];
2844 nkf_char std_ungetc(nkf_char c, FILE *f)
2846 if (std_gc_ndx == STD_GC_BUFSIZE){
2849 std_gc_buf[std_gc_ndx++] = c;
2854 void std_putc(nkf_char c)
2861 static unsigned char hold_buf[HOLD_SIZE*2];
2862 static int hold_count = 0;
2863 nkf_char push_hold_buf(nkf_char c2)
2865 if (hold_count >= HOLD_SIZE*2)
2867 hold_buf[hold_count++] = (unsigned char)c2;
2868 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2871 static int h_conv(FILE *f, int c1, int c2)
2877 /** it must NOT be in the kanji shifte sequence */
2878 /** it must NOT be written in JIS7 */
2879 /** and it must be after 2 byte 8bit code */
2885 while ((c2 = (*i_getc)(f)) != EOF) {
2891 if (push_hold_buf(c2) == EOF || estab_f) {
2897 struct input_code *p = input_code_list;
2898 struct input_code *result = p;
2903 if (p->status_func && p->score < result->score) {
2908 set_iconv(TRUE, result->iconv_func);
2913 ** 1) EOF is detected, or
2914 ** 2) Code is established, or
2915 ** 3) Buffer is FULL (but last word is pushed)
2917 ** in 1) and 3) cases, we continue to use
2918 ** Kanji codes by oconv and leave estab_f unchanged.
2923 while (hold_index < hold_count){
2924 c1 = hold_buf[hold_index++];
2928 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
2929 (*iconv)(JIS_X_0201_1976_K, c1, 0);
2932 if (hold_index < hold_count){
2933 c2 = hold_buf[hold_index++];
2943 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
2946 if (hold_index < hold_count){
2947 c3 = hold_buf[hold_index++];
2948 } else if ((c3 = (*i_getc)(f)) == EOF) {
2953 if (hold_index < hold_count){
2954 c4 = hold_buf[hold_index++];
2955 } else if ((c4 = (*i_getc)(f)) == EOF) {
2960 (*iconv)(c1, c2, (c3<<8)|c4);
2965 /* 3 bytes EUC or UTF-8 */
2966 if (hold_index < hold_count){
2967 c3 = hold_buf[hold_index++];
2968 } else if ((c3 = (*i_getc)(f)) == EOF) {
2974 (*iconv)(c1, c2, c3);
2977 if (c3 == EOF) break;
2983 * Check and Ignore BOM
2985 void check_bom(FILE *f)
2988 switch(c2 = (*i_getc)(f)){
2990 if((c2 = (*i_getc)(f)) == 0x00){
2991 if((c2 = (*i_getc)(f)) == 0xFE){
2992 if((c2 = (*i_getc)(f)) == 0xFF){
2993 if(!input_encoding){
2994 set_iconv(TRUE, w_iconv32);
2996 if (iconv == w_iconv32) {
2997 input_endian = ENDIAN_BIG;
3000 (*i_ungetc)(0xFF,f);
3001 }else (*i_ungetc)(c2,f);
3002 (*i_ungetc)(0xFE,f);
3003 }else if(c2 == 0xFF){
3004 if((c2 = (*i_getc)(f)) == 0xFE){
3005 if(!input_encoding){
3006 set_iconv(TRUE, w_iconv32);
3008 if (iconv == w_iconv32) {
3009 input_endian = ENDIAN_2143;
3012 (*i_ungetc)(0xFF,f);
3013 }else (*i_ungetc)(c2,f);
3014 (*i_ungetc)(0xFF,f);
3015 }else (*i_ungetc)(c2,f);
3016 (*i_ungetc)(0x00,f);
3017 }else (*i_ungetc)(c2,f);
3018 (*i_ungetc)(0x00,f);
3021 if((c2 = (*i_getc)(f)) == 0xBB){
3022 if((c2 = (*i_getc)(f)) == 0xBF){
3023 if(!input_encoding){
3024 set_iconv(TRUE, w_iconv);
3026 if (iconv == w_iconv) {
3029 (*i_ungetc)(0xBF,f);
3030 }else (*i_ungetc)(c2,f);
3031 (*i_ungetc)(0xBB,f);
3032 }else (*i_ungetc)(c2,f);
3033 (*i_ungetc)(0xEF,f);
3036 if((c2 = (*i_getc)(f)) == 0xFF){
3037 if((c2 = (*i_getc)(f)) == 0x00){
3038 if((c2 = (*i_getc)(f)) == 0x00){
3039 if(!input_encoding){
3040 set_iconv(TRUE, w_iconv32);
3042 if (iconv == w_iconv32) {
3043 input_endian = ENDIAN_3412;
3046 (*i_ungetc)(0x00,f);
3047 }else (*i_ungetc)(c2,f);
3048 (*i_ungetc)(0x00,f);
3049 }else (*i_ungetc)(c2,f);
3050 if(!input_encoding){
3051 set_iconv(TRUE, w_iconv16);
3053 if (iconv == w_iconv16) {
3054 input_endian = ENDIAN_BIG;
3057 (*i_ungetc)(0xFF,f);
3058 }else (*i_ungetc)(c2,f);
3059 (*i_ungetc)(0xFE,f);
3062 if((c2 = (*i_getc)(f)) == 0xFE){
3063 if((c2 = (*i_getc)(f)) == 0x00){
3064 if((c2 = (*i_getc)(f)) == 0x00){
3065 if(!input_encoding){
3066 set_iconv(TRUE, w_iconv32);
3068 if (iconv == w_iconv32) {
3069 input_endian = ENDIAN_LITTLE;
3072 (*i_ungetc)(0x00,f);
3073 }else (*i_ungetc)(c2,f);
3074 (*i_ungetc)(0x00,f);
3075 }else (*i_ungetc)(c2,f);
3076 if(!input_encoding){
3077 set_iconv(TRUE, w_iconv16);
3079 if (iconv == w_iconv16) {
3080 input_endian = ENDIAN_LITTLE;
3083 (*i_ungetc)(0xFE,f);
3084 }else (*i_ungetc)(c2,f);
3085 (*i_ungetc)(0xFF,f);
3099 static void init_broken_state(void)
3101 memset(&broken_state, 0, sizeof(broken_state));
3104 static void push_broken_buf(c)
3106 broken_state.buf[broken_state.count++] = c;
3109 static nkf_char pop_broken_buf(void)
3111 return broken_state.buf[--broken_state.count];
3114 nkf_char broken_getc(FILE *f)
3118 if (broken_state.count > 0) {
3119 return pop_broken_buf();
3122 if (c=='$' && broken_state.status != ESC
3123 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3125 broken_state.status = 0;
3126 if (c1=='@'|| c1=='B') {
3127 push_broken_buf(c1);
3134 } else if (c=='(' && broken_state.status != ESC
3135 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3137 broken_state.status = 0;
3138 if (c1=='J'|| c1=='B') {
3139 push_broken_buf(c1);
3147 broken_state.status = c;
3152 nkf_char broken_ungetc(nkf_char c, FILE *f)
3154 if (broken_state.count < 2)
3159 void eol_conv(nkf_char c2, nkf_char c1)
3161 if (guess_f && input_eol != EOF) {
3162 if (c2 == 0 && c1 == LF) {
3163 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3164 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3165 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3167 else if (!input_eol) input_eol = CR;
3168 else if (input_eol != CR) input_eol = EOF;
3170 if (prev_cr || (c2 == 0 && c1 == LF)) {
3172 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3173 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3175 if (c2 == 0 && c1 == CR) prev_cr = CR;
3176 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3180 Return value of fold_conv()
3182 LF add newline and output char
3183 CR add newline and output nothing
3186 1 (or else) normal output
3188 fold state in prev (previous character)
3190 >0x80 Japanese (X0208/X0201)
3195 This fold algorthm does not preserve heading space in a line.
3196 This is the main difference from fmt.
3199 #define char_size(c2,c1) (c2?2:1)
3201 void fold_conv(nkf_char c2, nkf_char c1)
3204 nkf_char fold_state;
3206 if (c1== CR && !fold_preserve_f) {
3207 fold_state=0; /* ignore cr */
3208 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3210 fold_state=0; /* ignore cr */
3211 } else if (c1== BS) {
3212 if (f_line>0) f_line--;
3214 } else if (c2==EOF && f_line != 0) { /* close open last line */
3216 } else if ((c1==LF && !fold_preserve_f)
3217 || ((c1==CR||(c1==LF&&f_prev!=CR))
3218 && fold_preserve_f)) {
3220 if (fold_preserve_f) {
3224 } else if ((f_prev == c1 && !fold_preserve_f)
3225 || (f_prev == LF && fold_preserve_f)
3226 ) { /* duplicate newline */
3229 fold_state = LF; /* output two newline */
3235 if (f_prev&0x80) { /* Japanese? */
3237 fold_state = 0; /* ignore given single newline */
3238 } else if (f_prev==SP) {
3242 if (++f_line<=fold_len)
3246 fold_state = CR; /* fold and output nothing */
3250 } else if (c1=='\f') {
3253 fold_state = LF; /* output newline and clear */
3254 } else if ( (c2==0 && c1==SP)||
3255 (c2==0 && c1==TAB)||
3256 (c2=='!'&& c1=='!')) {
3257 /* X0208 kankaku or ascii space */
3259 fold_state = 0; /* remove duplicate spaces */
3262 if (++f_line<=fold_len)
3263 fold_state = SP; /* output ASCII space only */
3265 f_prev = SP; f_line = 0;
3266 fold_state = CR; /* fold and output nothing */
3270 prev0 = f_prev; /* we still need this one... , but almost done */
3272 if (c2 || c2 == JIS_X_0201_1976_K)
3273 f_prev |= 0x80; /* this is Japanese */
3274 f_line += char_size(c2,c1);
3275 if (f_line<=fold_len) { /* normal case */
3278 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3279 f_line = char_size(c2,c1);
3280 fold_state = LF; /* We can't wait, do fold now */
3281 } else if (c2 == JIS_X_0201_1976_K) {
3282 /* simple kinsoku rules return 1 means no folding */
3283 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3284 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3285 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3286 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3287 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3288 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3289 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3291 fold_state = LF;/* add one new f_line before this character */
3294 fold_state = LF;/* add one new f_line before this character */
3297 /* kinsoku point in ASCII */
3298 if ( c1==')'|| /* { [ ( */
3309 /* just after special */
3310 } else if (!is_alnum(prev0)) {
3311 f_line = char_size(c2,c1);
3313 } else if ((prev0==SP) || /* ignored new f_line */
3314 (prev0==LF)|| /* ignored new f_line */
3315 (prev0&0x80)) { /* X0208 - ASCII */
3316 f_line = char_size(c2,c1);
3317 fold_state = LF;/* add one new f_line before this character */
3319 fold_state = 1; /* default no fold in ASCII */
3323 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3324 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3325 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3326 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3327 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3328 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3329 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3330 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3331 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3332 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3333 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3334 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3335 /* default no fold in kinsoku */
3338 f_line = char_size(c2,c1);
3339 /* add one new f_line before this character */
3342 f_line = char_size(c2,c1);
3344 /* add one new f_line before this character */
3349 /* terminator process */
3350 switch(fold_state) {
3352 OCONV_NEWLINE((*o_fconv));
3358 OCONV_NEWLINE((*o_fconv));
3369 nkf_char z_prev2=0,z_prev1=0;
3371 void z_conv(nkf_char c2, nkf_char c1)
3374 /* if (c2) c1 &= 0x7f; assertion */
3376 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3382 if (z_prev2 == JIS_X_0201_1976_K) {
3383 if (c2 == JIS_X_0201_1976_K) {
3384 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3386 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3388 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3390 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3395 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3397 if (c2 == JIS_X_0201_1976_K) {
3398 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3399 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3404 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3415 if (alpha_f&1 && c2 == 0x23) {
3416 /* JISX0208 Alphabet */
3418 } else if (c2 == 0x21) {
3419 /* JISX0208 Kigou */
3424 } else if (alpha_f&4) {
3429 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3435 if (alpha_f&8 && c2 == 0) {
3439 case '>': entity = ">"; break;
3440 case '<': entity = "<"; break;
3441 case '\"': entity = """; break;
3442 case '&': entity = "&"; break;
3445 while (*entity) (*o_zconv)(0, *entity++);
3451 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3456 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3460 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3464 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3468 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3472 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3476 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3480 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3484 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3489 (*o_zconv)(JIS_X_0201_1976_K, c);
3492 } else if (c2 == 0x25) {
3493 /* JISX0208 Katakana */
3494 static const int fullwidth_to_halfwidth[] =
3496 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3497 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3498 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3499 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3500 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3501 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3502 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3503 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3504 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3505 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3506 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3507 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3509 if (fullwidth_to_halfwidth[c1-0x20]){
3510 c2 = fullwidth_to_halfwidth[c1-0x20];
3511 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3513 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3523 #define rot13(c) ( \
3525 (c <= 'M') ? (c + 13): \
3526 (c <= 'Z') ? (c - 13): \
3528 (c <= 'm') ? (c + 13): \
3529 (c <= 'z') ? (c - 13): \
3533 #define rot47(c) ( \
3535 ( c <= 'O') ? (c + 47) : \
3536 ( c <= '~') ? (c - 47) : \
3540 void rot_conv(nkf_char c2, nkf_char c1)
3542 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3548 (*o_rot_conv)(c2,c1);
3551 void hira_conv(nkf_char c2, nkf_char c1)
3555 if (0x20 < c1 && c1 < 0x74) {
3557 (*o_hira_conv)(c2,c1);
3559 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3561 c1 = nkf_char_unicode_new(0x3094);
3562 (*o_hira_conv)(c2,c1);
3565 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3567 (*o_hira_conv)(c2,c1);
3572 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3575 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3577 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3581 (*o_hira_conv)(c2,c1);
3585 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3587 #define RANGE_NUM_MAX 18
3588 static const nkf_char range[RANGE_NUM_MAX][2] = {
3609 nkf_char start, end, c;
3611 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3615 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3620 for (i = 0; i < RANGE_NUM_MAX; i++) {
3621 start = range[i][0];
3624 if (c >= start && c <= end) {
3629 (*o_iso2022jp_check_conv)(c2,c1);
3633 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3635 static const unsigned char *mime_pattern[] = {
3636 (const unsigned char *)"\075?EUC-JP?B?",
3637 (const unsigned char *)"\075?SHIFT_JIS?B?",
3638 (const unsigned char *)"\075?ISO-8859-1?Q?",
3639 (const unsigned char *)"\075?ISO-8859-1?B?",
3640 (const unsigned char *)"\075?ISO-2022-JP?B?",
3641 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3642 #if defined(UTF8_INPUT_ENABLE)
3643 (const unsigned char *)"\075?UTF-8?B?",
3644 (const unsigned char *)"\075?UTF-8?Q?",
3646 (const unsigned char *)"\075?US-ASCII?Q?",
3651 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3652 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3653 e_iconv, s_iconv, 0, 0, 0, 0,
3654 #if defined(UTF8_INPUT_ENABLE)
3660 static const nkf_char mime_encode[] = {
3661 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3662 #if defined(UTF8_INPUT_ENABLE)
3669 static const nkf_char mime_encode_method[] = {
3670 'B', 'B','Q', 'B', 'B', 'Q',
3671 #if defined(UTF8_INPUT_ENABLE)
3679 /* MIME preprocessor fifo */
3681 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3682 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3683 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3685 unsigned char buf[MIME_BUF_SIZE];
3687 unsigned int last; /* decoded */
3688 unsigned int input; /* undecoded */
3690 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3692 #define MAXRECOVER 20
3694 static void mime_input_buf_unshift(nkf_char c)
3696 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3699 nkf_char mime_ungetc(nkf_char c, FILE *f)
3701 mime_input_buf_unshift(c);
3705 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
3708 (*i_mungetc_buf)(c,f);
3710 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3714 nkf_char mime_getc_buf(FILE *f)
3716 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3717 a terminator. It was checked in mime_integrity. */
3718 return ((mimebuf_f)?
3719 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3722 void switch_mime_getc(void)
3724 if (i_getc!=mime_getc) {
3725 i_mgetc = i_getc; i_getc = mime_getc;
3726 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3727 if(mime_f==STRICT_MIME) {
3728 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3729 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3734 void unswitch_mime_getc(void)
3736 if(mime_f==STRICT_MIME) {
3737 i_mgetc = i_mgetc_buf;
3738 i_mungetc = i_mungetc_buf;
3741 i_ungetc = i_mungetc;
3742 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3743 mime_iconv_back = NULL;
3746 nkf_char mime_integrity(FILE *f, const unsigned char *p)
3750 /* In buffered mode, read until =? or NL or buffer full
3752 mime_input_state.input = mime_input_state.top;
3753 mime_input_state.last = mime_input_state.top;
3755 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3757 q = mime_input_state.input;
3758 while((c=(*i_getc)(f))!=EOF) {
3759 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3760 break; /* buffer full */
3762 if (c=='=' && d=='?') {
3763 /* checked. skip header, start decode */
3764 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3765 /* mime_last_input = mime_input_state.input; */
3766 mime_input_state.input = q;
3770 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3772 /* Should we check length mod 4? */
3773 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3776 /* In case of Incomplete MIME, no MIME decode */
3777 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3778 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3779 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3780 switch_mime_getc(); /* anyway we need buffered getc */
3784 nkf_char mime_begin_strict(FILE *f)
3788 const unsigned char *p,*q;
3789 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3791 mime_decode_mode = FALSE;
3792 /* =? has been checked */
3794 p = mime_pattern[j];
3797 for(i=2;p[i]>SP;i++) { /* start at =? */
3798 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3799 /* pattern fails, try next one */
3801 while (mime_pattern[++j]) {
3802 p = mime_pattern[j];
3803 for(k=2;k<i;k++) /* assume length(p) > i */
3804 if (p[k]!=q[k]) break;
3805 if (k==i && nkf_toupper(c1)==p[k]) break;
3807 p = mime_pattern[j];
3808 if (p) continue; /* found next one, continue */
3809 /* all fails, output from recovery buffer */
3817 mime_decode_mode = p[i-2];
3819 mime_iconv_back = iconv;
3820 set_iconv(FALSE, mime_priority_func[j]);
3821 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3823 if (mime_decode_mode=='B') {
3824 mimebuf_f = unbuf_f;
3826 /* do MIME integrity check */
3827 return mime_integrity(f,mime_pattern[j]);
3835 nkf_char mime_begin(FILE *f)
3840 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3841 /* re-read and convert again from mime_buffer. */
3843 /* =? has been checked */
3844 k = mime_input_state.last;
3845 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
3846 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3847 /* We accept any character type even if it is breaked by new lines */
3848 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3849 if (c1==LF||c1==SP||c1==CR||
3850 c1=='-'||c1=='_'||is_alnum(c1)) continue;
3852 /* Failed. But this could be another MIME preemble */
3854 mime_input_state.last--;
3860 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3861 if (!(++i<MAXRECOVER) || c1==EOF) break;
3862 if (c1=='b'||c1=='B') {
3863 mime_decode_mode = 'B';
3864 } else if (c1=='q'||c1=='Q') {
3865 mime_decode_mode = 'Q';
3869 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3870 if (!(++i<MAXRECOVER) || c1==EOF) break;
3872 mime_decode_mode = FALSE;
3878 if (!mime_decode_mode) {
3879 /* false MIME premble, restart from mime_buffer */
3880 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3881 /* Since we are in MIME mode until buffer becomes empty, */
3882 /* we never go into mime_begin again for a while. */
3885 /* discard mime preemble, and goto MIME mode */
3886 mime_input_state.last = k;
3887 /* do no MIME integrity check */
3888 return c1; /* used only for checking EOF */
3892 void no_putc(nkf_char c)
3897 void debug(const char *str)
3900 fprintf(stderr, "%s\n", str ? str : "NULL");
3905 void set_input_codename(char *codename)
3907 if (!input_codename) {
3908 input_codename = codename;
3909 } else if (strcmp(codename, input_codename) != 0) {
3910 input_codename = "";
3914 static char* get_guessed_code(void)
3916 if (input_codename && !*input_codename) {
3917 input_codename = "BINARY";
3919 struct input_code *p = find_inputcode_byfunc(iconv);
3920 if (!input_codename) {
3921 input_codename = "ASCII";
3922 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
3923 if (p->score & (SCORE_DEPEND|SCORE_CP932))
3924 input_codename = "CP932";
3925 } else if (strcmp(input_codename, "EUC-JP") == 0) {
3926 if (p->score & (SCORE_X0212))
3927 input_codename = "EUCJP-MS";
3928 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3929 input_codename = "CP51932";
3930 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
3931 if (p->score & (SCORE_KANA))
3932 input_codename = "CP50221";
3933 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3934 input_codename = "CP50220";
3937 return input_codename;
3940 #if !defined(PERL_XS) && !defined(WIN32DLL)
3941 void print_guessed_code(char *filename)
3943 if (filename != NULL) printf("%s: ", filename);
3944 if (input_codename && !*input_codename) {
3947 input_codename = get_guessed_code();
3949 printf("%s\n", input_codename);
3953 input_eol == CR ? " (CR)" :
3954 input_eol == LF ? " (LF)" :
3955 input_eol == CRLF ? " (CRLF)" :
3956 input_eol == EOF ? " (MIXED NL)" :
3965 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
3967 nkf_char c1, c2, c3;
3973 if (!nkf_isxdigit(c2)){
3978 if (!nkf_isxdigit(c3)){
3983 return (hex2bin(c2) << 4) | hex2bin(c3);
3986 nkf_char cap_getc(FILE *f)
3988 return hex_getc(':', f, i_cgetc, i_cungetc);
3991 nkf_char cap_ungetc(nkf_char c, FILE *f)
3993 return (*i_cungetc)(c, f);
3996 nkf_char url_getc(FILE *f)
3998 return hex_getc('%', f, i_ugetc, i_uungetc);
4001 nkf_char url_ungetc(nkf_char c, FILE *f)
4003 return (*i_uungetc)(c, f);
4007 #ifdef NUMCHAR_OPTION
4008 nkf_char numchar_getc(FILE *f)
4010 nkf_char (*g)(FILE *) = i_ngetc;
4011 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4022 if (buf[i] == 'x' || buf[i] == 'X'){
4023 for (j = 0; j < 7; j++){
4025 if (!nkf_isxdigit(buf[i])){
4032 c |= hex2bin(buf[i]);
4035 for (j = 0; j < 8; j++){
4039 if (!nkf_isdigit(buf[i])){
4046 c += hex2bin(buf[i]);
4052 return nkf_char_unicode_new(c);
4061 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4063 return (*i_nungetc)(c, f);
4067 #ifdef UNICODE_NORMALIZATION
4069 /* Normalization Form C */
4070 nkf_char nfc_getc(FILE *f)
4072 nkf_char (*g)(FILE *f) = i_nfc_getc;
4073 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4074 int i=0, j, k=1, lower, upper;
4076 const unsigned char *array;
4079 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4080 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4081 while (upper >= lower) {
4082 j = (lower+upper) / 2;
4083 array = normalization_table[j].nfd;
4084 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4085 if (array[k] != buf[k]){
4086 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4093 array = normalization_table[j].nfc;
4094 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4095 buf[i] = (nkf_char)(array[i]);
4106 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4108 return (*i_nfc_ungetc)(c, f);
4110 #endif /* UNICODE_NORMALIZATION */
4113 static nkf_char base64decode(nkf_char c)
4118 i = c - 'A'; /* A..Z 0-25 */
4119 } else if (c == '_') {
4120 i = '?' /* 63 */ ; /* _ 63 */
4122 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4124 } else if (c > '/') {
4125 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4126 } else if (c == '+' || c == '-') {
4127 i = '>' /* 62 */ ; /* + and - 62 */
4129 i = '?' /* 63 */ ; /* / 63 */
4137 nkf_char c1, c2, c3, c4, cc;
4138 nkf_char t1, t2, t3, t4, mode, exit_mode;
4139 nkf_char lwsp_count;
4142 nkf_char lwsp_size = 128;
4144 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4145 return mime_input_buf(mime_input_state.top++);
4147 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4148 mime_decode_mode=FALSE;
4149 unswitch_mime_getc();
4150 return (*i_getc)(f);
4153 if (mimebuf_f == FIXED_MIME)
4154 exit_mode = mime_decode_mode;
4157 if (mime_decode_mode == 'Q') {
4158 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4160 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4161 if (c1<=SP || DEL<=c1) {
4162 mime_decode_mode = exit_mode; /* prepare for quit */
4165 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4169 mime_decode_mode = exit_mode; /* prepare for quit */
4170 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4171 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4172 /* end Q encoding */
4173 input_mode = exit_mode;
4175 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4176 if (lwsp_buf==NULL) {
4177 perror("can't malloc");
4180 while ((c1=(*i_getc)(f))!=EOF) {
4185 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4193 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4194 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4209 lwsp_buf[lwsp_count] = (unsigned char)c1;
4210 if (lwsp_count++>lwsp_size){
4212 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4213 if (lwsp_buf_new==NULL) {
4215 perror("can't realloc");
4218 lwsp_buf = lwsp_buf_new;
4224 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4226 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4227 i_ungetc(lwsp_buf[lwsp_count],f);
4233 if (c1=='='&&c2<SP) { /* this is soft wrap */
4234 while((c1 = (*i_mgetc)(f)) <=SP) {
4235 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4237 mime_decode_mode = 'Q'; /* still in MIME */
4238 goto restart_mime_q;
4241 mime_decode_mode = 'Q'; /* still in MIME */
4245 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4246 if (c2<=SP) return c2;
4247 mime_decode_mode = 'Q'; /* still in MIME */
4248 return ((hex2bin(c2)<<4) + hex2bin(c3));
4251 if (mime_decode_mode != 'B') {
4252 mime_decode_mode = FALSE;
4253 return (*i_mgetc)(f);
4257 /* Base64 encoding */
4259 MIME allows line break in the middle of
4260 Base64, but we are very pessimistic in decoding
4261 in unbuf mode because MIME encoded code may broken by
4262 less or editor's control sequence (such as ESC-[-K in unbuffered
4263 mode. ignore incomplete MIME.
4265 mode = mime_decode_mode;
4266 mime_decode_mode = exit_mode; /* prepare for quit */
4268 while ((c1 = (*i_mgetc)(f))<=SP) {
4273 if ((c2 = (*i_mgetc)(f))<=SP) {
4276 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4277 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4280 if ((c1 == '?') && (c2 == '=')) {
4283 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4284 if (lwsp_buf==NULL) {
4285 perror("can't malloc");
4288 while ((c1=(*i_getc)(f))!=EOF) {
4293 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4301 if ((c1=(*i_getc)(f))!=EOF) {
4305 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4320 lwsp_buf[lwsp_count] = (unsigned char)c1;
4321 if (lwsp_count++>lwsp_size){
4323 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4324 if (lwsp_buf_new==NULL) {
4326 perror("can't realloc");
4329 lwsp_buf = lwsp_buf_new;
4335 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4337 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4338 i_ungetc(lwsp_buf[lwsp_count],f);
4345 if ((c3 = (*i_mgetc)(f))<=SP) {
4348 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4349 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4353 if ((c4 = (*i_mgetc)(f))<=SP) {
4356 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4357 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4361 mime_decode_mode = mode; /* still in MIME sigh... */
4363 /* BASE 64 decoding */
4365 t1 = 0x3f & base64decode(c1);
4366 t2 = 0x3f & base64decode(c2);
4367 t3 = 0x3f & base64decode(c3);
4368 t4 = 0x3f & base64decode(c4);
4369 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4371 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4372 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4374 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4375 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4377 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4382 return mime_input_buf(mime_input_state.top++);
4385 static const char basis_64[] =
4386 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4388 #define MIMEOUT_BUF_LENGTH (60)
4390 char buf[MIMEOUT_BUF_LENGTH+1];
4395 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4397 static void open_mime(nkf_char mode)
4399 const unsigned char *p;
4402 p = mime_pattern[0];
4403 for(i=0;mime_pattern[i];i++) {
4404 if (mode == mime_encode[i]) {
4405 p = mime_pattern[i];
4409 mimeout_mode = mime_encode_method[i];
4411 if (base64_count>45) {
4412 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4413 (*o_mputc)(mimeout_state.buf[i]);
4416 PUT_NEWLINE((*o_mputc));
4419 if (mimeout_state.count>0
4420 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4421 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4425 for (;i<mimeout_state.count;i++) {
4426 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4427 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4428 (*o_mputc)(mimeout_state.buf[i]);
4438 j = mimeout_state.count;
4439 mimeout_state.count = 0;
4441 mime_putc(mimeout_state.buf[i]);
4445 static void mime_prechar(nkf_char c2, nkf_char c1)
4447 if (mimeout_mode > 0){
4449 if (base64_count + mimeout_state.count/3*4> 73){
4450 (*o_base64conv)(EOF,0);
4451 OCONV_NEWLINE((*o_base64conv));
4452 (*o_base64conv)(0,SP);
4456 if (base64_count + mimeout_state.count/3*4> 66) {
4457 (*o_base64conv)(EOF,0);
4458 OCONV_NEWLINE((*o_base64conv));
4459 (*o_base64conv)(0,SP);
4465 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4466 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4467 open_mime(output_mode);
4468 (*o_base64conv)(EOF,0);
4469 OCONV_NEWLINE((*o_base64conv));
4470 (*o_base64conv)(0,SP);
4477 static void close_mime(void)
4485 static void eof_mime(void)
4487 switch(mimeout_mode) {
4492 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4498 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4503 if (mimeout_mode > 0) {
4504 if (mimeout_f!=FIXED_MIME) {
4506 } else if (mimeout_mode != 'Q')
4511 static void mimeout_addchar(nkf_char c)
4513 switch(mimeout_mode) {
4518 } else if(!nkf_isalnum(c)) {
4520 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4521 (*o_mputc)(bin2hex((c&0xf)));
4529 mimeout_state.state=c;
4530 (*o_mputc)(basis_64[c>>2]);
4535 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4536 mimeout_state.state=c;
4541 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4542 (*o_mputc)(basis_64[c & 0x3F]);
4553 static void mime_putc(nkf_char c)
4558 if (mimeout_f == FIXED_MIME){
4559 if (mimeout_mode == 'Q'){
4560 if (base64_count > 71){
4561 if (c!=CR && c!=LF) {
4563 PUT_NEWLINE((*o_mputc));
4568 if (base64_count > 71){
4570 PUT_NEWLINE((*o_mputc));
4573 if (c == EOF) { /* c==EOF */
4577 if (c != EOF) { /* c==EOF */
4583 /* mimeout_f != FIXED_MIME */
4585 if (c == EOF) { /* c==EOF */
4586 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4587 j = mimeout_state.count;
4588 mimeout_state.count = 0;
4590 if (mimeout_mode > 0) {
4591 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4593 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4596 mimeout_addchar(mimeout_state.buf[i]);
4600 mimeout_addchar(mimeout_state.buf[i]);
4604 mimeout_addchar(mimeout_state.buf[i]);
4610 mimeout_addchar(mimeout_state.buf[i]);
4616 if (mimeout_state.count > 0){
4617 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4622 if (mimeout_mode=='Q') {
4623 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4624 if (c == CR || c == LF) {
4629 } else if (c <= SP) {
4631 if (base64_count > 70) {
4632 PUT_NEWLINE((*o_mputc));
4635 if (!nkf_isblank(c)) {
4640 if (base64_count > 70) {
4642 PUT_NEWLINE((*o_mputc));
4645 open_mime(output_mode);
4647 if (!nkf_noescape_mime(c)) {
4658 if (mimeout_mode <= 0) {
4659 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4660 if (nkf_isspace(c)) {
4662 if (mimeout_mode == -1) {
4665 if (c==CR || c==LF) {
4667 open_mime(output_mode);
4673 for (i=0;i<mimeout_state.count;i++) {
4674 (*o_mputc)(mimeout_state.buf[i]);
4675 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4686 mimeout_state.buf[0] = (char)c;
4687 mimeout_state.count = 1;
4689 if (base64_count > 1
4690 && base64_count + mimeout_state.count > 76
4691 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4692 PUT_NEWLINE((*o_mputc));
4694 if (!nkf_isspace(mimeout_state.buf[0])){
4699 mimeout_state.buf[mimeout_state.count++] = (char)c;
4700 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4701 open_mime(output_mode);
4706 if (lastchar==CR || lastchar == LF){
4707 for (i=0;i<mimeout_state.count;i++) {
4708 (*o_mputc)(mimeout_state.buf[i]);
4711 mimeout_state.count = 0;
4714 for (i=0;i<mimeout_state.count-1;i++) {
4715 (*o_mputc)(mimeout_state.buf[i]);
4718 mimeout_state.buf[0] = SP;
4719 mimeout_state.count = 1;
4721 open_mime(output_mode);
4724 /* mimeout_mode == 'B', 1, 2 */
4725 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4726 if (lastchar == CR || lastchar == LF){
4727 if (nkf_isblank(c)) {
4728 for (i=0;i<mimeout_state.count;i++) {
4729 mimeout_addchar(mimeout_state.buf[i]);
4731 mimeout_state.count = 0;
4732 } else if (SP<c && c<DEL) {
4734 for (i=0;i<mimeout_state.count;i++) {
4735 (*o_mputc)(mimeout_state.buf[i]);
4738 mimeout_state.count = 0;
4740 mimeout_state.buf[mimeout_state.count++] = (char)c;
4743 if (c==SP || c==TAB || c==CR || c==LF) {
4744 for (i=0;i<mimeout_state.count;i++) {
4745 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4747 for (i=0;i<mimeout_state.count;i++) {
4748 (*o_mputc)(mimeout_state.buf[i]);
4751 mimeout_state.count = 0;
4754 mimeout_state.buf[mimeout_state.count++] = (char)c;
4755 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4757 for (i=0;i<mimeout_state.count;i++) {
4758 (*o_mputc)(mimeout_state.buf[i]);
4761 mimeout_state.count = 0;
4765 if (mimeout_state.count>0 && SP<c && c!='=') {
4766 mimeout_state.buf[mimeout_state.count++] = (char)c;
4767 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4768 j = mimeout_state.count;
4769 mimeout_state.count = 0;
4771 mimeout_addchar(mimeout_state.buf[i]);
4778 if (mimeout_state.count>0) {
4779 j = mimeout_state.count;
4780 mimeout_state.count = 0;
4782 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4784 mimeout_addchar(mimeout_state.buf[i]);
4790 (*o_mputc)(mimeout_state.buf[i]);
4792 open_mime(output_mode);
4798 void base64_conv(nkf_char c2, nkf_char c1)
4800 mime_prechar(c2, c1);
4801 (*o_base64conv)(c2,c1);
4805 typedef struct nkf_iconv_t {
4808 size_t input_buffer_size;
4809 char *output_buffer;
4810 size_t output_buffer_size;
4813 nkf_iconv_t nkf_iconv_new(char *tocode, char *fromcode)
4815 nkf_iconv_t converter;
4817 converter->input_buffer_size = IOBUF_SIZE;
4818 converter->input_buffer = malloc(converter->input_buffer_size);
4819 if (converter->input_buffer == NULL)
4820 perror("can't malloc");
4822 converter->output_buffer_size = IOBUF_SIZE * 2;
4823 converter->output_buffer = malloc(converter->output_buffer_size);
4824 if (converter->output_buffer == NULL)
4825 perror("can't malloc");
4827 converter->cd = iconv_open(tocode, fromcode);
4828 if (converter->cd == (iconv_t)-1)
4832 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
4835 perror("can't iconv_open");
4840 size_t nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
4842 size_t invalid = (size_t)0;
4843 char *input_buffer = converter->input_buffer;
4844 size_t input_length = (size_t)0;
4845 char *output_buffer = converter->output_buffer;
4846 size_t output_length = converter->output_buffer_size;
4851 while ((c = (*i_getc)(f)) != EOF) {
4852 input_buffer[input_length++] = c;
4853 if (input_length < converter->input_buffer_size) break;
4857 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
4858 while (output_length-- > 0) {
4859 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
4861 if (ret == (size_t) - 1) {
4864 if (input_buffer != converter->input_buffer)
4865 memmove(converter->input_buffer, input_buffer, input_length);
4868 converter->output_buffer_size *= 2;
4869 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
4870 if (output_buffer == NULL) {
4871 perror("can't realloc");
4874 converter->output_buffer = output_buffer;
4877 perror("can't iconv");
4889 void nkf_iconv_close(nkf_iconv_t *convert)
4891 free(converter->inbuf);
4892 free(converter->outbuf);
4893 iconv_close(converter->cd);
4901 struct input_code *p = input_code_list;
4913 mime_f = MIME_DECODE_DEFAULT;
4914 mime_decode_f = FALSE;
4919 x0201_f = X0201_DEFAULT;
4920 iso2022jp_f = FALSE;
4921 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
4922 ms_ucs_map_f = UCS_MAP_ASCII;
4924 #ifdef UTF8_INPUT_ENABLE
4925 no_cp932ext_f = FALSE;
4926 no_best_fit_chars_f = FALSE;
4927 encode_fallback = NULL;
4928 unicode_subchar = '?';
4929 input_endian = ENDIAN_BIG;
4931 #ifdef UTF8_OUTPUT_ENABLE
4932 output_bom_f = FALSE;
4933 output_endian = ENDIAN_BIG;
4935 #ifdef UNICODE_NORMALIZATION
4951 #ifdef SHIFTJIS_CP932
4961 for (i = 0; i < 256; i++){
4962 prefix_table[i] = 0;
4966 mimeout_state.count = 0;
4971 fold_preserve_f = FALSE;
4974 kanji_intro = DEFAULT_J;
4975 ascii_intro = DEFAULT_R;
4976 fold_margin = FOLD_MARGIN;
4977 o_zconv = no_connection;
4978 o_fconv = no_connection;
4979 o_eol_conv = no_connection;
4980 o_rot_conv = no_connection;
4981 o_hira_conv = no_connection;
4982 o_base64conv = no_connection;
4983 o_iso2022jp_check_conv = no_connection;
4986 i_ungetc = std_ungetc;
4988 i_bungetc = std_ungetc;
4991 i_mungetc = std_ungetc;
4992 i_mgetc_buf = std_getc;
4993 i_mungetc_buf = std_ungetc;
4994 output_mode = ASCII;
4996 mime_decode_mode = FALSE;
5002 init_broken_state();
5003 z_prev2=0,z_prev1=0;
5005 iconv_for_check = 0;
5007 input_codename = NULL;
5008 input_encoding = NULL;
5009 output_encoding = NULL;
5015 int module_connection(void)
5017 if (input_encoding) set_input_encoding(input_encoding);
5018 if (!output_encoding) {
5019 output_encoding = nkf_default_encoding();
5021 if (!output_encoding) {
5022 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5025 set_output_encoding(output_encoding);
5026 oconv = nkf_enc_to_oconv(output_encoding);
5029 /* replace continucation module, from output side */
5031 /* output redicrection */
5033 if (noout_f || guess_f){
5040 if (mimeout_f == TRUE) {
5041 o_base64conv = oconv; oconv = base64_conv;
5043 /* base64_count = 0; */
5046 if (eolmode_f || guess_f) {
5047 o_eol_conv = oconv; oconv = eol_conv;
5050 o_rot_conv = oconv; oconv = rot_conv;
5053 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5056 o_hira_conv = oconv; oconv = hira_conv;
5059 o_fconv = oconv; oconv = fold_conv;
5062 if (alpha_f || x0201_f) {
5063 o_zconv = oconv; oconv = z_conv;
5067 i_ungetc = std_ungetc;
5068 /* input redicrection */
5071 i_cgetc = i_getc; i_getc = cap_getc;
5072 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5075 i_ugetc = i_getc; i_getc = url_getc;
5076 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5079 #ifdef NUMCHAR_OPTION
5081 i_ngetc = i_getc; i_getc = numchar_getc;
5082 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5085 #ifdef UNICODE_NORMALIZATION
5087 i_nfc_getc = i_getc; i_getc = nfc_getc;
5088 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5091 if (mime_f && mimebuf_f==FIXED_MIME) {
5092 i_mgetc = i_getc; i_getc = mime_getc;
5093 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5096 i_bgetc = i_getc; i_getc = broken_getc;
5097 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5099 if (input_encoding) {
5100 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5102 set_iconv(FALSE, e_iconv);
5106 struct input_code *p = input_code_list;
5115 Conversion main loop. Code detection only.
5118 #if !defined(PERL_XS) && !defined(WIN32DLL)
5119 nkf_char noconvert(FILE *f)
5124 module_connection();
5125 while ((c = (*i_getc)(f)) != EOF)
5132 #define NEXT continue /* no output, get next */
5133 #define SKIP c2=0;continue /* no output, get next */
5134 #define MORE c2=c1;continue /* need one more byte */
5135 #define SEND ; /* output c1 and c2, get next */
5136 #define LAST break /* end of loop, go closing */
5137 #define set_input_mode(mode) do { \
5138 input_mode = mode; \
5140 set_input_codename("ISO-2022-JP"); \
5141 debug("ISO-2022-JP"); \
5144 int kanji_convert(FILE *f)
5146 nkf_char c1=0, c2=0, c3=0, c4=0;
5147 int shift_mode = 0; /* 0, 1, 2, 3 */
5149 int is_8bit = FALSE;
5151 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5156 output_mode = ASCII;
5158 if (module_connection() < 0) {
5159 #if !defined(PERL_XS) && !defined(WIN32DLL)
5160 fprintf(stderr, "no output encoding given\n");
5166 #ifdef UTF8_INPUT_ENABLE
5167 if(iconv == w_iconv32){
5168 while ((c1 = (*i_getc)(f)) != EOF &&
5169 (c2 = (*i_getc)(f)) != EOF &&
5170 (c3 = (*i_getc)(f)) != EOF &&
5171 (c4 = (*i_getc)(f)) != EOF) {
5172 nkf_iconv_utf_32(c1, c2, c3, c4);
5174 (*i_ungetc)(EOF, f);
5176 else if (iconv == w_iconv16) {
5177 while ((c1 = (*i_getc)(f)) != EOF &&
5178 (c2 = (*i_getc)(f)) != EOF) {
5179 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5180 (c3 = (*i_getc)(f)) != EOF &&
5181 (c4 = (*i_getc)(f)) != EOF) {
5182 nkf_iconv_utf_16(c1, c2, c3, c4);
5185 (*i_ungetc)(EOF, f);
5189 while ((c1 = (*i_getc)(f)) != EOF) {
5190 #ifdef INPUT_CODE_FIX
5191 if (!input_encoding)
5197 /* in case of 8th bit is on */
5198 if (!estab_f&&!mime_decode_mode) {
5199 /* in case of not established yet */
5200 /* It is still ambiguious */
5201 if (h_conv(f, c2, c1)==EOF) {
5209 /* in case of already established */
5211 /* ignore bogus code */
5219 /* 2nd byte of 7 bit code or SJIS */
5225 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5228 } else if (c1 > DEL) {
5230 if (!estab_f && !iso8859_f) {
5231 /* not established yet */
5233 } else { /* estab_f==TRUE */
5239 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5240 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5242 c2 = JIS_X_0201_1976_K;
5247 /* already established */
5251 } else if (SP < c1 && c1 < DEL) {
5252 /* in case of Roman characters */
5254 /* output 1 shifted byte */
5258 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5259 /* output 1 shifted byte */
5260 c2 = JIS_X_0201_1976_K;
5263 /* look like bogus code */
5266 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5267 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5268 /* in case of Kanji shifted */
5270 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5271 /* Check MIME code */
5272 if ((c1 = (*i_getc)(f)) == EOF) {
5275 } else if (c1 == '?') {
5276 /* =? is mime conversion start sequence */
5277 if(mime_f == STRICT_MIME) {
5278 /* check in real detail */
5279 if (mime_begin_strict(f) == EOF)
5282 } else if (mime_begin(f) == EOF)
5291 /* normal ASCII code */
5294 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5297 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5300 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5301 if ((c1 = (*i_getc)(f)) == EOF) {
5302 /* (*oconv)(0, ESC); don't send bogus code */
5305 else if (c1 == '&') {
5307 if ((c1 = (*i_getc)(f)) == EOF) {
5313 else if (c1 == '$') {
5315 if ((c1 = (*i_getc)(f)) == EOF) {
5316 /* don't send bogus code
5318 (*oconv)(0, '$'); */
5320 } else if (c1 == '@' || c1 == 'B') {
5322 set_input_mode(JIS_X_0208);
5324 } else if (c1 == '(') {
5326 if ((c1 = (*i_getc)(f)) == EOF) {
5327 /* don't send bogus code
5333 } else if (c1 == '@'|| c1 == 'B') {
5335 set_input_mode(JIS_X_0208);
5338 } else if (c1 == 'D'){
5339 set_input_mode(JIS_X_0212);
5341 #endif /* X0212_ENABLE */
5342 } else if (c1 == 'O' || c1 == 'Q'){
5343 set_input_mode(JIS_X_0213_1);
5345 } else if (c1 == 'P'){
5346 set_input_mode(JIS_X_0213_2);
5349 /* could be some special code */
5356 } else if (broken_f&0x2) {
5357 /* accept any ESC-(-x as broken code ... */
5358 input_mode = JIS_X_0208;
5367 } else if (c1 == '(') {
5369 if ((c1 = (*i_getc)(f)) == EOF) {
5370 /* don't send bogus code
5372 (*oconv)(0, '('); */
5375 else if (c1 == 'I') {
5376 /* JIS X 0201 Katakana */
5377 set_input_mode(JIS_X_0201_1976_K);
5380 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5381 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5382 set_input_mode(ASCII);
5385 else if (broken_f&0x2) {
5386 set_input_mode(ASCII);
5395 else if (c1 == '.') {
5397 if ((c1 = (*i_getc)(f)) == EOF) {
5400 else if (c1 == 'A') {
5411 else if (c1 == 'N') {
5414 if (g2 == ISO_8859_1) {
5429 } else if (c1 == ESC && iconv == s_iconv) {
5430 /* ESC in Shift_JIS */
5431 if ((c1 = (*i_getc)(f)) == EOF) {
5432 /* (*oconv)(0, ESC); don't send bogus code */
5434 } else if (c1 == '$') {
5436 if ((c1 = (*i_getc)(f)) == EOF) {
5438 } else if (('E' <= c1 && c1 <= 'G') ||
5439 ('O' <= c1 && c1 <= 'Q')) {
5447 static const int jphone_emoji_first_table[7] =
5448 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5449 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5450 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5451 while (SP <= c1 && c1 <= 'z') {
5452 (*oconv)(0, c1 + c3);
5453 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5468 } else if (c1 == LF || c1 == CR) {
5470 input_mode = ASCII; set_iconv(FALSE, 0);
5472 } else if (mime_decode_f && !mime_decode_mode){
5474 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5482 } else { /* if (c1 == CR)*/
5483 if ((c1=(*i_getc)(f))!=EOF) {
5487 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5507 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5510 if ((c3 = (*i_getc)(f)) != EOF) {
5513 if ((c4 = (*i_getc)(f)) != EOF) {
5515 (*iconv)(c2, c1, c3|c4);
5520 /* 3 bytes EUC or UTF-8 */
5521 if ((c3 = (*i_getc)(f)) != EOF) {
5523 (*iconv)(c2, c1, c3);
5531 0x7F <= c2 && c2 <= 0x92 &&
5532 0x21 <= c1 && c1 <= 0x7E) {
5534 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5537 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5541 (*oconv)(PREFIX_EUCG3 | c2, c1);
5543 #endif /* X0212_ENABLE */
5545 (*oconv)(PREFIX_EUCG3 | c2, c1);
5548 (*oconv)(input_mode, c1); /* other special case */
5554 /* goto next_word */
5558 (*iconv)(EOF, 0, 0);
5559 if (!input_codename)
5562 struct input_code *p = input_code_list;
5563 struct input_code *result = p;
5565 if (p->score < result->score) result = p;
5568 set_input_codename(result->name);
5570 debug(result->name);
5578 * int options(unsigned char *cp)
5584 int options(unsigned char *cp)
5588 unsigned char *cp_back = NULL;
5594 while(*cp && *cp++!='-');
5595 while (*cp || cp_back) {
5603 case '-': /* literal options */
5604 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5608 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5609 p = (unsigned char *)long_option[i].name;
5610 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5611 if (*p == cp[j] || cp[j] == SP){
5618 #if !defined(PERL_XS) && !defined(WIN32DLL)
5619 fprintf(stderr, "unknown long option: --%s\n", cp);
5623 while(*cp && *cp != SP && cp++);
5624 if (long_option[i].alias[0]){
5626 cp = (unsigned char *)long_option[i].alias;
5628 if (strcmp(long_option[i].name, "ic=") == 0){
5629 nkf_str_upcase((char *)p, codeset, 32);
5630 enc = nkf_enc_find(codeset);
5632 input_encoding = enc;
5635 if (strcmp(long_option[i].name, "oc=") == 0){
5636 nkf_str_upcase((char *)p, codeset, 32);
5637 enc = nkf_enc_find(codeset);
5638 if (enc <= 0) continue;
5639 output_encoding = enc;
5642 if (strcmp(long_option[i].name, "guess=") == 0){
5643 if (p[0] == '0' || p[0] == '1') {
5651 if (strcmp(long_option[i].name, "overwrite") == 0){
5654 preserve_time_f = TRUE;
5657 if (strcmp(long_option[i].name, "overwrite=") == 0){
5660 preserve_time_f = TRUE;
5662 backup_suffix = malloc(strlen((char *) p) + 1);
5663 strcpy(backup_suffix, (char *) p);
5666 if (strcmp(long_option[i].name, "in-place") == 0){
5669 preserve_time_f = FALSE;
5672 if (strcmp(long_option[i].name, "in-place=") == 0){
5675 preserve_time_f = FALSE;
5677 backup_suffix = malloc(strlen((char *) p) + 1);
5678 strcpy(backup_suffix, (char *) p);
5683 if (strcmp(long_option[i].name, "cap-input") == 0){
5687 if (strcmp(long_option[i].name, "url-input") == 0){
5692 #ifdef NUMCHAR_OPTION
5693 if (strcmp(long_option[i].name, "numchar-input") == 0){
5699 if (strcmp(long_option[i].name, "no-output") == 0){
5703 if (strcmp(long_option[i].name, "debug") == 0){
5708 if (strcmp(long_option[i].name, "cp932") == 0){
5709 #ifdef SHIFTJIS_CP932
5713 #ifdef UTF8_OUTPUT_ENABLE
5714 ms_ucs_map_f = UCS_MAP_CP932;
5718 if (strcmp(long_option[i].name, "no-cp932") == 0){
5719 #ifdef SHIFTJIS_CP932
5723 #ifdef UTF8_OUTPUT_ENABLE
5724 ms_ucs_map_f = UCS_MAP_ASCII;
5728 #ifdef SHIFTJIS_CP932
5729 if (strcmp(long_option[i].name, "cp932inv") == 0){
5736 if (strcmp(long_option[i].name, "x0212") == 0){
5743 if (strcmp(long_option[i].name, "exec-in") == 0){
5747 if (strcmp(long_option[i].name, "exec-out") == 0){
5752 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5753 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5754 no_cp932ext_f = TRUE;
5757 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5758 no_best_fit_chars_f = TRUE;
5761 if (strcmp(long_option[i].name, "fb-skip") == 0){
5762 encode_fallback = NULL;
5765 if (strcmp(long_option[i].name, "fb-html") == 0){
5766 encode_fallback = encode_fallback_html;
5769 if (strcmp(long_option[i].name, "fb-xml") == 0){
5770 encode_fallback = encode_fallback_xml;
5773 if (strcmp(long_option[i].name, "fb-java") == 0){
5774 encode_fallback = encode_fallback_java;
5777 if (strcmp(long_option[i].name, "fb-perl") == 0){
5778 encode_fallback = encode_fallback_perl;
5781 if (strcmp(long_option[i].name, "fb-subchar") == 0){
5782 encode_fallback = encode_fallback_subchar;
5785 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
5786 encode_fallback = encode_fallback_subchar;
5787 unicode_subchar = 0;
5789 /* decimal number */
5790 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
5791 unicode_subchar *= 10;
5792 unicode_subchar += hex2bin(p[i]);
5794 }else if(p[1] == 'x' || p[1] == 'X'){
5795 /* hexadecimal number */
5796 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
5797 unicode_subchar <<= 4;
5798 unicode_subchar |= hex2bin(p[i]);
5802 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
5803 unicode_subchar *= 8;
5804 unicode_subchar += hex2bin(p[i]);
5807 w16e_conv(unicode_subchar, &i, &j);
5808 unicode_subchar = i<<8 | j;
5812 #ifdef UTF8_OUTPUT_ENABLE
5813 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
5814 ms_ucs_map_f = UCS_MAP_MS;
5818 #ifdef UNICODE_NORMALIZATION
5819 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
5824 if (strcmp(long_option[i].name, "prefix=") == 0){
5825 if (nkf_isgraph(p[0])){
5826 for (i = 1; nkf_isgraph(p[i]); i++){
5827 prefix_table[p[i]] = p[0];
5832 #if !defined(PERL_XS) && !defined(WIN32DLL)
5833 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
5838 case 'b': /* buffered mode */
5841 case 'u': /* non bufferd mode */
5844 case 't': /* transparent mode */
5849 } else if (*cp=='2') {
5853 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
5861 case 'j': /* JIS output */
5863 output_encoding = nkf_enc_from_index(ISO_2022_JP);
5865 case 'e': /* AT&T EUC output */
5866 output_encoding = nkf_enc_from_index(EUCJP_NKF);
5868 case 's': /* SJIS output */
5869 output_encoding = nkf_enc_from_index(WINDOWS_31J);
5871 case 'l': /* ISO8859 Latin-1 support, no conversion */
5872 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
5873 input_encoding = nkf_enc_from_index(ISO_8859_1);
5875 case 'i': /* Kanji IN ESC-$-@/B */
5876 if (*cp=='@'||*cp=='B')
5877 kanji_intro = *cp++;
5879 case 'o': /* ASCII IN ESC-(-J/B */
5880 if (*cp=='J'||*cp=='B'||*cp=='H')
5881 ascii_intro = *cp++;
5885 bit:1 katakana->hiragana
5886 bit:2 hiragana->katakana
5888 if ('9'>= *cp && *cp>='0')
5889 hira_f |= (*cp++ -'0');
5896 #if defined(MSDOS) || defined(__OS2__)
5903 show_configuration();
5911 #ifdef UTF8_OUTPUT_ENABLE
5912 case 'w': /* UTF-8 output */
5917 output_encoding = nkf_enc_from_index(UTF_8N);
5919 output_bom_f = TRUE;
5920 output_encoding = nkf_enc_from_index(UTF_8_BOM);
5924 if ('1'== cp[0] && '6'==cp[1]) {
5927 } else if ('3'== cp[0] && '2'==cp[1]) {
5931 output_encoding = nkf_enc_from_index(UTF_8);
5936 output_endian = ENDIAN_LITTLE;
5937 } else if (cp[0] == 'B') {
5940 output_encoding = nkf_enc_from_index(enc_idx);
5945 enc_idx = enc_idx == UTF_16
5946 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5947 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5949 output_bom_f = TRUE;
5950 enc_idx = enc_idx == UTF_16
5951 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
5952 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
5954 output_encoding = nkf_enc_from_index(enc_idx);
5958 #ifdef UTF8_INPUT_ENABLE
5959 case 'W': /* UTF input */
5962 input_encoding = nkf_enc_from_index(UTF_8);
5965 if ('1'== cp[0] && '6'==cp[1]) {
5967 input_endian = ENDIAN_BIG;
5969 } else if ('3'== cp[0] && '2'==cp[1]) {
5971 input_endian = ENDIAN_BIG;
5974 input_encoding = nkf_enc_from_index(UTF_8);
5979 input_endian = ENDIAN_LITTLE;
5980 } else if (cp[0] == 'B') {
5982 input_endian = ENDIAN_BIG;
5984 enc_idx = enc_idx == UTF_16
5985 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5986 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5987 input_encoding = nkf_enc_from_index(enc_idx);
5991 /* Input code assumption */
5992 case 'J': /* ISO-2022-JP input */
5993 input_encoding = nkf_enc_from_index(ISO_2022_JP);
5995 case 'E': /* EUC-JP input */
5996 input_encoding = nkf_enc_from_index(EUCJP_NKF);
5998 case 'S': /* Windows-31J input */
5999 input_encoding = nkf_enc_from_index(WINDOWS_31J);
6001 case 'Z': /* Convert X0208 alphabet to asii */
6003 bit:0 Convert JIS X 0208 Alphabet to ASCII
6004 bit:1 Convert Kankaku to one space
6005 bit:2 Convert Kankaku to two spaces
6006 bit:3 Convert HTML Entity
6007 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6009 while ('0'<= *cp && *cp <='9') {
6010 alpha_f |= 1 << (*cp++ - '0');
6012 if (!alpha_f) alpha_f = 1;
6014 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6015 x0201_f = FALSE; /* No X0201->X0208 conversion */
6017 ESC-(-I in JIS, EUC, MS Kanji
6018 SI/SO in JIS, EUC, MS Kanji
6019 SS2 in EUC, JIS, not in MS Kanji
6020 MS Kanji (0xa0-0xdf)
6022 ESC-(-I in JIS (0x20-0x5f)
6023 SS2 in EUC (0xa0-0xdf)
6024 0xa0-0xd in MS Kanji (0xa0-0xdf)
6027 case 'X': /* Convert X0201 kana to X0208 */
6030 case 'F': /* prserve new lines */
6031 fold_preserve_f = TRUE;
6032 case 'f': /* folding -f60 or -f */
6035 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6037 fold_len += *cp++ - '0';
6039 if (!(0<fold_len && fold_len<BUFSIZ))
6040 fold_len = DEFAULT_FOLD;
6044 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6046 fold_margin += *cp++ - '0';
6050 case 'm': /* MIME support */
6051 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6052 if (*cp=='B'||*cp=='Q') {
6053 mime_decode_mode = *cp++;
6054 mimebuf_f = FIXED_MIME;
6055 } else if (*cp=='N') {
6056 mime_f = TRUE; cp++;
6057 } else if (*cp=='S') {
6058 mime_f = STRICT_MIME; cp++;
6059 } else if (*cp=='0') {
6060 mime_decode_f = FALSE;
6061 mime_f = FALSE; cp++;
6063 mime_f = STRICT_MIME;
6066 case 'M': /* MIME output */
6069 mimeout_f = FIXED_MIME; cp++;
6070 } else if (*cp=='Q') {
6072 mimeout_f = FIXED_MIME; cp++;
6077 case 'B': /* Broken JIS support */
6079 bit:1 allow any x on ESC-(-x or ESC-$-x
6080 bit:2 reset to ascii on NL
6082 if ('9'>= *cp && *cp>='0')
6083 broken_f |= 1<<(*cp++ -'0');
6088 case 'O':/* for Output file */
6092 case 'c':/* add cr code */
6095 case 'd':/* delete cr code */
6098 case 'I': /* ISO-2022-JP output */
6101 case 'L': /* line mode */
6102 if (*cp=='u') { /* unix */
6103 eolmode_f = LF; cp++;
6104 } else if (*cp=='m') { /* mac */
6105 eolmode_f = CR; cp++;
6106 } else if (*cp=='w') { /* windows */
6107 eolmode_f = CRLF; cp++;
6108 } else if (*cp=='0') { /* no conversion */
6109 eolmode_f = 0; cp++;
6114 if ('2' <= *cp && *cp <= '9') {
6117 } else if (*cp == '0' || *cp == '1') {
6126 /* module muliple options in a string are allowed for Perl moudle */
6127 while(*cp && *cp++!='-');
6130 #if !defined(PERL_XS) && !defined(WIN32DLL)
6131 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6133 /* bogus option but ignored */
6141 #include "nkf32dll.c"
6142 #elif defined(PERL_XS)
6143 #else /* WIN32DLL */
6144 int main(int argc, char **argv)
6149 char *outfname = NULL;
6152 #ifdef EASYWIN /*Easy Win */
6153 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6155 #ifdef DEFAULT_CODE_LOCALE
6156 setlocale(LC_CTYPE, "");
6158 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6159 cp = (unsigned char *)*argv;
6164 if (pipe(fds) < 0 || (pid = fork()) < 0){
6175 execvp(argv[1], &argv[1]);
6192 int debug_f_back = debug_f;
6195 int exec_f_back = exec_f;
6198 int x0212_f_back = x0212_f;
6200 int x0213_f_back = x0213_f;
6201 int guess_f_back = guess_f;
6203 guess_f = guess_f_back;
6206 debug_f = debug_f_back;
6209 exec_f = exec_f_back;
6211 x0212_f = x0212_f_back;
6212 x0213_f = x0213_f_back;
6215 if (binmode_f == TRUE)
6216 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6217 if (freopen("","wb",stdout) == NULL)
6224 setbuf(stdout, (char *) NULL);
6226 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6229 if (binmode_f == TRUE)
6230 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6231 if (freopen("","rb",stdin) == NULL) return (-1);
6235 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6239 kanji_convert(stdin);
6240 if (guess_f) print_guessed_code(NULL);
6244 int is_argument_error = FALSE;
6246 input_codename = NULL;
6249 iconv_for_check = 0;
6251 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6253 is_argument_error = TRUE;
6261 /* reopen file for stdout */
6262 if (file_out_f == TRUE) {
6265 outfname = malloc(strlen(origfname)
6266 + strlen(".nkftmpXXXXXX")
6272 strcpy(outfname, origfname);
6276 for (i = strlen(outfname); i; --i){
6277 if (outfname[i - 1] == '/'
6278 || outfname[i - 1] == '\\'){
6284 strcat(outfname, "ntXXXXXX");
6286 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6287 S_IREAD | S_IWRITE);
6289 strcat(outfname, ".nkftmpXXXXXX");
6290 fd = mkstemp(outfname);
6293 || (fd_backup = dup(fileno(stdout))) < 0
6294 || dup2(fd, fileno(stdout)) < 0
6305 outfname = "nkf.out";
6308 if(freopen(outfname, "w", stdout) == NULL) {
6312 if (binmode_f == TRUE) {
6313 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6314 if (freopen("","wb",stdout) == NULL)
6321 if (binmode_f == TRUE)
6322 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6323 if (freopen("","rb",fin) == NULL)
6328 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6332 char *filename = NULL;
6334 if (nfiles > 1) filename = origfname;
6335 if (guess_f) print_guessed_code(filename);
6341 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6349 if (dup2(fd_backup, fileno(stdout)) < 0){
6352 if (stat(origfname, &sb)) {
6353 fprintf(stderr, "Can't stat %s\n", origfname);
6355 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6356 if (chmod(outfname, sb.st_mode)) {
6357 fprintf(stderr, "Can't set permission %s\n", outfname);
6360 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6361 if(preserve_time_f){
6362 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6363 tb[0] = tb[1] = sb.st_mtime;
6364 if (utime(outfname, tb)) {
6365 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6368 tb.actime = sb.st_atime;
6369 tb.modtime = sb.st_mtime;
6370 if (utime(outfname, &tb)) {
6371 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6376 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6378 unlink(backup_filename);
6380 if (rename(origfname, backup_filename)) {
6381 perror(backup_filename);
6382 fprintf(stderr, "Can't rename %s to %s\n",
6383 origfname, backup_filename);
6387 if (unlink(origfname)){
6392 if (rename(outfname, origfname)) {
6394 fprintf(stderr, "Can't rename %s to %s\n",
6395 outfname, origfname);
6402 if (is_argument_error)
6405 #ifdef EASYWIN /*Easy Win */
6406 if (file_out_f == FALSE)
6407 scanf("%d",&end_check);
6410 #else /* for Other OS */
6411 if (file_out_f == TRUE)
6413 #endif /*Easy Win */
6416 #endif /* WIN32DLL */