1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_IDENT "$Id: nkf.c,v 1.174 2008/02/07 19:25:29 naruse Exp $"
35 #define NKF_VERSION "2.0.8"
36 #define NKF_RELEASE_DATE "2008-02-07"
38 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
39 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
45 /* state of output_mode and input_mode
124 NKF_ENCODING_TABLE_SIZE,
125 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
126 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
127 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
128 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
129 JIS_X_0208 = 0x1168, /* @B */
130 JIS_X_0212 = 0x1159, /* D */
131 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
132 JIS_X_0213_2 = 0x1229, /* P */
133 JIS_X_0213_1 = 0x1233, /* Q */
136 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
139 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
140 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
141 void j_oconv(nkf_char c2, nkf_char c1);
142 void s_oconv(nkf_char c2, nkf_char c1);
143 void e_oconv(nkf_char c2, nkf_char c1);
144 void w_oconv(nkf_char c2, nkf_char c1);
145 void w_oconv16(nkf_char c2, nkf_char c1);
146 void w_oconv32(nkf_char c2, nkf_char c1);
150 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
151 void (*oconv)(nkf_char c2, nkf_char c1);
152 } nkf_native_encoding;
154 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
155 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
156 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
157 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
158 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
159 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
160 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
165 const nkf_native_encoding *base_encoding;
168 nkf_encoding nkf_encoding_table[] = {
169 {ASCII, "US-ASCII", &NkfEncodingASCII},
170 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
171 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
172 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
173 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
174 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
177 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
178 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
179 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
180 {CP10001, "CP10001", &NkfEncodingShift_JIS},
181 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
182 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
183 {CP51932, "CP51932", &NkfEncodingEUC_JP},
184 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
185 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
186 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
187 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
188 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
189 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
190 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
191 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
192 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
193 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
194 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
195 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
196 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
197 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
198 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
199 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
200 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
201 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
202 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
203 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
204 {BINARY, "BINARY", &NkfEncodingASCII},
211 } encoding_name_to_id_table[] = {
214 {"ISO-2022-JP", ISO_2022_JP},
215 {"ISO2022JP-CP932", CP50220},
216 {"CP50220", CP50220},
217 {"CP50221", CP50221},
218 {"CP50222", CP50222},
219 {"ISO-2022-JP-1", ISO_2022_JP_1},
220 {"ISO-2022-JP-3", ISO_2022_JP_3},
221 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
222 {"SHIFT_JIS", SHIFT_JIS},
224 {"WINDOWS-31J", WINDOWS_31J},
225 {"CSWINDOWS31J", WINDOWS_31J},
226 {"CP932", WINDOWS_31J},
227 {"MS932", WINDOWS_31J},
228 {"CP10001", CP10001},
231 {"EUCJP-NKF", EUCJP_NKF},
232 {"CP51932", CP51932},
233 {"EUC-JP-MS", EUCJP_MS},
234 {"EUCJP-MS", EUCJP_MS},
235 {"EUCJPMS", EUCJP_MS},
236 {"EUC-JP-ASCII", EUCJP_ASCII},
237 {"EUCJP-ASCII", EUCJP_ASCII},
238 {"SHIFT_JISX0213", SHIFT_JISX0213},
239 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
240 {"EUC-JISX0213", EUC_JISX0213},
241 {"EUC-JIS-2004", EUC_JIS_2004},
244 {"UTF-8-BOM", UTF_8_BOM},
245 {"UTF8-MAC", UTF8_MAC},
246 {"UTF-8-MAC", UTF8_MAC},
248 {"UTF-16BE", UTF_16BE},
249 {"UTF-16BE-BOM", UTF_16BE_BOM},
250 {"UTF-16LE", UTF_16LE},
251 {"UTF-16LE-BOM", UTF_16LE_BOM},
253 {"UTF-32BE", UTF_32BE},
254 {"UTF-32BE-BOM", UTF_32BE_BOM},
255 {"UTF-32LE", UTF_32LE},
256 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 #if defined(DEFAULT_CODE_JIS)
262 #define DEFAULT_ENCIDX ISO_2022_JP
263 #elif defined(DEFAULT_CODE_SJIS)
264 #define DEFAULT_ENCIDX SHIFT_JIS
265 #elif defined(DEFAULT_CODE_EUC)
266 #define DEFAULT_ENCIDX EUC_JP
267 #elif defined(DEFAULT_CODE_UTF8)
268 #define DEFAULT_ENCIDX UTF_8
272 #define is_alnum(c) \
273 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
275 /* I don't trust portablity of toupper */
276 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
277 #define nkf_isoctal(c) ('0'<=c && c<='7')
278 #define nkf_isdigit(c) ('0'<=c && c<='9')
279 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
280 #define nkf_isblank(c) (c == SP || c == TAB)
281 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
282 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
283 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
284 #define nkf_isprint(c) (SP<=c && c<='~')
285 #define nkf_isgraph(c) ('!'<=c && c<='~')
286 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
287 ('A'<=c&&c<='F') ? (c-'A'+10) : \
288 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
289 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
290 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
291 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
292 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
293 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
295 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
297 #define HOLD_SIZE 1024
298 #if defined(INT_IS_SHORT)
299 #define IOBUF_SIZE 2048
301 #define IOBUF_SIZE 16384
304 #define DEFAULT_J 'B'
305 #define DEFAULT_R 'B'
312 /* MIME preprocessor */
314 #ifdef EASYWIN /*Easy Win */
315 extern POINT _BufferSize;
324 void (*status_func)(struct input_code *, nkf_char);
325 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
329 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
330 static nkf_encoding *input_encoding = NULL;
331 static nkf_encoding *output_encoding = NULL;
333 static int kanji_convert(FILE *f);
334 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
336 * 0: Shift_JIS, eucJP-ascii
341 #define UCS_MAP_ASCII 0
343 #define UCS_MAP_CP932 2
344 #define UCS_MAP_CP10001 3
345 static int ms_ucs_map_f = UCS_MAP_ASCII;
347 #ifdef UTF8_INPUT_ENABLE
348 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
349 static int no_cp932ext_f = FALSE;
350 /* ignore ZERO WIDTH NO-BREAK SPACE */
351 static int no_best_fit_chars_f = FALSE;
352 static int input_endian = ENDIAN_BIG;
353 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
354 static void (*encode_fallback)(nkf_char c) = NULL;
355 static void w_status(struct input_code *, nkf_char);
357 #ifdef UTF8_OUTPUT_ENABLE
358 static int output_bom_f = FALSE;
359 static int output_endian = ENDIAN_BIG;
362 static void std_putc(nkf_char c);
363 static nkf_char std_getc(FILE *f);
364 static nkf_char std_ungetc(nkf_char c,FILE *f);
366 static nkf_char broken_getc(FILE *f);
367 static nkf_char broken_ungetc(nkf_char c,FILE *f);
369 static nkf_char mime_getc(FILE *f);
371 static void mime_putc(nkf_char c);
375 #if !defined(PERL_XS) && !defined(WIN32DLL)
376 static unsigned char stdibuf[IOBUF_SIZE];
377 static unsigned char stdobuf[IOBUF_SIZE];
381 static int unbuf_f = FALSE;
382 static int estab_f = FALSE;
383 static int nop_f = FALSE;
384 static int binmode_f = TRUE; /* binary mode */
385 static int rot_f = FALSE; /* rot14/43 mode */
386 static int hira_f = FALSE; /* hira/kata henkan */
387 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
388 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
389 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
390 static int mimebuf_f = FALSE; /* MIME buffered input */
391 static int broken_f = FALSE; /* convert ESC-less broken JIS */
392 static int iso8859_f = FALSE; /* ISO8859 through */
393 static int mimeout_f = FALSE; /* base64 mode */
394 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
395 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
397 #ifdef UNICODE_NORMALIZATION
398 static int nfc_f = FALSE;
399 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
400 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
404 static int cap_f = FALSE;
405 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
406 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
408 static int url_f = FALSE;
409 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
410 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
414 #define CLASS_MASK NKF_INT32_C(0xFF000000)
415 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
416 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
417 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
418 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
419 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
420 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
421 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
422 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_BMP_MAX))
423 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_MAX))
425 #ifdef NUMCHAR_OPTION
426 static int numchar_f = FALSE;
427 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
428 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
432 static int noout_f = FALSE;
433 static void no_putc(nkf_char c);
434 static int debug_f = FALSE;
435 static void debug(const char *str);
436 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
439 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
440 static void set_input_codename(char *codename);
443 static int exec_f = 0;
446 #ifdef SHIFTJIS_CP932
447 /* invert IBM extended characters to others */
448 static int cp51932_f = FALSE;
450 /* invert NEC-selected IBM extended characters to IBM extended characters */
451 static int cp932inv_f = TRUE;
453 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
454 #endif /* SHIFTJIS_CP932 */
456 static int x0212_f = FALSE;
457 static int x0213_f = FALSE;
459 static unsigned char prefix_table[256];
461 static void e_status(struct input_code *, nkf_char);
462 static void s_status(struct input_code *, nkf_char);
464 struct input_code input_code_list[] = {
465 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
466 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
467 #ifdef UTF8_INPUT_ENABLE
468 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
473 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
474 static int base64_count = 0;
476 /* X0208 -> ASCII converter */
479 static int f_line = 0; /* chars in line */
480 static int f_prev = 0;
481 static int fold_preserve_f = FALSE; /* preserve new lines */
482 static int fold_f = FALSE;
483 static int fold_len = 0;
486 static unsigned char kanji_intro = DEFAULT_J;
487 static unsigned char ascii_intro = DEFAULT_R;
491 #define FOLD_MARGIN 10
492 #define DEFAULT_FOLD 60
494 static int fold_margin = FOLD_MARGIN;
496 /* process default */
498 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
500 fprintf(stderr,"nkf internal module connection failure.\n");
505 void no_connection(nkf_char c2, nkf_char c1)
507 no_connection2(c2,c1,0);
510 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
511 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
513 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
514 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
515 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
516 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
517 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
518 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
521 /* static redirections */
523 static void (*o_putc)(nkf_char c) = std_putc;
525 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
526 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
528 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
529 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
531 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
533 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
534 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
536 /* for strict mime */
537 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
538 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
541 static int output_mode = ASCII; /* output kanji mode */
542 static int input_mode = ASCII; /* input kanji mode */
543 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
545 /* X0201 / X0208 conversion tables */
547 /* X0201 kana conversion table */
549 static const unsigned char cv[]= {
550 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
551 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
552 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
553 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
554 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
555 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
556 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
557 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
558 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
559 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
560 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
561 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
562 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
563 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
564 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
565 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
569 /* X0201 kana conversion table for daguten */
571 static const unsigned char dv[]= {
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
576 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
577 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
578 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
579 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
580 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
581 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
583 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 /* X0201 kana conversion table for han-daguten */
592 static const unsigned char ev[]= {
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
604 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 /* X0208 kigou conversion table */
613 /* 0x8140 - 0x819e */
614 static const unsigned char fv[] = {
616 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
617 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
618 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
619 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
620 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
621 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
622 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
624 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
626 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
632 static int option_mode = 0;
633 static int file_out_f = FALSE;
635 static int overwrite_f = FALSE;
636 static int preserve_time_f = FALSE;
637 static int backup_f = FALSE;
638 static char *backup_suffix = "";
641 static int eolmode_f = 0; /* CR, LF, CRLF */
642 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
643 static nkf_char prev_cr = 0; /* CR or 0 */
644 #ifdef EASYWIN /*Easy Win */
645 static int end_check;
648 #define STD_GC_BUFSIZE (256)
649 nkf_char std_gc_buf[STD_GC_BUFSIZE];
652 char* nkf_strcpy(const char *str)
654 char* result = malloc(strlen(str) + 1);
663 static void nkf_str_upcase(const char *src, char *dest, size_t length)
666 for (; i < length && src[i]; i++) {
667 dest[i] = nkf_toupper(src[i]);
672 static nkf_encoding *nkf_enc_from_index(int idx)
674 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
677 return &nkf_encoding_table[idx];
680 static int nkf_enc_find_index(const char *name)
683 if (*name == 'X' && *(name+1) == '-') name += 2;
684 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
685 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
686 return encoding_name_to_id_table[i].id;
692 static nkf_encoding *nkf_enc_find(const char *name)
695 idx = nkf_enc_find_index(name);
696 if (idx < 0) return 0;
697 return nkf_enc_from_index(idx);
700 #define nkf_enc_name(enc) (enc)->name
701 #define nkf_enc_to_index(enc) (enc)->id
702 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
703 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
704 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
705 #define nkf_enc_asciicompat(enc) (\
706 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
707 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
708 #define nkf_enc_unicode_p(enc) (\
709 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
710 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
711 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
712 #define nkf_enc_cp5022x_p(enc) (\
713 nkf_enc_to_index(enc) == CP50220 ||\
714 nkf_enc_to_index(enc) == CP50221 ||\
715 nkf_enc_to_index(enc) == CP50222)
717 #ifdef DEFAULT_CODE_LOCALE
718 static char* nkf_locale_charmap()
720 #ifdef HAVE_LANGINFO_H
721 return nl_langinfo(CODESET);
722 #elif defined(__WIN32__)
723 return sprintf("CP%d", GetACP());
729 static nkf_encoding* nkf_locale_encoding()
731 nkf_encoding *enc = 0;
732 char *encname = nkf_locale_charmap();
734 enc = nkf_enc_find(encname);
735 if (enc < 0) enc = 0;
738 #endif /* DEFAULT_CODE_LOCALE */
740 static nkf_encoding* nkf_default_encoding()
742 nkf_encoding *enc = 0;
743 #ifdef DEFAULT_CODE_LOCALE
744 enc = nkf_locale_encoding();
746 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
753 #define fprintf dllprintf
758 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
764 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
766 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
767 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
768 #ifdef UTF8_OUTPUT_ENABLE
769 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
771 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
772 #ifdef UTF8_INPUT_ENABLE
773 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
776 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
777 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
778 "r {de/en}crypt ROT13/47\n"
779 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
780 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
781 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
782 "l ISO8859-1 (Latin-1) support\n"
783 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
784 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
785 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
786 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
787 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
788 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
790 "T Text mode output\n"
792 "O Output to File (DEFAULT 'nkf.out')\n"
793 "I Convert non ISO-2022-JP charactor to GETA\n"
794 "d,c Convert line breaks -d: LF -c: CRLF\n"
795 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
796 "v, V Show this usage. V: show configuration\n"
798 "Long name options\n"
799 " --ic=<input codeset> --oc=<output codeset>\n"
800 " Specify the input or output codeset\n"
801 " --fj --unix --mac --windows\n"
802 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
803 " Convert for the system or code\n"
804 " --hiragana --katakana --katakana-hiragana\n"
805 " To Hiragana/Katakana Conversion\n"
806 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
808 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
810 #ifdef NUMCHAR_OPTION
811 " --numchar-input Convert Unicode Character Reference\n"
813 #ifdef UTF8_INPUT_ENABLE
814 " --fb-{skip, html, xml, perl, java, subchar}\n"
815 " Specify how nkf handles unassigned characters\n"
818 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
819 " Overwrite original listed files by filtered result\n"
820 " --overwrite preserves timestamp of original files\n"
822 " -g --guess Guess the input code\n"
823 " --help --version Show this help/the version\n"
824 " For more information, see also man nkf\n"
829 void show_configuration(void)
832 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
835 " Compile-time options:\n"
836 " Compiled at: " __DATE__ " " __TIME__ "\n"
839 " Default output encoding: "
840 #ifdef DEFAULT_CODE_LOCALE
841 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
843 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
849 " Default output end of line: "
850 #if DEFAULT_NEWLINE == CR
852 #elif DEFAULT_NEWLINE == CRLF
858 " Decode MIME encoded string: "
859 #if MIME_DECODE_DEFAULT
865 " Convert JIS X 0201 Katakana: "
872 " --help, --version output: "
873 #if HELP_OUTPUT_HELP_OUTPUT
883 char *get_backup_filename(const char *suffix, const char *filename)
885 char *backup_filename;
886 int asterisk_count = 0;
888 int filename_length = strlen(filename);
890 for(i = 0; suffix[i]; i++){
891 if(suffix[i] == '*') asterisk_count++;
895 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
896 if (!backup_filename){
897 perror("Can't malloc backup filename.");
901 for(i = 0, j = 0; suffix[i];){
902 if(suffix[i] == '*'){
903 backup_filename[j] = '\0';
904 strncat(backup_filename, filename, filename_length);
906 j += filename_length;
908 backup_filename[j++] = suffix[i++];
911 backup_filename[j] = '\0';
913 j = strlen(suffix) + filename_length;
914 backup_filename = malloc( + 1);
915 strcpy(backup_filename, filename);
916 strcat(backup_filename, suffix);
917 backup_filename[j] = '\0';
919 return backup_filename;
923 #ifdef UTF8_INPUT_ENABLE
924 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
931 (*f)(0, bin2hex(c>>shift));
941 void encode_fallback_html(nkf_char c)
946 if(c >= NKF_INT32_C(1000000))
947 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
948 if(c >= NKF_INT32_C(100000))
949 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
951 (*oconv)(0, 0x30+(c/10000 )%10);
953 (*oconv)(0, 0x30+(c/1000 )%10);
955 (*oconv)(0, 0x30+(c/100 )%10);
957 (*oconv)(0, 0x30+(c/10 )%10);
959 (*oconv)(0, 0x30+ c %10);
964 void encode_fallback_xml(nkf_char c)
969 nkf_each_char_to_hex(oconv, c);
974 void encode_fallback_java(nkf_char c)
978 if(!nkf_char_unicode_bmp_p(c)){
982 (*oconv)(0, bin2hex(c>>20));
983 (*oconv)(0, bin2hex(c>>16));
987 (*oconv)(0, bin2hex(c>>12));
988 (*oconv)(0, bin2hex(c>> 8));
989 (*oconv)(0, bin2hex(c>> 4));
990 (*oconv)(0, bin2hex(c ));
994 void encode_fallback_perl(nkf_char c)
999 nkf_each_char_to_hex(oconv, c);
1004 void encode_fallback_subchar(nkf_char c)
1006 c = unicode_subchar;
1007 (*oconv)((c>>8)&0xFF, c&0xFF);
1012 static const struct {
1036 {"katakana-hiragana","h3"},
1044 #ifdef UTF8_OUTPUT_ENABLE
1054 {"fb-subchar=", ""},
1056 #ifdef UTF8_INPUT_ENABLE
1057 {"utf8-input", "W"},
1058 {"utf16-input", "W16"},
1059 {"no-cp932ext", ""},
1060 {"no-best-fit-chars",""},
1062 #ifdef UNICODE_NORMALIZATION
1063 {"utf8mac-input", ""},
1075 #ifdef NUMCHAR_OPTION
1076 {"numchar-input", ""},
1082 #ifdef SHIFTJIS_CP932
1092 static void set_input_encoding(nkf_encoding *enc)
1094 switch (nkf_enc_to_index(enc)) {
1101 #ifdef SHIFTJIS_CP932
1104 #ifdef UTF8_OUTPUT_ENABLE
1105 ms_ucs_map_f = UCS_MAP_CP932;
1115 case ISO_2022_JP_2004:
1122 #ifdef SHIFTJIS_CP932
1125 #ifdef UTF8_OUTPUT_ENABLE
1126 ms_ucs_map_f = UCS_MAP_CP932;
1131 #ifdef SHIFTJIS_CP932
1134 #ifdef UTF8_OUTPUT_ENABLE
1135 ms_ucs_map_f = UCS_MAP_CP10001;
1143 #ifdef SHIFTJIS_CP932
1146 #ifdef UTF8_OUTPUT_ENABLE
1147 ms_ucs_map_f = UCS_MAP_CP932;
1151 #ifdef SHIFTJIS_CP932
1154 #ifdef UTF8_OUTPUT_ENABLE
1155 ms_ucs_map_f = UCS_MAP_MS;
1159 #ifdef SHIFTJIS_CP932
1162 #ifdef UTF8_OUTPUT_ENABLE
1163 ms_ucs_map_f = UCS_MAP_ASCII;
1166 case SHIFT_JISX0213:
1167 case SHIFT_JIS_2004:
1169 #ifdef SHIFTJIS_CP932
1176 #ifdef SHIFTJIS_CP932
1180 #ifdef UTF8_INPUT_ENABLE
1181 #ifdef UNICODE_NORMALIZATION
1189 input_endian = ENDIAN_BIG;
1193 input_endian = ENDIAN_LITTLE;
1198 input_endian = ENDIAN_BIG;
1202 input_endian = ENDIAN_LITTLE;
1208 static void set_output_encoding(nkf_encoding *enc)
1210 switch (nkf_enc_to_index(enc)) {
1213 #ifdef SHIFTJIS_CP932
1214 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1216 #ifdef UTF8_OUTPUT_ENABLE
1217 ms_ucs_map_f = UCS_MAP_CP932;
1221 #ifdef SHIFTJIS_CP932
1222 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1224 #ifdef UTF8_OUTPUT_ENABLE
1225 ms_ucs_map_f = UCS_MAP_CP932;
1230 #ifdef SHIFTJIS_CP932
1231 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1237 #ifdef SHIFTJIS_CP932
1238 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1244 #ifdef UTF8_OUTPUT_ENABLE
1245 ms_ucs_map_f = UCS_MAP_CP932;
1249 #ifdef UTF8_OUTPUT_ENABLE
1250 ms_ucs_map_f = UCS_MAP_CP10001;
1255 #ifdef SHIFTJIS_CP932
1256 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1258 #ifdef UTF8_OUTPUT_ENABLE
1259 ms_ucs_map_f = UCS_MAP_ASCII;
1264 #ifdef SHIFTJIS_CP932
1265 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1267 #ifdef UTF8_OUTPUT_ENABLE
1268 ms_ucs_map_f = UCS_MAP_ASCII;
1272 #ifdef SHIFTJIS_CP932
1273 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1275 #ifdef UTF8_OUTPUT_ENABLE
1276 ms_ucs_map_f = UCS_MAP_CP932;
1281 #ifdef UTF8_OUTPUT_ENABLE
1282 ms_ucs_map_f = UCS_MAP_MS;
1287 #ifdef UTF8_OUTPUT_ENABLE
1288 ms_ucs_map_f = UCS_MAP_ASCII;
1291 case SHIFT_JISX0213:
1292 case SHIFT_JIS_2004:
1294 #ifdef SHIFTJIS_CP932
1295 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1302 #ifdef SHIFTJIS_CP932
1303 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1306 #ifdef UTF8_OUTPUT_ENABLE
1308 output_bom_f = TRUE;
1312 output_bom_f = TRUE;
1315 output_endian = ENDIAN_LITTLE;
1316 output_bom_f = FALSE;
1319 output_endian = ENDIAN_LITTLE;
1320 output_bom_f = TRUE;
1323 output_bom_f = TRUE;
1326 output_endian = ENDIAN_LITTLE;
1327 output_bom_f = FALSE;
1330 output_endian = ENDIAN_LITTLE;
1331 output_bom_f = TRUE;
1337 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1340 struct input_code *p = input_code_list;
1342 if (iconv_func == p->iconv_func){
1351 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1353 #ifdef INPUT_CODE_FIX
1354 if (f || !input_encoding)
1361 #ifdef INPUT_CODE_FIX
1362 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1368 if (estab_f && iconv_for_check != iconv){
1369 struct input_code *p = find_inputcode_byfunc(iconv);
1371 set_input_codename(p->name);
1374 iconv_for_check = iconv;
1380 nkf_char x0212_shift(nkf_char c)
1385 if (0x75 <= c && c <= 0x7f){
1386 ret = c + (0x109 - 0x75);
1389 if (0x75 <= c && c <= 0x7f){
1390 ret = c + (0x113 - 0x75);
1397 nkf_char x0212_unshift(nkf_char c)
1400 if (0x7f <= c && c <= 0x88){
1401 ret = c + (0x75 - 0x7f);
1402 }else if (0x89 <= c && c <= 0x92){
1403 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1407 #endif /* X0212_ENABLE */
1409 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1415 if((0x21 <= ndx && ndx <= 0x2F)){
1416 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1417 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1419 }else if(0x6E <= ndx && ndx <= 0x7E){
1420 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1421 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1427 else if(nkf_isgraph(ndx)){
1429 const unsigned short *ptr;
1430 ptr = x0212_shiftjis[ndx - 0x21];
1432 val = ptr[(c1 & 0x7f) - 0x21];
1441 c2 = x0212_shift(c2);
1443 #endif /* X0212_ENABLE */
1445 if(0x7F < c2) return 1;
1446 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1447 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1451 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1453 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1456 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1457 #ifdef SHIFTJIS_CP932
1458 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1459 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1466 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1467 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1473 #endif /* SHIFTJIS_CP932 */
1475 if (!x0213_f && is_ibmext_in_sjis(c2)){
1476 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1479 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1492 if(x0213_f && c2 >= 0xF0){
1493 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1494 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1495 }else{ /* 78<=k<=94 */
1496 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1497 if (0x9E < c1) c2++;
1500 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1501 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1502 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1503 if (0x9E < c1) c2++;
1506 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1513 c2 = x0212_unshift(c2);
1520 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1521 void nkf_unicode_to_utf8(nkf_char val, int *p1, int *p2, int *p3, int *p4)
1529 }else if (val < 0x800){
1530 *p1 = 0xc0 | (val >> 6);
1531 *p2 = 0x80 | (val & 0x3f);
1534 } else if (nkf_char_unicode_bmp_p(val)) {
1535 *p1 = 0xe0 | (val >> 12);
1536 *p2 = 0x80 | ((val >> 6) & 0x3f);
1537 *p3 = 0x80 | ( val & 0x3f);
1539 } else if (nkf_char_unicode_value_p(val)) {
1540 *p1 = 0xe0 | (val >> 16);
1541 *p2 = 0x80 | ((val >> 12) & 0x3f);
1542 *p3 = 0x80 | ((val >> 6) & 0x3f);
1543 *p4 = 0x80 | ( val & 0x3f);
1552 nkf_char nkf_utf8_to_unicode(int c1, int c2, int c3, int c4)
1559 else if (c1 <= 0xC3) {
1560 /* trail byte or invalid */
1563 else if (c1 <= 0xDF) {
1565 wc = (c1 & 0x1F) << 6;
1568 else if (c1 <= 0xEF) {
1570 wc = (c1 & 0x0F) << 12;
1571 wc |= (c2 & 0x3F) << 6;
1574 else if (c2 <= 0xF4) {
1576 wc = (c1 & 0x0F) << 18;
1577 wc |= (c2 & 0x3F) << 12;
1578 wc |= (c3 & 0x3F) << 6;
1588 #ifdef UTF8_INPUT_ENABLE
1589 static int unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1590 const unsigned short *const *pp, nkf_char psize,
1591 nkf_char *p2, nkf_char *p1)
1594 const unsigned short *p;
1597 if (pp == 0) return 1;
1600 if (c1 < 0 || psize <= c1) return 1;
1602 if (p == 0) return 1;
1605 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1607 if (val == 0) return 1;
1608 if (no_cp932ext_f && (
1609 (val>>8) == 0x2D || /* NEC special characters */
1610 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1618 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1625 static nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1627 const unsigned short *const *pp;
1628 const unsigned short *const *const *ppp;
1629 static const char no_best_fit_chars_table_C2[] =
1630 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1631 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1632 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1633 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1634 static const char no_best_fit_chars_table_C2_ms[] =
1635 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1636 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1637 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1638 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1639 static const char no_best_fit_chars_table_932_C2[] =
1640 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1642 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1643 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1644 static const char no_best_fit_chars_table_932_C3[] =
1645 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1646 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1647 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1648 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1654 }else if(c2 < 0xe0){
1655 if(no_best_fit_chars_f){
1656 if(ms_ucs_map_f == UCS_MAP_CP932){
1659 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1662 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1665 }else if(!cp932inv_f){
1668 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1671 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1674 }else if(ms_ucs_map_f == UCS_MAP_MS){
1675 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1676 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1694 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1695 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1696 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1698 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1699 }else if(c0 < 0xF0){
1700 if(no_best_fit_chars_f){
1701 if(ms_ucs_map_f == UCS_MAP_CP932){
1702 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1703 }else if(ms_ucs_map_f == UCS_MAP_MS){
1708 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1711 if(c0 == 0x92) return 1;
1716 if(c1 == 0x80 || c0 == 0x9C) return 1;
1719 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1724 if(c0 == 0x94) return 1;
1727 if(c0 == 0xBB) return 1;
1737 if(c0 == 0x95) return 1;
1740 if(c0 == 0xA5) return 1;
1747 if(c0 == 0x8D) return 1;
1750 if(c0 == 0x9E && !cp932inv_f) return 1;
1753 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1761 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1762 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1763 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1765 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1767 #ifdef SHIFTJIS_CP932
1768 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1770 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1771 s2e_conv(s2, s1, p2, p1);
1780 #ifdef UTF8_OUTPUT_ENABLE
1781 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
1783 const unsigned short *p;
1785 if (c2 == JIS_X_0201_1976_K) {
1786 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1794 p = euc_to_utf8_1byte;
1796 } else if (is_eucg3(c2)){
1797 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1800 c2 = (c2&0x7f) - 0x21;
1801 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1802 p = x0212_to_utf8_2bytes[c2];
1808 c2 = (c2&0x7f) - 0x21;
1809 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1811 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1812 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1813 euc_to_utf8_2bytes_ms[c2];
1818 c1 = (c1 & 0x7f) - 0x21;
1819 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1825 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1832 }else if (0xc0 <= c2 && c2 <= 0xef) {
1833 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1834 #ifdef NUMCHAR_OPTION
1837 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1845 #ifdef UTF8_INPUT_ENABLE
1846 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1855 else if (nkf_char_unicode_bmp_p(val)){
1856 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1857 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1860 *p1 = nkf_char_unicode_new(val);
1866 *p1 = nkf_char_unicode_new(val);
1872 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1874 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1875 if (iso2022jp_f && !x0201_f) {
1876 c2 = GETA1; c1 = GETA2;
1878 c2 = JIS_X_0201_1976_K;
1882 }else if (c2 == 0x8f){
1886 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
1887 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1888 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
1891 c2 = (c2 << 8) | (c1 & 0x7f);
1893 #ifdef SHIFTJIS_CP932
1896 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1897 s2e_conv(s2, s1, &c2, &c1);
1904 #endif /* SHIFTJIS_CP932 */
1906 #endif /* X0212_ENABLE */
1907 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
1910 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
1911 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1912 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
1917 #ifdef SHIFTJIS_CP932
1918 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
1920 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1921 s2e_conv(s2, s1, &c2, &c1);
1928 #endif /* SHIFTJIS_CP932 */
1935 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1937 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
1938 if (iso2022jp_f && !x0201_f) {
1939 c2 = GETA1; c1 = GETA2;
1943 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
1945 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
1947 if(c1 == 0x7F) return 0;
1948 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
1951 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
1952 if (ret) return ret;
1958 nkf_char w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
1960 nkf_char ret = 0, c4 = 0;
1961 static const char w_iconv_utf8_1st_byte[] =
1963 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1964 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1965 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
1966 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
1973 if (c1 < 0 || 0xff < c1) {
1974 }else if (c1 == 0) { /* 0 : 1 byte*/
1976 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
1979 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
1981 if (c2 < 0x80 || 0xBF < c2) return 0;
1984 if (c3 == 0) return -1;
1985 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
1990 if (c3 == 0) return -1;
1991 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
1995 if (c3 == 0) return -1;
1996 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2000 if (c3 == 0) return -2;
2001 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2005 if (c3 == 0) return -2;
2006 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2010 if (c3 == 0) return -2;
2011 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2019 if (c1 == 0 || c1 == EOF){
2020 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2021 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2024 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2032 #define NKF_ICONV_INVALID_CODE_RANGE -13
2033 static size_t unicode_iconv(nkf_char wc)
2041 }else if ((wc>>11) == 27) {
2042 /* unpaired surrogate */
2043 return NKF_ICONV_INVALID_CODE_RANGE;
2044 }else if (wc < 0xFFFF) {
2045 ret = w16e_conv(wc, &c2, &c1);
2046 if (ret) return ret;
2047 }else if (wc < 0x10FFFF) {
2049 c1 = nkf_char_unicode_new(wc);
2051 return NKF_ICONV_INVALID_CODE_RANGE;
2057 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2058 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2059 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2060 size_t nkf_iconv_utf_16(int c1, int c2, int c3, int c4)
2069 if (input_endian == ENDIAN_BIG) {
2070 if (0xD8 <= c1 && c1 <= 0xDB) {
2071 if (0xDC <= c3 && c3 <= 0xDF) {
2072 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2073 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2078 if (0xD8 <= c2 && c2 <= 0xDB) {
2079 if (0xDC <= c4 && c4 <= 0xDF) {
2080 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2081 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2087 return (*unicode_iconv)(wc);
2090 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2095 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2100 size_t nkf_iconv_utf_32(int c1, int c2, int c3, int c4)
2109 switch(input_endian){
2111 wc = c2 << 16 | c3 << 8 | c4;
2114 wc = c3 << 16 | c2 << 8 | c1;
2117 wc = c1 << 16 | c4 << 8 | c3;
2120 wc = c4 << 16 | c1 << 8 | c2;
2123 return NKF_ICONV_INVALID_CODE_RANGE;
2126 return (*unicode_iconv)(wc);
2130 #define output_ascii_escape_sequence(mode) do { \
2131 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2134 (*o_putc)(ascii_intro); \
2135 output_mode = mode; \
2139 void output_escape_sequence(int mode)
2141 if (output_mode == mode)
2149 case JIS_X_0201_1976_K:
2157 (*o_putc)(kanji_intro);
2181 void j_oconv(nkf_char c2, nkf_char c1)
2183 #ifdef NUMCHAR_OPTION
2184 if (c2 == 0 && nkf_char_unicode_p(c1)){
2185 w16e_conv(c1, &c2, &c1);
2186 if (c2 == 0 && nkf_char_unicode_p(c1)){
2187 c2 = c1 & VALUE_MASK;
2188 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2191 c2 = 0x7F + c1 / 94;
2192 c1 = 0x21 + c1 % 94;
2194 if (encode_fallback) (*encode_fallback)(c1);
2201 output_ascii_escape_sequence(ASCII);
2204 else if (c2 == EOF) {
2205 output_ascii_escape_sequence(ASCII);
2208 else if (c2 == ISO_8859_1) {
2209 output_ascii_escape_sequence(ISO_8859_1);
2212 else if (c2 == JIS_X_0201_1976_K) {
2213 output_escape_sequence(JIS_X_0201_1976_K);
2216 } else if (is_eucg3(c2)){
2217 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2218 (*o_putc)(c2 & 0x7f);
2223 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2224 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2225 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2231 void e_oconv(nkf_char c2, nkf_char c1)
2233 if (c2 == 0 && nkf_char_unicode_p(c1)){
2234 w16e_conv(c1, &c2, &c1);
2235 if (c2 == 0 && nkf_char_unicode_p(c1)){
2236 c2 = c1 & VALUE_MASK;
2237 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2241 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2242 c1 = 0x21 + c1 % 94;
2245 (*o_putc)((c2 & 0x7f) | 0x080);
2246 (*o_putc)(c1 | 0x080);
2248 (*o_putc)((c2 & 0x7f) | 0x080);
2249 (*o_putc)(c1 | 0x080);
2253 if (encode_fallback) (*encode_fallback)(c1);
2261 } else if (c2 == 0) {
2262 output_mode = ASCII;
2264 } else if (c2 == JIS_X_0201_1976_K) {
2265 output_mode = EUC_JP;
2266 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2267 } else if (c2 == ISO_8859_1) {
2268 output_mode = ISO_8859_1;
2269 (*o_putc)(c1 | 0x080);
2271 } else if (is_eucg3(c2)){
2272 output_mode = EUC_JP;
2273 #ifdef SHIFTJIS_CP932
2276 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2277 s2e_conv(s2, s1, &c2, &c1);
2282 output_mode = ASCII;
2284 }else if (is_eucg3(c2)){
2287 (*o_putc)((c2 & 0x7f) | 0x080);
2288 (*o_putc)(c1 | 0x080);
2291 (*o_putc)((c2 & 0x7f) | 0x080);
2292 (*o_putc)(c1 | 0x080);
2296 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2297 set_iconv(FALSE, 0);
2298 return; /* too late to rescue this char */
2300 output_mode = EUC_JP;
2301 (*o_putc)(c2 | 0x080);
2302 (*o_putc)(c1 | 0x080);
2306 void s_oconv(nkf_char c2, nkf_char c1)
2308 #ifdef NUMCHAR_OPTION
2309 if (c2 == 0 && nkf_char_unicode_p(c1)){
2310 w16e_conv(c1, &c2, &c1);
2311 if (c2 == 0 && nkf_char_unicode_p(c1)){
2312 c2 = c1 & VALUE_MASK;
2313 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2316 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2318 c1 += 0x40 + (c1 > 0x3e);
2323 if(encode_fallback)(*encode_fallback)(c1);
2332 } else if (c2 == 0) {
2333 output_mode = ASCII;
2335 } else if (c2 == JIS_X_0201_1976_K) {
2336 output_mode = SHIFT_JIS;
2338 } else if (c2 == ISO_8859_1) {
2339 output_mode = ISO_8859_1;
2340 (*o_putc)(c1 | 0x080);
2342 } else if (is_eucg3(c2)){
2343 output_mode = SHIFT_JIS;
2344 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2350 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2351 set_iconv(FALSE, 0);
2352 return; /* too late to rescue this char */
2354 output_mode = SHIFT_JIS;
2355 e2s_conv(c2, c1, &c2, &c1);
2357 #ifdef SHIFTJIS_CP932
2359 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2360 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2366 #endif /* SHIFTJIS_CP932 */
2369 if (prefix_table[(unsigned char)c1]){
2370 (*o_putc)(prefix_table[(unsigned char)c1]);
2376 #ifdef UTF8_OUTPUT_ENABLE
2377 void w_oconv(nkf_char c2, nkf_char c1)
2383 output_bom_f = FALSE;
2394 if (c2 == 0 && nkf_char_unicode_p(c1)){
2395 val = c1 & VALUE_MASK;
2396 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2398 if (c2) (*o_putc)(c2);
2399 if (c3) (*o_putc)(c3);
2400 if (c4) (*o_putc)(c4);
2407 val = e2w_conv(c2, c1);
2409 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2411 if (c2) (*o_putc)(c2);
2412 if (c3) (*o_putc)(c3);
2413 if (c4) (*o_putc)(c4);
2418 void w_oconv16(nkf_char c2, nkf_char c1)
2421 output_bom_f = FALSE;
2422 if (output_endian == ENDIAN_LITTLE){
2436 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2437 if (nkf_char_unicode_bmp_p(c1)) {
2438 c2 = (c1 >> 8) & 0xff;
2442 if (c1 <= UNICODE_MAX) {
2443 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2444 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2445 if (output_endian == ENDIAN_LITTLE){
2446 (*o_putc)(c2 & 0xff);
2447 (*o_putc)((c2 >> 8) & 0xff);
2448 (*o_putc)(c1 & 0xff);
2449 (*o_putc)((c1 >> 8) & 0xff);
2451 (*o_putc)((c2 >> 8) & 0xff);
2452 (*o_putc)(c2 & 0xff);
2453 (*o_putc)((c1 >> 8) & 0xff);
2454 (*o_putc)(c1 & 0xff);
2460 nkf_char val = e2w_conv(c2, c1);
2461 c2 = (val >> 8) & 0xff;
2465 if (output_endian == ENDIAN_LITTLE){
2474 void w_oconv32(nkf_char c2, nkf_char c1)
2477 output_bom_f = FALSE;
2478 if (output_endian == ENDIAN_LITTLE){
2496 if (c2 == ISO_8859_1) {
2498 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2501 c1 = e2w_conv(c2, c1);
2504 if (output_endian == ENDIAN_LITTLE){
2505 (*o_putc)( c1 & 0xFF);
2506 (*o_putc)((c1 >> 8) & 0xFF);
2507 (*o_putc)((c1 >> 16) & 0xFF);
2511 (*o_putc)((c1 >> 16) & 0xFF);
2512 (*o_putc)((c1 >> 8) & 0xFF);
2513 (*o_putc)( c1 & 0xFF);
2518 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2519 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2520 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2521 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2522 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2523 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2524 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2525 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2527 #define SCORE_INIT (SCORE_iMIME)
2529 static const char score_table_A0[] = {
2532 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2533 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2536 static const char score_table_F0[] = {
2537 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2538 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2539 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2540 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2543 void set_code_score(struct input_code *ptr, nkf_char score)
2546 ptr->score |= score;
2550 void clr_code_score(struct input_code *ptr, nkf_char score)
2553 ptr->score &= ~score;
2557 void code_score(struct input_code *ptr)
2559 nkf_char c2 = ptr->buf[0];
2560 #ifdef UTF8_OUTPUT_ENABLE
2561 nkf_char c1 = ptr->buf[1];
2564 set_code_score(ptr, SCORE_ERROR);
2565 }else if (c2 == SS2){
2566 set_code_score(ptr, SCORE_KANA);
2567 }else if (c2 == 0x8f){
2568 set_code_score(ptr, SCORE_X0212);
2569 #ifdef UTF8_OUTPUT_ENABLE
2570 }else if (!e2w_conv(c2, c1)){
2571 set_code_score(ptr, SCORE_NO_EXIST);
2573 }else if ((c2 & 0x70) == 0x20){
2574 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2575 }else if ((c2 & 0x70) == 0x70){
2576 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2577 }else if ((c2 & 0x70) >= 0x50){
2578 set_code_score(ptr, SCORE_L2);
2582 void status_disable(struct input_code *ptr)
2587 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2590 void status_push_ch(struct input_code *ptr, nkf_char c)
2592 ptr->buf[ptr->index++] = c;
2595 void status_clear(struct input_code *ptr)
2601 void status_reset(struct input_code *ptr)
2604 ptr->score = SCORE_INIT;
2607 void status_reinit(struct input_code *ptr)
2610 ptr->_file_stat = 0;
2613 void status_check(struct input_code *ptr, nkf_char c)
2615 if (c <= DEL && estab_f){
2620 void s_status(struct input_code *ptr, nkf_char c)
2624 status_check(ptr, c);
2629 }else if (nkf_char_unicode_p(c)){
2631 }else if (0xa1 <= c && c <= 0xdf){
2632 status_push_ch(ptr, SS2);
2633 status_push_ch(ptr, c);
2636 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2638 status_push_ch(ptr, c);
2639 }else if (0xed <= c && c <= 0xee){
2641 status_push_ch(ptr, c);
2642 #ifdef SHIFTJIS_CP932
2643 }else if (is_ibmext_in_sjis(c)){
2645 status_push_ch(ptr, c);
2646 #endif /* SHIFTJIS_CP932 */
2648 }else if (0xf0 <= c && c <= 0xfc){
2650 status_push_ch(ptr, c);
2651 #endif /* X0212_ENABLE */
2653 status_disable(ptr);
2657 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2658 status_push_ch(ptr, c);
2659 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2663 status_disable(ptr);
2667 #ifdef SHIFTJIS_CP932
2668 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2669 status_push_ch(ptr, c);
2670 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2671 set_code_score(ptr, SCORE_CP932);
2676 #endif /* SHIFTJIS_CP932 */
2677 status_disable(ptr);
2680 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2681 status_push_ch(ptr, c);
2682 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2683 set_code_score(ptr, SCORE_CP932);
2686 status_disable(ptr);
2692 void e_status(struct input_code *ptr, nkf_char c)
2696 status_check(ptr, c);
2701 }else if (nkf_char_unicode_p(c)){
2703 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2705 status_push_ch(ptr, c);
2707 }else if (0x8f == c){
2709 status_push_ch(ptr, c);
2710 #endif /* X0212_ENABLE */
2712 status_disable(ptr);
2716 if (0xa1 <= c && c <= 0xfe){
2717 status_push_ch(ptr, c);
2721 status_disable(ptr);
2726 if (0xa1 <= c && c <= 0xfe){
2728 status_push_ch(ptr, c);
2730 status_disable(ptr);
2732 #endif /* X0212_ENABLE */
2736 #ifdef UTF8_INPUT_ENABLE
2737 void w_status(struct input_code *ptr, nkf_char c)
2741 status_check(ptr, c);
2746 }else if (nkf_char_unicode_p(c)){
2748 }else if (0xc0 <= c && c <= 0xdf){
2750 status_push_ch(ptr, c);
2751 }else if (0xe0 <= c && c <= 0xef){
2753 status_push_ch(ptr, c);
2754 }else if (0xf0 <= c && c <= 0xf4){
2756 status_push_ch(ptr, c);
2758 status_disable(ptr);
2763 if (0x80 <= c && c <= 0xbf){
2764 status_push_ch(ptr, c);
2765 if (ptr->index > ptr->stat){
2766 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2767 && ptr->buf[2] == 0xbf);
2768 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2769 &ptr->buf[0], &ptr->buf[1]);
2776 status_disable(ptr);
2780 if (0x80 <= c && c <= 0xbf){
2781 if (ptr->index < ptr->stat){
2782 status_push_ch(ptr, c);
2787 status_disable(ptr);
2794 void code_status(nkf_char c)
2796 int action_flag = 1;
2797 struct input_code *result = 0;
2798 struct input_code *p = input_code_list;
2800 if (!p->status_func) {
2804 if (!p->status_func)
2806 (p->status_func)(p, c);
2809 }else if(p->stat == 0){
2820 if (result && !estab_f){
2821 set_iconv(TRUE, result->iconv_func);
2822 }else if (c <= DEL){
2823 struct input_code *ptr = input_code_list;
2833 nkf_char std_getc(FILE *f)
2836 return std_gc_buf[--std_gc_ndx];
2842 nkf_char std_ungetc(nkf_char c, FILE *f)
2844 if (std_gc_ndx == STD_GC_BUFSIZE){
2847 std_gc_buf[std_gc_ndx++] = c;
2852 void std_putc(nkf_char c)
2859 static unsigned char hold_buf[HOLD_SIZE*2];
2860 static int hold_count = 0;
2861 nkf_char push_hold_buf(nkf_char c2)
2863 if (hold_count >= HOLD_SIZE*2)
2865 hold_buf[hold_count++] = (unsigned char)c2;
2866 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2869 static int h_conv(FILE *f, int c1, int c2)
2875 /** it must NOT be in the kanji shifte sequence */
2876 /** it must NOT be written in JIS7 */
2877 /** and it must be after 2 byte 8bit code */
2883 while ((c2 = (*i_getc)(f)) != EOF) {
2889 if (push_hold_buf(c2) == EOF || estab_f) {
2895 struct input_code *p = input_code_list;
2896 struct input_code *result = p;
2901 if (p->status_func && p->score < result->score) {
2906 set_iconv(TRUE, result->iconv_func);
2911 ** 1) EOF is detected, or
2912 ** 2) Code is established, or
2913 ** 3) Buffer is FULL (but last word is pushed)
2915 ** in 1) and 3) cases, we continue to use
2916 ** Kanji codes by oconv and leave estab_f unchanged.
2921 while (hold_index < hold_count){
2922 c1 = hold_buf[hold_index++];
2926 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
2927 (*iconv)(JIS_X_0201_1976_K, c1, 0);
2930 if (hold_index < hold_count){
2931 c2 = hold_buf[hold_index++];
2941 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
2944 if (hold_index < hold_count){
2945 c3 = hold_buf[hold_index++];
2946 } else if ((c3 = (*i_getc)(f)) == EOF) {
2951 if (hold_index < hold_count){
2952 c4 = hold_buf[hold_index++];
2953 } else if ((c4 = (*i_getc)(f)) == EOF) {
2958 (*iconv)(c1, c2, (c3<<8)|c4);
2963 /* 3 bytes EUC or UTF-8 */
2964 if (hold_index < hold_count){
2965 c3 = hold_buf[hold_index++];
2966 } else if ((c3 = (*i_getc)(f)) == EOF) {
2972 (*iconv)(c1, c2, c3);
2975 if (c3 == EOF) break;
2981 * Check and Ignore BOM
2983 void check_bom(FILE *f)
2986 switch(c2 = (*i_getc)(f)){
2988 if((c2 = (*i_getc)(f)) == 0x00){
2989 if((c2 = (*i_getc)(f)) == 0xFE){
2990 if((c2 = (*i_getc)(f)) == 0xFF){
2991 if(!input_encoding){
2992 set_iconv(TRUE, w_iconv32);
2994 if (iconv == w_iconv32) {
2995 input_endian = ENDIAN_BIG;
2998 (*i_ungetc)(0xFF,f);
2999 }else (*i_ungetc)(c2,f);
3000 (*i_ungetc)(0xFE,f);
3001 }else if(c2 == 0xFF){
3002 if((c2 = (*i_getc)(f)) == 0xFE){
3003 if(!input_encoding){
3004 set_iconv(TRUE, w_iconv32);
3006 if (iconv == w_iconv32) {
3007 input_endian = ENDIAN_2143;
3010 (*i_ungetc)(0xFF,f);
3011 }else (*i_ungetc)(c2,f);
3012 (*i_ungetc)(0xFF,f);
3013 }else (*i_ungetc)(c2,f);
3014 (*i_ungetc)(0x00,f);
3015 }else (*i_ungetc)(c2,f);
3016 (*i_ungetc)(0x00,f);
3019 if((c2 = (*i_getc)(f)) == 0xBB){
3020 if((c2 = (*i_getc)(f)) == 0xBF){
3021 if(!input_encoding){
3022 set_iconv(TRUE, w_iconv);
3024 if (iconv == w_iconv) {
3027 (*i_ungetc)(0xBF,f);
3028 }else (*i_ungetc)(c2,f);
3029 (*i_ungetc)(0xBB,f);
3030 }else (*i_ungetc)(c2,f);
3031 (*i_ungetc)(0xEF,f);
3034 if((c2 = (*i_getc)(f)) == 0xFF){
3035 if((c2 = (*i_getc)(f)) == 0x00){
3036 if((c2 = (*i_getc)(f)) == 0x00){
3037 if(!input_encoding){
3038 set_iconv(TRUE, w_iconv32);
3040 if (iconv == w_iconv32) {
3041 input_endian = ENDIAN_3412;
3044 (*i_ungetc)(0x00,f);
3045 }else (*i_ungetc)(c2,f);
3046 (*i_ungetc)(0x00,f);
3047 }else (*i_ungetc)(c2,f);
3048 if(!input_encoding){
3049 set_iconv(TRUE, w_iconv16);
3051 if (iconv == w_iconv16) {
3052 input_endian = ENDIAN_BIG;
3055 (*i_ungetc)(0xFF,f);
3056 }else (*i_ungetc)(c2,f);
3057 (*i_ungetc)(0xFE,f);
3060 if((c2 = (*i_getc)(f)) == 0xFE){
3061 if((c2 = (*i_getc)(f)) == 0x00){
3062 if((c2 = (*i_getc)(f)) == 0x00){
3063 if(!input_encoding){
3064 set_iconv(TRUE, w_iconv32);
3066 if (iconv == w_iconv32) {
3067 input_endian = ENDIAN_LITTLE;
3070 (*i_ungetc)(0x00,f);
3071 }else (*i_ungetc)(c2,f);
3072 (*i_ungetc)(0x00,f);
3073 }else (*i_ungetc)(c2,f);
3074 if(!input_encoding){
3075 set_iconv(TRUE, w_iconv16);
3077 if (iconv == w_iconv16) {
3078 input_endian = ENDIAN_LITTLE;
3081 (*i_ungetc)(0xFE,f);
3082 }else (*i_ungetc)(c2,f);
3083 (*i_ungetc)(0xFF,f);
3097 static void init_broken_state(void)
3099 memset(&broken_state, 0, sizeof(broken_state));
3102 static void push_broken_buf(c)
3104 broken_state.buf[broken_state.count++] = c;
3107 static nkf_char pop_broken_buf(void)
3109 return broken_state.buf[--broken_state.count];
3112 nkf_char broken_getc(FILE *f)
3116 if (broken_state.count > 0) {
3117 return pop_broken_buf();
3120 if (c=='$' && broken_state.status != ESC
3121 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3123 broken_state.status = 0;
3124 if (c1=='@'|| c1=='B') {
3125 push_broken_buf(c1);
3132 } else if (c=='(' && broken_state.status != ESC
3133 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3135 broken_state.status = 0;
3136 if (c1=='J'|| c1=='B') {
3137 push_broken_buf(c1);
3145 broken_state.status = c;
3150 nkf_char broken_ungetc(nkf_char c, FILE *f)
3152 if (broken_state.count < 2)
3157 void eol_conv(nkf_char c2, nkf_char c1)
3159 if (guess_f && input_eol != EOF) {
3160 if (c2 == 0 && c1 == LF) {
3161 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3162 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3163 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3165 else if (!input_eol) input_eol = CR;
3166 else if (input_eol != CR) input_eol = EOF;
3168 if (prev_cr || (c2 == 0 && c1 == LF)) {
3170 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3171 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3173 if (c2 == 0 && c1 == CR) prev_cr = CR;
3174 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3178 Return value of fold_conv()
3180 LF add newline and output char
3181 CR add newline and output nothing
3184 1 (or else) normal output
3186 fold state in prev (previous character)
3188 >0x80 Japanese (X0208/X0201)
3193 This fold algorthm does not preserve heading space in a line.
3194 This is the main difference from fmt.
3197 #define char_size(c2,c1) (c2?2:1)
3199 void fold_conv(nkf_char c2, nkf_char c1)
3202 nkf_char fold_state;
3204 if (c1== CR && !fold_preserve_f) {
3205 fold_state=0; /* ignore cr */
3206 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3208 fold_state=0; /* ignore cr */
3209 } else if (c1== BS) {
3210 if (f_line>0) f_line--;
3212 } else if (c2==EOF && f_line != 0) { /* close open last line */
3214 } else if ((c1==LF && !fold_preserve_f)
3215 || ((c1==CR||(c1==LF&&f_prev!=CR))
3216 && fold_preserve_f)) {
3218 if (fold_preserve_f) {
3222 } else if ((f_prev == c1 && !fold_preserve_f)
3223 || (f_prev == LF && fold_preserve_f)
3224 ) { /* duplicate newline */
3227 fold_state = LF; /* output two newline */
3233 if (f_prev&0x80) { /* Japanese? */
3235 fold_state = 0; /* ignore given single newline */
3236 } else if (f_prev==SP) {
3240 if (++f_line<=fold_len)
3244 fold_state = CR; /* fold and output nothing */
3248 } else if (c1=='\f') {
3251 fold_state = LF; /* output newline and clear */
3252 } else if ( (c2==0 && c1==SP)||
3253 (c2==0 && c1==TAB)||
3254 (c2=='!'&& c1=='!')) {
3255 /* X0208 kankaku or ascii space */
3257 fold_state = 0; /* remove duplicate spaces */
3260 if (++f_line<=fold_len)
3261 fold_state = SP; /* output ASCII space only */
3263 f_prev = SP; f_line = 0;
3264 fold_state = CR; /* fold and output nothing */
3268 prev0 = f_prev; /* we still need this one... , but almost done */
3270 if (c2 || c2 == JIS_X_0201_1976_K)
3271 f_prev |= 0x80; /* this is Japanese */
3272 f_line += char_size(c2,c1);
3273 if (f_line<=fold_len) { /* normal case */
3276 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3277 f_line = char_size(c2,c1);
3278 fold_state = LF; /* We can't wait, do fold now */
3279 } else if (c2 == JIS_X_0201_1976_K) {
3280 /* simple kinsoku rules return 1 means no folding */
3281 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3282 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3283 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3284 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3285 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3286 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3287 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3289 fold_state = LF;/* add one new f_line before this character */
3292 fold_state = LF;/* add one new f_line before this character */
3295 /* kinsoku point in ASCII */
3296 if ( c1==')'|| /* { [ ( */
3307 /* just after special */
3308 } else if (!is_alnum(prev0)) {
3309 f_line = char_size(c2,c1);
3311 } else if ((prev0==SP) || /* ignored new f_line */
3312 (prev0==LF)|| /* ignored new f_line */
3313 (prev0&0x80)) { /* X0208 - ASCII */
3314 f_line = char_size(c2,c1);
3315 fold_state = LF;/* add one new f_line before this character */
3317 fold_state = 1; /* default no fold in ASCII */
3321 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3322 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3323 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3324 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3325 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3326 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3327 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3328 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3329 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3330 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3331 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3332 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3333 /* default no fold in kinsoku */
3336 f_line = char_size(c2,c1);
3337 /* add one new f_line before this character */
3340 f_line = char_size(c2,c1);
3342 /* add one new f_line before this character */
3347 /* terminator process */
3348 switch(fold_state) {
3350 OCONV_NEWLINE((*o_fconv));
3356 OCONV_NEWLINE((*o_fconv));
3367 nkf_char z_prev2=0,z_prev1=0;
3369 void z_conv(nkf_char c2, nkf_char c1)
3372 /* if (c2) c1 &= 0x7f; assertion */
3374 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3380 if (z_prev2 == JIS_X_0201_1976_K) {
3381 if (c2 == JIS_X_0201_1976_K) {
3382 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3384 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3386 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3388 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3393 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3395 if (c2 == JIS_X_0201_1976_K) {
3396 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3397 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3402 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3413 if (alpha_f&1 && c2 == 0x23) {
3414 /* JISX0208 Alphabet */
3416 } else if (c2 == 0x21) {
3417 /* JISX0208 Kigou */
3422 } else if (alpha_f&4) {
3427 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3433 if (alpha_f&8 && c2 == 0) {
3437 case '>': entity = ">"; break;
3438 case '<': entity = "<"; break;
3439 case '\"': entity = """; break;
3440 case '&': entity = "&"; break;
3443 while (*entity) (*o_zconv)(0, *entity++);
3449 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3454 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3458 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3462 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3466 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3470 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3474 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3478 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3482 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3487 (*o_zconv)(JIS_X_0201_1976_K, c);
3490 } else if (c2 == 0x25) {
3491 /* JISX0208 Katakana */
3492 static const int fullwidth_to_halfwidth[] =
3494 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3495 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3496 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3497 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3498 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3499 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3500 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3501 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3502 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3503 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3504 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3505 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3507 if (fullwidth_to_halfwidth[c1-0x20]){
3508 c2 = fullwidth_to_halfwidth[c1-0x20];
3509 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3511 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3521 #define rot13(c) ( \
3523 (c <= 'M') ? (c + 13): \
3524 (c <= 'Z') ? (c - 13): \
3526 (c <= 'm') ? (c + 13): \
3527 (c <= 'z') ? (c - 13): \
3531 #define rot47(c) ( \
3533 ( c <= 'O') ? (c + 47) : \
3534 ( c <= '~') ? (c - 47) : \
3538 void rot_conv(nkf_char c2, nkf_char c1)
3540 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3546 (*o_rot_conv)(c2,c1);
3549 void hira_conv(nkf_char c2, nkf_char c1)
3553 if (0x20 < c1 && c1 < 0x74) {
3555 (*o_hira_conv)(c2,c1);
3557 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3559 c1 = nkf_char_unicode_new(0x3094);
3560 (*o_hira_conv)(c2,c1);
3563 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3565 (*o_hira_conv)(c2,c1);
3570 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3573 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3575 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3579 (*o_hira_conv)(c2,c1);
3583 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3585 #define RANGE_NUM_MAX 18
3586 static const nkf_char range[RANGE_NUM_MAX][2] = {
3607 nkf_char start, end, c;
3609 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3613 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3618 for (i = 0; i < RANGE_NUM_MAX; i++) {
3619 start = range[i][0];
3622 if (c >= start && c <= end) {
3627 (*o_iso2022jp_check_conv)(c2,c1);
3631 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3633 static const unsigned char *mime_pattern[] = {
3634 (const unsigned char *)"\075?EUC-JP?B?",
3635 (const unsigned char *)"\075?SHIFT_JIS?B?",
3636 (const unsigned char *)"\075?ISO-8859-1?Q?",
3637 (const unsigned char *)"\075?ISO-8859-1?B?",
3638 (const unsigned char *)"\075?ISO-2022-JP?B?",
3639 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3640 #if defined(UTF8_INPUT_ENABLE)
3641 (const unsigned char *)"\075?UTF-8?B?",
3642 (const unsigned char *)"\075?UTF-8?Q?",
3644 (const unsigned char *)"\075?US-ASCII?Q?",
3649 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3650 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3651 e_iconv, s_iconv, 0, 0, 0, 0,
3652 #if defined(UTF8_INPUT_ENABLE)
3658 static const nkf_char mime_encode[] = {
3659 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3660 #if defined(UTF8_INPUT_ENABLE)
3667 static const nkf_char mime_encode_method[] = {
3668 'B', 'B','Q', 'B', 'B', 'Q',
3669 #if defined(UTF8_INPUT_ENABLE)
3677 /* MIME preprocessor fifo */
3679 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3680 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3681 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3683 unsigned char buf[MIME_BUF_SIZE];
3685 unsigned int last; /* decoded */
3686 unsigned int input; /* undecoded */
3688 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3690 #define MAXRECOVER 20
3692 static void mime_input_buf_unshift(nkf_char c)
3694 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3697 nkf_char mime_ungetc(nkf_char c, FILE *f)
3699 mime_input_buf_unshift(c);
3703 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
3706 (*i_mungetc_buf)(c,f);
3708 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3712 nkf_char mime_getc_buf(FILE *f)
3714 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3715 a terminator. It was checked in mime_integrity. */
3716 return ((mimebuf_f)?
3717 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3720 void switch_mime_getc(void)
3722 if (i_getc!=mime_getc) {
3723 i_mgetc = i_getc; i_getc = mime_getc;
3724 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3725 if(mime_f==STRICT_MIME) {
3726 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3727 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3732 void unswitch_mime_getc(void)
3734 if(mime_f==STRICT_MIME) {
3735 i_mgetc = i_mgetc_buf;
3736 i_mungetc = i_mungetc_buf;
3739 i_ungetc = i_mungetc;
3740 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3741 mime_iconv_back = NULL;
3744 nkf_char mime_integrity(FILE *f, const unsigned char *p)
3748 /* In buffered mode, read until =? or NL or buffer full
3750 mime_input_state.input = mime_input_state.top;
3751 mime_input_state.last = mime_input_state.top;
3753 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3755 q = mime_input_state.input;
3756 while((c=(*i_getc)(f))!=EOF) {
3757 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3758 break; /* buffer full */
3760 if (c=='=' && d=='?') {
3761 /* checked. skip header, start decode */
3762 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3763 /* mime_last_input = mime_input_state.input; */
3764 mime_input_state.input = q;
3768 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3770 /* Should we check length mod 4? */
3771 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3774 /* In case of Incomplete MIME, no MIME decode */
3775 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3776 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3777 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3778 switch_mime_getc(); /* anyway we need buffered getc */
3782 nkf_char mime_begin_strict(FILE *f)
3786 const unsigned char *p,*q;
3787 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3789 mime_decode_mode = FALSE;
3790 /* =? has been checked */
3792 p = mime_pattern[j];
3795 for(i=2;p[i]>SP;i++) { /* start at =? */
3796 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3797 /* pattern fails, try next one */
3799 while (mime_pattern[++j]) {
3800 p = mime_pattern[j];
3801 for(k=2;k<i;k++) /* assume length(p) > i */
3802 if (p[k]!=q[k]) break;
3803 if (k==i && nkf_toupper(c1)==p[k]) break;
3805 p = mime_pattern[j];
3806 if (p) continue; /* found next one, continue */
3807 /* all fails, output from recovery buffer */
3815 mime_decode_mode = p[i-2];
3817 mime_iconv_back = iconv;
3818 set_iconv(FALSE, mime_priority_func[j]);
3819 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3821 if (mime_decode_mode=='B') {
3822 mimebuf_f = unbuf_f;
3824 /* do MIME integrity check */
3825 return mime_integrity(f,mime_pattern[j]);
3833 nkf_char mime_begin(FILE *f)
3838 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3839 /* re-read and convert again from mime_buffer. */
3841 /* =? has been checked */
3842 k = mime_input_state.last;
3843 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
3844 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3845 /* We accept any character type even if it is breaked by new lines */
3846 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3847 if (c1==LF||c1==SP||c1==CR||
3848 c1=='-'||c1=='_'||is_alnum(c1)) continue;
3850 /* Failed. But this could be another MIME preemble */
3852 mime_input_state.last--;
3858 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3859 if (!(++i<MAXRECOVER) || c1==EOF) break;
3860 if (c1=='b'||c1=='B') {
3861 mime_decode_mode = 'B';
3862 } else if (c1=='q'||c1=='Q') {
3863 mime_decode_mode = 'Q';
3867 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3868 if (!(++i<MAXRECOVER) || c1==EOF) break;
3870 mime_decode_mode = FALSE;
3876 if (!mime_decode_mode) {
3877 /* false MIME premble, restart from mime_buffer */
3878 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3879 /* Since we are in MIME mode until buffer becomes empty, */
3880 /* we never go into mime_begin again for a while. */
3883 /* discard mime preemble, and goto MIME mode */
3884 mime_input_state.last = k;
3885 /* do no MIME integrity check */
3886 return c1; /* used only for checking EOF */
3890 void no_putc(nkf_char c)
3895 void debug(const char *str)
3898 fprintf(stderr, "%s\n", str ? str : "NULL");
3903 void set_input_codename(char *codename)
3905 if (!input_codename) {
3906 input_codename = codename;
3907 } else if (strcmp(codename, input_codename) != 0) {
3908 input_codename = "";
3912 static char* get_guessed_code(void)
3914 if (input_codename && !*input_codename) {
3915 input_codename = "BINARY";
3917 struct input_code *p = find_inputcode_byfunc(iconv);
3918 if (!input_codename) {
3919 input_codename = "ASCII";
3920 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
3921 if (p->score & (SCORE_DEPEND|SCORE_CP932))
3922 input_codename = "CP932";
3923 } else if (strcmp(input_codename, "EUC-JP") == 0) {
3924 if (p->score & (SCORE_X0212))
3925 input_codename = "EUCJP-MS";
3926 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3927 input_codename = "CP51932";
3928 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
3929 if (p->score & (SCORE_KANA))
3930 input_codename = "CP50221";
3931 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3932 input_codename = "CP50220";
3935 return input_codename;
3938 #if !defined(PERL_XS) && !defined(WIN32DLL)
3939 void print_guessed_code(char *filename)
3941 if (filename != NULL) printf("%s: ", filename);
3942 if (input_codename && !*input_codename) {
3945 input_codename = get_guessed_code();
3947 printf("%s\n", input_codename);
3951 input_eol == CR ? " (CR)" :
3952 input_eol == LF ? " (LF)" :
3953 input_eol == CRLF ? " (CRLF)" :
3954 input_eol == EOF ? " (MIXED NL)" :
3963 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
3965 nkf_char c1, c2, c3;
3971 if (!nkf_isxdigit(c2)){
3976 if (!nkf_isxdigit(c3)){
3981 return (hex2bin(c2) << 4) | hex2bin(c3);
3984 nkf_char cap_getc(FILE *f)
3986 return hex_getc(':', f, i_cgetc, i_cungetc);
3989 nkf_char cap_ungetc(nkf_char c, FILE *f)
3991 return (*i_cungetc)(c, f);
3994 nkf_char url_getc(FILE *f)
3996 return hex_getc('%', f, i_ugetc, i_uungetc);
3999 nkf_char url_ungetc(nkf_char c, FILE *f)
4001 return (*i_uungetc)(c, f);
4005 #ifdef NUMCHAR_OPTION
4006 nkf_char numchar_getc(FILE *f)
4008 nkf_char (*g)(FILE *) = i_ngetc;
4009 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4020 if (buf[i] == 'x' || buf[i] == 'X'){
4021 for (j = 0; j < 7; j++){
4023 if (!nkf_isxdigit(buf[i])){
4030 c |= hex2bin(buf[i]);
4033 for (j = 0; j < 8; j++){
4037 if (!nkf_isdigit(buf[i])){
4044 c += hex2bin(buf[i]);
4050 return nkf_char_unicode_new(c);
4059 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4061 return (*i_nungetc)(c, f);
4065 #ifdef UNICODE_NORMALIZATION
4067 /* Normalization Form C */
4068 nkf_char nfc_getc(FILE *f)
4070 nkf_char (*g)(FILE *f) = i_nfc_getc;
4071 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4072 int i=0, j, k=1, lower, upper;
4074 const unsigned char *array;
4077 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4078 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4079 while (upper >= lower) {
4080 j = (lower+upper) / 2;
4081 array = normalization_table[j].nfd;
4082 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4083 if (array[k] != buf[k]){
4084 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4091 array = normalization_table[j].nfc;
4092 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4093 buf[i] = (nkf_char)(array[i]);
4104 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4106 return (*i_nfc_ungetc)(c, f);
4108 #endif /* UNICODE_NORMALIZATION */
4111 static nkf_char base64decode(nkf_char c)
4116 i = c - 'A'; /* A..Z 0-25 */
4117 } else if (c == '_') {
4118 i = '?' /* 63 */ ; /* _ 63 */
4120 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4122 } else if (c > '/') {
4123 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4124 } else if (c == '+' || c == '-') {
4125 i = '>' /* 62 */ ; /* + and - 62 */
4127 i = '?' /* 63 */ ; /* / 63 */
4135 nkf_char c1, c2, c3, c4, cc;
4136 nkf_char t1, t2, t3, t4, mode, exit_mode;
4137 nkf_char lwsp_count;
4140 nkf_char lwsp_size = 128;
4142 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4143 return mime_input_buf(mime_input_state.top++);
4145 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4146 mime_decode_mode=FALSE;
4147 unswitch_mime_getc();
4148 return (*i_getc)(f);
4151 if (mimebuf_f == FIXED_MIME)
4152 exit_mode = mime_decode_mode;
4155 if (mime_decode_mode == 'Q') {
4156 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4158 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4159 if (c1<=SP || DEL<=c1) {
4160 mime_decode_mode = exit_mode; /* prepare for quit */
4163 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4167 mime_decode_mode = exit_mode; /* prepare for quit */
4168 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4169 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4170 /* end Q encoding */
4171 input_mode = exit_mode;
4173 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4174 if (lwsp_buf==NULL) {
4175 perror("can't malloc");
4178 while ((c1=(*i_getc)(f))!=EOF) {
4183 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4191 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4192 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4207 lwsp_buf[lwsp_count] = (unsigned char)c1;
4208 if (lwsp_count++>lwsp_size){
4210 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4211 if (lwsp_buf_new==NULL) {
4213 perror("can't realloc");
4216 lwsp_buf = lwsp_buf_new;
4222 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4224 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4225 i_ungetc(lwsp_buf[lwsp_count],f);
4231 if (c1=='='&&c2<SP) { /* this is soft wrap */
4232 while((c1 = (*i_mgetc)(f)) <=SP) {
4233 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4235 mime_decode_mode = 'Q'; /* still in MIME */
4236 goto restart_mime_q;
4239 mime_decode_mode = 'Q'; /* still in MIME */
4243 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4244 if (c2<=SP) return c2;
4245 mime_decode_mode = 'Q'; /* still in MIME */
4246 return ((hex2bin(c2)<<4) + hex2bin(c3));
4249 if (mime_decode_mode != 'B') {
4250 mime_decode_mode = FALSE;
4251 return (*i_mgetc)(f);
4255 /* Base64 encoding */
4257 MIME allows line break in the middle of
4258 Base64, but we are very pessimistic in decoding
4259 in unbuf mode because MIME encoded code may broken by
4260 less or editor's control sequence (such as ESC-[-K in unbuffered
4261 mode. ignore incomplete MIME.
4263 mode = mime_decode_mode;
4264 mime_decode_mode = exit_mode; /* prepare for quit */
4266 while ((c1 = (*i_mgetc)(f))<=SP) {
4271 if ((c2 = (*i_mgetc)(f))<=SP) {
4274 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4275 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4278 if ((c1 == '?') && (c2 == '=')) {
4281 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4282 if (lwsp_buf==NULL) {
4283 perror("can't malloc");
4286 while ((c1=(*i_getc)(f))!=EOF) {
4291 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4299 if ((c1=(*i_getc)(f))!=EOF) {
4303 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4318 lwsp_buf[lwsp_count] = (unsigned char)c1;
4319 if (lwsp_count++>lwsp_size){
4321 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4322 if (lwsp_buf_new==NULL) {
4324 perror("can't realloc");
4327 lwsp_buf = lwsp_buf_new;
4333 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4335 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4336 i_ungetc(lwsp_buf[lwsp_count],f);
4343 if ((c3 = (*i_mgetc)(f))<=SP) {
4346 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4347 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4351 if ((c4 = (*i_mgetc)(f))<=SP) {
4354 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4355 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4359 mime_decode_mode = mode; /* still in MIME sigh... */
4361 /* BASE 64 decoding */
4363 t1 = 0x3f & base64decode(c1);
4364 t2 = 0x3f & base64decode(c2);
4365 t3 = 0x3f & base64decode(c3);
4366 t4 = 0x3f & base64decode(c4);
4367 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4369 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4370 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4372 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4373 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4375 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4380 return mime_input_buf(mime_input_state.top++);
4383 static const char basis_64[] =
4384 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4386 #define MIMEOUT_BUF_LENGTH (60)
4388 char buf[MIMEOUT_BUF_LENGTH+1];
4393 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4395 static void open_mime(nkf_char mode)
4397 const unsigned char *p;
4400 p = mime_pattern[0];
4401 for(i=0;mime_pattern[i];i++) {
4402 if (mode == mime_encode[i]) {
4403 p = mime_pattern[i];
4407 mimeout_mode = mime_encode_method[i];
4409 if (base64_count>45) {
4410 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4411 (*o_mputc)(mimeout_state.buf[i]);
4414 PUT_NEWLINE((*o_mputc));
4417 if (mimeout_state.count>0
4418 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4419 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4423 for (;i<mimeout_state.count;i++) {
4424 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4425 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4426 (*o_mputc)(mimeout_state.buf[i]);
4436 j = mimeout_state.count;
4437 mimeout_state.count = 0;
4439 mime_putc(mimeout_state.buf[i]);
4443 static void mime_prechar(nkf_char c2, nkf_char c1)
4445 if (mimeout_mode > 0){
4447 if (base64_count + mimeout_state.count/3*4> 73){
4448 (*o_base64conv)(EOF,0);
4449 OCONV_NEWLINE((*o_base64conv));
4450 (*o_base64conv)(0,SP);
4454 if (base64_count + mimeout_state.count/3*4> 66) {
4455 (*o_base64conv)(EOF,0);
4456 OCONV_NEWLINE((*o_base64conv));
4457 (*o_base64conv)(0,SP);
4463 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4464 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4465 open_mime(output_mode);
4466 (*o_base64conv)(EOF,0);
4467 OCONV_NEWLINE((*o_base64conv));
4468 (*o_base64conv)(0,SP);
4475 static void close_mime(void)
4483 static void eof_mime(void)
4485 switch(mimeout_mode) {
4490 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4496 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4501 if (mimeout_mode > 0) {
4502 if (mimeout_f!=FIXED_MIME) {
4504 } else if (mimeout_mode != 'Q')
4509 static void mimeout_addchar(nkf_char c)
4511 switch(mimeout_mode) {
4516 } else if(!nkf_isalnum(c)) {
4518 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4519 (*o_mputc)(bin2hex((c&0xf)));
4527 mimeout_state.state=c;
4528 (*o_mputc)(basis_64[c>>2]);
4533 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4534 mimeout_state.state=c;
4539 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4540 (*o_mputc)(basis_64[c & 0x3F]);
4551 static void mime_putc(nkf_char c)
4556 if (mimeout_f == FIXED_MIME){
4557 if (mimeout_mode == 'Q'){
4558 if (base64_count > 71){
4559 if (c!=CR && c!=LF) {
4561 PUT_NEWLINE((*o_mputc));
4566 if (base64_count > 71){
4568 PUT_NEWLINE((*o_mputc));
4571 if (c == EOF) { /* c==EOF */
4575 if (c != EOF) { /* c==EOF */
4581 /* mimeout_f != FIXED_MIME */
4583 if (c == EOF) { /* c==EOF */
4584 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4585 j = mimeout_state.count;
4586 mimeout_state.count = 0;
4588 if (mimeout_mode > 0) {
4589 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4591 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4594 mimeout_addchar(mimeout_state.buf[i]);
4598 mimeout_addchar(mimeout_state.buf[i]);
4602 mimeout_addchar(mimeout_state.buf[i]);
4608 mimeout_addchar(mimeout_state.buf[i]);
4614 if (mimeout_state.count > 0){
4615 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4620 if (mimeout_mode=='Q') {
4621 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4622 if (c == CR || c == LF) {
4627 } else if (c <= SP) {
4629 if (base64_count > 70) {
4630 PUT_NEWLINE((*o_mputc));
4633 if (!nkf_isblank(c)) {
4638 if (base64_count > 70) {
4640 PUT_NEWLINE((*o_mputc));
4643 open_mime(output_mode);
4645 if (!nkf_noescape_mime(c)) {
4656 if (mimeout_mode <= 0) {
4657 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4658 if (nkf_isspace(c)) {
4660 if (mimeout_mode == -1) {
4663 if (c==CR || c==LF) {
4665 open_mime(output_mode);
4671 for (i=0;i<mimeout_state.count;i++) {
4672 (*o_mputc)(mimeout_state.buf[i]);
4673 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4684 mimeout_state.buf[0] = (char)c;
4685 mimeout_state.count = 1;
4687 if (base64_count > 1
4688 && base64_count + mimeout_state.count > 76
4689 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4690 PUT_NEWLINE((*o_mputc));
4692 if (!nkf_isspace(mimeout_state.buf[0])){
4697 mimeout_state.buf[mimeout_state.count++] = (char)c;
4698 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4699 open_mime(output_mode);
4704 if (lastchar==CR || lastchar == LF){
4705 for (i=0;i<mimeout_state.count;i++) {
4706 (*o_mputc)(mimeout_state.buf[i]);
4709 mimeout_state.count = 0;
4712 for (i=0;i<mimeout_state.count-1;i++) {
4713 (*o_mputc)(mimeout_state.buf[i]);
4716 mimeout_state.buf[0] = SP;
4717 mimeout_state.count = 1;
4719 open_mime(output_mode);
4722 /* mimeout_mode == 'B', 1, 2 */
4723 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4724 if (lastchar == CR || lastchar == LF){
4725 if (nkf_isblank(c)) {
4726 for (i=0;i<mimeout_state.count;i++) {
4727 mimeout_addchar(mimeout_state.buf[i]);
4729 mimeout_state.count = 0;
4730 } else if (SP<c && c<DEL) {
4732 for (i=0;i<mimeout_state.count;i++) {
4733 (*o_mputc)(mimeout_state.buf[i]);
4736 mimeout_state.count = 0;
4738 mimeout_state.buf[mimeout_state.count++] = (char)c;
4741 if (c==SP || c==TAB || c==CR || c==LF) {
4742 for (i=0;i<mimeout_state.count;i++) {
4743 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4745 for (i=0;i<mimeout_state.count;i++) {
4746 (*o_mputc)(mimeout_state.buf[i]);
4749 mimeout_state.count = 0;
4752 mimeout_state.buf[mimeout_state.count++] = (char)c;
4753 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4755 for (i=0;i<mimeout_state.count;i++) {
4756 (*o_mputc)(mimeout_state.buf[i]);
4759 mimeout_state.count = 0;
4763 if (mimeout_state.count>0 && SP<c && c!='=') {
4764 mimeout_state.buf[mimeout_state.count++] = (char)c;
4765 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4766 j = mimeout_state.count;
4767 mimeout_state.count = 0;
4769 mimeout_addchar(mimeout_state.buf[i]);
4776 if (mimeout_state.count>0) {
4777 j = mimeout_state.count;
4778 mimeout_state.count = 0;
4780 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4782 mimeout_addchar(mimeout_state.buf[i]);
4788 (*o_mputc)(mimeout_state.buf[i]);
4790 open_mime(output_mode);
4796 void base64_conv(nkf_char c2, nkf_char c1)
4798 mime_prechar(c2, c1);
4799 (*o_base64conv)(c2,c1);
4803 typedef struct nkf_iconv_t {
4806 size_t input_buffer_size;
4807 char *output_buffer;
4808 size_t output_buffer_size;
4811 nkf_iconv_t nkf_iconv_new(char *tocode, char *fromcode)
4813 nkf_iconv_t converter;
4815 converter->input_buffer_size = IOBUF_SIZE;
4816 converter->input_buffer = malloc(converter->input_buffer_size);
4817 if (converter->input_buffer == NULL)
4818 perror("can't malloc");
4820 converter->output_buffer_size = IOBUF_SIZE * 2;
4821 converter->output_buffer = malloc(converter->output_buffer_size);
4822 if (converter->output_buffer == NULL)
4823 perror("can't malloc");
4825 converter->cd = iconv_open(tocode, fromcode);
4826 if (converter->cd == (iconv_t)-1)
4830 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
4833 perror("can't iconv_open");
4838 size_t nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
4840 size_t invalid = (size_t)0;
4841 char *input_buffer = converter->input_buffer;
4842 size_t input_length = (size_t)0;
4843 char *output_buffer = converter->output_buffer;
4844 size_t output_length = converter->output_buffer_size;
4849 while ((c = (*i_getc)(f)) != EOF) {
4850 input_buffer[input_length++] = c;
4851 if (input_length < converter->input_buffer_size) break;
4855 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
4856 while (output_length-- > 0) {
4857 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
4859 if (ret == (size_t) - 1) {
4862 if (input_buffer != converter->input_buffer)
4863 memmove(converter->input_buffer, input_buffer, input_length);
4866 converter->output_buffer_size *= 2;
4867 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
4868 if (output_buffer == NULL) {
4869 perror("can't realloc");
4872 converter->output_buffer = output_buffer;
4875 perror("can't iconv");
4887 void nkf_iconv_close(nkf_iconv_t *convert)
4889 free(converter->inbuf);
4890 free(converter->outbuf);
4891 iconv_close(converter->cd);
4899 struct input_code *p = input_code_list;
4911 mime_f = MIME_DECODE_DEFAULT;
4912 mime_decode_f = FALSE;
4917 x0201_f = X0201_DEFAULT;
4918 iso2022jp_f = FALSE;
4919 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
4920 ms_ucs_map_f = UCS_MAP_ASCII;
4922 #ifdef UTF8_INPUT_ENABLE
4923 no_cp932ext_f = FALSE;
4924 no_best_fit_chars_f = FALSE;
4925 encode_fallback = NULL;
4926 unicode_subchar = '?';
4927 input_endian = ENDIAN_BIG;
4929 #ifdef UTF8_OUTPUT_ENABLE
4930 output_bom_f = FALSE;
4931 output_endian = ENDIAN_BIG;
4933 #ifdef UNICODE_NORMALIZATION
4949 #ifdef SHIFTJIS_CP932
4959 for (i = 0; i < 256; i++){
4960 prefix_table[i] = 0;
4964 mimeout_state.count = 0;
4969 fold_preserve_f = FALSE;
4972 kanji_intro = DEFAULT_J;
4973 ascii_intro = DEFAULT_R;
4974 fold_margin = FOLD_MARGIN;
4975 o_zconv = no_connection;
4976 o_fconv = no_connection;
4977 o_eol_conv = no_connection;
4978 o_rot_conv = no_connection;
4979 o_hira_conv = no_connection;
4980 o_base64conv = no_connection;
4981 o_iso2022jp_check_conv = no_connection;
4984 i_ungetc = std_ungetc;
4986 i_bungetc = std_ungetc;
4989 i_mungetc = std_ungetc;
4990 i_mgetc_buf = std_getc;
4991 i_mungetc_buf = std_ungetc;
4992 output_mode = ASCII;
4994 mime_decode_mode = FALSE;
5000 init_broken_state();
5001 z_prev2=0,z_prev1=0;
5003 iconv_for_check = 0;
5005 input_codename = NULL;
5006 input_encoding = NULL;
5007 output_encoding = NULL;
5013 int module_connection(void)
5015 if (input_encoding) set_input_encoding(input_encoding);
5016 if (!output_encoding) {
5017 output_encoding = nkf_default_encoding();
5019 if (!output_encoding) {
5020 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5023 set_output_encoding(output_encoding);
5024 oconv = nkf_enc_to_oconv(output_encoding);
5027 /* replace continucation module, from output side */
5029 /* output redicrection */
5031 if (noout_f || guess_f){
5038 if (mimeout_f == TRUE) {
5039 o_base64conv = oconv; oconv = base64_conv;
5041 /* base64_count = 0; */
5044 if (eolmode_f || guess_f) {
5045 o_eol_conv = oconv; oconv = eol_conv;
5048 o_rot_conv = oconv; oconv = rot_conv;
5051 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5054 o_hira_conv = oconv; oconv = hira_conv;
5057 o_fconv = oconv; oconv = fold_conv;
5060 if (alpha_f || x0201_f) {
5061 o_zconv = oconv; oconv = z_conv;
5065 i_ungetc = std_ungetc;
5066 /* input redicrection */
5069 i_cgetc = i_getc; i_getc = cap_getc;
5070 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5073 i_ugetc = i_getc; i_getc = url_getc;
5074 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5077 #ifdef NUMCHAR_OPTION
5079 i_ngetc = i_getc; i_getc = numchar_getc;
5080 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5083 #ifdef UNICODE_NORMALIZATION
5085 i_nfc_getc = i_getc; i_getc = nfc_getc;
5086 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5089 if (mime_f && mimebuf_f==FIXED_MIME) {
5090 i_mgetc = i_getc; i_getc = mime_getc;
5091 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5094 i_bgetc = i_getc; i_getc = broken_getc;
5095 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5097 if (input_encoding) {
5098 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5100 set_iconv(FALSE, e_iconv);
5104 struct input_code *p = input_code_list;
5113 Conversion main loop. Code detection only.
5116 #if !defined(PERL_XS) && !defined(WIN32DLL)
5117 nkf_char noconvert(FILE *f)
5122 module_connection();
5123 while ((c = (*i_getc)(f)) != EOF)
5130 int kanji_convert(FILE *f)
5132 nkf_char c1=0, c2=0, c3=0, c4=0;
5133 int shift_mode = FALSE; /* TRUE or FALSE or JIS_X_0201_1976_K */
5134 int is_8bit = FALSE;
5136 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5141 output_mode = ASCII;
5143 #define NEXT continue /* no output, get next */
5144 #define SKIP c2=0;continue /* no output, get next */
5145 #define MORE c2=c1;continue /* need one more byte */
5146 #define SEND ; /* output c1 and c2, get next */
5147 #define LAST break /* end of loop, go closing */
5149 if (module_connection() < 0) {
5150 #if !defined(PERL_XS) && !defined(WIN32DLL)
5151 fprintf(stderr, "no output encoding given\n");
5157 #ifdef UTF8_INPUT_ENABLE
5158 if(iconv == w_iconv32){
5159 while ((c1 = (*i_getc)(f)) != EOF &&
5160 (c2 = (*i_getc)(f)) != EOF &&
5161 (c3 = (*i_getc)(f)) != EOF &&
5162 (c4 = (*i_getc)(f)) != EOF) {
5163 nkf_iconv_utf_32(c1, c2, c3, c4);
5165 (*i_ungetc)(EOF, f);
5167 else if (iconv == w_iconv16) {
5168 while ((c1 = (*i_getc)(f)) != EOF &&
5169 (c2 = (*i_getc)(f)) != EOF) {
5170 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5171 (c3 = (*i_getc)(f)) != EOF &&
5172 (c4 = (*i_getc)(f)) != EOF) {
5173 nkf_iconv_utf_16(c1, c2, c3, c4);
5176 (*i_ungetc)(EOF, f);
5180 while ((c1 = (*i_getc)(f)) != EOF) {
5181 #ifdef INPUT_CODE_FIX
5182 if (!input_encoding)
5188 /* in case of 8th bit is on */
5189 if (!estab_f&&!mime_decode_mode) {
5190 /* in case of not established yet */
5191 /* It is still ambiguious */
5192 if (h_conv(f, c2, c1)==EOF)
5197 /* in case of already established */
5199 /* ignore bogus code */
5206 /* 2nd byte of 7 bit code or SJIS */
5211 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5214 } else if (c1 > DEL) {
5216 if (!estab_f && !iso8859_f) {
5217 /* not established yet */
5219 } else { /* estab_f==TRUE */
5225 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5226 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5228 c2 = JIS_X_0201_1976_K;
5233 /* already established */
5237 } else if (SP < c1 && c1 < DEL) {
5238 /* in case of Roman characters */
5240 /* output 1 shifted byte */
5244 } else if (SP <= c1 && c1 < (0xE0&0x7F)){
5245 /* output 1 shifted byte */
5246 c2 = JIS_X_0201_1976_K;
5249 /* look like bogus code */
5252 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5253 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5254 /* in case of Kanji shifted */
5256 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5257 /* Check MIME code */
5258 if ((c1 = (*i_getc)(f)) == EOF) {
5261 } else if (c1 == '?') {
5262 /* =? is mime conversion start sequence */
5263 if(mime_f == STRICT_MIME) {
5264 /* check in real detail */
5265 if (mime_begin_strict(f) == EOF)
5268 } else if (mime_begin(f) == EOF)
5277 /* normal ASCII code */
5280 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5283 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5286 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5287 if ((c1 = (*i_getc)(f)) == EOF) {
5288 /* (*oconv)(0, ESC); don't send bogus code */
5290 } else if (c1 == '$') {
5291 if ((c1 = (*i_getc)(f)) == EOF) {
5293 (*oconv)(0, ESC); don't send bogus code
5294 (*oconv)(0, '$'); */
5296 } else if (c1 == '@'|| c1 == 'B') {
5297 /* This is kanji introduction */
5298 input_mode = JIS_X_0208;
5300 set_input_codename("ISO-2022-JP");
5302 debug("ISO-2022-JP");
5305 } else if (c1 == '(') {
5306 if ((c1 = (*i_getc)(f)) == EOF) {
5307 /* don't send bogus code
5313 } else if (c1 == '@'|| c1 == 'B') {
5314 /* This is kanji introduction */
5315 input_mode = JIS_X_0208;
5319 } else if (c1 == 'D'){
5320 input_mode = JIS_X_0212;
5323 #endif /* X0212_ENABLE */
5324 } else if (c1 == 'O' || c1 == 'Q'){
5325 input_mode = JIS_X_0213_1;
5328 } else if (c1 == 'P'){
5329 input_mode = JIS_X_0213_2;
5333 /* could be some special code */
5340 } else if (broken_f&0x2) {
5341 /* accept any ESC-(-x as broken code ... */
5342 input_mode = JIS_X_0208;
5351 } else if (c1 == '(') {
5352 if ((c1 = (*i_getc)(f)) == EOF) {
5353 /* don't send bogus code
5355 (*oconv)(0, '('); */
5359 /* This is X0201 kana introduction */
5360 input_mode = JIS_X_0201_1976_K; shift_mode = JIS_X_0201_1976_K;
5362 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5363 /* This is X0208 kanji introduction */
5364 input_mode = ASCII; shift_mode = FALSE;
5366 } else if (broken_f&0x2) {
5367 input_mode = ASCII; shift_mode = FALSE;
5372 /* maintain various input_mode here */
5376 } else if ( c1 == 'N' || c1 == 'n'){
5378 c4 = (*i_getc)(f); /* skip SS2 */
5379 if ( (SP<=c4 && c4 < 0x60) || (0xa0<=c4 && c4 < 0xe0)){
5381 c2 = JIS_X_0201_1976_K;
5394 } else if (c1 == ESC && iconv == s_iconv) {
5395 /* ESC in Shift_JIS */
5396 if ((c1 = (*i_getc)(f)) == EOF) {
5397 /* (*oconv)(0, ESC); don't send bogus code */
5399 } else if (c1 == '$') {
5401 if ((c1 = (*i_getc)(f)) == EOF) {
5403 (*oconv)(0, ESC); don't send bogus code
5404 (*oconv)(0, '$'); */
5407 if (('E' <= c1 && c1 <= 'G') ||
5408 ('O' <= c1 && c1 <= 'Q')) {
5416 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
5417 c3 = nkf_char_unicode_new((jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000);
5418 while ((c1 = (*i_getc)(f)) != EOF) {
5419 if (SP <= c1 && c1 <= 'z') {
5420 (*oconv)(0, c1 + c3);
5421 } else break; /* c1 == SO */
5425 if (c1 == EOF) LAST;
5432 } else if (c1 == LF || c1 == CR) {
5434 input_mode = ASCII; set_iconv(FALSE, 0);
5436 } else if (mime_decode_f && !mime_decode_mode){
5438 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5446 } else { /* if (c1 == CR)*/
5447 if ((c1=(*i_getc)(f))!=EOF) {
5451 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5471 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5474 if ((c3 = (*i_getc)(f)) != EOF) {
5477 if ((c4 = (*i_getc)(f)) != EOF) {
5479 (*iconv)(c2, c1, c3|c4);
5484 /* 3 bytes EUC or UTF-8 */
5485 if ((c3 = (*i_getc)(f)) != EOF) {
5487 (*iconv)(c2, c1, c3);
5495 0x7F <= c2 && c2 <= 0x92 &&
5496 0x21 <= c1 && c1 <= 0x7E) {
5500 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5503 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5507 (*oconv)(PREFIX_EUCG3 | c2, c1);
5509 #endif /* X0212_ENABLE */
5511 (*oconv)(PREFIX_EUCG3 | c2, c1);
5514 (*oconv)(input_mode, c1); /* other special case */
5520 /* goto next_word */
5524 (*iconv)(EOF, 0, 0);
5525 if (!input_codename)
5528 struct input_code *p = input_code_list;
5529 struct input_code *result = p;
5531 if (p->score < result->score) result = p;
5534 set_input_codename(result->name);
5536 debug(result->name);
5544 * int options(unsigned char *cp)
5550 int options(unsigned char *cp)
5554 unsigned char *cp_back = NULL;
5560 while(*cp && *cp++!='-');
5561 while (*cp || cp_back) {
5569 case '-': /* literal options */
5570 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5574 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5575 p = (unsigned char *)long_option[i].name;
5576 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5577 if (*p == cp[j] || cp[j] == SP){
5584 #if !defined(PERL_XS) && !defined(WIN32DLL)
5585 fprintf(stderr, "unknown long option: --%s\n", cp);
5589 while(*cp && *cp != SP && cp++);
5590 if (long_option[i].alias[0]){
5592 cp = (unsigned char *)long_option[i].alias;
5594 if (strcmp(long_option[i].name, "ic=") == 0){
5595 nkf_str_upcase((char *)p, codeset, 32);
5596 enc = nkf_enc_find(codeset);
5598 input_encoding = enc;
5601 if (strcmp(long_option[i].name, "oc=") == 0){
5602 nkf_str_upcase((char *)p, codeset, 32);
5603 enc = nkf_enc_find(codeset);
5604 if (enc <= 0) continue;
5605 output_encoding = enc;
5608 if (strcmp(long_option[i].name, "guess=") == 0){
5609 if (p[0] == '0' || p[0] == '1') {
5617 if (strcmp(long_option[i].name, "overwrite") == 0){
5620 preserve_time_f = TRUE;
5623 if (strcmp(long_option[i].name, "overwrite=") == 0){
5626 preserve_time_f = TRUE;
5628 backup_suffix = malloc(strlen((char *) p) + 1);
5629 strcpy(backup_suffix, (char *) p);
5632 if (strcmp(long_option[i].name, "in-place") == 0){
5635 preserve_time_f = FALSE;
5638 if (strcmp(long_option[i].name, "in-place=") == 0){
5641 preserve_time_f = FALSE;
5643 backup_suffix = malloc(strlen((char *) p) + 1);
5644 strcpy(backup_suffix, (char *) p);
5649 if (strcmp(long_option[i].name, "cap-input") == 0){
5653 if (strcmp(long_option[i].name, "url-input") == 0){
5658 #ifdef NUMCHAR_OPTION
5659 if (strcmp(long_option[i].name, "numchar-input") == 0){
5665 if (strcmp(long_option[i].name, "no-output") == 0){
5669 if (strcmp(long_option[i].name, "debug") == 0){
5674 if (strcmp(long_option[i].name, "cp932") == 0){
5675 #ifdef SHIFTJIS_CP932
5679 #ifdef UTF8_OUTPUT_ENABLE
5680 ms_ucs_map_f = UCS_MAP_CP932;
5684 if (strcmp(long_option[i].name, "no-cp932") == 0){
5685 #ifdef SHIFTJIS_CP932
5689 #ifdef UTF8_OUTPUT_ENABLE
5690 ms_ucs_map_f = UCS_MAP_ASCII;
5694 #ifdef SHIFTJIS_CP932
5695 if (strcmp(long_option[i].name, "cp932inv") == 0){
5702 if (strcmp(long_option[i].name, "x0212") == 0){
5709 if (strcmp(long_option[i].name, "exec-in") == 0){
5713 if (strcmp(long_option[i].name, "exec-out") == 0){
5718 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5719 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5720 no_cp932ext_f = TRUE;
5723 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5724 no_best_fit_chars_f = TRUE;
5727 if (strcmp(long_option[i].name, "fb-skip") == 0){
5728 encode_fallback = NULL;
5731 if (strcmp(long_option[i].name, "fb-html") == 0){
5732 encode_fallback = encode_fallback_html;
5735 if (strcmp(long_option[i].name, "fb-xml") == 0){
5736 encode_fallback = encode_fallback_xml;
5739 if (strcmp(long_option[i].name, "fb-java") == 0){
5740 encode_fallback = encode_fallback_java;
5743 if (strcmp(long_option[i].name, "fb-perl") == 0){
5744 encode_fallback = encode_fallback_perl;
5747 if (strcmp(long_option[i].name, "fb-subchar") == 0){
5748 encode_fallback = encode_fallback_subchar;
5751 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
5752 encode_fallback = encode_fallback_subchar;
5753 unicode_subchar = 0;
5755 /* decimal number */
5756 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
5757 unicode_subchar *= 10;
5758 unicode_subchar += hex2bin(p[i]);
5760 }else if(p[1] == 'x' || p[1] == 'X'){
5761 /* hexadecimal number */
5762 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
5763 unicode_subchar <<= 4;
5764 unicode_subchar |= hex2bin(p[i]);
5768 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
5769 unicode_subchar *= 8;
5770 unicode_subchar += hex2bin(p[i]);
5773 w16e_conv(unicode_subchar, &i, &j);
5774 unicode_subchar = i<<8 | j;
5778 #ifdef UTF8_OUTPUT_ENABLE
5779 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
5780 ms_ucs_map_f = UCS_MAP_MS;
5784 #ifdef UNICODE_NORMALIZATION
5785 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
5790 if (strcmp(long_option[i].name, "prefix=") == 0){
5791 if (nkf_isgraph(p[0])){
5792 for (i = 1; nkf_isgraph(p[i]); i++){
5793 prefix_table[p[i]] = p[0];
5798 #if !defined(PERL_XS) && !defined(WIN32DLL)
5799 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
5804 case 'b': /* buffered mode */
5807 case 'u': /* non bufferd mode */
5810 case 't': /* transparent mode */
5815 } else if (*cp=='2') {
5819 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
5827 case 'j': /* JIS output */
5829 output_encoding = nkf_enc_from_index(ISO_2022_JP);
5831 case 'e': /* AT&T EUC output */
5832 output_encoding = nkf_enc_from_index(EUCJP_NKF);
5834 case 's': /* SJIS output */
5835 output_encoding = nkf_enc_from_index(WINDOWS_31J);
5837 case 'l': /* ISO8859 Latin-1 support, no conversion */
5838 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
5839 input_encoding = nkf_enc_from_index(ISO_8859_1);
5841 case 'i': /* Kanji IN ESC-$-@/B */
5842 if (*cp=='@'||*cp=='B')
5843 kanji_intro = *cp++;
5845 case 'o': /* ASCII IN ESC-(-J/B */
5846 if (*cp=='J'||*cp=='B'||*cp=='H')
5847 ascii_intro = *cp++;
5851 bit:1 katakana->hiragana
5852 bit:2 hiragana->katakana
5854 if ('9'>= *cp && *cp>='0')
5855 hira_f |= (*cp++ -'0');
5862 #if defined(MSDOS) || defined(__OS2__)
5869 show_configuration();
5877 #ifdef UTF8_OUTPUT_ENABLE
5878 case 'w': /* UTF-8 output */
5883 output_encoding = nkf_enc_from_index(UTF_8N);
5885 output_bom_f = TRUE;
5886 output_encoding = nkf_enc_from_index(UTF_8_BOM);
5890 if ('1'== cp[0] && '6'==cp[1]) {
5893 } else if ('3'== cp[0] && '2'==cp[1]) {
5897 output_encoding = nkf_enc_from_index(UTF_8);
5902 output_endian = ENDIAN_LITTLE;
5903 } else if (cp[0] == 'B') {
5906 output_encoding = nkf_enc_from_index(enc_idx);
5911 enc_idx = enc_idx == UTF_16
5912 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5913 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5915 output_bom_f = TRUE;
5916 enc_idx = enc_idx == UTF_16
5917 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
5918 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
5920 output_encoding = nkf_enc_from_index(enc_idx);
5924 #ifdef UTF8_INPUT_ENABLE
5925 case 'W': /* UTF input */
5928 input_encoding = nkf_enc_from_index(UTF_8);
5931 if ('1'== cp[0] && '6'==cp[1]) {
5933 input_endian = ENDIAN_BIG;
5935 } else if ('3'== cp[0] && '2'==cp[1]) {
5937 input_endian = ENDIAN_BIG;
5940 input_encoding = nkf_enc_from_index(UTF_8);
5945 input_endian = ENDIAN_LITTLE;
5946 } else if (cp[0] == 'B') {
5948 input_endian = ENDIAN_BIG;
5950 enc_idx = enc_idx == UTF_16
5951 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5952 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5953 input_encoding = nkf_enc_from_index(enc_idx);
5957 /* Input code assumption */
5958 case 'J': /* ISO-2022-JP input */
5959 input_encoding = nkf_enc_from_index(ISO_2022_JP);
5961 case 'E': /* EUC-JP input */
5962 input_encoding = nkf_enc_from_index(EUCJP_NKF);
5964 case 'S': /* Windows-31J input */
5965 input_encoding = nkf_enc_from_index(WINDOWS_31J);
5967 case 'Z': /* Convert X0208 alphabet to asii */
5969 bit:0 Convert JIS X 0208 Alphabet to ASCII
5970 bit:1 Convert Kankaku to one space
5971 bit:2 Convert Kankaku to two spaces
5972 bit:3 Convert HTML Entity
5973 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
5975 while ('0'<= *cp && *cp <='9') {
5976 alpha_f |= 1 << (*cp++ - '0');
5978 if (!alpha_f) alpha_f = 1;
5980 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
5981 x0201_f = FALSE; /* No X0201->X0208 conversion */
5983 ESC-(-I in JIS, EUC, MS Kanji
5984 SI/SO in JIS, EUC, MS Kanji
5985 SS2 in EUC, JIS, not in MS Kanji
5986 MS Kanji (0xa0-0xdf)
5988 ESC-(-I in JIS (0x20-0x5f)
5989 SS2 in EUC (0xa0-0xdf)
5990 0xa0-0xd in MS Kanji (0xa0-0xdf)
5993 case 'X': /* Convert X0201 kana to X0208 */
5996 case 'F': /* prserve new lines */
5997 fold_preserve_f = TRUE;
5998 case 'f': /* folding -f60 or -f */
6001 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6003 fold_len += *cp++ - '0';
6005 if (!(0<fold_len && fold_len<BUFSIZ))
6006 fold_len = DEFAULT_FOLD;
6010 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6012 fold_margin += *cp++ - '0';
6016 case 'm': /* MIME support */
6017 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6018 if (*cp=='B'||*cp=='Q') {
6019 mime_decode_mode = *cp++;
6020 mimebuf_f = FIXED_MIME;
6021 } else if (*cp=='N') {
6022 mime_f = TRUE; cp++;
6023 } else if (*cp=='S') {
6024 mime_f = STRICT_MIME; cp++;
6025 } else if (*cp=='0') {
6026 mime_decode_f = FALSE;
6027 mime_f = FALSE; cp++;
6029 mime_f = STRICT_MIME;
6032 case 'M': /* MIME output */
6035 mimeout_f = FIXED_MIME; cp++;
6036 } else if (*cp=='Q') {
6038 mimeout_f = FIXED_MIME; cp++;
6043 case 'B': /* Broken JIS support */
6045 bit:1 allow any x on ESC-(-x or ESC-$-x
6046 bit:2 reset to ascii on NL
6048 if ('9'>= *cp && *cp>='0')
6049 broken_f |= 1<<(*cp++ -'0');
6054 case 'O':/* for Output file */
6058 case 'c':/* add cr code */
6061 case 'd':/* delete cr code */
6064 case 'I': /* ISO-2022-JP output */
6067 case 'L': /* line mode */
6068 if (*cp=='u') { /* unix */
6069 eolmode_f = LF; cp++;
6070 } else if (*cp=='m') { /* mac */
6071 eolmode_f = CR; cp++;
6072 } else if (*cp=='w') { /* windows */
6073 eolmode_f = CRLF; cp++;
6074 } else if (*cp=='0') { /* no conversion */
6075 eolmode_f = 0; cp++;
6080 if ('2' <= *cp && *cp <= '9') {
6083 } else if (*cp == '0' || *cp == '1') {
6092 /* module muliple options in a string are allowed for Perl moudle */
6093 while(*cp && *cp++!='-');
6096 #if !defined(PERL_XS) && !defined(WIN32DLL)
6097 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6099 /* bogus option but ignored */
6107 #include "nkf32dll.c"
6108 #elif defined(PERL_XS)
6109 #else /* WIN32DLL */
6110 int main(int argc, char **argv)
6115 char *outfname = NULL;
6118 #ifdef EASYWIN /*Easy Win */
6119 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6121 #ifdef DEFAULT_CODE_LOCALE
6122 setlocale(LC_CTYPE, "");
6124 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6125 cp = (unsigned char *)*argv;
6130 if (pipe(fds) < 0 || (pid = fork()) < 0){
6141 execvp(argv[1], &argv[1]);
6158 int debug_f_back = debug_f;
6161 int exec_f_back = exec_f;
6164 int x0212_f_back = x0212_f;
6166 int x0213_f_back = x0213_f;
6167 int guess_f_back = guess_f;
6169 guess_f = guess_f_back;
6172 debug_f = debug_f_back;
6175 exec_f = exec_f_back;
6177 x0212_f = x0212_f_back;
6178 x0213_f = x0213_f_back;
6181 if (binmode_f == TRUE)
6182 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6183 if (freopen("","wb",stdout) == NULL)
6190 setbuf(stdout, (char *) NULL);
6192 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6195 if (binmode_f == TRUE)
6196 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6197 if (freopen("","rb",stdin) == NULL) return (-1);
6201 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6205 kanji_convert(stdin);
6206 if (guess_f) print_guessed_code(NULL);
6210 int is_argument_error = FALSE;
6212 input_codename = NULL;
6215 iconv_for_check = 0;
6217 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6219 is_argument_error = TRUE;
6227 /* reopen file for stdout */
6228 if (file_out_f == TRUE) {
6231 outfname = malloc(strlen(origfname)
6232 + strlen(".nkftmpXXXXXX")
6238 strcpy(outfname, origfname);
6242 for (i = strlen(outfname); i; --i){
6243 if (outfname[i - 1] == '/'
6244 || outfname[i - 1] == '\\'){
6250 strcat(outfname, "ntXXXXXX");
6252 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6253 S_IREAD | S_IWRITE);
6255 strcat(outfname, ".nkftmpXXXXXX");
6256 fd = mkstemp(outfname);
6259 || (fd_backup = dup(fileno(stdout))) < 0
6260 || dup2(fd, fileno(stdout)) < 0
6271 outfname = "nkf.out";
6274 if(freopen(outfname, "w", stdout) == NULL) {
6278 if (binmode_f == TRUE) {
6279 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6280 if (freopen("","wb",stdout) == NULL)
6287 if (binmode_f == TRUE)
6288 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6289 if (freopen("","rb",fin) == NULL)
6294 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6298 char *filename = NULL;
6300 if (nfiles > 1) filename = origfname;
6301 if (guess_f) print_guessed_code(filename);
6307 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6315 if (dup2(fd_backup, fileno(stdout)) < 0){
6318 if (stat(origfname, &sb)) {
6319 fprintf(stderr, "Can't stat %s\n", origfname);
6321 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6322 if (chmod(outfname, sb.st_mode)) {
6323 fprintf(stderr, "Can't set permission %s\n", outfname);
6326 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6327 if(preserve_time_f){
6328 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6329 tb[0] = tb[1] = sb.st_mtime;
6330 if (utime(outfname, tb)) {
6331 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6334 tb.actime = sb.st_atime;
6335 tb.modtime = sb.st_mtime;
6336 if (utime(outfname, &tb)) {
6337 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6342 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6344 unlink(backup_filename);
6346 if (rename(origfname, backup_filename)) {
6347 perror(backup_filename);
6348 fprintf(stderr, "Can't rename %s to %s\n",
6349 origfname, backup_filename);
6353 if (unlink(origfname)){
6358 if (rename(outfname, origfname)) {
6360 fprintf(stderr, "Can't rename %s to %s\n",
6361 outfname, origfname);
6368 if (is_argument_error)
6371 #ifdef EASYWIN /*Easy Win */
6372 if (file_out_f == FALSE)
6373 scanf("%d",&end_check);
6376 #else /* for Other OS */
6377 if (file_out_f == TRUE)
6379 #endif /*Easy Win */
6382 #endif /* WIN32DLL */