2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.1.0"
24 #define NKF_RELEASE_DATE "2009-11-17"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2009, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
213 {"ISO-2022-JP", ISO_2022_JP},
214 {"ISO2022JP-CP932", CP50220},
215 {"CP50220", CP50220},
216 {"CP50221", CP50221},
217 {"CSISO2022JP", CP50221},
218 {"CP50222", CP50222},
219 {"ISO-2022-JP-1", ISO_2022_JP_1},
220 {"ISO-2022-JP-3", ISO_2022_JP_3},
221 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
222 {"SHIFT_JIS", SHIFT_JIS},
224 {"WINDOWS-31J", WINDOWS_31J},
225 {"CSWINDOWS31J", WINDOWS_31J},
226 {"CP932", WINDOWS_31J},
227 {"MS932", WINDOWS_31J},
228 {"CP10001", CP10001},
231 {"EUCJP-NKF", EUCJP_NKF},
232 {"CP51932", CP51932},
233 {"EUC-JP-MS", EUCJP_MS},
234 {"EUCJP-MS", EUCJP_MS},
235 {"EUCJPMS", EUCJP_MS},
236 {"EUC-JP-ASCII", EUCJP_ASCII},
237 {"EUCJP-ASCII", EUCJP_ASCII},
238 {"SHIFT_JISX0213", SHIFT_JISX0213},
239 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
240 {"EUC-JISX0213", EUC_JISX0213},
241 {"EUC-JIS-2004", EUC_JIS_2004},
244 {"UTF-8-BOM", UTF_8_BOM},
245 {"UTF8-MAC", UTF8_MAC},
246 {"UTF-8-MAC", UTF8_MAC},
248 {"UTF-16BE", UTF_16BE},
249 {"UTF-16BE-BOM", UTF_16BE_BOM},
250 {"UTF-16LE", UTF_16LE},
251 {"UTF-16LE-BOM", UTF_16LE_BOM},
253 {"UTF-32BE", UTF_32BE},
254 {"UTF-32BE-BOM", UTF_32BE_BOM},
255 {"UTF-32LE", UTF_32LE},
256 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 #if defined(DEFAULT_CODE_JIS)
262 #define DEFAULT_ENCIDX ISO_2022_JP
263 #elif defined(DEFAULT_CODE_SJIS)
264 #define DEFAULT_ENCIDX SHIFT_JIS
265 #elif defined(DEFAULT_CODE_WINDOWS_31J)
266 #define DEFAULT_ENCIDX WINDOWS_31J
267 #elif defined(DEFAULT_CODE_EUC)
268 #define DEFAULT_ENCIDX EUC_JP
269 #elif defined(DEFAULT_CODE_UTF8)
270 #define DEFAULT_ENCIDX UTF_8
274 #define is_alnum(c) \
275 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
277 /* I don't trust portablity of toupper */
278 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
279 #define nkf_isoctal(c) ('0'<=c && c<='7')
280 #define nkf_isdigit(c) ('0'<=c && c<='9')
281 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
282 #define nkf_isblank(c) (c == SP || c == TAB)
283 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
284 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
285 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
286 #define nkf_isprint(c) (SP<=c && c<='~')
287 #define nkf_isgraph(c) ('!'<=c && c<='~')
288 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
289 ('A'<=c&&c<='F') ? (c-'A'+10) : \
290 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
291 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
292 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
293 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
294 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
295 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
297 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
300 #define HOLD_SIZE 1024
301 #if defined(INT_IS_SHORT)
302 #define IOBUF_SIZE 2048
304 #define IOBUF_SIZE 16384
307 #define DEFAULT_J 'B'
308 #define DEFAULT_R 'B'
315 /* MIME preprocessor */
317 #ifdef EASYWIN /*Easy Win */
318 extern POINT _BufferSize;
327 void (*status_func)(struct input_code *, nkf_char);
328 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
332 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
333 static nkf_encoding *input_encoding = NULL;
334 static nkf_encoding *output_encoding = NULL;
336 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
338 * 0: Shift_JIS, eucJP-ascii
343 #define UCS_MAP_ASCII 0
345 #define UCS_MAP_CP932 2
346 #define UCS_MAP_CP10001 3
347 static int ms_ucs_map_f = UCS_MAP_ASCII;
349 #ifdef UTF8_INPUT_ENABLE
350 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
351 static int no_cp932ext_f = FALSE;
352 /* ignore ZERO WIDTH NO-BREAK SPACE */
353 static int no_best_fit_chars_f = FALSE;
354 static int input_endian = ENDIAN_BIG;
355 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
356 static void (*encode_fallback)(nkf_char c) = NULL;
357 static void w_status(struct input_code *, nkf_char);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int output_bom_f = FALSE;
361 static int output_endian = ENDIAN_BIG;
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_getc(FILE *f);
373 static void mime_putc(nkf_char c);
377 #if !defined(PERL_XS) && !defined(WIN32DLL)
378 static unsigned char stdibuf[IOBUF_SIZE];
379 static unsigned char stdobuf[IOBUF_SIZE];
383 static int unbuf_f = FALSE;
384 static int estab_f = FALSE;
385 static int nop_f = FALSE;
386 static int binmode_f = TRUE; /* binary mode */
387 static int rot_f = FALSE; /* rot14/43 mode */
388 static int hira_f = FALSE; /* hira/kata henkan */
389 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
390 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
391 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
392 static int mimebuf_f = FALSE; /* MIME buffered input */
393 static int broken_f = FALSE; /* convert ESC-less broken JIS */
394 static int iso8859_f = FALSE; /* ISO8859 through */
395 static int mimeout_f = FALSE; /* base64 mode */
396 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
397 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399 #ifdef UNICODE_NORMALIZATION
400 static int nfc_f = FALSE;
401 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
402 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
406 static int cap_f = FALSE;
407 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
408 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 static int url_f = FALSE;
411 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
415 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
416 #define CLASS_MASK NKF_INT32_C(0xFF000000)
417 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
418 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
419 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
420 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
421 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
422 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
423 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
424 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
425 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
427 #ifdef NUMCHAR_OPTION
428 static int numchar_f = FALSE;
429 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
430 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static int noout_f = FALSE;
435 static void no_putc(nkf_char c);
436 static int debug_f = FALSE;
437 static void debug(const char *str);
438 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
441 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
442 static void set_input_codename(const char *codename);
445 static int exec_f = 0;
448 #ifdef SHIFTJIS_CP932
449 /* invert IBM extended characters to others */
450 static int cp51932_f = FALSE;
452 /* invert NEC-selected IBM extended characters to IBM extended characters */
453 static int cp932inv_f = TRUE;
455 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
456 #endif /* SHIFTJIS_CP932 */
458 static int x0212_f = FALSE;
459 static int x0213_f = FALSE;
461 static unsigned char prefix_table[256];
463 static void e_status(struct input_code *, nkf_char);
464 static void s_status(struct input_code *, nkf_char);
466 struct input_code input_code_list[] = {
467 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
468 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469 #ifdef UTF8_INPUT_ENABLE
470 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
471 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
472 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
477 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
478 static int base64_count = 0;
480 /* X0208 -> ASCII converter */
483 static int f_line = 0; /* chars in line */
484 static int f_prev = 0;
485 static int fold_preserve_f = FALSE; /* preserve new lines */
486 static int fold_f = FALSE;
487 static int fold_len = 0;
490 static unsigned char kanji_intro = DEFAULT_J;
491 static unsigned char ascii_intro = DEFAULT_R;
495 #define FOLD_MARGIN 10
496 #define DEFAULT_FOLD 60
498 static int fold_margin = FOLD_MARGIN;
500 /* process default */
503 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
505 fprintf(stderr,"nkf internal module connection failure.\n");
511 no_connection(nkf_char c2, nkf_char c1)
513 no_connection2(c2,c1,0);
516 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
517 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
524 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
525 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
527 /* static redirections */
529 static void (*o_putc)(nkf_char c) = std_putc;
531 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
532 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
534 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
535 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
537 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
539 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
540 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
542 /* for strict mime */
543 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
544 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
547 static int output_mode = ASCII; /* output kanji mode */
548 static int input_mode = ASCII; /* input kanji mode */
549 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
551 /* X0201 / X0208 conversion tables */
553 /* X0201 kana conversion table */
555 static const unsigned char cv[]= {
556 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
557 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
558 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
559 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
560 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
561 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
562 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
563 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
564 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
565 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
566 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
567 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
568 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
569 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
570 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
571 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
575 /* X0201 kana conversion table for daguten */
577 static const unsigned char dv[]= {
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
583 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
584 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
585 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
586 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
587 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
589 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 /* X0201 kana conversion table for han-daguten */
598 static const unsigned char ev[]= {
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
610 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 /* X0208 kigou conversion table */
619 /* 0x8140 - 0x819e */
620 static const unsigned char fv[] = {
622 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
623 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
624 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
625 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
626 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
627 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
628 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
629 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
630 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
638 static int option_mode = 0;
639 static int file_out_f = FALSE;
641 static int overwrite_f = FALSE;
642 static int preserve_time_f = FALSE;
643 static int backup_f = FALSE;
644 static char *backup_suffix = "";
647 static int eolmode_f = 0; /* CR, LF, CRLF */
648 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
649 static nkf_char prev_cr = 0; /* CR or 0 */
650 #ifdef EASYWIN /*Easy Win */
651 static int end_check;
655 nkf_xmalloc(size_t size)
659 if (size == 0) size = 1;
663 perror("can't malloc");
671 nkf_xrealloc(void *ptr, size_t size)
673 if (size == 0) size = 1;
675 ptr = realloc(ptr, size);
677 perror("can't realloc");
684 #define nkf_xfree(ptr) free(ptr)
687 nkf_str_caseeql(const char *src, const char *target)
690 for (i = 0; src[i] && target[i]; i++) {
691 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
693 if (src[i] || target[i]) return FALSE;
698 nkf_enc_from_index(int idx)
700 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
703 return &nkf_encoding_table[idx];
707 nkf_enc_find_index(const char *name)
710 if (name[0] == 'X' && *(name+1) == '-') name += 2;
711 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
712 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
713 return encoding_name_to_id_table[i].id;
720 nkf_enc_find(const char *name)
723 idx = nkf_enc_find_index(name);
724 if (idx < 0) return 0;
725 return nkf_enc_from_index(idx);
728 #define nkf_enc_name(enc) (enc)->name
729 #define nkf_enc_to_index(enc) (enc)->id
730 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
731 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
732 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
733 #define nkf_enc_asciicompat(enc) (\
734 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
735 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
736 #define nkf_enc_unicode_p(enc) (\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
738 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
739 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
740 #define nkf_enc_cp5022x_p(enc) (\
741 nkf_enc_to_index(enc) == CP50220 ||\
742 nkf_enc_to_index(enc) == CP50221 ||\
743 nkf_enc_to_index(enc) == CP50222)
745 #ifdef DEFAULT_CODE_LOCALE
749 #ifdef HAVE_LANGINFO_H
750 return nl_langinfo(CODESET);
751 #elif defined(__WIN32__)
753 sprintf(buf, "CP%d", GetACP());
755 #elif defined(__OS2__)
756 # if defined(INT_IS_SHORT)
762 ULONG ulCP[1], ulncp;
763 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
764 if (ulCP[0] == 932 || ulCP[0] == 943)
765 strcpy(buf, "Shift_JIS");
767 sprintf(buf, "CP%lu", ulCP[0]);
775 nkf_locale_encoding()
777 nkf_encoding *enc = 0;
778 const char *encname = nkf_locale_charmap();
780 enc = nkf_enc_find(encname);
783 #endif /* DEFAULT_CODE_LOCALE */
788 return &nkf_encoding_table[UTF_8];
792 nkf_default_encoding()
794 nkf_encoding *enc = 0;
795 #ifdef DEFAULT_CODE_LOCALE
796 enc = nkf_locale_encoding();
797 #elif defined(DEFAULT_ENCIDX)
798 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
800 if (!enc) enc = nkf_utf8_encoding();
811 nkf_buf_new(int length)
813 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
814 buf->ptr = nkf_xmalloc(length);
822 nkf_buf_dispose(nkf_buf_t *buf)
829 #define nkf_buf_length(buf) ((buf)->len)
830 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
833 nkf_buf_at(nkf_buf_t *buf, int index)
835 assert(index <= buf->len);
836 return buf->ptr[index];
840 nkf_buf_clear(nkf_buf_t *buf)
846 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
848 if (buf->capa <= buf->len) {
851 buf->ptr[buf->len++] = c;
855 nkf_buf_pop(nkf_buf_t *buf)
857 assert(!nkf_buf_empty_p(buf));
858 return buf->ptr[--buf->len];
861 /* Normalization Form C */
864 #define fprintf dllprintf
870 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
877 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
878 #ifdef UTF8_OUTPUT_ENABLE
879 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
880 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
883 #ifdef UTF8_INPUT_ENABLE
884 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
885 " UTF option is -W[8,[16,32][B,L]]\n"
887 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
891 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
892 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
893 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
896 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
897 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
898 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
899 " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
902 " O Output to File (DEFAULT 'nkf.out')\n"
903 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
906 " --ic=<encoding> Specify the input encoding\n"
907 " --oc=<encoding> Specify the output encoding\n"
908 " --hiragana --katakana Hiragana/Katakana Conversion\n"
909 " --katakana-hiragana Converts each other\n"
913 " --{cap, url}-input Convert hex after ':' or '%%'\n"
915 #ifdef NUMCHAR_OPTION
916 " --numchar-input Convert Unicode Character Reference\n"
918 #ifdef UTF8_INPUT_ENABLE
919 " --fb-{skip, html, xml, perl, java, subchar}\n"
920 " Specify unassigned character's replacement\n"
925 " --in-place[=SUF] Overwrite original files\n"
926 " --overwrite[=SUF] Preserve timestamp of original files\n"
928 " -g --guess Guess the input code\n"
929 " -v --version Print the version\n"
930 " --help/-V Print this help / configuration\n"
936 show_configuration(void)
939 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
940 " Compile-time options:\n"
941 " Compiled at: " __DATE__ " " __TIME__ "\n"
944 " Default output encoding: "
945 #ifdef DEFAULT_CODE_LOCALE
946 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
947 #elif defined(DEFAULT_ENCIDX)
948 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
954 " Default output end of line: "
955 #if DEFAULT_NEWLINE == CR
957 #elif DEFAULT_NEWLINE == CRLF
963 " Decode MIME encoded string: "
964 #if MIME_DECODE_DEFAULT
970 " Convert JIS X 0201 Katakana: "
977 " --help, --version output: "
978 #if HELP_OUTPUT_HELP_OUTPUT
989 get_backup_filename(const char *suffix, const char *filename)
991 char *backup_filename;
992 int asterisk_count = 0;
994 int filename_length = strlen(filename);
996 for(i = 0; suffix[i]; i++){
997 if(suffix[i] == '*') asterisk_count++;
1001 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1002 for(i = 0, j = 0; suffix[i];){
1003 if(suffix[i] == '*'){
1004 backup_filename[j] = '\0';
1005 strncat(backup_filename, filename, filename_length);
1007 j += filename_length;
1009 backup_filename[j++] = suffix[i++];
1012 backup_filename[j] = '\0';
1014 j = filename_length + strlen(suffix);
1015 backup_filename = nkf_xmalloc(j + 1);
1016 strcpy(backup_filename, filename);
1017 strcat(backup_filename, suffix);
1018 backup_filename[j] = '\0';
1020 return backup_filename;
1024 #ifdef UTF8_INPUT_ENABLE
1026 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1033 (*f)(0, bin2hex(c>>shift));
1044 encode_fallback_html(nkf_char c)
1049 if(c >= NKF_INT32_C(1000000))
1050 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1051 if(c >= NKF_INT32_C(100000))
1052 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1054 (*oconv)(0, 0x30+(c/10000 )%10);
1056 (*oconv)(0, 0x30+(c/1000 )%10);
1058 (*oconv)(0, 0x30+(c/100 )%10);
1060 (*oconv)(0, 0x30+(c/10 )%10);
1062 (*oconv)(0, 0x30+ c %10);
1068 encode_fallback_xml(nkf_char c)
1073 nkf_each_char_to_hex(oconv, c);
1079 encode_fallback_java(nkf_char c)
1083 if(!nkf_char_unicode_bmp_p(c)){
1087 (*oconv)(0, bin2hex(c>>20));
1088 (*oconv)(0, bin2hex(c>>16));
1092 (*oconv)(0, bin2hex(c>>12));
1093 (*oconv)(0, bin2hex(c>> 8));
1094 (*oconv)(0, bin2hex(c>> 4));
1095 (*oconv)(0, bin2hex(c ));
1100 encode_fallback_perl(nkf_char c)
1105 nkf_each_char_to_hex(oconv, c);
1111 encode_fallback_subchar(nkf_char c)
1113 c = unicode_subchar;
1114 (*oconv)((c>>8)&0xFF, c&0xFF);
1119 static const struct {
1143 {"katakana-hiragana","h3"},
1151 #ifdef UTF8_OUTPUT_ENABLE
1161 {"fb-subchar=", ""},
1163 #ifdef UTF8_INPUT_ENABLE
1164 {"utf8-input", "W"},
1165 {"utf16-input", "W16"},
1166 {"no-cp932ext", ""},
1167 {"no-best-fit-chars",""},
1169 #ifdef UNICODE_NORMALIZATION
1170 {"utf8mac-input", ""},
1182 #ifdef NUMCHAR_OPTION
1183 {"numchar-input", ""},
1189 #ifdef SHIFTJIS_CP932
1200 set_input_encoding(nkf_encoding *enc)
1202 switch (nkf_enc_to_index(enc)) {
1210 #ifdef SHIFTJIS_CP932
1213 #ifdef UTF8_OUTPUT_ENABLE
1214 ms_ucs_map_f = UCS_MAP_CP932;
1224 case ISO_2022_JP_2004:
1232 #ifdef SHIFTJIS_CP932
1235 #ifdef UTF8_OUTPUT_ENABLE
1236 ms_ucs_map_f = UCS_MAP_CP932;
1241 #ifdef SHIFTJIS_CP932
1244 #ifdef UTF8_OUTPUT_ENABLE
1245 ms_ucs_map_f = UCS_MAP_CP10001;
1254 #ifdef SHIFTJIS_CP932
1257 #ifdef UTF8_OUTPUT_ENABLE
1258 ms_ucs_map_f = UCS_MAP_CP932;
1262 #ifdef SHIFTJIS_CP932
1265 #ifdef UTF8_OUTPUT_ENABLE
1266 ms_ucs_map_f = UCS_MAP_MS;
1270 #ifdef SHIFTJIS_CP932
1273 #ifdef UTF8_OUTPUT_ENABLE
1274 ms_ucs_map_f = UCS_MAP_ASCII;
1277 case SHIFT_JISX0213:
1278 case SHIFT_JIS_2004:
1280 #ifdef SHIFTJIS_CP932
1287 #ifdef SHIFTJIS_CP932
1291 #ifdef UTF8_INPUT_ENABLE
1292 #ifdef UNICODE_NORMALIZATION
1300 input_endian = ENDIAN_BIG;
1304 input_endian = ENDIAN_LITTLE;
1309 input_endian = ENDIAN_BIG;
1313 input_endian = ENDIAN_LITTLE;
1320 set_output_encoding(nkf_encoding *enc)
1322 switch (nkf_enc_to_index(enc)) {
1325 #ifdef SHIFTJIS_CP932
1326 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1328 #ifdef UTF8_OUTPUT_ENABLE
1329 ms_ucs_map_f = UCS_MAP_CP932;
1334 #ifdef SHIFTJIS_CP932
1335 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1337 #ifdef UTF8_OUTPUT_ENABLE
1338 ms_ucs_map_f = UCS_MAP_CP932;
1342 #ifdef SHIFTJIS_CP932
1343 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1348 #ifdef SHIFTJIS_CP932
1349 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1355 #ifdef SHIFTJIS_CP932
1356 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1363 #ifdef UTF8_OUTPUT_ENABLE
1364 ms_ucs_map_f = UCS_MAP_CP932;
1368 #ifdef UTF8_OUTPUT_ENABLE
1369 ms_ucs_map_f = UCS_MAP_CP10001;
1374 #ifdef SHIFTJIS_CP932
1375 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1377 #ifdef UTF8_OUTPUT_ENABLE
1378 ms_ucs_map_f = UCS_MAP_ASCII;
1383 #ifdef SHIFTJIS_CP932
1384 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1386 #ifdef UTF8_OUTPUT_ENABLE
1387 ms_ucs_map_f = UCS_MAP_ASCII;
1392 #ifdef SHIFTJIS_CP932
1393 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1395 #ifdef UTF8_OUTPUT_ENABLE
1396 ms_ucs_map_f = UCS_MAP_CP932;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_MS;
1407 #ifdef UTF8_OUTPUT_ENABLE
1408 ms_ucs_map_f = UCS_MAP_ASCII;
1411 case SHIFT_JISX0213:
1412 case SHIFT_JIS_2004:
1414 #ifdef SHIFTJIS_CP932
1415 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1422 #ifdef SHIFTJIS_CP932
1423 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1426 #ifdef UTF8_OUTPUT_ENABLE
1428 output_bom_f = TRUE;
1432 output_bom_f = TRUE;
1435 output_endian = ENDIAN_LITTLE;
1436 output_bom_f = FALSE;
1439 output_endian = ENDIAN_LITTLE;
1440 output_bom_f = TRUE;
1444 output_bom_f = TRUE;
1447 output_endian = ENDIAN_LITTLE;
1448 output_bom_f = FALSE;
1451 output_endian = ENDIAN_LITTLE;
1452 output_bom_f = TRUE;
1458 static struct input_code*
1459 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1462 struct input_code *p = input_code_list;
1464 if (iconv_func == p->iconv_func){
1474 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1476 #ifdef INPUT_CODE_FIX
1477 if (f || !input_encoding)
1484 #ifdef INPUT_CODE_FIX
1485 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1491 if (estab_f && iconv_for_check != iconv){
1492 struct input_code *p = find_inputcode_byfunc(iconv);
1494 set_input_codename(p->name);
1497 iconv_for_check = iconv;
1504 x0212_shift(nkf_char c)
1509 if (0x75 <= c && c <= 0x7f){
1510 ret = c + (0x109 - 0x75);
1513 if (0x75 <= c && c <= 0x7f){
1514 ret = c + (0x113 - 0x75);
1522 x0212_unshift(nkf_char c)
1525 if (0x7f <= c && c <= 0x88){
1526 ret = c + (0x75 - 0x7f);
1527 }else if (0x89 <= c && c <= 0x92){
1528 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1532 #endif /* X0212_ENABLE */
1535 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1541 if((0x21 <= ndx && ndx <= 0x2F)){
1542 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1543 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1545 }else if(0x6E <= ndx && ndx <= 0x7E){
1546 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1547 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1553 else if(nkf_isgraph(ndx)){
1555 const unsigned short *ptr;
1556 ptr = x0212_shiftjis[ndx - 0x21];
1558 val = ptr[(c1 & 0x7f) - 0x21];
1567 c2 = x0212_shift(c2);
1569 #endif /* X0212_ENABLE */
1571 if(0x7F < c2) return 1;
1572 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1573 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1578 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1580 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1583 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1584 if (0xFC < c1) return 1;
1585 #ifdef SHIFTJIS_CP932
1586 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1587 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1594 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1595 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1601 #endif /* SHIFTJIS_CP932 */
1603 if (!x0213_f && is_ibmext_in_sjis(c2)){
1604 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1607 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1620 if(x0213_f && c2 >= 0xF0){
1621 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1622 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1623 }else{ /* 78<=k<=94 */
1624 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1625 if (0x9E < c1) c2++;
1628 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1629 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1630 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1631 if (0x9E < c1) c2++;
1634 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1641 c2 = x0212_unshift(c2);
1648 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1650 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1658 }else if (val < 0x800){
1659 *p1 = 0xc0 | (val >> 6);
1660 *p2 = 0x80 | (val & 0x3f);
1663 } else if (nkf_char_unicode_bmp_p(val)) {
1664 *p1 = 0xe0 | (val >> 12);
1665 *p2 = 0x80 | ((val >> 6) & 0x3f);
1666 *p3 = 0x80 | ( val & 0x3f);
1668 } else if (nkf_char_unicode_value_p(val)) {
1669 *p1 = 0xf0 | (val >> 18);
1670 *p2 = 0x80 | ((val >> 12) & 0x3f);
1671 *p3 = 0x80 | ((val >> 6) & 0x3f);
1672 *p4 = 0x80 | ( val & 0x3f);
1682 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1689 else if (c1 <= 0xC3) {
1690 /* trail byte or invalid */
1693 else if (c1 <= 0xDF) {
1695 wc = (c1 & 0x1F) << 6;
1698 else if (c1 <= 0xEF) {
1700 wc = (c1 & 0x0F) << 12;
1701 wc |= (c2 & 0x3F) << 6;
1704 else if (c2 <= 0xF4) {
1706 wc = (c1 & 0x0F) << 18;
1707 wc |= (c2 & 0x3F) << 12;
1708 wc |= (c3 & 0x3F) << 6;
1718 #ifdef UTF8_INPUT_ENABLE
1720 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1721 const unsigned short *const *pp, nkf_char psize,
1722 nkf_char *p2, nkf_char *p1)
1725 const unsigned short *p;
1728 if (pp == 0) return 1;
1731 if (c1 < 0 || psize <= c1) return 1;
1733 if (p == 0) return 1;
1736 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1738 if (val == 0) return 1;
1739 if (no_cp932ext_f && (
1740 (val>>8) == 0x2D || /* NEC special characters */
1741 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1749 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1757 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1759 const unsigned short *const *pp;
1760 const unsigned short *const *const *ppp;
1761 static const char no_best_fit_chars_table_C2[] =
1762 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1765 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1766 static const char no_best_fit_chars_table_C2_ms[] =
1767 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1768 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1769 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1770 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1771 static const char no_best_fit_chars_table_932_C2[] =
1772 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1773 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1774 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1775 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1776 static const char no_best_fit_chars_table_932_C3[] =
1777 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1778 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1780 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1786 }else if(c2 < 0xe0){
1787 if(no_best_fit_chars_f){
1788 if(ms_ucs_map_f == UCS_MAP_CP932){
1791 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1794 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1797 }else if(!cp932inv_f){
1800 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1803 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1806 }else if(ms_ucs_map_f == UCS_MAP_MS){
1807 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1808 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1826 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1827 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1828 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1830 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1831 }else if(c0 < 0xF0){
1832 if(no_best_fit_chars_f){
1833 if(ms_ucs_map_f == UCS_MAP_CP932){
1834 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1835 }else if(ms_ucs_map_f == UCS_MAP_MS){
1840 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1843 if(c0 == 0x92) return 1;
1848 if(c1 == 0x80 || c0 == 0x9C) return 1;
1851 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1856 if(c0 == 0x94) return 1;
1859 if(c0 == 0xBB) return 1;
1869 if(c0 == 0x95) return 1;
1872 if(c0 == 0xA5) return 1;
1879 if(c0 == 0x8D) return 1;
1882 if(c0 == 0x9E && !cp932inv_f) return 1;
1885 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1893 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1894 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1895 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1897 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1899 #ifdef SHIFTJIS_CP932
1900 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1902 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1903 s2e_conv(s2, s1, p2, p1);
1912 #ifdef UTF8_OUTPUT_ENABLE
1914 e2w_conv(nkf_char c2, nkf_char c1)
1916 const unsigned short *p;
1918 if (c2 == JIS_X_0201_1976_K) {
1919 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1927 p = euc_to_utf8_1byte;
1929 } else if (is_eucg3(c2)){
1930 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1933 c2 = (c2&0x7f) - 0x21;
1934 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1935 p = x0212_to_utf8_2bytes[c2];
1941 c2 = (c2&0x7f) - 0x21;
1942 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1944 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1945 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1946 euc_to_utf8_2bytes_ms[c2];
1951 c1 = (c1 & 0x7f) - 0x21;
1952 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1959 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1966 }else if (0xc0 <= c2 && c2 <= 0xef) {
1967 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1968 #ifdef NUMCHAR_OPTION
1971 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1979 #ifdef UTF8_INPUT_ENABLE
1981 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1983 nkf_char c1, c2, c3, c4;
1990 else if (nkf_char_unicode_bmp_p(val)){
1991 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1992 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1995 *p1 = nkf_char_unicode_new(val);
2001 *p1 = nkf_char_unicode_new(val);
2008 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2010 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2011 if (iso2022jp_f && !x0201_f) {
2012 c2 = GETA1; c1 = GETA2;
2014 c2 = JIS_X_0201_1976_K;
2018 }else if (c2 == 0x8f){
2022 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2023 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2024 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2027 c2 = (c2 << 8) | (c1 & 0x7f);
2029 #ifdef SHIFTJIS_CP932
2032 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2033 s2e_conv(s2, s1, &c2, &c1);
2040 #endif /* SHIFTJIS_CP932 */
2042 #endif /* X0212_ENABLE */
2043 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2046 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2047 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2048 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2053 #ifdef SHIFTJIS_CP932
2054 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2056 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2057 s2e_conv(s2, s1, &c2, &c1);
2064 #endif /* SHIFTJIS_CP932 */
2072 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2074 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2075 if (iso2022jp_f && !x0201_f) {
2076 c2 = GETA1; c1 = GETA2;
2080 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2082 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2084 if(c1 == 0x7F) return 0;
2085 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2088 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2089 if (ret) return ret;
2096 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2098 nkf_char ret = 0, c4 = 0;
2099 static const char w_iconv_utf8_1st_byte[] =
2101 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2102 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2103 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2104 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2111 if (c1 < 0 || 0xff < c1) {
2112 }else if (c1 == 0) { /* 0 : 1 byte*/
2114 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2117 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2119 if (c2 < 0x80 || 0xBF < c2) return 0;
2122 if (c3 == 0) return -1;
2123 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2128 if (c3 == 0) return -1;
2129 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2133 if (c3 == 0) return -1;
2134 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2138 if (c3 == 0) return -2;
2139 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2143 if (c3 == 0) return -2;
2144 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2148 if (c3 == 0) return -2;
2149 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2157 if (c1 == 0 || c1 == EOF){
2158 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2159 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2162 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2170 #define NKF_ICONV_INVALID_CODE_RANGE -13
2172 unicode_iconv(nkf_char wc)
2180 }else if ((wc>>11) == 27) {
2181 /* unpaired surrogate */
2182 return NKF_ICONV_INVALID_CODE_RANGE;
2183 }else if (wc < 0xFFFF) {
2184 ret = w16e_conv(wc, &c2, &c1);
2185 if (ret) return ret;
2186 }else if (wc < 0x10FFFF) {
2188 c1 = nkf_char_unicode_new(wc);
2190 return NKF_ICONV_INVALID_CODE_RANGE;
2196 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2197 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2198 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2200 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2209 if (input_endian == ENDIAN_BIG) {
2210 if (0xD8 <= c1 && c1 <= 0xDB) {
2211 if (0xDC <= c3 && c3 <= 0xDF) {
2212 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2213 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2218 if (0xD8 <= c2 && c2 <= 0xDB) {
2219 if (0xDC <= c4 && c4 <= 0xDF) {
2220 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2221 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2227 return (*unicode_iconv)(wc);
2231 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2237 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2243 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2252 switch(input_endian){
2254 wc = c2 << 16 | c3 << 8 | c4;
2257 wc = c3 << 16 | c2 << 8 | c1;
2260 wc = c1 << 16 | c4 << 8 | c3;
2263 wc = c4 << 16 | c1 << 8 | c2;
2266 return NKF_ICONV_INVALID_CODE_RANGE;
2269 return (*unicode_iconv)(wc);
2273 #define output_ascii_escape_sequence(mode) do { \
2274 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2277 (*o_putc)(ascii_intro); \
2278 output_mode = mode; \
2283 output_escape_sequence(int mode)
2285 if (output_mode == mode)
2293 case JIS_X_0201_1976_K:
2301 (*o_putc)(kanji_intro);
2326 j_oconv(nkf_char c2, nkf_char c1)
2328 #ifdef NUMCHAR_OPTION
2329 if (c2 == 0 && nkf_char_unicode_p(c1)){
2330 w16e_conv(c1, &c2, &c1);
2331 if (c2 == 0 && nkf_char_unicode_p(c1)){
2332 c2 = c1 & VALUE_MASK;
2333 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2336 c2 = 0x7F + c1 / 94;
2337 c1 = 0x21 + c1 % 94;
2339 if (encode_fallback) (*encode_fallback)(c1);
2346 output_ascii_escape_sequence(ASCII);
2349 else if (c2 == EOF) {
2350 output_ascii_escape_sequence(ASCII);
2353 else if (c2 == ISO_8859_1) {
2354 output_ascii_escape_sequence(ISO_8859_1);
2357 else if (c2 == JIS_X_0201_1976_K) {
2358 output_escape_sequence(JIS_X_0201_1976_K);
2361 } else if (is_eucg3(c2)){
2362 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2363 (*o_putc)(c2 & 0x7f);
2368 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2369 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2370 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2377 e_oconv(nkf_char c2, nkf_char c1)
2379 if (c2 == 0 && nkf_char_unicode_p(c1)){
2380 w16e_conv(c1, &c2, &c1);
2381 if (c2 == 0 && nkf_char_unicode_p(c1)){
2382 c2 = c1 & VALUE_MASK;
2383 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2387 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2388 c1 = 0x21 + c1 % 94;
2391 (*o_putc)((c2 & 0x7f) | 0x080);
2392 (*o_putc)(c1 | 0x080);
2394 (*o_putc)((c2 & 0x7f) | 0x080);
2395 (*o_putc)(c1 | 0x080);
2399 if (encode_fallback) (*encode_fallback)(c1);
2407 } else if (c2 == 0) {
2408 output_mode = ASCII;
2410 } else if (c2 == JIS_X_0201_1976_K) {
2411 output_mode = EUC_JP;
2412 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2413 } else if (c2 == ISO_8859_1) {
2414 output_mode = ISO_8859_1;
2415 (*o_putc)(c1 | 0x080);
2417 } else if (is_eucg3(c2)){
2418 output_mode = EUC_JP;
2419 #ifdef SHIFTJIS_CP932
2422 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2423 s2e_conv(s2, s1, &c2, &c1);
2428 output_mode = ASCII;
2430 }else if (is_eucg3(c2)){
2433 (*o_putc)((c2 & 0x7f) | 0x080);
2434 (*o_putc)(c1 | 0x080);
2437 (*o_putc)((c2 & 0x7f) | 0x080);
2438 (*o_putc)(c1 | 0x080);
2442 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2443 set_iconv(FALSE, 0);
2444 return; /* too late to rescue this char */
2446 output_mode = EUC_JP;
2447 (*o_putc)(c2 | 0x080);
2448 (*o_putc)(c1 | 0x080);
2453 s_oconv(nkf_char c2, nkf_char c1)
2455 #ifdef NUMCHAR_OPTION
2456 if (c2 == 0 && nkf_char_unicode_p(c1)){
2457 w16e_conv(c1, &c2, &c1);
2458 if (c2 == 0 && nkf_char_unicode_p(c1)){
2459 c2 = c1 & VALUE_MASK;
2460 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2463 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2465 c1 += 0x40 + (c1 > 0x3e);
2470 if(encode_fallback)(*encode_fallback)(c1);
2479 } else if (c2 == 0) {
2480 output_mode = ASCII;
2482 } else if (c2 == JIS_X_0201_1976_K) {
2483 output_mode = SHIFT_JIS;
2485 } else if (c2 == ISO_8859_1) {
2486 output_mode = ISO_8859_1;
2487 (*o_putc)(c1 | 0x080);
2489 } else if (is_eucg3(c2)){
2490 output_mode = SHIFT_JIS;
2491 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2497 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2498 set_iconv(FALSE, 0);
2499 return; /* too late to rescue this char */
2501 output_mode = SHIFT_JIS;
2502 e2s_conv(c2, c1, &c2, &c1);
2504 #ifdef SHIFTJIS_CP932
2506 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2507 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2513 #endif /* SHIFTJIS_CP932 */
2516 if (prefix_table[(unsigned char)c1]){
2517 (*o_putc)(prefix_table[(unsigned char)c1]);
2523 #ifdef UTF8_OUTPUT_ENABLE
2525 w_oconv(nkf_char c2, nkf_char c1)
2531 output_bom_f = FALSE;
2542 if (c2 == 0 && nkf_char_unicode_p(c1)){
2543 val = c1 & VALUE_MASK;
2544 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2546 if (c2) (*o_putc)(c2);
2547 if (c3) (*o_putc)(c3);
2548 if (c4) (*o_putc)(c4);
2555 val = e2w_conv(c2, c1);
2557 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2559 if (c2) (*o_putc)(c2);
2560 if (c3) (*o_putc)(c3);
2561 if (c4) (*o_putc)(c4);
2567 w_oconv16(nkf_char c2, nkf_char c1)
2570 output_bom_f = FALSE;
2571 if (output_endian == ENDIAN_LITTLE){
2585 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2586 if (nkf_char_unicode_bmp_p(c1)) {
2587 c2 = (c1 >> 8) & 0xff;
2591 if (c1 <= UNICODE_MAX) {
2592 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2593 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2594 if (output_endian == ENDIAN_LITTLE){
2595 (*o_putc)(c2 & 0xff);
2596 (*o_putc)((c2 >> 8) & 0xff);
2597 (*o_putc)(c1 & 0xff);
2598 (*o_putc)((c1 >> 8) & 0xff);
2600 (*o_putc)((c2 >> 8) & 0xff);
2601 (*o_putc)(c2 & 0xff);
2602 (*o_putc)((c1 >> 8) & 0xff);
2603 (*o_putc)(c1 & 0xff);
2609 nkf_char val = e2w_conv(c2, c1);
2610 c2 = (val >> 8) & 0xff;
2615 if (output_endian == ENDIAN_LITTLE){
2625 w_oconv32(nkf_char c2, nkf_char c1)
2628 output_bom_f = FALSE;
2629 if (output_endian == ENDIAN_LITTLE){
2647 if (c2 == ISO_8859_1) {
2649 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2652 c1 = e2w_conv(c2, c1);
2655 if (output_endian == ENDIAN_LITTLE){
2656 (*o_putc)( c1 & 0xFF);
2657 (*o_putc)((c1 >> 8) & 0xFF);
2658 (*o_putc)((c1 >> 16) & 0xFF);
2662 (*o_putc)((c1 >> 16) & 0xFF);
2663 (*o_putc)((c1 >> 8) & 0xFF);
2664 (*o_putc)( c1 & 0xFF);
2669 #define SCORE_L2 (1) /* Kanji Level 2 */
2670 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2671 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2672 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2673 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2674 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2675 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2676 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2678 #define SCORE_INIT (SCORE_iMIME)
2680 static const nkf_char score_table_A0[] = {
2683 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2684 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2687 static const nkf_char score_table_F0[] = {
2688 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2689 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2690 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2691 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2695 set_code_score(struct input_code *ptr, nkf_char score)
2698 ptr->score |= score;
2703 clr_code_score(struct input_code *ptr, nkf_char score)
2706 ptr->score &= ~score;
2711 code_score(struct input_code *ptr)
2713 nkf_char c2 = ptr->buf[0];
2714 #ifdef UTF8_OUTPUT_ENABLE
2715 nkf_char c1 = ptr->buf[1];
2718 set_code_score(ptr, SCORE_ERROR);
2719 }else if (c2 == SS2){
2720 set_code_score(ptr, SCORE_KANA);
2721 }else if (c2 == 0x8f){
2722 set_code_score(ptr, SCORE_X0212);
2723 #ifdef UTF8_OUTPUT_ENABLE
2724 }else if (!e2w_conv(c2, c1)){
2725 set_code_score(ptr, SCORE_NO_EXIST);
2727 }else if ((c2 & 0x70) == 0x20){
2728 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2729 }else if ((c2 & 0x70) == 0x70){
2730 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2731 }else if ((c2 & 0x70) >= 0x50){
2732 set_code_score(ptr, SCORE_L2);
2737 status_disable(struct input_code *ptr)
2742 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2746 status_push_ch(struct input_code *ptr, nkf_char c)
2748 ptr->buf[ptr->index++] = c;
2752 status_clear(struct input_code *ptr)
2759 status_reset(struct input_code *ptr)
2762 ptr->score = SCORE_INIT;
2766 status_reinit(struct input_code *ptr)
2769 ptr->_file_stat = 0;
2773 status_check(struct input_code *ptr, nkf_char c)
2775 if (c <= DEL && estab_f){
2781 s_status(struct input_code *ptr, nkf_char c)
2785 status_check(ptr, c);
2790 }else if (nkf_char_unicode_p(c)){
2792 }else if (0xa1 <= c && c <= 0xdf){
2793 status_push_ch(ptr, SS2);
2794 status_push_ch(ptr, c);
2797 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2799 status_push_ch(ptr, c);
2800 }else if (0xed <= c && c <= 0xee){
2802 status_push_ch(ptr, c);
2803 #ifdef SHIFTJIS_CP932
2804 }else if (is_ibmext_in_sjis(c)){
2806 status_push_ch(ptr, c);
2807 #endif /* SHIFTJIS_CP932 */
2809 }else if (0xf0 <= c && c <= 0xfc){
2811 status_push_ch(ptr, c);
2812 #endif /* X0212_ENABLE */
2814 status_disable(ptr);
2818 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2819 status_push_ch(ptr, c);
2820 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2824 status_disable(ptr);
2828 #ifdef SHIFTJIS_CP932
2829 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2830 status_push_ch(ptr, c);
2831 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2832 set_code_score(ptr, SCORE_CP932);
2837 #endif /* SHIFTJIS_CP932 */
2838 status_disable(ptr);
2841 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2842 status_push_ch(ptr, c);
2843 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2844 set_code_score(ptr, SCORE_CP932);
2847 status_disable(ptr);
2854 e_status(struct input_code *ptr, nkf_char c)
2858 status_check(ptr, c);
2863 }else if (nkf_char_unicode_p(c)){
2865 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2867 status_push_ch(ptr, c);
2869 }else if (0x8f == c){
2871 status_push_ch(ptr, c);
2872 #endif /* X0212_ENABLE */
2874 status_disable(ptr);
2878 if (0xa1 <= c && c <= 0xfe){
2879 status_push_ch(ptr, c);
2883 status_disable(ptr);
2888 if (0xa1 <= c && c <= 0xfe){
2890 status_push_ch(ptr, c);
2892 status_disable(ptr);
2894 #endif /* X0212_ENABLE */
2898 #ifdef UTF8_INPUT_ENABLE
2900 w_status(struct input_code *ptr, nkf_char c)
2904 status_check(ptr, c);
2909 }else if (nkf_char_unicode_p(c)){
2911 }else if (0xc0 <= c && c <= 0xdf){
2913 status_push_ch(ptr, c);
2914 }else if (0xe0 <= c && c <= 0xef){
2916 status_push_ch(ptr, c);
2917 }else if (0xf0 <= c && c <= 0xf4){
2919 status_push_ch(ptr, c);
2921 status_disable(ptr);
2926 if (0x80 <= c && c <= 0xbf){
2927 status_push_ch(ptr, c);
2928 if (ptr->index > ptr->stat){
2929 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2930 && ptr->buf[2] == 0xbf);
2931 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2932 &ptr->buf[0], &ptr->buf[1]);
2939 status_disable(ptr);
2943 if (0x80 <= c && c <= 0xbf){
2944 if (ptr->index < ptr->stat){
2945 status_push_ch(ptr, c);
2950 status_disable(ptr);
2958 code_status(nkf_char c)
2960 int action_flag = 1;
2961 struct input_code *result = 0;
2962 struct input_code *p = input_code_list;
2964 if (!p->status_func) {
2968 if (!p->status_func)
2970 (p->status_func)(p, c);
2973 }else if(p->stat == 0){
2984 if (result && !estab_f){
2985 set_iconv(TRUE, result->iconv_func);
2986 }else if (c <= DEL){
2987 struct input_code *ptr = input_code_list;
2997 nkf_buf_t *std_gc_buf;
2998 nkf_char broken_state;
2999 nkf_buf_t *broken_buf;
3000 nkf_char mimeout_state;
3004 static nkf_state_t *nkf_state = NULL;
3006 #define STD_GC_BUFSIZE (256)
3009 nkf_state_init(void)
3012 nkf_buf_clear(nkf_state->std_gc_buf);
3013 nkf_buf_clear(nkf_state->broken_buf);
3014 nkf_buf_clear(nkf_state->nfc_buf);
3017 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3018 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3019 nkf_state->broken_buf = nkf_buf_new(3);
3020 nkf_state->nfc_buf = nkf_buf_new(9);
3022 nkf_state->broken_state = 0;
3023 nkf_state->mimeout_state = 0;
3030 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3031 return nkf_buf_pop(nkf_state->std_gc_buf);
3038 std_ungetc(nkf_char c, FILE *f)
3040 nkf_buf_push(nkf_state->std_gc_buf, c);
3046 std_putc(nkf_char c)
3053 static unsigned char hold_buf[HOLD_SIZE*2];
3054 static int hold_count = 0;
3056 push_hold_buf(nkf_char c2)
3058 if (hold_count >= HOLD_SIZE*2)
3060 hold_buf[hold_count++] = (unsigned char)c2;
3061 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3065 h_conv(FILE *f, int c1, int c2)
3071 /** it must NOT be in the kanji shifte sequence */
3072 /** it must NOT be written in JIS7 */
3073 /** and it must be after 2 byte 8bit code */
3079 while ((c2 = (*i_getc)(f)) != EOF) {
3085 if (push_hold_buf(c2) == EOF || estab_f) {
3091 struct input_code *p = input_code_list;
3092 struct input_code *result = p;
3097 if (p->status_func && p->score < result->score) {
3102 set_iconv(TRUE, result->iconv_func);
3107 ** 1) EOF is detected, or
3108 ** 2) Code is established, or
3109 ** 3) Buffer is FULL (but last word is pushed)
3111 ** in 1) and 3) cases, we continue to use
3112 ** Kanji codes by oconv and leave estab_f unchanged.
3117 while (hold_index < hold_count){
3118 c1 = hold_buf[hold_index++];
3122 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3123 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3126 if (hold_index < hold_count){
3127 c2 = hold_buf[hold_index++];
3137 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3140 if (hold_index < hold_count){
3141 c3 = hold_buf[hold_index++];
3142 } else if ((c3 = (*i_getc)(f)) == EOF) {
3147 if (hold_index < hold_count){
3148 c4 = hold_buf[hold_index++];
3149 } else if ((c4 = (*i_getc)(f)) == EOF) {
3154 (*iconv)(c1, c2, (c3<<8)|c4);
3157 /* 3 bytes EUC or UTF-8 */
3158 if (hold_index < hold_count){
3159 c3 = hold_buf[hold_index++];
3160 } else if ((c3 = (*i_getc)(f)) == EOF) {
3166 (*iconv)(c1, c2, c3);
3169 if (c3 == EOF) break;
3175 * Check and Ignore BOM
3181 switch(c2 = (*i_getc)(f)){
3183 if((c2 = (*i_getc)(f)) == 0x00){
3184 if((c2 = (*i_getc)(f)) == 0xFE){
3185 if((c2 = (*i_getc)(f)) == 0xFF){
3186 if(!input_encoding){
3187 set_iconv(TRUE, w_iconv32);
3189 if (iconv == w_iconv32) {
3190 input_endian = ENDIAN_BIG;
3193 (*i_ungetc)(0xFF,f);
3194 }else (*i_ungetc)(c2,f);
3195 (*i_ungetc)(0xFE,f);
3196 }else if(c2 == 0xFF){
3197 if((c2 = (*i_getc)(f)) == 0xFE){
3198 if(!input_encoding){
3199 set_iconv(TRUE, w_iconv32);
3201 if (iconv == w_iconv32) {
3202 input_endian = ENDIAN_2143;
3205 (*i_ungetc)(0xFF,f);
3206 }else (*i_ungetc)(c2,f);
3207 (*i_ungetc)(0xFF,f);
3208 }else (*i_ungetc)(c2,f);
3209 (*i_ungetc)(0x00,f);
3210 }else (*i_ungetc)(c2,f);
3211 (*i_ungetc)(0x00,f);
3214 if((c2 = (*i_getc)(f)) == 0xBB){
3215 if((c2 = (*i_getc)(f)) == 0xBF){
3216 if(!input_encoding){
3217 set_iconv(TRUE, w_iconv);
3219 if (iconv == w_iconv) {
3222 (*i_ungetc)(0xBF,f);
3223 }else (*i_ungetc)(c2,f);
3224 (*i_ungetc)(0xBB,f);
3225 }else (*i_ungetc)(c2,f);
3226 (*i_ungetc)(0xEF,f);
3229 if((c2 = (*i_getc)(f)) == 0xFF){
3230 if((c2 = (*i_getc)(f)) == 0x00){
3231 if((c2 = (*i_getc)(f)) == 0x00){
3232 if(!input_encoding){
3233 set_iconv(TRUE, w_iconv32);
3235 if (iconv == w_iconv32) {
3236 input_endian = ENDIAN_3412;
3239 (*i_ungetc)(0x00,f);
3240 }else (*i_ungetc)(c2,f);
3241 (*i_ungetc)(0x00,f);
3242 }else (*i_ungetc)(c2,f);
3243 if(!input_encoding){
3244 set_iconv(TRUE, w_iconv16);
3246 if (iconv == w_iconv16) {
3247 input_endian = ENDIAN_BIG;
3250 (*i_ungetc)(0xFF,f);
3251 }else (*i_ungetc)(c2,f);
3252 (*i_ungetc)(0xFE,f);
3255 if((c2 = (*i_getc)(f)) == 0xFE){
3256 if((c2 = (*i_getc)(f)) == 0x00){
3257 if((c2 = (*i_getc)(f)) == 0x00){
3258 if(!input_encoding){
3259 set_iconv(TRUE, w_iconv32);
3261 if (iconv == w_iconv32) {
3262 input_endian = ENDIAN_LITTLE;
3265 (*i_ungetc)(0x00,f);
3266 }else (*i_ungetc)(c2,f);
3267 (*i_ungetc)(0x00,f);
3268 }else (*i_ungetc)(c2,f);
3269 if(!input_encoding){
3270 set_iconv(TRUE, w_iconv16);
3272 if (iconv == w_iconv16) {
3273 input_endian = ENDIAN_LITTLE;
3276 (*i_ungetc)(0xFE,f);
3277 }else (*i_ungetc)(c2,f);
3278 (*i_ungetc)(0xFF,f);
3287 broken_getc(FILE *f)
3291 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3292 return nkf_buf_pop(nkf_state->broken_buf);
3295 if (c=='$' && nkf_state->broken_state != ESC
3296 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3298 nkf_state->broken_state = 0;
3299 if (c1=='@'|| c1=='B') {
3300 nkf_buf_push(nkf_state->broken_buf, c1);
3301 nkf_buf_push(nkf_state->broken_buf, c);
3307 } else if (c=='(' && nkf_state->broken_state != ESC
3308 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3310 nkf_state->broken_state = 0;
3311 if (c1=='J'|| c1=='B') {
3312 nkf_buf_push(nkf_state->broken_buf, c1);
3313 nkf_buf_push(nkf_state->broken_buf, c);
3320 nkf_state->broken_state = c;
3326 broken_ungetc(nkf_char c, FILE *f)
3328 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3329 nkf_buf_push(nkf_state->broken_buf, c);
3334 eol_conv(nkf_char c2, nkf_char c1)
3336 if (guess_f && input_eol != EOF) {
3337 if (c2 == 0 && c1 == LF) {
3338 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3339 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3340 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3342 else if (!input_eol) input_eol = CR;
3343 else if (input_eol != CR) input_eol = EOF;
3345 if (prev_cr || (c2 == 0 && c1 == LF)) {
3347 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3348 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3350 if (c2 == 0 && c1 == CR) prev_cr = CR;
3351 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3355 Return value of fold_conv()
3357 LF add newline and output char
3358 CR add newline and output nothing
3361 1 (or else) normal output
3363 fold state in prev (previous character)
3365 >0x80 Japanese (X0208/X0201)
3370 This fold algorthm does not preserve heading space in a line.
3371 This is the main difference from fmt.
3374 #define char_size(c2,c1) (c2?2:1)
3377 fold_conv(nkf_char c2, nkf_char c1)
3380 nkf_char fold_state;
3382 if (c1== CR && !fold_preserve_f) {
3383 fold_state=0; /* ignore cr */
3384 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3386 fold_state=0; /* ignore cr */
3387 } else if (c1== BS) {
3388 if (f_line>0) f_line--;
3390 } else if (c2==EOF && f_line != 0) { /* close open last line */
3392 } else if ((c1==LF && !fold_preserve_f)
3393 || ((c1==CR||(c1==LF&&f_prev!=CR))
3394 && fold_preserve_f)) {
3396 if (fold_preserve_f) {
3400 } else if ((f_prev == c1 && !fold_preserve_f)
3401 || (f_prev == LF && fold_preserve_f)
3402 ) { /* duplicate newline */
3405 fold_state = LF; /* output two newline */
3411 if (f_prev&0x80) { /* Japanese? */
3413 fold_state = 0; /* ignore given single newline */
3414 } else if (f_prev==SP) {
3418 if (++f_line<=fold_len)
3422 fold_state = CR; /* fold and output nothing */
3426 } else if (c1=='\f') {
3429 fold_state = LF; /* output newline and clear */
3430 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3431 /* X0208 kankaku or ascii space */
3433 fold_state = 0; /* remove duplicate spaces */
3436 if (++f_line<=fold_len)
3437 fold_state = SP; /* output ASCII space only */
3439 f_prev = SP; f_line = 0;
3440 fold_state = CR; /* fold and output nothing */
3444 prev0 = f_prev; /* we still need this one... , but almost done */
3446 if (c2 || c2 == JIS_X_0201_1976_K)
3447 f_prev |= 0x80; /* this is Japanese */
3448 f_line += char_size(c2,c1);
3449 if (f_line<=fold_len) { /* normal case */
3452 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3453 f_line = char_size(c2,c1);
3454 fold_state = LF; /* We can't wait, do fold now */
3455 } else if (c2 == JIS_X_0201_1976_K) {
3456 /* simple kinsoku rules return 1 means no folding */
3457 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3458 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3459 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3460 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3461 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3462 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3463 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3465 fold_state = LF;/* add one new f_line before this character */
3468 fold_state = LF;/* add one new f_line before this character */
3471 /* kinsoku point in ASCII */
3472 if ( c1==')'|| /* { [ ( */
3483 /* just after special */
3484 } else if (!is_alnum(prev0)) {
3485 f_line = char_size(c2,c1);
3487 } else if ((prev0==SP) || /* ignored new f_line */
3488 (prev0==LF)|| /* ignored new f_line */
3489 (prev0&0x80)) { /* X0208 - ASCII */
3490 f_line = char_size(c2,c1);
3491 fold_state = LF;/* add one new f_line before this character */
3493 fold_state = 1; /* default no fold in ASCII */
3497 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3498 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3499 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3500 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3501 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3502 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3503 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3504 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3505 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3506 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3507 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3508 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3509 /* default no fold in kinsoku */
3512 f_line = char_size(c2,c1);
3513 /* add one new f_line before this character */
3516 f_line = char_size(c2,c1);
3518 /* add one new f_line before this character */
3523 /* terminator process */
3524 switch(fold_state) {
3526 OCONV_NEWLINE((*o_fconv));
3532 OCONV_NEWLINE((*o_fconv));
3543 static nkf_char z_prev2=0,z_prev1=0;
3546 z_conv(nkf_char c2, nkf_char c1)
3549 /* if (c2) c1 &= 0x7f; assertion */
3551 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3557 if (z_prev2 == JIS_X_0201_1976_K) {
3558 if (c2 == JIS_X_0201_1976_K) {
3559 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3561 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3563 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3565 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3570 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3572 if (c2 == JIS_X_0201_1976_K) {
3573 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3574 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3579 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3590 if (alpha_f&1 && c2 == 0x23) {
3591 /* JISX0208 Alphabet */
3593 } else if (c2 == 0x21) {
3594 /* JISX0208 Kigou */
3599 } else if (alpha_f&4) {
3604 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3610 if (alpha_f&8 && c2 == 0) {
3612 const char *entity = 0;
3614 case '>': entity = ">"; break;
3615 case '<': entity = "<"; break;
3616 case '\"': entity = """; break;
3617 case '&': entity = "&"; break;
3620 while (*entity) (*o_zconv)(0, *entity++);
3626 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3631 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3635 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3639 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3643 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3647 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3651 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3655 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3659 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3664 (*o_zconv)(JIS_X_0201_1976_K, c);
3667 } else if (c2 == 0x25) {
3668 /* JISX0208 Katakana */
3669 static const int fullwidth_to_halfwidth[] =
3671 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3672 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3673 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3674 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3675 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3676 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3677 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3678 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3679 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3680 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3681 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3682 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3684 if (fullwidth_to_halfwidth[c1-0x20]){
3685 c2 = fullwidth_to_halfwidth[c1-0x20];
3686 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3688 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3698 #define rot13(c) ( \
3700 (c <= 'M') ? (c + 13): \
3701 (c <= 'Z') ? (c - 13): \
3703 (c <= 'm') ? (c + 13): \
3704 (c <= 'z') ? (c - 13): \
3708 #define rot47(c) ( \
3710 ( c <= 'O') ? (c + 47) : \
3711 ( c <= '~') ? (c - 47) : \
3716 rot_conv(nkf_char c2, nkf_char c1)
3718 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3724 (*o_rot_conv)(c2,c1);
3728 hira_conv(nkf_char c2, nkf_char c1)
3732 if (0x20 < c1 && c1 < 0x74) {
3734 (*o_hira_conv)(c2,c1);
3736 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3738 c1 = nkf_char_unicode_new(0x3094);
3739 (*o_hira_conv)(c2,c1);
3742 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3744 (*o_hira_conv)(c2,c1);
3749 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3752 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3754 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3758 (*o_hira_conv)(c2,c1);
3763 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3765 #define RANGE_NUM_MAX 18
3766 static const nkf_char range[RANGE_NUM_MAX][2] = {
3787 nkf_char start, end, c;
3789 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3793 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3798 for (i = 0; i < RANGE_NUM_MAX; i++) {
3799 start = range[i][0];
3802 if (c >= start && c <= end) {
3807 (*o_iso2022jp_check_conv)(c2,c1);
3811 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3813 static const unsigned char *mime_pattern[] = {
3814 (const unsigned char *)"\075?EUC-JP?B?",
3815 (const unsigned char *)"\075?SHIFT_JIS?B?",
3816 (const unsigned char *)"\075?ISO-8859-1?Q?",
3817 (const unsigned char *)"\075?ISO-8859-1?B?",
3818 (const unsigned char *)"\075?ISO-2022-JP?B?",
3819 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3820 #if defined(UTF8_INPUT_ENABLE)
3821 (const unsigned char *)"\075?UTF-8?B?",
3822 (const unsigned char *)"\075?UTF-8?Q?",
3824 (const unsigned char *)"\075?US-ASCII?Q?",
3829 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3830 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3831 e_iconv, s_iconv, 0, 0, 0, 0,
3832 #if defined(UTF8_INPUT_ENABLE)
3838 static const nkf_char mime_encode[] = {
3839 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3840 #if defined(UTF8_INPUT_ENABLE)
3847 static const nkf_char mime_encode_method[] = {
3848 'B', 'B','Q', 'B', 'B', 'Q',
3849 #if defined(UTF8_INPUT_ENABLE)
3857 /* MIME preprocessor fifo */
3859 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3860 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3861 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3863 unsigned char buf[MIME_BUF_SIZE];
3865 unsigned int last; /* decoded */
3866 unsigned int input; /* undecoded */
3868 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3870 #define MAXRECOVER 20
3873 mime_input_buf_unshift(nkf_char c)
3875 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3879 mime_ungetc(nkf_char c, FILE *f)
3881 mime_input_buf_unshift(c);
3886 mime_ungetc_buf(nkf_char c, FILE *f)
3889 (*i_mungetc_buf)(c,f);
3891 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3896 mime_getc_buf(FILE *f)
3898 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3899 a terminator. It was checked in mime_integrity. */
3900 return ((mimebuf_f)?
3901 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3905 switch_mime_getc(void)
3907 if (i_getc!=mime_getc) {
3908 i_mgetc = i_getc; i_getc = mime_getc;
3909 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3910 if(mime_f==STRICT_MIME) {
3911 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3912 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3918 unswitch_mime_getc(void)
3920 if(mime_f==STRICT_MIME) {
3921 i_mgetc = i_mgetc_buf;
3922 i_mungetc = i_mungetc_buf;
3925 i_ungetc = i_mungetc;
3926 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3927 mime_iconv_back = NULL;
3931 mime_integrity(FILE *f, const unsigned char *p)
3935 /* In buffered mode, read until =? or NL or buffer full
3937 mime_input_state.input = mime_input_state.top;
3938 mime_input_state.last = mime_input_state.top;
3940 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3942 q = mime_input_state.input;
3943 while((c=(*i_getc)(f))!=EOF) {
3944 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3945 break; /* buffer full */
3947 if (c=='=' && d=='?') {
3948 /* checked. skip header, start decode */
3949 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3950 /* mime_last_input = mime_input_state.input; */
3951 mime_input_state.input = q;
3955 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3957 /* Should we check length mod 4? */
3958 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3961 /* In case of Incomplete MIME, no MIME decode */
3962 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3963 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3964 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3965 switch_mime_getc(); /* anyway we need buffered getc */
3970 mime_begin_strict(FILE *f)
3974 const unsigned char *p,*q;
3975 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3977 mime_decode_mode = FALSE;
3978 /* =? has been checked */
3980 p = mime_pattern[j];
3983 for(i=2;p[i]>SP;i++) { /* start at =? */
3984 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3985 /* pattern fails, try next one */
3987 while (mime_pattern[++j]) {
3988 p = mime_pattern[j];
3989 for(k=2;k<i;k++) /* assume length(p) > i */
3990 if (p[k]!=q[k]) break;
3991 if (k==i && nkf_toupper(c1)==p[k]) break;
3993 p = mime_pattern[j];
3994 if (p) continue; /* found next one, continue */
3995 /* all fails, output from recovery buffer */
4003 mime_decode_mode = p[i-2];
4005 mime_iconv_back = iconv;
4006 set_iconv(FALSE, mime_priority_func[j]);
4007 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4009 if (mime_decode_mode=='B') {
4010 mimebuf_f = unbuf_f;
4012 /* do MIME integrity check */
4013 return mime_integrity(f,mime_pattern[j]);
4027 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4028 /* re-read and convert again from mime_buffer. */
4030 /* =? has been checked */
4031 k = mime_input_state.last;
4032 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4033 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4034 /* We accept any character type even if it is breaked by new lines */
4035 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4036 if (c1==LF||c1==SP||c1==CR||
4037 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4039 /* Failed. But this could be another MIME preemble */
4041 mime_input_state.last--;
4047 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4048 if (!(++i<MAXRECOVER) || c1==EOF) break;
4049 if (c1=='b'||c1=='B') {
4050 mime_decode_mode = 'B';
4051 } else if (c1=='q'||c1=='Q') {
4052 mime_decode_mode = 'Q';
4056 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4057 if (!(++i<MAXRECOVER) || c1==EOF) break;
4059 mime_decode_mode = FALSE;
4065 if (!mime_decode_mode) {
4066 /* false MIME premble, restart from mime_buffer */
4067 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4068 /* Since we are in MIME mode until buffer becomes empty, */
4069 /* we never go into mime_begin again for a while. */
4072 /* discard mime preemble, and goto MIME mode */
4073 mime_input_state.last = k;
4074 /* do no MIME integrity check */
4075 return c1; /* used only for checking EOF */
4086 debug(const char *str)
4089 fprintf(stderr, "%s\n", str ? str : "NULL");
4095 set_input_codename(const char *codename)
4097 if (!input_codename) {
4098 input_codename = codename;
4099 } else if (strcmp(codename, input_codename) != 0) {
4100 input_codename = "";
4105 get_guessed_code(void)
4107 if (input_codename && !*input_codename) {
4108 input_codename = "BINARY";
4110 struct input_code *p = find_inputcode_byfunc(iconv);
4111 if (!input_codename) {
4112 input_codename = "ASCII";
4113 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4114 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4115 input_codename = "CP932";
4116 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4117 if (p->score & (SCORE_X0212))
4118 input_codename = "EUCJP-MS";
4119 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4120 input_codename = "CP51932";
4121 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4122 if (p->score & (SCORE_KANA))
4123 input_codename = "CP50221";
4124 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4125 input_codename = "CP50220";
4128 return input_codename;
4131 #if !defined(PERL_XS) && !defined(WIN32DLL)
4133 print_guessed_code(char *filename)
4135 if (filename != NULL) printf("%s: ", filename);
4136 if (input_codename && !*input_codename) {
4139 input_codename = get_guessed_code();
4141 printf("%s\n", input_codename);
4145 input_eol == CR ? " (CR)" :
4146 input_eol == LF ? " (LF)" :
4147 input_eol == CRLF ? " (CRLF)" :
4148 input_eol == EOF ? " (MIXED NL)" :
4158 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4160 nkf_char c1, c2, c3;
4166 if (!nkf_isxdigit(c2)){
4171 if (!nkf_isxdigit(c3)){
4176 return (hex2bin(c2) << 4) | hex2bin(c3);
4182 return hex_getc(':', f, i_cgetc, i_cungetc);
4186 cap_ungetc(nkf_char c, FILE *f)
4188 return (*i_cungetc)(c, f);
4194 return hex_getc('%', f, i_ugetc, i_uungetc);
4198 url_ungetc(nkf_char c, FILE *f)
4200 return (*i_uungetc)(c, f);
4204 #ifdef NUMCHAR_OPTION
4206 numchar_getc(FILE *f)
4208 nkf_char (*g)(FILE *) = i_ngetc;
4209 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4220 if (buf[i] == 'x' || buf[i] == 'X'){
4221 for (j = 0; j < 7; j++){
4223 if (!nkf_isxdigit(buf[i])){
4230 c |= hex2bin(buf[i]);
4233 for (j = 0; j < 8; j++){
4237 if (!nkf_isdigit(buf[i])){
4244 c += hex2bin(buf[i]);
4250 return nkf_char_unicode_new(c);
4260 numchar_ungetc(nkf_char c, FILE *f)
4262 return (*i_nungetc)(c, f);
4266 #ifdef UNICODE_NORMALIZATION
4271 nkf_char (*g)(FILE *f) = i_nfc_getc;
4272 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4273 nkf_buf_t *buf = nkf_state->nfc_buf;
4274 const unsigned char *array;
4275 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4276 nkf_char c = (*g)(f);
4278 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4280 nkf_buf_push(buf, c);
4282 while (lower <= upper) {
4283 int mid = (lower+upper) / 2;
4285 array = normalization_table[mid].nfd;
4286 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4287 if (len >= nkf_buf_length(buf)) {
4291 lower = 1, upper = 0;
4294 nkf_buf_push(buf, c);
4296 if (array[len] != nkf_buf_at(buf, len)) {
4297 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4298 else upper = mid - 1;
4305 array = normalization_table[mid].nfc;
4307 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4308 nkf_buf_push(buf, array[i]);
4312 } while (lower <= upper);
4314 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4315 c = nkf_buf_pop(buf);
4321 nfc_ungetc(nkf_char c, FILE *f)
4323 return (*i_nfc_ungetc)(c, f);
4325 #endif /* UNICODE_NORMALIZATION */
4329 base64decode(nkf_char c)
4334 i = c - 'A'; /* A..Z 0-25 */
4335 } else if (c == '_') {
4336 i = '?' /* 63 */ ; /* _ 63 */
4338 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4340 } else if (c > '/') {
4341 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4342 } else if (c == '+' || c == '-') {
4343 i = '>' /* 62 */ ; /* + and - 62 */
4345 i = '?' /* 63 */ ; /* / 63 */
4353 nkf_char c1, c2, c3, c4, cc;
4354 nkf_char t1, t2, t3, t4, mode, exit_mode;
4355 nkf_char lwsp_count;
4358 nkf_char lwsp_size = 128;
4360 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4361 return mime_input_buf(mime_input_state.top++);
4363 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4364 mime_decode_mode=FALSE;
4365 unswitch_mime_getc();
4366 return (*i_getc)(f);
4369 if (mimebuf_f == FIXED_MIME)
4370 exit_mode = mime_decode_mode;
4373 if (mime_decode_mode == 'Q') {
4374 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4376 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4377 if (c1<=SP || DEL<=c1) {
4378 mime_decode_mode = exit_mode; /* prepare for quit */
4381 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4385 mime_decode_mode = exit_mode; /* prepare for quit */
4386 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4387 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4388 /* end Q encoding */
4389 input_mode = exit_mode;
4391 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4392 while ((c1=(*i_getc)(f))!=EOF) {
4397 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4405 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4406 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4421 lwsp_buf[lwsp_count] = (unsigned char)c1;
4422 if (lwsp_count++>lwsp_size){
4424 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4425 lwsp_buf = lwsp_buf_new;
4431 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4433 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4434 i_ungetc(lwsp_buf[lwsp_count],f);
4437 nkf_xfree(lwsp_buf);
4440 if (c1=='='&&c2<SP) { /* this is soft wrap */
4441 while((c1 = (*i_mgetc)(f)) <=SP) {
4442 if (c1 == EOF) return (EOF);
4444 mime_decode_mode = 'Q'; /* still in MIME */
4445 goto restart_mime_q;
4448 mime_decode_mode = 'Q'; /* still in MIME */
4452 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4453 if (c2<=SP) return c2;
4454 mime_decode_mode = 'Q'; /* still in MIME */
4455 return ((hex2bin(c2)<<4) + hex2bin(c3));
4458 if (mime_decode_mode != 'B') {
4459 mime_decode_mode = FALSE;
4460 return (*i_mgetc)(f);
4464 /* Base64 encoding */
4466 MIME allows line break in the middle of
4467 Base64, but we are very pessimistic in decoding
4468 in unbuf mode because MIME encoded code may broken by
4469 less or editor's control sequence (such as ESC-[-K in unbuffered
4470 mode. ignore incomplete MIME.
4472 mode = mime_decode_mode;
4473 mime_decode_mode = exit_mode; /* prepare for quit */
4475 while ((c1 = (*i_mgetc)(f))<=SP) {
4480 if ((c2 = (*i_mgetc)(f))<=SP) {
4483 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4484 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4487 if ((c1 == '?') && (c2 == '=')) {
4490 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4491 while ((c1=(*i_getc)(f))!=EOF) {
4496 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4504 if ((c1=(*i_getc)(f))!=EOF) {
4508 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4523 lwsp_buf[lwsp_count] = (unsigned char)c1;
4524 if (lwsp_count++>lwsp_size){
4526 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4527 lwsp_buf = lwsp_buf_new;
4533 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4535 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4536 i_ungetc(lwsp_buf[lwsp_count],f);
4539 nkf_xfree(lwsp_buf);
4543 if ((c3 = (*i_mgetc)(f))<=SP) {
4546 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4547 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4551 if ((c4 = (*i_mgetc)(f))<=SP) {
4554 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4555 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4559 mime_decode_mode = mode; /* still in MIME sigh... */
4561 /* BASE 64 decoding */
4563 t1 = 0x3f & base64decode(c1);
4564 t2 = 0x3f & base64decode(c2);
4565 t3 = 0x3f & base64decode(c3);
4566 t4 = 0x3f & base64decode(c4);
4567 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4569 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4570 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4572 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4573 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4575 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4580 return mime_input_buf(mime_input_state.top++);
4583 static const char basis_64[] =
4584 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4586 #define MIMEOUT_BUF_LENGTH 74
4588 char buf[MIMEOUT_BUF_LENGTH+1];
4592 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4595 open_mime(nkf_char mode)
4597 const unsigned char *p;
4600 p = mime_pattern[0];
4601 for(i=0;mime_pattern[i];i++) {
4602 if (mode == mime_encode[i]) {
4603 p = mime_pattern[i];
4607 mimeout_mode = mime_encode_method[i];
4609 if (base64_count>45) {
4610 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4611 (*o_mputc)(mimeout_state.buf[i]);
4614 PUT_NEWLINE((*o_mputc));
4617 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
4621 for (;i<mimeout_state.count;i++) {
4622 if (nkf_isspace(mimeout_state.buf[i])) {
4623 (*o_mputc)(mimeout_state.buf[i]);
4633 j = mimeout_state.count;
4634 mimeout_state.count = 0;
4636 mime_putc(mimeout_state.buf[i]);
4641 mime_prechar(nkf_char c2, nkf_char c1)
4643 if (mimeout_mode > 0){
4645 if (base64_count + mimeout_state.count/3*4> 73){
4646 (*o_base64conv)(EOF,0);
4647 OCONV_NEWLINE((*o_base64conv));
4648 (*o_base64conv)(0,SP);
4652 if (!(c2 == 0 && (c1 == CR || c1 == LF)) &&
4653 base64_count + mimeout_state.count/3*4> 66) {
4654 (*o_base64conv)(EOF,0);
4655 OCONV_NEWLINE((*o_base64conv));
4656 (*o_base64conv)(0,SP);
4662 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4663 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4664 open_mime(output_mode);
4665 (*o_base64conv)(EOF,0);
4666 OCONV_NEWLINE((*o_base64conv));
4667 (*o_base64conv)(0,SP);
4686 switch(mimeout_mode) {
4691 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
4697 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
4702 if (mimeout_mode > 0) {
4703 if (mimeout_f!=FIXED_MIME) {
4705 } else if (mimeout_mode != 'Q')
4711 mimeout_addchar(nkf_char c)
4713 switch(mimeout_mode) {
4718 } else if(!nkf_isalnum(c)) {
4720 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4721 (*o_mputc)(bin2hex((c&0xf)));
4729 nkf_state->mimeout_state=c;
4730 (*o_mputc)(basis_64[c>>2]);
4735 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4736 nkf_state->mimeout_state=c;
4741 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4742 (*o_mputc)(basis_64[c & 0x3F]);
4754 mime_putc(nkf_char c)
4759 if (mimeout_f == FIXED_MIME){
4760 if (mimeout_mode == 'Q'){
4761 if (base64_count > 71){
4762 if (c!=CR && c!=LF) {
4764 PUT_NEWLINE((*o_mputc));
4769 if (base64_count > 71){
4771 PUT_NEWLINE((*o_mputc));
4774 if (c == EOF) { /* c==EOF */
4778 if (c != EOF) { /* c==EOF */
4784 /* mimeout_f != FIXED_MIME */
4786 if (c == EOF) { /* c==EOF */
4787 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4788 j = mimeout_state.count;
4789 mimeout_state.count = 0;
4791 if (mimeout_mode > 0) {
4792 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4794 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4797 mimeout_addchar(mimeout_state.buf[i]);
4801 mimeout_addchar(mimeout_state.buf[i]);
4805 mimeout_addchar(mimeout_state.buf[i]);
4811 mimeout_addchar(mimeout_state.buf[i]);
4817 if (mimeout_state.count > 0){
4818 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4823 if (mimeout_mode=='Q') {
4824 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4825 if (c == CR || c == LF) {
4830 } else if (c <= SP) {
4832 if (base64_count > 70) {
4833 PUT_NEWLINE((*o_mputc));
4836 if (!nkf_isblank(c)) {
4841 if (base64_count > 70) {
4843 PUT_NEWLINE((*o_mputc));
4846 open_mime(output_mode);
4848 if (!nkf_noescape_mime(c)) {
4861 if (mimeout_mode <= 0) {
4862 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
4863 output_mode == UTF_8)) {
4864 if (nkf_isspace(c)) {
4866 if (mimeout_mode == -1) {
4869 if (c==CR || c==LF) {
4871 open_mime(output_mode);
4877 for (i=0;i<mimeout_state.count;i++) {
4878 (*o_mputc)(mimeout_state.buf[i]);
4879 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4890 mimeout_state.buf[0] = (char)c;
4891 mimeout_state.count = 1;
4893 if (base64_count > 1
4894 && base64_count + mimeout_state.count > 76
4895 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4896 static const char *str = "boundary=\"";
4897 static int len = 10;
4900 for (; i < mimeout_state.count - len; ++i) {
4901 if (!strncmp(mimeout_state.buf+i, str, len)) {
4907 if (i == 0 || i == mimeout_state.count - len) {
4908 PUT_NEWLINE((*o_mputc));
4910 if (!nkf_isspace(mimeout_state.buf[0])){
4917 for (j = 0; j <= i; ++j) {
4918 (*o_mputc)(mimeout_state.buf[j]);
4920 PUT_NEWLINE((*o_mputc));
4922 for (; j <= mimeout_state.count; ++j) {
4923 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4925 mimeout_state.count -= i;
4928 mimeout_state.buf[mimeout_state.count++] = (char)c;
4929 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4930 open_mime(output_mode);
4935 if (lastchar==CR || lastchar == LF){
4936 for (i=0;i<mimeout_state.count;i++) {
4937 (*o_mputc)(mimeout_state.buf[i]);
4940 mimeout_state.count = 0;
4943 for (i=0;i<mimeout_state.count-1;i++) {
4944 (*o_mputc)(mimeout_state.buf[i]);
4947 mimeout_state.buf[0] = SP;
4948 mimeout_state.count = 1;
4950 open_mime(output_mode);
4953 /* mimeout_mode == 'B', 1, 2 */
4954 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
4955 output_mode == UTF_8)) {
4956 if (lastchar == CR || lastchar == LF){
4957 if (nkf_isblank(c)) {
4958 for (i=0;i<mimeout_state.count;i++) {
4959 mimeout_addchar(mimeout_state.buf[i]);
4961 mimeout_state.count = 0;
4964 for (i=0;i<mimeout_state.count;i++) {
4965 (*o_mputc)(mimeout_state.buf[i]);
4968 mimeout_state.count = 0;
4970 mimeout_state.buf[mimeout_state.count++] = (char)c;
4973 if (nkf_isspace(c)) {
4974 for (i=0;i<mimeout_state.count;i++) {
4975 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4977 for (i=0;i<mimeout_state.count;i++) {
4978 (*o_mputc)(mimeout_state.buf[i]);
4981 mimeout_state.count = 0;
4984 mimeout_state.buf[mimeout_state.count++] = (char)c;
4985 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4987 for (i=0;i<mimeout_state.count;i++) {
4988 (*o_mputc)(mimeout_state.buf[i]);
4991 mimeout_state.count = 0;
4995 if (mimeout_state.count>0 && SP<c && c!='=') {
4996 mimeout_state.buf[mimeout_state.count++] = (char)c;
4997 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4998 j = mimeout_state.count;
4999 mimeout_state.count = 0;
5001 mimeout_addchar(mimeout_state.buf[i]);
5008 if (mimeout_state.count>0) {
5009 j = mimeout_state.count;
5010 mimeout_state.count = 0;
5012 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5014 mimeout_addchar(mimeout_state.buf[i]);
5020 (*o_mputc)(mimeout_state.buf[i]);
5022 open_mime(output_mode);
5029 base64_conv(nkf_char c2, nkf_char c1)
5031 mime_prechar(c2, c1);
5032 (*o_base64conv)(c2,c1);
5036 typedef struct nkf_iconv_t {
5039 size_t input_buffer_size;
5040 char *output_buffer;
5041 size_t output_buffer_size;
5045 nkf_iconv_new(char *tocode, char *fromcode)
5047 nkf_iconv_t converter;
5049 converter->input_buffer_size = IOBUF_SIZE;
5050 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5051 converter->output_buffer_size = IOBUF_SIZE * 2;
5052 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5053 converter->cd = iconv_open(tocode, fromcode);
5054 if (converter->cd == (iconv_t)-1)
5058 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5061 perror("can't iconv_open");
5067 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5069 size_t invalid = (size_t)0;
5070 char *input_buffer = converter->input_buffer;
5071 size_t input_length = (size_t)0;
5072 char *output_buffer = converter->output_buffer;
5073 size_t output_length = converter->output_buffer_size;
5078 while ((c = (*i_getc)(f)) != EOF) {
5079 input_buffer[input_length++] = c;
5080 if (input_length < converter->input_buffer_size) break;
5084 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5085 while (output_length-- > 0) {
5086 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5088 if (ret == (size_t) - 1) {
5091 if (input_buffer != converter->input_buffer)
5092 memmove(converter->input_buffer, input_buffer, input_length);
5095 converter->output_buffer_size *= 2;
5096 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5097 if (output_buffer == NULL) {
5098 perror("can't realloc");
5101 converter->output_buffer = output_buffer;
5104 perror("can't iconv");
5117 nkf_iconv_close(nkf_iconv_t *convert)
5119 nkf_xfree(converter->inbuf);
5120 nkf_xfree(converter->outbuf);
5121 iconv_close(converter->cd);
5130 struct input_code *p = input_code_list;
5142 mime_f = MIME_DECODE_DEFAULT;
5143 mime_decode_f = FALSE;
5148 x0201_f = X0201_DEFAULT;
5149 iso2022jp_f = FALSE;
5150 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5151 ms_ucs_map_f = UCS_MAP_ASCII;
5153 #ifdef UTF8_INPUT_ENABLE
5154 no_cp932ext_f = FALSE;
5155 no_best_fit_chars_f = FALSE;
5156 encode_fallback = NULL;
5157 unicode_subchar = '?';
5158 input_endian = ENDIAN_BIG;
5160 #ifdef UTF8_OUTPUT_ENABLE
5161 output_bom_f = FALSE;
5162 output_endian = ENDIAN_BIG;
5164 #ifdef UNICODE_NORMALIZATION
5180 #ifdef SHIFTJIS_CP932
5190 for (i = 0; i < 256; i++){
5191 prefix_table[i] = 0;
5195 mimeout_state.count = 0;
5200 fold_preserve_f = FALSE;
5203 kanji_intro = DEFAULT_J;
5204 ascii_intro = DEFAULT_R;
5205 fold_margin = FOLD_MARGIN;
5206 o_zconv = no_connection;
5207 o_fconv = no_connection;
5208 o_eol_conv = no_connection;
5209 o_rot_conv = no_connection;
5210 o_hira_conv = no_connection;
5211 o_base64conv = no_connection;
5212 o_iso2022jp_check_conv = no_connection;
5215 i_ungetc = std_ungetc;
5217 i_bungetc = std_ungetc;
5220 i_mungetc = std_ungetc;
5221 i_mgetc_buf = std_getc;
5222 i_mungetc_buf = std_ungetc;
5223 output_mode = ASCII;
5225 mime_decode_mode = FALSE;
5231 z_prev2=0,z_prev1=0;
5233 iconv_for_check = 0;
5235 input_codename = NULL;
5236 input_encoding = NULL;
5237 output_encoding = NULL;
5245 module_connection(void)
5247 if (input_encoding) set_input_encoding(input_encoding);
5248 if (!output_encoding) {
5249 output_encoding = nkf_default_encoding();
5251 if (!output_encoding) {
5252 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5255 set_output_encoding(output_encoding);
5256 oconv = nkf_enc_to_oconv(output_encoding);
5258 if (nkf_enc_unicode_p(output_encoding))
5259 output_mode = UTF_8;
5261 /* replace continucation module, from output side */
5263 /* output redicrection */
5265 if (noout_f || guess_f){
5272 if (mimeout_f == TRUE) {
5273 o_base64conv = oconv; oconv = base64_conv;
5275 /* base64_count = 0; */
5278 if (eolmode_f || guess_f) {
5279 o_eol_conv = oconv; oconv = eol_conv;
5282 o_rot_conv = oconv; oconv = rot_conv;
5285 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5288 o_hira_conv = oconv; oconv = hira_conv;
5291 o_fconv = oconv; oconv = fold_conv;
5294 if (alpha_f || x0201_f) {
5295 o_zconv = oconv; oconv = z_conv;
5299 i_ungetc = std_ungetc;
5300 /* input redicrection */
5303 i_cgetc = i_getc; i_getc = cap_getc;
5304 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5307 i_ugetc = i_getc; i_getc = url_getc;
5308 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5311 #ifdef NUMCHAR_OPTION
5313 i_ngetc = i_getc; i_getc = numchar_getc;
5314 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5317 #ifdef UNICODE_NORMALIZATION
5319 i_nfc_getc = i_getc; i_getc = nfc_getc;
5320 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5323 if (mime_f && mimebuf_f==FIXED_MIME) {
5324 i_mgetc = i_getc; i_getc = mime_getc;
5325 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5328 i_bgetc = i_getc; i_getc = broken_getc;
5329 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5331 if (input_encoding) {
5332 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5334 set_iconv(FALSE, e_iconv);
5338 struct input_code *p = input_code_list;
5347 Conversion main loop. Code detection only.
5350 #if !defined(PERL_XS) && !defined(WIN32DLL)
5357 module_connection();
5358 while ((c = (*i_getc)(f)) != EOF)
5365 #define NEXT continue /* no output, get next */
5366 #define SKIP c2=0;continue /* no output, get next */
5367 #define MORE c2=c1;continue /* need one more byte */
5368 #define SEND ; /* output c1 and c2, get next */
5369 #define LAST break /* end of loop, go closing */
5370 #define set_input_mode(mode) do { \
5371 input_mode = mode; \
5373 set_input_codename("ISO-2022-JP"); \
5374 debug("ISO-2022-JP"); \
5378 kanji_convert(FILE *f)
5380 nkf_char c1=0, c2=0, c3=0, c4=0;
5381 int shift_mode = 0; /* 0, 1, 2, 3 */
5383 int is_8bit = FALSE;
5385 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5390 output_mode = ASCII;
5392 if (module_connection() < 0) {
5393 #if !defined(PERL_XS) && !defined(WIN32DLL)
5394 fprintf(stderr, "no output encoding given\n");
5400 #ifdef UTF8_INPUT_ENABLE
5401 if(iconv == w_iconv32){
5402 while ((c1 = (*i_getc)(f)) != EOF &&
5403 (c2 = (*i_getc)(f)) != EOF &&
5404 (c3 = (*i_getc)(f)) != EOF &&
5405 (c4 = (*i_getc)(f)) != EOF) {
5406 nkf_iconv_utf_32(c1, c2, c3, c4);
5408 (*i_ungetc)(EOF, f);
5410 else if (iconv == w_iconv16) {
5411 while ((c1 = (*i_getc)(f)) != EOF &&
5412 (c2 = (*i_getc)(f)) != EOF) {
5413 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5414 (c3 = (*i_getc)(f)) != EOF &&
5415 (c4 = (*i_getc)(f)) != EOF) {
5416 nkf_iconv_utf_16(c1, c2, c3, c4);
5419 (*i_ungetc)(EOF, f);
5423 while ((c1 = (*i_getc)(f)) != EOF) {
5424 #ifdef INPUT_CODE_FIX
5425 if (!input_encoding)
5431 /* in case of 8th bit is on */
5432 if (!estab_f&&!mime_decode_mode) {
5433 /* in case of not established yet */
5434 /* It is still ambiguious */
5435 if (h_conv(f, c2, c1)==EOF) {
5443 /* in case of already established */
5445 /* ignore bogus code */
5453 /* 2nd byte of 7 bit code or SJIS */
5457 else if (nkf_char_unicode_p(c1)) {
5463 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5466 }else if (input_codename && input_codename[0] == 'I' &&
5467 0xA1 <= c1 && c1 <= 0xDF) {
5468 /* JIS X 0201 Katakana in 8bit JIS */
5469 c2 = JIS_X_0201_1976_K;
5472 } else if (c1 > DEL) {
5474 if (!estab_f && !iso8859_f) {
5475 /* not established yet */
5477 } else { /* estab_f==TRUE */
5483 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5484 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5486 c2 = JIS_X_0201_1976_K;
5491 /* already established */
5495 } else if (SP < c1 && c1 < DEL) {
5496 /* in case of Roman characters */
5498 /* output 1 shifted byte */
5502 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5503 /* output 1 shifted byte */
5504 c2 = JIS_X_0201_1976_K;
5507 /* look like bogus code */
5510 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5511 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5512 /* in case of Kanji shifted */
5514 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5515 /* Check MIME code */
5516 if ((c1 = (*i_getc)(f)) == EOF) {
5519 } else if (c1 == '?') {
5520 /* =? is mime conversion start sequence */
5521 if(mime_f == STRICT_MIME) {
5522 /* check in real detail */
5523 if (mime_begin_strict(f) == EOF)
5526 } else if (mime_begin(f) == EOF)
5535 /* normal ASCII code */
5538 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5541 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5544 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5545 if ((c1 = (*i_getc)(f)) == EOF) {
5549 else if (c1 == '&') {
5551 if ((c1 = (*i_getc)(f)) == EOF) {
5557 else if (c1 == '$') {
5559 if ((c1 = (*i_getc)(f)) == EOF) {
5560 /* don't send bogus code
5562 (*oconv)(0, '$'); */
5564 } else if (c1 == '@' || c1 == 'B') {
5566 set_input_mode(JIS_X_0208);
5568 } else if (c1 == '(') {
5570 if ((c1 = (*i_getc)(f)) == EOF) {
5571 /* don't send bogus code
5577 } else if (c1 == '@'|| c1 == 'B') {
5579 set_input_mode(JIS_X_0208);
5582 } else if (c1 == 'D'){
5583 set_input_mode(JIS_X_0212);
5585 #endif /* X0212_ENABLE */
5586 } else if (c1 == 'O' || c1 == 'Q'){
5587 set_input_mode(JIS_X_0213_1);
5589 } else if (c1 == 'P'){
5590 set_input_mode(JIS_X_0213_2);
5593 /* could be some special code */
5600 } else if (broken_f&0x2) {
5601 /* accept any ESC-(-x as broken code ... */
5602 input_mode = JIS_X_0208;
5611 } else if (c1 == '(') {
5613 if ((c1 = (*i_getc)(f)) == EOF) {
5614 /* don't send bogus code
5616 (*oconv)(0, '('); */
5619 else if (c1 == 'I') {
5620 /* JIS X 0201 Katakana */
5621 set_input_mode(JIS_X_0201_1976_K);
5624 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5625 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5626 set_input_mode(ASCII);
5629 else if (broken_f&0x2) {
5630 set_input_mode(ASCII);
5639 else if (c1 == '.') {
5641 if ((c1 = (*i_getc)(f)) == EOF) {
5644 else if (c1 == 'A') {
5655 else if (c1 == 'N') {
5658 if (g2 == ISO_8859_1) {
5673 } else if (c1 == ESC && iconv == s_iconv) {
5674 /* ESC in Shift_JIS */
5675 if ((c1 = (*i_getc)(f)) == EOF) {
5678 } else if (c1 == '$') {
5680 if ((c1 = (*i_getc)(f)) == EOF) {
5682 } else if (('E' <= c1 && c1 <= 'G') ||
5683 ('O' <= c1 && c1 <= 'Q')) {
5691 static const nkf_char jphone_emoji_first_table[7] =
5692 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5693 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5694 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5695 while (SP <= c1 && c1 <= 'z') {
5696 (*oconv)(0, c1 + c3);
5697 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5712 } else if (c1 == LF || c1 == CR) {
5714 input_mode = ASCII; set_iconv(FALSE, 0);
5716 } else if (mime_decode_f && !mime_decode_mode){
5718 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5726 } else { /* if (c1 == CR)*/
5727 if ((c1=(*i_getc)(f))!=EOF) {
5731 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5751 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5754 if ((c3 = (*i_getc)(f)) != EOF) {
5757 if ((c4 = (*i_getc)(f)) != EOF) {
5759 (*iconv)(c2, c1, c3|c4);
5764 /* 3 bytes EUC or UTF-8 */
5765 if ((c3 = (*i_getc)(f)) != EOF) {
5767 (*iconv)(c2, c1, c3);
5775 0x7F <= c2 && c2 <= 0x92 &&
5776 0x21 <= c1 && c1 <= 0x7E) {
5778 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5781 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5785 (*oconv)(PREFIX_EUCG3 | c2, c1);
5787 #endif /* X0212_ENABLE */
5789 (*oconv)(PREFIX_EUCG3 | c2, c1);
5792 (*oconv)(input_mode, c1); /* other special case */
5798 /* goto next_word */
5802 (*iconv)(EOF, 0, 0);
5803 if (!input_codename)
5806 struct input_code *p = input_code_list;
5807 struct input_code *result = p;
5809 if (p->score < result->score) result = p;
5812 set_input_codename(result->name);
5814 debug(result->name);
5822 * int options(unsigned char *cp)
5829 options(unsigned char *cp)
5833 unsigned char *cp_back = NULL;
5838 while(*cp && *cp++!='-');
5839 while (*cp || cp_back) {
5847 case '-': /* literal options */
5848 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5852 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5853 p = (unsigned char *)long_option[i].name;
5854 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5855 if (*p == cp[j] || cp[j] == SP){
5862 #if !defined(PERL_XS) && !defined(WIN32DLL)
5863 fprintf(stderr, "unknown long option: --%s\n", cp);
5867 while(*cp && *cp != SP && cp++);
5868 if (long_option[i].alias[0]){
5870 cp = (unsigned char *)long_option[i].alias;
5873 if (strcmp(long_option[i].name, "help") == 0){
5878 if (strcmp(long_option[i].name, "ic=") == 0){
5879 enc = nkf_enc_find((char *)p);
5881 input_encoding = enc;
5884 if (strcmp(long_option[i].name, "oc=") == 0){
5885 enc = nkf_enc_find((char *)p);
5886 /* if (enc <= 0) continue; */
5888 output_encoding = enc;
5891 if (strcmp(long_option[i].name, "guess=") == 0){
5892 if (p[0] == '0' || p[0] == '1') {
5900 if (strcmp(long_option[i].name, "overwrite") == 0){
5903 preserve_time_f = TRUE;
5906 if (strcmp(long_option[i].name, "overwrite=") == 0){
5909 preserve_time_f = TRUE;
5911 backup_suffix = (char *)p;
5914 if (strcmp(long_option[i].name, "in-place") == 0){
5917 preserve_time_f = FALSE;
5920 if (strcmp(long_option[i].name, "in-place=") == 0){
5923 preserve_time_f = FALSE;
5925 backup_suffix = (char *)p;
5930 if (strcmp(long_option[i].name, "cap-input") == 0){
5934 if (strcmp(long_option[i].name, "url-input") == 0){
5939 #ifdef NUMCHAR_OPTION
5940 if (strcmp(long_option[i].name, "numchar-input") == 0){
5946 if (strcmp(long_option[i].name, "no-output") == 0){
5950 if (strcmp(long_option[i].name, "debug") == 0){
5955 if (strcmp(long_option[i].name, "cp932") == 0){
5956 #ifdef SHIFTJIS_CP932
5960 #ifdef UTF8_OUTPUT_ENABLE
5961 ms_ucs_map_f = UCS_MAP_CP932;
5965 if (strcmp(long_option[i].name, "no-cp932") == 0){
5966 #ifdef SHIFTJIS_CP932
5970 #ifdef UTF8_OUTPUT_ENABLE
5971 ms_ucs_map_f = UCS_MAP_ASCII;
5975 #ifdef SHIFTJIS_CP932
5976 if (strcmp(long_option[i].name, "cp932inv") == 0){
5983 if (strcmp(long_option[i].name, "x0212") == 0){
5990 if (strcmp(long_option[i].name, "exec-in") == 0){
5994 if (strcmp(long_option[i].name, "exec-out") == 0){
5999 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6000 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6001 no_cp932ext_f = TRUE;
6004 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6005 no_best_fit_chars_f = TRUE;
6008 if (strcmp(long_option[i].name, "fb-skip") == 0){
6009 encode_fallback = NULL;
6012 if (strcmp(long_option[i].name, "fb-html") == 0){
6013 encode_fallback = encode_fallback_html;
6016 if (strcmp(long_option[i].name, "fb-xml") == 0){
6017 encode_fallback = encode_fallback_xml;
6020 if (strcmp(long_option[i].name, "fb-java") == 0){
6021 encode_fallback = encode_fallback_java;
6024 if (strcmp(long_option[i].name, "fb-perl") == 0){
6025 encode_fallback = encode_fallback_perl;
6028 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6029 encode_fallback = encode_fallback_subchar;
6032 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6033 encode_fallback = encode_fallback_subchar;
6034 unicode_subchar = 0;
6036 /* decimal number */
6037 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6038 unicode_subchar *= 10;
6039 unicode_subchar += hex2bin(p[i]);
6041 }else if(p[1] == 'x' || p[1] == 'X'){
6042 /* hexadecimal number */
6043 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6044 unicode_subchar <<= 4;
6045 unicode_subchar |= hex2bin(p[i]);
6049 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6050 unicode_subchar *= 8;
6051 unicode_subchar += hex2bin(p[i]);
6054 w16e_conv(unicode_subchar, &i, &j);
6055 unicode_subchar = i<<8 | j;
6059 #ifdef UTF8_OUTPUT_ENABLE
6060 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6061 ms_ucs_map_f = UCS_MAP_MS;
6065 #ifdef UNICODE_NORMALIZATION
6066 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6071 if (strcmp(long_option[i].name, "prefix=") == 0){
6072 if (nkf_isgraph(p[0])){
6073 for (i = 1; nkf_isgraph(p[i]); i++){
6074 prefix_table[p[i]] = p[0];
6079 #if !defined(PERL_XS) && !defined(WIN32DLL)
6080 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6085 case 'b': /* buffered mode */
6088 case 'u': /* non bufferd mode */
6091 case 't': /* transparent mode */
6096 } else if (*cp=='2') {
6100 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6108 case 'j': /* JIS output */
6110 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6112 case 'e': /* AT&T EUC output */
6113 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6115 case 's': /* SJIS output */
6116 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6118 case 'l': /* ISO8859 Latin-1 support, no conversion */
6119 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6120 input_encoding = nkf_enc_from_index(ISO_8859_1);
6122 case 'i': /* Kanji IN ESC-$-@/B */
6123 if (*cp=='@'||*cp=='B')
6124 kanji_intro = *cp++;
6126 case 'o': /* ASCII IN ESC-(-J/B/H */
6127 /* ESC ( H was used in initial JUNET messages */
6128 if (*cp=='J'||*cp=='B'||*cp=='H')
6129 ascii_intro = *cp++;
6133 bit:1 katakana->hiragana
6134 bit:2 hiragana->katakana
6136 if ('9'>= *cp && *cp>='0')
6137 hira_f |= (*cp++ -'0');
6144 #if defined(MSDOS) || defined(__OS2__)
6151 show_configuration();
6159 #ifdef UTF8_OUTPUT_ENABLE
6160 case 'w': /* UTF-8 output */
6165 output_encoding = nkf_enc_from_index(UTF_8N);
6167 output_bom_f = TRUE;
6168 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6172 if ('1'== cp[0] && '6'==cp[1]) {
6175 } else if ('3'== cp[0] && '2'==cp[1]) {
6179 output_encoding = nkf_enc_from_index(UTF_8);
6184 output_endian = ENDIAN_LITTLE;
6185 } else if (cp[0] == 'B') {
6190 enc_idx = enc_idx == UTF_16
6191 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6192 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6194 output_bom_f = TRUE;
6195 enc_idx = enc_idx == UTF_16
6196 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6197 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6199 output_encoding = nkf_enc_from_index(enc_idx);
6203 #ifdef UTF8_INPUT_ENABLE
6204 case 'W': /* UTF input */
6207 input_encoding = nkf_enc_from_index(UTF_8);
6210 if ('1'== cp[0] && '6'==cp[1]) {
6212 input_endian = ENDIAN_BIG;
6214 } else if ('3'== cp[0] && '2'==cp[1]) {
6216 input_endian = ENDIAN_BIG;
6219 input_encoding = nkf_enc_from_index(UTF_8);
6224 input_endian = ENDIAN_LITTLE;
6225 } else if (cp[0] == 'B') {
6227 input_endian = ENDIAN_BIG;
6229 enc_idx = (enc_idx == UTF_16
6230 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6231 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6232 input_encoding = nkf_enc_from_index(enc_idx);
6236 /* Input code assumption */
6237 case 'J': /* ISO-2022-JP input */
6238 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6240 case 'E': /* EUC-JP input */
6241 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6243 case 'S': /* Shift_JIS input */
6244 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6246 case 'Z': /* Convert X0208 alphabet to asii */
6248 bit:0 Convert JIS X 0208 Alphabet to ASCII
6249 bit:1 Convert Kankaku to one space
6250 bit:2 Convert Kankaku to two spaces
6251 bit:3 Convert HTML Entity
6252 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6254 while ('0'<= *cp && *cp <='4') {
6255 alpha_f |= 1 << (*cp++ - '0');
6259 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6260 x0201_f = FALSE; /* No X0201->X0208 conversion */
6262 ESC-(-I in JIS, EUC, MS Kanji
6263 SI/SO in JIS, EUC, MS Kanji
6264 SS2 in EUC, JIS, not in MS Kanji
6265 MS Kanji (0xa0-0xdf)
6267 ESC-(-I in JIS (0x20-0x5f)
6268 SS2 in EUC (0xa0-0xdf)
6269 0xa0-0xd in MS Kanji (0xa0-0xdf)
6272 case 'X': /* Convert X0201 kana to X0208 */
6275 case 'F': /* prserve new lines */
6276 fold_preserve_f = TRUE;
6277 case 'f': /* folding -f60 or -f */
6280 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6282 fold_len += *cp++ - '0';
6284 if (!(0<fold_len && fold_len<BUFSIZ))
6285 fold_len = DEFAULT_FOLD;
6289 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6291 fold_margin += *cp++ - '0';
6295 case 'm': /* MIME support */
6296 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6297 if (*cp=='B'||*cp=='Q') {
6298 mime_decode_mode = *cp++;
6299 mimebuf_f = FIXED_MIME;
6300 } else if (*cp=='N') {
6301 mime_f = TRUE; cp++;
6302 } else if (*cp=='S') {
6303 mime_f = STRICT_MIME; cp++;
6304 } else if (*cp=='0') {
6305 mime_decode_f = FALSE;
6306 mime_f = FALSE; cp++;
6308 mime_f = STRICT_MIME;
6311 case 'M': /* MIME output */
6314 mimeout_f = FIXED_MIME; cp++;
6315 } else if (*cp=='Q') {
6317 mimeout_f = FIXED_MIME; cp++;
6322 case 'B': /* Broken JIS support */
6324 bit:1 allow any x on ESC-(-x or ESC-$-x
6325 bit:2 reset to ascii on NL
6327 if ('9'>= *cp && *cp>='0')
6328 broken_f |= 1<<(*cp++ -'0');
6333 case 'O':/* for Output file */
6337 case 'c':/* add cr code */
6340 case 'd':/* delete cr code */
6343 case 'I': /* ISO-2022-JP output */
6346 case 'L': /* line mode */
6347 if (*cp=='u') { /* unix */
6348 eolmode_f = LF; cp++;
6349 } else if (*cp=='m') { /* mac */
6350 eolmode_f = CR; cp++;
6351 } else if (*cp=='w') { /* windows */
6352 eolmode_f = CRLF; cp++;
6353 } else if (*cp=='0') { /* no conversion */
6354 eolmode_f = 0; cp++;
6359 if ('2' <= *cp && *cp <= '9') {
6362 } else if (*cp == '0' || *cp == '1') {
6371 /* module muliple options in a string are allowed for Perl moudle */
6372 while(*cp && *cp++!='-');
6375 #if !defined(PERL_XS) && !defined(WIN32DLL)
6376 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6378 /* bogus option but ignored */
6386 #include "nkf32dll.c"
6387 #elif defined(PERL_XS)
6388 #else /* WIN32DLL */
6390 main(int argc, char **argv)
6395 char *outfname = NULL;
6398 #ifdef EASYWIN /*Easy Win */
6399 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6401 #ifdef DEFAULT_CODE_LOCALE
6402 setlocale(LC_CTYPE, "");
6406 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6407 cp = (unsigned char *)*argv;
6412 if (pipe(fds) < 0 || (pid = fork()) < 0){
6423 execvp(argv[1], &argv[1]);
6440 int debug_f_back = debug_f;
6443 int exec_f_back = exec_f;
6446 int x0212_f_back = x0212_f;
6448 int x0213_f_back = x0213_f;
6449 int guess_f_back = guess_f;
6451 guess_f = guess_f_back;
6454 debug_f = debug_f_back;
6457 exec_f = exec_f_back;
6459 x0212_f = x0212_f_back;
6460 x0213_f = x0213_f_back;
6463 if (binmode_f == TRUE)
6464 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6465 if (freopen("","wb",stdout) == NULL)
6472 setbuf(stdout, (char *) NULL);
6474 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6477 if (binmode_f == TRUE)
6478 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6479 if (freopen("","rb",stdin) == NULL) return (-1);
6483 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6487 kanji_convert(stdin);
6488 if (guess_f) print_guessed_code(NULL);
6492 int is_argument_error = FALSE;
6494 input_codename = NULL;
6497 iconv_for_check = 0;
6499 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6501 is_argument_error = TRUE;
6509 /* reopen file for stdout */
6510 if (file_out_f == TRUE) {
6513 outfname = nkf_xmalloc(strlen(origfname)
6514 + strlen(".nkftmpXXXXXX")
6516 strcpy(outfname, origfname);
6520 for (i = strlen(outfname); i; --i){
6521 if (outfname[i - 1] == '/'
6522 || outfname[i - 1] == '\\'){
6528 strcat(outfname, "ntXXXXXX");
6530 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6531 S_IREAD | S_IWRITE);
6533 strcat(outfname, ".nkftmpXXXXXX");
6534 fd = mkstemp(outfname);
6537 || (fd_backup = dup(fileno(stdout))) < 0
6538 || dup2(fd, fileno(stdout)) < 0
6549 outfname = "nkf.out";
6552 if(freopen(outfname, "w", stdout) == NULL) {
6556 if (binmode_f == TRUE) {
6557 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6558 if (freopen("","wb",stdout) == NULL)
6565 if (binmode_f == TRUE)
6566 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6567 if (freopen("","rb",fin) == NULL)
6572 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6576 char *filename = NULL;
6578 if (nfiles > 1) filename = origfname;
6579 if (guess_f) print_guessed_code(filename);
6585 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6593 if (dup2(fd_backup, fileno(stdout)) < 0){
6596 if (stat(origfname, &sb)) {
6597 fprintf(stderr, "Can't stat %s\n", origfname);
6599 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6600 if (chmod(outfname, sb.st_mode)) {
6601 fprintf(stderr, "Can't set permission %s\n", outfname);
6604 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6605 if(preserve_time_f){
6606 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6607 tb[0] = tb[1] = sb.st_mtime;
6608 if (utime(outfname, tb)) {
6609 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6612 tb.actime = sb.st_atime;
6613 tb.modtime = sb.st_mtime;
6614 if (utime(outfname, &tb)) {
6615 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6620 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6622 unlink(backup_filename);
6624 if (rename(origfname, backup_filename)) {
6625 perror(backup_filename);
6626 fprintf(stderr, "Can't rename %s to %s\n",
6627 origfname, backup_filename);
6629 nkf_xfree(backup_filename);
6632 if (unlink(origfname)){
6637 if (rename(outfname, origfname)) {
6639 fprintf(stderr, "Can't rename %s to %s\n",
6640 outfname, origfname);
6642 nkf_xfree(outfname);
6647 if (is_argument_error)
6650 #ifdef EASYWIN /*Easy Win */
6651 if (file_out_f == FALSE)
6652 scanf("%d",&end_check);
6655 #else /* for Other OS */
6656 if (file_out_f == TRUE)
6658 #endif /*Easy Win */
6661 #endif /* WIN32DLL */