2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2013, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.1.3"
24 #define NKF_RELEASE_DATE "2013-02-20"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2013, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
215 {"ISO-2022-JP", ISO_2022_JP},
216 {"ISO2022JP-CP932", CP50220},
217 {"CP50220", CP50220},
218 {"CP50221", CP50221},
219 {"CSISO2022JP", CP50221},
220 {"CP50222", CP50222},
221 {"ISO-2022-JP-1", ISO_2022_JP_1},
222 {"ISO-2022-JP-3", ISO_2022_JP_3},
223 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
224 {"SHIFT_JIS", SHIFT_JIS},
226 {"MS_Kanji", SHIFT_JIS},
228 {"WINDOWS-31J", WINDOWS_31J},
229 {"CSWINDOWS31J", WINDOWS_31J},
230 {"CP932", WINDOWS_31J},
231 {"MS932", WINDOWS_31J},
232 {"CP10001", CP10001},
235 {"EUCJP-NKF", EUCJP_NKF},
236 {"CP51932", CP51932},
237 {"EUC-JP-MS", EUCJP_MS},
238 {"EUCJP-MS", EUCJP_MS},
239 {"EUCJPMS", EUCJP_MS},
240 {"EUC-JP-ASCII", EUCJP_ASCII},
241 {"EUCJP-ASCII", EUCJP_ASCII},
242 {"SHIFT_JISX0213", SHIFT_JISX0213},
243 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
244 {"EUC-JISX0213", EUC_JISX0213},
245 {"EUC-JIS-2004", EUC_JIS_2004},
248 {"UTF-8-BOM", UTF_8_BOM},
249 {"UTF8-MAC", UTF8_MAC},
250 {"UTF-8-MAC", UTF8_MAC},
252 {"UTF-16BE", UTF_16BE},
253 {"UTF-16BE-BOM", UTF_16BE_BOM},
254 {"UTF-16LE", UTF_16LE},
255 {"UTF-16LE-BOM", UTF_16LE_BOM},
257 {"UTF-32BE", UTF_32BE},
258 {"UTF-32BE-BOM", UTF_32BE_BOM},
259 {"UTF-32LE", UTF_32LE},
260 {"UTF-32LE-BOM", UTF_32LE_BOM},
265 #if defined(DEFAULT_CODE_JIS)
266 #define DEFAULT_ENCIDX ISO_2022_JP
267 #elif defined(DEFAULT_CODE_SJIS)
268 #define DEFAULT_ENCIDX SHIFT_JIS
269 #elif defined(DEFAULT_CODE_WINDOWS_31J)
270 #define DEFAULT_ENCIDX WINDOWS_31J
271 #elif defined(DEFAULT_CODE_EUC)
272 #define DEFAULT_ENCIDX EUC_JP
273 #elif defined(DEFAULT_CODE_UTF8)
274 #define DEFAULT_ENCIDX UTF_8
278 #define is_alnum(c) \
279 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
281 /* I don't trust portablity of toupper */
282 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
283 #define nkf_isoctal(c) ('0'<=c && c<='7')
284 #define nkf_isdigit(c) ('0'<=c && c<='9')
285 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
286 #define nkf_isblank(c) (c == SP || c == TAB)
287 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
288 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
289 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
290 #define nkf_isprint(c) (SP<=c && c<='~')
291 #define nkf_isgraph(c) ('!'<=c && c<='~')
292 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
293 ('A'<=c&&c<='F') ? (c-'A'+10) : \
294 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
295 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
296 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
297 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
298 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
299 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
301 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
302 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
304 #define HOLD_SIZE 1024
305 #if defined(INT_IS_SHORT)
306 #define IOBUF_SIZE 2048
308 #define IOBUF_SIZE 16384
311 #define DEFAULT_J 'B'
312 #define DEFAULT_R 'B'
319 /* MIME preprocessor */
321 #ifdef EASYWIN /*Easy Win */
322 extern POINT _BufferSize;
331 void (*status_func)(struct input_code *, nkf_char);
332 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
336 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
337 static nkf_encoding *input_encoding = NULL;
338 static nkf_encoding *output_encoding = NULL;
340 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
342 * 0: Shift_JIS, eucJP-ascii
347 #define UCS_MAP_ASCII 0
349 #define UCS_MAP_CP932 2
350 #define UCS_MAP_CP10001 3
351 static int ms_ucs_map_f = UCS_MAP_ASCII;
353 #ifdef UTF8_INPUT_ENABLE
354 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
355 static int no_cp932ext_f = FALSE;
356 /* ignore ZERO WIDTH NO-BREAK SPACE */
357 static int no_best_fit_chars_f = FALSE;
358 static int input_endian = ENDIAN_BIG;
359 static int input_bom_f = FALSE;
360 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
361 static void (*encode_fallback)(nkf_char c) = NULL;
362 static void w_status(struct input_code *, nkf_char);
364 #ifdef UTF8_OUTPUT_ENABLE
365 static int output_bom_f = FALSE;
366 static int output_endian = ENDIAN_BIG;
369 static void std_putc(nkf_char c);
370 static nkf_char std_getc(FILE *f);
371 static nkf_char std_ungetc(nkf_char c,FILE *f);
373 static nkf_char broken_getc(FILE *f);
374 static nkf_char broken_ungetc(nkf_char c,FILE *f);
376 static nkf_char mime_getc(FILE *f);
378 static void mime_putc(nkf_char c);
382 #if !defined(PERL_XS) && !defined(WIN32DLL)
383 static unsigned char stdibuf[IOBUF_SIZE];
384 static unsigned char stdobuf[IOBUF_SIZE];
387 #define NKF_UNSPECIFIED (-TRUE)
390 static int unbuf_f = FALSE;
391 static int estab_f = FALSE;
392 static int nop_f = FALSE;
393 static int binmode_f = TRUE; /* binary mode */
394 static int rot_f = FALSE; /* rot14/43 mode */
395 static int hira_f = FALSE; /* hira/kata henkan */
396 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
397 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
398 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
399 static int mimebuf_f = FALSE; /* MIME buffered input */
400 static int broken_f = FALSE; /* convert ESC-less broken JIS */
401 static int iso8859_f = FALSE; /* ISO8859 through */
402 static int mimeout_f = FALSE; /* base64 mode */
403 static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */
404 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
406 #ifdef UNICODE_NORMALIZATION
407 static int nfc_f = FALSE;
408 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
409 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 static int cap_f = FALSE;
414 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
415 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
417 static int url_f = FALSE;
418 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
419 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
422 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
423 #define CLASS_MASK NKF_INT32_C(0xFF000000)
424 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
425 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
426 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
427 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
428 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
429 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
430 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
431 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
432 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
434 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
436 #ifdef NUMCHAR_OPTION
437 static int numchar_f = FALSE;
438 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
439 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
443 static int noout_f = FALSE;
444 static void no_putc(nkf_char c);
445 static int debug_f = FALSE;
446 static void debug(const char *str);
447 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
450 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
451 static void set_input_codename(const char *codename);
454 static int exec_f = 0;
457 #ifdef SHIFTJIS_CP932
458 /* invert IBM extended characters to others */
459 static int cp51932_f = FALSE;
461 /* invert NEC-selected IBM extended characters to IBM extended characters */
462 static int cp932inv_f = TRUE;
464 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
465 #endif /* SHIFTJIS_CP932 */
467 static int x0212_f = FALSE;
468 static int x0213_f = FALSE;
470 static unsigned char prefix_table[256];
472 static void e_status(struct input_code *, nkf_char);
473 static void s_status(struct input_code *, nkf_char);
475 struct input_code input_code_list[] = {
476 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
477 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
478 #ifdef UTF8_INPUT_ENABLE
479 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
480 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
481 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
483 {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0}
486 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
487 static int base64_count = 0;
489 /* X0208 -> ASCII converter */
492 static int f_line = 0; /* chars in line */
493 static int f_prev = 0;
494 static int fold_preserve_f = FALSE; /* preserve new lines */
495 static int fold_f = FALSE;
496 static int fold_len = 0;
499 static unsigned char kanji_intro = DEFAULT_J;
500 static unsigned char ascii_intro = DEFAULT_R;
504 #define FOLD_MARGIN 10
505 #define DEFAULT_FOLD 60
507 static int fold_margin = FOLD_MARGIN;
509 /* process default */
512 no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0)
514 fprintf(stderr,"nkf internal module connection failure.\n");
520 no_connection(nkf_char c2, nkf_char c1)
522 no_connection2(c2,c1,0);
525 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
526 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
529 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
530 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
531 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
532 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
533 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
534 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
536 /* static redirections */
538 static void (*o_putc)(nkf_char c) = std_putc;
540 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
541 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
543 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
544 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
546 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
548 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
549 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
551 /* for strict mime */
552 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
553 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
556 static int output_mode = ASCII; /* output kanji mode */
557 static int input_mode = ASCII; /* input kanji mode */
558 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
560 /* X0201 / X0208 conversion tables */
562 /* X0201 kana conversion table */
564 static const unsigned char cv[]= {
565 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
566 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
567 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
568 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
569 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
570 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
571 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
572 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
573 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
574 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
575 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
576 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
577 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
578 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
579 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
580 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
584 /* X0201 kana conversion table for daguten */
586 static const unsigned char dv[]= {
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
592 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
593 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
594 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
595 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
596 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
598 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 /* X0201 kana conversion table for han-daguten */
607 static const unsigned char ev[]= {
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
619 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
626 /* X0201 kana to X0213 conversion table for han-daguten */
628 static const unsigned char ev_x0213[]= {
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78,
635 0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00,
638 0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 /* X0208 kigou conversion table */
649 /* 0x8140 - 0x819e */
650 static const unsigned char fv[] = {
652 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
653 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
654 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
656 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
657 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
658 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
660 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
668 static int option_mode = 0;
669 static int file_out_f = FALSE;
671 static int overwrite_f = FALSE;
672 static int preserve_time_f = FALSE;
673 static int backup_f = FALSE;
674 static char *backup_suffix = "";
677 static int eolmode_f = 0; /* CR, LF, CRLF */
678 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
679 static nkf_char prev_cr = 0; /* CR or 0 */
680 #ifdef EASYWIN /*Easy Win */
681 static int end_check;
685 nkf_xmalloc(size_t size)
689 if (size == 0) size = 1;
693 perror("can't malloc");
701 nkf_xrealloc(void *ptr, size_t size)
703 if (size == 0) size = 1;
705 ptr = realloc(ptr, size);
707 perror("can't realloc");
714 #define nkf_xfree(ptr) free(ptr)
717 nkf_str_caseeql(const char *src, const char *target)
720 for (i = 0; src[i] && target[i]; i++) {
721 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
723 if (src[i] || target[i]) return FALSE;
728 nkf_enc_from_index(int idx)
730 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
733 return &nkf_encoding_table[idx];
737 nkf_enc_find_index(const char *name)
740 if (name[0] == 'X' && *(name+1) == '-') name += 2;
741 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
742 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
743 return encoding_name_to_id_table[i].id;
750 nkf_enc_find(const char *name)
753 idx = nkf_enc_find_index(name);
754 if (idx < 0) return 0;
755 return nkf_enc_from_index(idx);
758 #define nkf_enc_name(enc) (enc)->name
759 #define nkf_enc_to_index(enc) (enc)->id
760 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
761 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
762 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
763 #define nkf_enc_asciicompat(enc) (\
764 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
765 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
766 #define nkf_enc_unicode_p(enc) (\
767 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
768 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
769 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
770 #define nkf_enc_cp5022x_p(enc) (\
771 nkf_enc_to_index(enc) == CP50220 ||\
772 nkf_enc_to_index(enc) == CP50221 ||\
773 nkf_enc_to_index(enc) == CP50222)
775 #ifdef DEFAULT_CODE_LOCALE
779 #ifdef HAVE_LANGINFO_H
780 return nl_langinfo(CODESET);
781 #elif defined(__WIN32__)
783 sprintf(buf, "CP%d", GetACP());
785 #elif defined(__OS2__)
786 # if defined(INT_IS_SHORT)
792 ULONG ulCP[1], ulncp;
793 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
794 if (ulCP[0] == 932 || ulCP[0] == 943)
795 strcpy(buf, "Shift_JIS");
797 sprintf(buf, "CP%lu", ulCP[0]);
805 nkf_locale_encoding()
807 nkf_encoding *enc = 0;
808 const char *encname = nkf_locale_charmap();
810 enc = nkf_enc_find(encname);
813 #endif /* DEFAULT_CODE_LOCALE */
818 return &nkf_encoding_table[UTF_8];
822 nkf_default_encoding()
824 nkf_encoding *enc = 0;
825 #ifdef DEFAULT_CODE_LOCALE
826 enc = nkf_locale_encoding();
827 #elif defined(DEFAULT_ENCIDX)
828 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
830 if (!enc) enc = nkf_utf8_encoding();
841 nkf_buf_new(int length)
843 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
844 buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length);
852 nkf_buf_dispose(nkf_buf_t *buf)
859 #define nkf_buf_length(buf) ((buf)->len)
860 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
863 nkf_buf_at(nkf_buf_t *buf, int index)
865 assert(index <= buf->len);
866 return buf->ptr[index];
870 nkf_buf_clear(nkf_buf_t *buf)
876 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
878 if (buf->capa <= buf->len) {
881 buf->ptr[buf->len++] = c;
885 nkf_buf_pop(nkf_buf_t *buf)
887 assert(!nkf_buf_empty_p(buf));
888 return buf->ptr[--buf->len];
891 /* Normalization Form C */
894 #define fprintf dllprintf
900 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
907 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
908 #ifdef UTF8_OUTPUT_ENABLE
909 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
910 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
913 #ifdef UTF8_INPUT_ENABLE
914 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
915 " UTF option is -W[8,[16,32][B,L]]\n"
917 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
921 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
922 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
923 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
926 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
927 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
928 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
929 " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n"
932 " O Output to File (DEFAULT 'nkf.out')\n"
933 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
936 " --ic=<encoding> Specify the input encoding\n"
937 " --oc=<encoding> Specify the output encoding\n"
938 " --hiragana --katakana Hiragana/Katakana Conversion\n"
939 " --katakana-hiragana Converts each other\n"
943 " --{cap, url}-input Convert hex after ':' or '%%'\n"
945 #ifdef NUMCHAR_OPTION
946 " --numchar-input Convert Unicode Character Reference\n"
948 #ifdef UTF8_INPUT_ENABLE
949 " --fb-{skip, html, xml, perl, java, subchar}\n"
950 " Specify unassigned character's replacement\n"
955 " --in-place[=SUF] Overwrite original files\n"
956 " --overwrite[=SUF] Preserve timestamp of original files\n"
958 " -g --guess Guess the input code\n"
959 " -v --version Print the version\n"
960 " --help/-V Print this help / configuration\n"
966 show_configuration(void)
969 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
970 " Compile-time options:\n"
971 " Compiled at: " __DATE__ " " __TIME__ "\n"
974 " Default output encoding: "
975 #ifdef DEFAULT_CODE_LOCALE
976 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
977 #elif defined(DEFAULT_ENCIDX)
978 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
984 " Default output end of line: "
985 #if DEFAULT_NEWLINE == CR
987 #elif DEFAULT_NEWLINE == CRLF
993 " Decode MIME encoded string: "
994 #if MIME_DECODE_DEFAULT
1000 " Convert JIS X 0201 Katakana: "
1007 " --help, --version output: "
1008 #if HELP_OUTPUT_HELP_OUTPUT
1019 get_backup_filename(const char *suffix, const char *filename)
1021 char *backup_filename;
1022 int asterisk_count = 0;
1024 int filename_length = strlen(filename);
1026 for(i = 0; suffix[i]; i++){
1027 if(suffix[i] == '*') asterisk_count++;
1031 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1032 for(i = 0, j = 0; suffix[i];){
1033 if(suffix[i] == '*'){
1034 backup_filename[j] = '\0';
1035 strncat(backup_filename, filename, filename_length);
1037 j += filename_length;
1039 backup_filename[j++] = suffix[i++];
1042 backup_filename[j] = '\0';
1044 j = filename_length + strlen(suffix);
1045 backup_filename = nkf_xmalloc(j + 1);
1046 strcpy(backup_filename, filename);
1047 strcat(backup_filename, suffix);
1048 backup_filename[j] = '\0';
1050 return backup_filename;
1054 #ifdef UTF8_INPUT_ENABLE
1056 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1061 if(c >= NKF_INT32_C(1)<<shift){
1063 (*f)(0, bin2hex(c>>shift));
1074 encode_fallback_html(nkf_char c)
1079 if(c >= NKF_INT32_C(1000000))
1080 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1081 if(c >= NKF_INT32_C(100000))
1082 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1084 (*oconv)(0, 0x30+(c/10000 )%10);
1086 (*oconv)(0, 0x30+(c/1000 )%10);
1088 (*oconv)(0, 0x30+(c/100 )%10);
1090 (*oconv)(0, 0x30+(c/10 )%10);
1092 (*oconv)(0, 0x30+ c %10);
1098 encode_fallback_xml(nkf_char c)
1103 nkf_each_char_to_hex(oconv, c);
1109 encode_fallback_java(nkf_char c)
1113 if(!nkf_char_unicode_bmp_p(c)){
1117 (*oconv)(0, bin2hex(c>>20));
1118 (*oconv)(0, bin2hex(c>>16));
1122 (*oconv)(0, bin2hex(c>>12));
1123 (*oconv)(0, bin2hex(c>> 8));
1124 (*oconv)(0, bin2hex(c>> 4));
1125 (*oconv)(0, bin2hex(c ));
1130 encode_fallback_perl(nkf_char c)
1135 nkf_each_char_to_hex(oconv, c);
1141 encode_fallback_subchar(nkf_char c)
1143 c = unicode_subchar;
1144 (*oconv)((c>>8)&0xFF, c&0xFF);
1149 static const struct {
1173 {"katakana-hiragana","h3"},
1181 #ifdef UTF8_OUTPUT_ENABLE
1191 {"fb-subchar=", ""},
1193 #ifdef UTF8_INPUT_ENABLE
1194 {"utf8-input", "W"},
1195 {"utf16-input", "W16"},
1196 {"no-cp932ext", ""},
1197 {"no-best-fit-chars",""},
1199 #ifdef UNICODE_NORMALIZATION
1200 {"utf8mac-input", ""},
1212 #ifdef NUMCHAR_OPTION
1213 {"numchar-input", ""},
1219 #ifdef SHIFTJIS_CP932
1230 set_input_encoding(nkf_encoding *enc)
1232 switch (nkf_enc_to_index(enc)) {
1238 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1240 #ifdef SHIFTJIS_CP932
1243 #ifdef UTF8_OUTPUT_ENABLE
1244 ms_ucs_map_f = UCS_MAP_CP932;
1254 case ISO_2022_JP_2004:
1261 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1262 #ifdef SHIFTJIS_CP932
1265 #ifdef UTF8_OUTPUT_ENABLE
1266 ms_ucs_map_f = UCS_MAP_CP932;
1271 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_CP10001;
1283 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1284 #ifdef SHIFTJIS_CP932
1287 #ifdef UTF8_OUTPUT_ENABLE
1288 ms_ucs_map_f = UCS_MAP_CP932;
1292 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1293 #ifdef SHIFTJIS_CP932
1296 #ifdef UTF8_OUTPUT_ENABLE
1297 ms_ucs_map_f = UCS_MAP_MS;
1301 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1302 #ifdef SHIFTJIS_CP932
1305 #ifdef UTF8_OUTPUT_ENABLE
1306 ms_ucs_map_f = UCS_MAP_ASCII;
1309 case SHIFT_JISX0213:
1310 case SHIFT_JIS_2004:
1312 #ifdef SHIFTJIS_CP932
1314 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1320 #ifdef SHIFTJIS_CP932
1324 #ifdef UTF8_INPUT_ENABLE
1325 #ifdef UNICODE_NORMALIZATION
1333 input_endian = ENDIAN_BIG;
1337 input_endian = ENDIAN_LITTLE;
1342 input_endian = ENDIAN_BIG;
1346 input_endian = ENDIAN_LITTLE;
1353 set_output_encoding(nkf_encoding *enc)
1355 switch (nkf_enc_to_index(enc)) {
1357 #ifdef SHIFTJIS_CP932
1358 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 ms_ucs_map_f = UCS_MAP_CP932;
1365 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1366 #ifdef SHIFTJIS_CP932
1367 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1369 #ifdef UTF8_OUTPUT_ENABLE
1370 ms_ucs_map_f = UCS_MAP_CP932;
1374 #ifdef SHIFTJIS_CP932
1375 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1380 #ifdef SHIFTJIS_CP932
1381 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1385 case ISO_2022_JP_2004:
1388 #ifdef SHIFTJIS_CP932
1389 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1395 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1396 #ifdef UTF8_OUTPUT_ENABLE
1397 ms_ucs_map_f = UCS_MAP_CP932;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_CP10001;
1407 #ifdef SHIFTJIS_CP932
1408 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 ms_ucs_map_f = UCS_MAP_ASCII;
1416 #ifdef SHIFTJIS_CP932
1417 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1419 #ifdef UTF8_OUTPUT_ENABLE
1420 ms_ucs_map_f = UCS_MAP_ASCII;
1424 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1425 #ifdef SHIFTJIS_CP932
1426 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1428 #ifdef UTF8_OUTPUT_ENABLE
1429 ms_ucs_map_f = UCS_MAP_CP932;
1433 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1435 #ifdef UTF8_OUTPUT_ENABLE
1436 ms_ucs_map_f = UCS_MAP_MS;
1440 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1442 #ifdef UTF8_OUTPUT_ENABLE
1443 ms_ucs_map_f = UCS_MAP_ASCII;
1446 case SHIFT_JISX0213:
1447 case SHIFT_JIS_2004:
1449 #ifdef SHIFTJIS_CP932
1450 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1457 #ifdef SHIFTJIS_CP932
1458 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1461 #ifdef UTF8_OUTPUT_ENABLE
1463 output_bom_f = TRUE;
1467 output_bom_f = TRUE;
1470 output_endian = ENDIAN_LITTLE;
1471 output_bom_f = FALSE;
1474 output_endian = ENDIAN_LITTLE;
1475 output_bom_f = TRUE;
1479 output_bom_f = TRUE;
1482 output_endian = ENDIAN_LITTLE;
1483 output_bom_f = FALSE;
1486 output_endian = ENDIAN_LITTLE;
1487 output_bom_f = TRUE;
1493 static struct input_code*
1494 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1497 struct input_code *p = input_code_list;
1499 if (iconv_func == p->iconv_func){
1509 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1511 #ifdef INPUT_CODE_FIX
1512 if (f || !input_encoding)
1519 #ifdef INPUT_CODE_FIX
1520 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1526 if (estab_f && iconv_for_check != iconv){
1527 struct input_code *p = find_inputcode_byfunc(iconv);
1529 set_input_codename(p->name);
1532 iconv_for_check = iconv;
1539 x0212_shift(nkf_char c)
1544 if (0x75 <= c && c <= 0x7f){
1545 ret = c + (0x109 - 0x75);
1548 if (0x75 <= c && c <= 0x7f){
1549 ret = c + (0x113 - 0x75);
1557 x0212_unshift(nkf_char c)
1560 if (0x7f <= c && c <= 0x88){
1561 ret = c + (0x75 - 0x7f);
1562 }else if (0x89 <= c && c <= 0x92){
1563 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1567 #endif /* X0212_ENABLE */
1570 is_x0213_2_in_x0212(nkf_char c1)
1572 static const char x0213_2_table[] =
1573 {0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1};
1576 return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */
1577 if (78 <= ku && ku <= 94)
1583 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1588 if (x0213_f && is_x0213_2_in_x0212(ndx)){
1589 if((0x21 <= ndx && ndx <= 0x2F)){
1590 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1591 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1593 }else if(0x6E <= ndx && ndx <= 0x7E){
1594 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1595 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1601 else if(nkf_isgraph(ndx)){
1603 const unsigned short *ptr;
1604 ptr = x0212_shiftjis[ndx - 0x21];
1606 val = ptr[(c1 & 0x7f) - 0x21];
1615 c2 = x0212_shift(c2);
1617 #endif /* X0212_ENABLE */
1619 if(0x7F < c2) return 1;
1620 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1621 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1626 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1628 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1631 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1632 if (0xFC < c1) return 1;
1633 #ifdef SHIFTJIS_CP932
1634 if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){
1635 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1642 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1643 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1649 #endif /* SHIFTJIS_CP932 */
1651 if (!x0213_f && is_ibmext_in_sjis(c2)){
1652 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1655 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1668 if(x0213_f && c2 >= 0xF0){
1669 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1670 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1671 }else{ /* 78<=k<=94 */
1672 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1673 if (0x9E < c1) c2++;
1676 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1677 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1678 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1679 if (0x9E < c1) c2++;
1682 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1689 c2 = x0212_unshift(c2);
1696 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1698 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1706 }else if (val < 0x800){
1707 *p1 = 0xc0 | (val >> 6);
1708 *p2 = 0x80 | (val & 0x3f);
1711 } else if (nkf_char_unicode_bmp_p(val)) {
1712 *p1 = 0xe0 | (val >> 12);
1713 *p2 = 0x80 | ((val >> 6) & 0x3f);
1714 *p3 = 0x80 | ( val & 0x3f);
1716 } else if (nkf_char_unicode_value_p(val)) {
1717 *p1 = 0xf0 | (val >> 18);
1718 *p2 = 0x80 | ((val >> 12) & 0x3f);
1719 *p3 = 0x80 | ((val >> 6) & 0x3f);
1720 *p4 = 0x80 | ( val & 0x3f);
1730 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1737 else if (c1 <= 0xC1) {
1738 /* trail byte or invalid */
1741 else if (c1 <= 0xDF) {
1743 wc = (c1 & 0x1F) << 6;
1746 else if (c1 <= 0xEF) {
1748 wc = (c1 & 0x0F) << 12;
1749 wc |= (c2 & 0x3F) << 6;
1752 else if (c2 <= 0xF4) {
1754 wc = (c1 & 0x0F) << 18;
1755 wc |= (c2 & 0x3F) << 12;
1756 wc |= (c3 & 0x3F) << 6;
1766 #ifdef UTF8_INPUT_ENABLE
1768 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1769 const unsigned short *const *pp, nkf_char psize,
1770 nkf_char *p2, nkf_char *p1)
1773 const unsigned short *p;
1776 if (pp == 0) return 1;
1779 if (c1 < 0 || psize <= c1) return 1;
1781 if (p == 0) return 1;
1784 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1786 if (val == 0) return 1;
1787 if (no_cp932ext_f && (
1788 (val>>8) == 0x2D || /* NEC special characters */
1789 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1797 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1805 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1807 const unsigned short *const *pp;
1808 const unsigned short *const *const *ppp;
1809 static const char no_best_fit_chars_table_C2[] =
1810 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1811 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1812 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1813 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1814 static const char no_best_fit_chars_table_C2_ms[] =
1815 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1817 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1818 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1819 static const char no_best_fit_chars_table_932_C2[] =
1820 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1821 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1822 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1823 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1824 static const char no_best_fit_chars_table_932_C3[] =
1825 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1826 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1828 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1834 }else if(c2 < 0xe0){
1835 if(no_best_fit_chars_f){
1836 if(ms_ucs_map_f == UCS_MAP_CP932){
1839 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1842 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1845 }else if(!cp932inv_f){
1848 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1851 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1854 }else if(ms_ucs_map_f == UCS_MAP_MS){
1855 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1856 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1874 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1875 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1876 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1877 x0213_f ? utf8_to_euc_2bytes_x0213 :
1879 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1880 }else if(c0 < 0xF0){
1881 if(no_best_fit_chars_f){
1882 if(ms_ucs_map_f == UCS_MAP_CP932){
1883 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1884 }else if(ms_ucs_map_f == UCS_MAP_MS){
1889 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1892 if(c0 == 0x92) return 1;
1897 if(c1 == 0x80 || c0 == 0x9C) return 1;
1900 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1905 if(c0 == 0x94) return 1;
1908 if(c0 == 0xBB) return 1;
1918 if(c0 == 0x95) return 1;
1921 if(c0 == 0xA5) return 1;
1928 if(c0 == 0x8D) return 1;
1931 if(c0 == 0x9E && !cp932inv_f) return 1;
1934 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1942 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1943 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1944 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1945 x0213_f ? utf8_to_euc_3bytes_x0213 :
1947 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1949 #ifdef SHIFTJIS_CP932
1950 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1952 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1953 s2e_conv(s2, s1, p2, p1);
1962 #ifdef UTF8_OUTPUT_ENABLE
1963 #define X0213_SURROGATE_FIND(tbl, size, euc) do { \
1965 for (i = 0; i < size; i++) \
1966 if (tbl[i][0] == euc) { \
1973 e2w_conv(nkf_char c2, nkf_char c1)
1975 const unsigned short *p;
1977 if (c2 == JIS_X_0201_1976_K) {
1978 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1986 p = euc_to_utf8_1byte;
1988 } else if (is_eucg3(c2)){
1989 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1992 c2 = (c2&0x7f) - 0x21;
1993 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1995 x0213_f ? x0212_to_utf8_2bytes_x0213[c2] :
1996 x0212_to_utf8_2bytes[c2];
2002 c2 = (c2&0x7f) - 0x21;
2003 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2005 x0213_f ? euc_to_utf8_2bytes_x0213[c2] :
2006 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
2007 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
2008 euc_to_utf8_2bytes_ms[c2];
2013 c1 = (c1 & 0x7f) - 0x21;
2014 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) {
2015 nkf_char val = p[c1];
2016 if (x0213_f && 0xD800<=val && val<=0xDBFF) {
2017 nkf_char euc = (c2+0x21)<<8 | (c1+0x21);
2019 if (p==x0212_to_utf8_2bytes_x0213[c2]) {
2020 X0213_SURROGATE_FIND(x0213_2_surrogate_table, sizeof_x0213_2_surrogate_table, euc);
2022 X0213_SURROGATE_FIND(x0213_1_surrogate_table, sizeof_x0213_1_surrogate_table, euc);
2025 return UTF16_TO_UTF32(val, low);
2034 e2w_combining(nkf_char comb, nkf_char c2, nkf_char c1)
2038 for (i = 0; i < sizeof_x0213_combining_chars; i++)
2039 if (x0213_combining_chars[i] == comb)
2041 if (i >= sizeof_x0213_combining_chars)
2043 euc = (c2&0x7f)<<8 | (c1&0x7f);
2044 for (i = 0; i < sizeof_x0213_combining_table; i++)
2045 if (x0213_combining_table[i][0] == euc)
2046 return x0213_combining_table[i][1];
2052 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
2059 }else if (0xc0 <= c2 && c2 <= 0xef) {
2060 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2061 #ifdef NUMCHAR_OPTION
2064 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
2072 #ifdef UTF8_INPUT_ENABLE
2074 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
2076 nkf_char c1, c2, c3, c4;
2083 else if (nkf_char_unicode_bmp_p(val)){
2084 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2085 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
2088 *p1 = nkf_char_unicode_new(val);
2095 c1 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2096 c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2097 for (i = 0; i < sizeof_x0213_1_surrogate_table; i++)
2098 if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) {
2099 val = x0213_1_surrogate_table[i][0];
2104 for (i = 0; i < sizeof_x0213_2_surrogate_table; i++)
2105 if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) {
2106 val = x0213_2_surrogate_table[i][0];
2107 *p2 = PREFIX_EUCG3 | (val >> 8);
2113 *p1 = nkf_char_unicode_new(val);
2120 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2122 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2123 if (iso2022jp_f && !x0201_f) {
2124 c2 = GETA1; c1 = GETA2;
2126 c2 = JIS_X_0201_1976_K;
2130 }else if (c2 == 0x8f){
2134 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2135 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2136 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2139 c2 = (c2 << 8) | (c1 & 0x7f);
2141 #ifdef SHIFTJIS_CP932
2144 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2145 s2e_conv(s2, s1, &c2, &c1);
2152 #endif /* SHIFTJIS_CP932 */
2154 #endif /* X0212_ENABLE */
2155 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2158 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2159 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2160 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2165 #ifdef SHIFTJIS_CP932
2166 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2168 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2169 s2e_conv(s2, s1, &c2, &c1);
2176 #endif /* SHIFTJIS_CP932 */
2184 s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2186 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2187 if (iso2022jp_f && !x0201_f) {
2188 c2 = GETA1; c1 = GETA2;
2192 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2194 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2196 if(c1 == 0x7F) return 0;
2197 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2200 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2201 if (ret) return ret;
2208 x0213_wait_combining_p(nkf_char wc)
2211 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2212 if (x0213_combining_table[i][1] == wc) {
2220 x0213_combining_p(nkf_char wc)
2223 for (i = 0; i < sizeof_x0213_combining_chars; i++) {
2224 if (x0213_combining_chars[i] == wc) {
2232 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2234 nkf_char ret = 0, c4 = 0;
2235 static const char w_iconv_utf8_1st_byte[] =
2237 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2238 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2239 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2240 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2247 if (c1 < 0 || 0xff < c1) {
2248 }else if (c1 == 0) { /* 0 : 1 byte*/
2250 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2253 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2255 if (c2 < 0x80 || 0xBF < c2) return 0;
2258 if (c3 == 0) return -1;
2259 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2264 if (c3 == 0) return -1;
2265 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2269 if (c3 == 0) return -1;
2270 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2274 if (c3 == 0) return -2;
2275 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2279 if (c3 == 0) return -2;
2280 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2284 if (c3 == 0) return -2;
2285 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2293 if (c1 == 0 || c1 == EOF){
2294 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2295 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2298 if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4)))
2300 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2309 w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3)
2311 /* continue from the line below 'return -3;' in w_iconv() */
2312 nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2);
2319 #define NKF_ICONV_INVALID_CODE_RANGE -13
2320 #define NKF_ICONV_WAIT_COMBINING_CHAR -14
2321 #define NKF_ICONV_NOT_COMBINED -15
2323 unicode_iconv(nkf_char wc, int nocombine)
2331 }else if ((wc>>11) == 27) {
2332 /* unpaired surrogate */
2333 return NKF_ICONV_INVALID_CODE_RANGE;
2334 }else if (wc < 0xFFFF) {
2335 if (!nocombine && x0213_f && x0213_wait_combining_p(wc))
2336 return NKF_ICONV_WAIT_COMBINING_CHAR;
2337 ret = w16e_conv(wc, &c2, &c1);
2338 if (ret) return ret;
2339 }else if (wc < 0x10FFFF) {
2341 c1 = nkf_char_unicode_new(wc);
2343 return NKF_ICONV_INVALID_CODE_RANGE;
2350 unicode_iconv_combine(nkf_char wc, nkf_char wc2)
2356 return NKF_ICONV_NOT_COMBINED;
2357 }else if ((wc2>>11) == 27) {
2358 /* unpaired surrogate */
2359 return NKF_ICONV_INVALID_CODE_RANGE;
2360 }else if (wc2 < 0xFFFF) {
2361 if (!x0213_combining_p(wc2))
2362 return NKF_ICONV_NOT_COMBINED;
2363 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2364 if (x0213_combining_table[i][1] == wc &&
2365 x0213_combining_table[i][2] == wc2) {
2366 c2 = x0213_combining_table[i][0] >> 8;
2367 c1 = x0213_combining_table[i][0] & 0x7f;
2372 }else if (wc2 < 0x10FFFF) {
2373 return NKF_ICONV_NOT_COMBINED;
2375 return NKF_ICONV_INVALID_CODE_RANGE;
2377 return NKF_ICONV_NOT_COMBINED;
2381 w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6)
2384 wc = nkf_utf8_to_unicode(c1, c2, c3, 0);
2385 wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0);
2388 return unicode_iconv_combine(wc, wc2);
2391 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1
2392 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2
2394 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2403 if (input_endian == ENDIAN_BIG) {
2404 if (0xD8 <= c1 && c1 <= 0xDB) {
2405 if (0xDC <= c3 && c3 <= 0xDF) {
2406 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2407 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2412 if (0xD8 <= c2 && c2 <= 0xDB) {
2413 if (0xDC <= c4 && c4 <= 0xDF) {
2414 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2415 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2421 return (*unicode_iconv)(wc, FALSE);
2425 nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2429 if (input_endian == ENDIAN_BIG) {
2430 if (0xD8 <= c3 && c3 <= 0xDB) {
2431 return NKF_ICONV_NOT_COMBINED;
2437 if (0xD8 <= c2 && c2 <= 0xDB) {
2438 return NKF_ICONV_NOT_COMBINED;
2445 return unicode_iconv_combine(wc, wc2);
2449 nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2)
2452 if (input_endian == ENDIAN_BIG)
2456 return (*unicode_iconv)(wc, TRUE);
2460 w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2463 return 16; /* different from w_iconv32 */
2467 w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2470 return 32; /* different from w_iconv16 */
2474 utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2478 switch(input_endian){
2480 wc = c2 << 16 | c3 << 8 | c4;
2483 wc = c3 << 16 | c2 << 8 | c1;
2486 wc = c1 << 16 | c4 << 8 | c3;
2489 wc = c4 << 16 | c1 << 8 | c2;
2492 return NKF_ICONV_INVALID_CODE_RANGE;
2498 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2507 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2511 return (*unicode_iconv)(wc, FALSE);
2515 nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8)
2519 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2522 wc2 = utf32_to_nkf_char(c5, c6, c7, c8);
2526 return unicode_iconv_combine(wc, wc2);
2530 nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2534 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2535 return (*unicode_iconv)(wc, TRUE);
2539 #define output_ascii_escape_sequence(mode) do { \
2540 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2543 (*o_putc)(ascii_intro); \
2544 output_mode = mode; \
2549 output_escape_sequence(int mode)
2551 if (output_mode == mode)
2559 case JIS_X_0201_1976_K:
2567 (*o_putc)(kanji_intro);
2592 j_oconv(nkf_char c2, nkf_char c1)
2594 #ifdef NUMCHAR_OPTION
2595 if (c2 == 0 && nkf_char_unicode_p(c1)){
2596 w16e_conv(c1, &c2, &c1);
2597 if (c2 == 0 && nkf_char_unicode_p(c1)){
2598 c2 = c1 & VALUE_MASK;
2599 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2602 c2 = 0x7F + c1 / 94;
2603 c1 = 0x21 + c1 % 94;
2605 if (encode_fallback) (*encode_fallback)(c1);
2612 output_ascii_escape_sequence(ASCII);
2615 else if (c2 == EOF) {
2616 output_ascii_escape_sequence(ASCII);
2619 else if (c2 == ISO_8859_1) {
2620 output_ascii_escape_sequence(ISO_8859_1);
2623 else if (c2 == JIS_X_0201_1976_K) {
2624 output_escape_sequence(JIS_X_0201_1976_K);
2627 } else if (is_eucg3(c2)){
2628 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2629 (*o_putc)(c2 & 0x7f);
2634 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2635 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2636 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2643 e_oconv(nkf_char c2, nkf_char c1)
2645 if (c2 == 0 && nkf_char_unicode_p(c1)){
2646 w16e_conv(c1, &c2, &c1);
2647 if (c2 == 0 && nkf_char_unicode_p(c1)){
2648 c2 = c1 & VALUE_MASK;
2649 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2653 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2654 c1 = 0x21 + c1 % 94;
2657 (*o_putc)((c2 & 0x7f) | 0x080);
2658 (*o_putc)(c1 | 0x080);
2660 (*o_putc)((c2 & 0x7f) | 0x080);
2661 (*o_putc)(c1 | 0x080);
2665 if (encode_fallback) (*encode_fallback)(c1);
2673 } else if (c2 == 0) {
2674 output_mode = ASCII;
2676 } else if (c2 == JIS_X_0201_1976_K) {
2677 output_mode = EUC_JP;
2678 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2679 } else if (c2 == ISO_8859_1) {
2680 output_mode = ISO_8859_1;
2681 (*o_putc)(c1 | 0x080);
2683 } else if (is_eucg3(c2)){
2684 output_mode = EUC_JP;
2685 #ifdef SHIFTJIS_CP932
2688 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2689 s2e_conv(s2, s1, &c2, &c1);
2694 output_mode = ASCII;
2696 }else if (is_eucg3(c2)){
2699 (*o_putc)((c2 & 0x7f) | 0x080);
2700 (*o_putc)(c1 | 0x080);
2703 (*o_putc)((c2 & 0x7f) | 0x080);
2704 (*o_putc)(c1 | 0x080);
2708 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2709 set_iconv(FALSE, 0);
2710 return; /* too late to rescue this char */
2712 output_mode = EUC_JP;
2713 (*o_putc)(c2 | 0x080);
2714 (*o_putc)(c1 | 0x080);
2719 s_oconv(nkf_char c2, nkf_char c1)
2721 #ifdef NUMCHAR_OPTION
2722 if (c2 == 0 && nkf_char_unicode_p(c1)){
2723 w16e_conv(c1, &c2, &c1);
2724 if (c2 == 0 && nkf_char_unicode_p(c1)){
2725 c2 = c1 & VALUE_MASK;
2726 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2729 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2731 c1 += 0x40 + (c1 > 0x3e);
2736 if(encode_fallback)(*encode_fallback)(c1);
2745 } else if (c2 == 0) {
2746 output_mode = ASCII;
2748 } else if (c2 == JIS_X_0201_1976_K) {
2749 output_mode = SHIFT_JIS;
2751 } else if (c2 == ISO_8859_1) {
2752 output_mode = ISO_8859_1;
2753 (*o_putc)(c1 | 0x080);
2755 } else if (is_eucg3(c2)){
2756 output_mode = SHIFT_JIS;
2757 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2763 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2764 set_iconv(FALSE, 0);
2765 return; /* too late to rescue this char */
2767 output_mode = SHIFT_JIS;
2768 e2s_conv(c2, c1, &c2, &c1);
2770 #ifdef SHIFTJIS_CP932
2772 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2773 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2779 #endif /* SHIFTJIS_CP932 */
2782 if (prefix_table[(unsigned char)c1]){
2783 (*o_putc)(prefix_table[(unsigned char)c1]);
2789 #ifdef UTF8_OUTPUT_ENABLE
2790 #define OUTPUT_UTF8(val) do { \
2791 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \
2793 if (c2) (*o_putc)(c2); \
2794 if (c3) (*o_putc)(c3); \
2795 if (c4) (*o_putc)(c4); \
2799 w_oconv(nkf_char c2, nkf_char c1)
2805 output_bom_f = FALSE;
2816 if (c2 == 0 && nkf_char_unicode_p(c1)){
2817 val = c1 & VALUE_MASK;
2825 val = e2w_conv(c2, c1);
2827 val2 = e2w_combining(val, c2, c1);
2835 #define OUTPUT_UTF16_BYTES(c1, c2) do { \
2836 if (output_endian == ENDIAN_LITTLE){ \
2845 #define OUTPUT_UTF16(val) do { \
2846 if (nkf_char_unicode_bmp_p(val)) { \
2847 c2 = (val >> 8) & 0xff; \
2849 OUTPUT_UTF16_BYTES(c1, c2); \
2851 val &= VALUE_MASK; \
2852 if (val <= UNICODE_MAX) { \
2853 c2 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ \
2854 c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \
2855 OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \
2856 OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \
2862 w_oconv16(nkf_char c2, nkf_char c1)
2865 output_bom_f = FALSE;
2866 OUTPUT_UTF16_BYTES(0xFF, 0xFE);
2874 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2878 val = e2w_conv(c2, c1);
2880 val2 = e2w_combining(val, c2, c1);
2885 OUTPUT_UTF16_BYTES(c1, c2);
2889 #define OUTPUT_UTF32(c) do { \
2890 if (output_endian == ENDIAN_LITTLE){ \
2891 (*o_putc)( (c) & 0xFF); \
2892 (*o_putc)(((c) >> 8) & 0xFF); \
2893 (*o_putc)(((c) >> 16) & 0xFF); \
2897 (*o_putc)(((c) >> 16) & 0xFF); \
2898 (*o_putc)(((c) >> 8) & 0xFF); \
2899 (*o_putc)( (c) & 0xFF); \
2904 w_oconv32(nkf_char c2, nkf_char c1)
2907 output_bom_f = FALSE;
2908 if (output_endian == ENDIAN_LITTLE){
2926 if (c2 == ISO_8859_1) {
2928 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2932 val = e2w_conv(c2, c1);
2934 val2 = e2w_combining(val, c2, c1);
2943 #define SCORE_L2 (1) /* Kanji Level 2 */
2944 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2945 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2946 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2947 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2948 #define SCORE_X0213 (SCORE_X0212 << 1) /* JIS X 0213 */
2949 #define SCORE_NO_EXIST (SCORE_X0213 << 1) /* Undefined Characters */
2950 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2951 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2953 #define SCORE_INIT (SCORE_iMIME)
2955 static const nkf_char score_table_A0[] = {
2958 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2959 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213,
2962 static const nkf_char score_table_F0[] = {
2963 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2964 SCORE_L2, SCORE_DEPEND, SCORE_X0213, SCORE_X0213,
2965 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2966 SCORE_CP932, SCORE_X0213, SCORE_X0213, SCORE_ERROR,
2969 static const nkf_char score_table_8FA0[] = {
2970 0, SCORE_X0213, SCORE_X0212, SCORE_X0213,
2971 SCORE_X0213, SCORE_X0213, SCORE_X0212, SCORE_X0212,
2972 SCORE_X0213, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2973 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2976 static const nkf_char score_table_8FE0[] = {
2977 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2978 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2979 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2980 SCORE_X0212, SCORE_X0212, SCORE_X0213, SCORE_X0213,
2983 static const nkf_char score_table_8FF0[] = {
2984 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0212,
2985 SCORE_X0212, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2986 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2987 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2991 set_code_score(struct input_code *ptr, nkf_char score)
2994 ptr->score |= score;
2999 clr_code_score(struct input_code *ptr, nkf_char score)
3002 ptr->score &= ~score;
3007 code_score(struct input_code *ptr)
3009 nkf_char c2 = ptr->buf[0];
3010 nkf_char c1 = ptr->buf[1];
3012 set_code_score(ptr, SCORE_ERROR);
3013 }else if (c2 == SS2){
3014 set_code_score(ptr, SCORE_KANA);
3015 }else if (c2 == 0x8f){
3016 if ((c1 & 0x70) == 0x20){
3017 set_code_score(ptr, score_table_8FA0[c1 & 0x0f]);
3018 }else if ((c1 & 0x70) == 0x60){
3019 set_code_score(ptr, score_table_8FE0[c1 & 0x0f]);
3020 }else if ((c1 & 0x70) == 0x70){
3021 set_code_score(ptr, score_table_8FF0[c1 & 0x0f]);
3023 set_code_score(ptr, SCORE_X0212);
3025 #ifdef UTF8_OUTPUT_ENABLE
3026 }else if (!e2w_conv(c2, c1)){
3027 set_code_score(ptr, SCORE_NO_EXIST);
3029 }else if ((c2 & 0x70) == 0x20){
3030 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
3031 }else if ((c2 & 0x70) == 0x70){
3032 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
3033 }else if ((c2 & 0x70) >= 0x50){
3034 set_code_score(ptr, SCORE_L2);
3039 status_disable(struct input_code *ptr)
3044 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
3048 status_push_ch(struct input_code *ptr, nkf_char c)
3050 ptr->buf[ptr->index++] = c;
3054 status_clear(struct input_code *ptr)
3061 status_reset(struct input_code *ptr)
3064 ptr->score = SCORE_INIT;
3068 status_reinit(struct input_code *ptr)
3071 ptr->_file_stat = 0;
3075 status_check(struct input_code *ptr, nkf_char c)
3077 if (c <= DEL && estab_f){
3083 s_status(struct input_code *ptr, nkf_char c)
3087 status_check(ptr, c);
3092 }else if (nkf_char_unicode_p(c)){
3094 }else if (0xa1 <= c && c <= 0xdf){
3095 status_push_ch(ptr, SS2);
3096 status_push_ch(ptr, c);
3099 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
3101 status_push_ch(ptr, c);
3102 }else if (0xed <= c && c <= 0xee){
3104 status_push_ch(ptr, c);
3105 #ifdef SHIFTJIS_CP932
3106 }else if (is_ibmext_in_sjis(c)){
3108 status_push_ch(ptr, c);
3109 #endif /* SHIFTJIS_CP932 */
3111 }else if (0xf0 <= c && c <= 0xfc){
3113 status_push_ch(ptr, c);
3114 #endif /* X0212_ENABLE */
3116 status_disable(ptr);
3120 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3121 status_push_ch(ptr, c);
3122 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3126 status_disable(ptr);
3130 #ifdef SHIFTJIS_CP932
3131 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
3132 status_push_ch(ptr, c);
3133 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
3134 set_code_score(ptr, SCORE_CP932);
3139 #endif /* SHIFTJIS_CP932 */
3140 status_disable(ptr);
3143 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3144 status_push_ch(ptr, c);
3145 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3146 set_code_score(ptr, SCORE_CP932);
3149 status_disable(ptr);
3156 e_status(struct input_code *ptr, nkf_char c)
3160 status_check(ptr, c);
3165 }else if (nkf_char_unicode_p(c)){
3167 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
3169 status_push_ch(ptr, c);
3171 }else if (0x8f == c){
3173 status_push_ch(ptr, c);
3174 #endif /* X0212_ENABLE */
3176 status_disable(ptr);
3180 if (0xa1 <= c && c <= 0xfe){
3181 status_push_ch(ptr, c);
3185 status_disable(ptr);
3190 if (0xa1 <= c && c <= 0xfe){
3192 status_push_ch(ptr, c);
3194 status_disable(ptr);
3196 #endif /* X0212_ENABLE */
3200 #ifdef UTF8_INPUT_ENABLE
3202 w_status(struct input_code *ptr, nkf_char c)
3206 status_check(ptr, c);
3211 }else if (nkf_char_unicode_p(c)){
3213 }else if (0xc0 <= c && c <= 0xdf){
3215 status_push_ch(ptr, c);
3216 }else if (0xe0 <= c && c <= 0xef){
3218 status_push_ch(ptr, c);
3219 }else if (0xf0 <= c && c <= 0xf4){
3221 status_push_ch(ptr, c);
3223 status_disable(ptr);
3228 if (0x80 <= c && c <= 0xbf){
3229 status_push_ch(ptr, c);
3230 if (ptr->index > ptr->stat){
3231 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
3232 && ptr->buf[2] == 0xbf);
3233 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
3234 &ptr->buf[0], &ptr->buf[1]);
3241 status_disable(ptr);
3245 if (0x80 <= c && c <= 0xbf){
3246 if (ptr->index < ptr->stat){
3247 status_push_ch(ptr, c);
3252 status_disable(ptr);
3260 code_status(nkf_char c)
3262 int action_flag = 1;
3263 struct input_code *result = 0;
3264 struct input_code *p = input_code_list;
3266 if (!p->status_func) {
3270 if (!p->status_func)
3272 (p->status_func)(p, c);
3275 }else if(p->stat == 0){
3286 if (result && !estab_f){
3287 set_iconv(TRUE, result->iconv_func);
3288 }else if (c <= DEL){
3289 struct input_code *ptr = input_code_list;
3299 nkf_buf_t *std_gc_buf;
3300 nkf_char broken_state;
3301 nkf_buf_t *broken_buf;
3302 nkf_char mimeout_state;
3306 static nkf_state_t *nkf_state = NULL;
3308 #define STD_GC_BUFSIZE (256)
3311 nkf_state_init(void)
3314 nkf_buf_clear(nkf_state->std_gc_buf);
3315 nkf_buf_clear(nkf_state->broken_buf);
3316 nkf_buf_clear(nkf_state->nfc_buf);
3319 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3320 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3321 nkf_state->broken_buf = nkf_buf_new(3);
3322 nkf_state->nfc_buf = nkf_buf_new(9);
3324 nkf_state->broken_state = 0;
3325 nkf_state->mimeout_state = 0;
3332 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3333 return nkf_buf_pop(nkf_state->std_gc_buf);
3340 std_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3342 nkf_buf_push(nkf_state->std_gc_buf, c);
3348 std_putc(nkf_char c)
3355 static nkf_char hold_buf[HOLD_SIZE*2];
3356 static int hold_count = 0;
3358 push_hold_buf(nkf_char c2)
3360 if (hold_count >= HOLD_SIZE*2)
3362 hold_buf[hold_count++] = c2;
3363 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3367 h_conv(FILE *f, nkf_char c1, nkf_char c2)
3374 /** it must NOT be in the kanji shifte sequence */
3375 /** it must NOT be written in JIS7 */
3376 /** and it must be after 2 byte 8bit code */
3382 while ((c2 = (*i_getc)(f)) != EOF) {
3388 if (push_hold_buf(c2) == EOF || estab_f) {
3394 struct input_code *p = input_code_list;
3395 struct input_code *result = p;
3400 if (p->status_func && p->score < result->score) {
3405 set_iconv(TRUE, result->iconv_func);
3410 ** 1) EOF is detected, or
3411 ** 2) Code is established, or
3412 ** 3) Buffer is FULL (but last word is pushed)
3414 ** in 1) and 3) cases, we continue to use
3415 ** Kanji codes by oconv and leave estab_f unchanged.
3420 while (hold_index < hold_count){
3421 c1 = hold_buf[hold_index++];
3422 if (nkf_char_unicode_p(c1)) {
3426 else if (c1 <= DEL){
3429 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3430 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3434 if (hold_index < hold_count){
3435 c2 = hold_buf[hold_index++];
3446 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3449 if (hold_index < hold_count){
3450 c3 = hold_buf[hold_index++];
3451 } else if ((c3 = (*i_getc)(f)) == EOF) {
3456 if (hold_index < hold_count){
3457 c4 = hold_buf[hold_index++];
3458 } else if ((c4 = (*i_getc)(f)) == EOF) {
3463 (*iconv)(c1, c2, (c3<<8)|c4);
3466 /* 4 bytes UTF-8 (check combining character) */
3467 if (hold_index < hold_count){
3468 c3 = hold_buf[hold_index++];
3470 } else if ((c3 = (*i_getc)(f)) == EOF) {
3471 w_iconv_nocombine(c1, c2, 0);
3474 if (hold_index < hold_count){
3475 c4 = hold_buf[hold_index++];
3477 } else if ((c4 = (*i_getc)(f)) == EOF) {
3478 w_iconv_nocombine(c1, c2, 0);
3479 if (fromhold_count <= 2)
3485 if (w_iconv_combine(c1, c2, 0, c3, c4, 0)) {
3486 w_iconv_nocombine(c1, c2, 0);
3487 if (fromhold_count <= 2) {
3490 } else if (fromhold_count == 3) {
3499 /* 3 bytes EUC or UTF-8 */
3500 if (hold_index < hold_count){
3501 c3 = hold_buf[hold_index++];
3503 } else if ((c3 = (*i_getc)(f)) == EOF) {
3509 if ((*iconv)(c1, c2, c3) == -3) {
3510 /* 6 bytes UTF-8 (check combining character) */
3512 if (hold_index < hold_count){
3513 c4 = hold_buf[hold_index++];
3515 } else if ((c4 = (*i_getc)(f)) == EOF) {
3516 w_iconv_nocombine(c1, c2, c3);
3519 if (hold_index < hold_count){
3520 c5 = hold_buf[hold_index++];
3522 } else if ((c5 = (*i_getc)(f)) == EOF) {
3523 w_iconv_nocombine(c1, c2, c3);
3524 if (fromhold_count == 4)
3530 if (hold_index < hold_count){
3531 c6 = hold_buf[hold_index++];
3533 } else if ((c6 = (*i_getc)(f)) == EOF) {
3534 w_iconv_nocombine(c1, c2, c3);
3535 if (fromhold_count == 5) {
3537 } else if (fromhold_count == 4) {
3546 if (w_iconv_combine(c1, c2, c3, c4, c5, c6)) {
3547 w_iconv_nocombine(c1, c2, c3);
3548 if (fromhold_count == 6) {
3550 } else if (fromhold_count == 5) {
3553 } else if (fromhold_count == 4) {
3566 if (c3 == EOF) break;
3572 * Check and Ignore BOM
3578 switch(c2 = (*i_getc)(f)){
3580 if((c2 = (*i_getc)(f)) == 0x00){
3581 if((c2 = (*i_getc)(f)) == 0xFE){
3582 if((c2 = (*i_getc)(f)) == 0xFF){
3583 if(!input_encoding){
3584 set_iconv(TRUE, w_iconv32);
3586 if (iconv == w_iconv32) {
3588 input_endian = ENDIAN_BIG;
3591 (*i_ungetc)(0xFF,f);
3592 }else (*i_ungetc)(c2,f);
3593 (*i_ungetc)(0xFE,f);
3594 }else if(c2 == 0xFF){
3595 if((c2 = (*i_getc)(f)) == 0xFE){
3596 if(!input_encoding){
3597 set_iconv(TRUE, w_iconv32);
3599 if (iconv == w_iconv32) {
3600 input_endian = ENDIAN_2143;
3603 (*i_ungetc)(0xFF,f);
3604 }else (*i_ungetc)(c2,f);
3605 (*i_ungetc)(0xFF,f);
3606 }else (*i_ungetc)(c2,f);
3607 (*i_ungetc)(0x00,f);
3608 }else (*i_ungetc)(c2,f);
3609 (*i_ungetc)(0x00,f);
3612 if((c2 = (*i_getc)(f)) == 0xBB){
3613 if((c2 = (*i_getc)(f)) == 0xBF){
3614 if(!input_encoding){
3615 set_iconv(TRUE, w_iconv);
3617 if (iconv == w_iconv) {
3621 (*i_ungetc)(0xBF,f);
3622 }else (*i_ungetc)(c2,f);
3623 (*i_ungetc)(0xBB,f);
3624 }else (*i_ungetc)(c2,f);
3625 (*i_ungetc)(0xEF,f);
3628 if((c2 = (*i_getc)(f)) == 0xFF){
3629 if((c2 = (*i_getc)(f)) == 0x00){
3630 if((c2 = (*i_getc)(f)) == 0x00){
3631 if(!input_encoding){
3632 set_iconv(TRUE, w_iconv32);
3634 if (iconv == w_iconv32) {
3635 input_endian = ENDIAN_3412;
3638 (*i_ungetc)(0x00,f);
3639 }else (*i_ungetc)(c2,f);
3640 (*i_ungetc)(0x00,f);
3641 }else (*i_ungetc)(c2,f);
3642 if(!input_encoding){
3643 set_iconv(TRUE, w_iconv16);
3645 if (iconv == w_iconv16) {
3646 input_endian = ENDIAN_BIG;
3650 (*i_ungetc)(0xFF,f);
3651 }else (*i_ungetc)(c2,f);
3652 (*i_ungetc)(0xFE,f);
3655 if((c2 = (*i_getc)(f)) == 0xFE){
3656 if((c2 = (*i_getc)(f)) == 0x00){
3657 if((c2 = (*i_getc)(f)) == 0x00){
3658 if(!input_encoding){
3659 set_iconv(TRUE, w_iconv32);
3661 if (iconv == w_iconv32) {
3662 input_endian = ENDIAN_LITTLE;
3666 (*i_ungetc)(0x00,f);
3667 }else (*i_ungetc)(c2,f);
3668 (*i_ungetc)(0x00,f);
3669 }else (*i_ungetc)(c2,f);
3670 if(!input_encoding){
3671 set_iconv(TRUE, w_iconv16);
3673 if (iconv == w_iconv16) {
3674 input_endian = ENDIAN_LITTLE;
3678 (*i_ungetc)(0xFE,f);
3679 }else (*i_ungetc)(c2,f);
3680 (*i_ungetc)(0xFF,f);
3689 broken_getc(FILE *f)
3693 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3694 return nkf_buf_pop(nkf_state->broken_buf);
3697 if (c=='$' && nkf_state->broken_state != ESC
3698 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3700 nkf_state->broken_state = 0;
3701 if (c1=='@'|| c1=='B') {
3702 nkf_buf_push(nkf_state->broken_buf, c1);
3703 nkf_buf_push(nkf_state->broken_buf, c);
3709 } else if (c=='(' && nkf_state->broken_state != ESC
3710 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3712 nkf_state->broken_state = 0;
3713 if (c1=='J'|| c1=='B') {
3714 nkf_buf_push(nkf_state->broken_buf, c1);
3715 nkf_buf_push(nkf_state->broken_buf, c);
3722 nkf_state->broken_state = c;
3728 broken_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3730 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3731 nkf_buf_push(nkf_state->broken_buf, c);
3736 eol_conv(nkf_char c2, nkf_char c1)
3738 if (guess_f && input_eol != EOF) {
3739 if (c2 == 0 && c1 == LF) {
3740 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3741 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3742 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3744 else if (!input_eol) input_eol = CR;
3745 else if (input_eol != CR) input_eol = EOF;
3747 if (prev_cr || (c2 == 0 && c1 == LF)) {
3749 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3750 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3752 if (c2 == 0 && c1 == CR) prev_cr = CR;
3753 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3757 put_newline(void (*func)(nkf_char))
3759 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3774 oconv_newline(void (*func)(nkf_char, nkf_char))
3776 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3791 Return value of fold_conv()
3793 LF add newline and output char
3794 CR add newline and output nothing
3797 1 (or else) normal output
3799 fold state in prev (previous character)
3801 >0x80 Japanese (X0208/X0201)
3806 This fold algorthm does not preserve heading space in a line.
3807 This is the main difference from fmt.
3810 #define char_size(c2,c1) (c2?2:1)
3813 fold_conv(nkf_char c2, nkf_char c1)
3816 nkf_char fold_state;
3818 if (c1== CR && !fold_preserve_f) {
3819 fold_state=0; /* ignore cr */
3820 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3822 fold_state=0; /* ignore cr */
3823 } else if (c1== BS) {
3824 if (f_line>0) f_line--;
3826 } else if (c2==EOF && f_line != 0) { /* close open last line */
3828 } else if ((c1==LF && !fold_preserve_f)
3829 || ((c1==CR||(c1==LF&&f_prev!=CR))
3830 && fold_preserve_f)) {
3832 if (fold_preserve_f) {
3836 } else if ((f_prev == c1 && !fold_preserve_f)
3837 || (f_prev == LF && fold_preserve_f)
3838 ) { /* duplicate newline */
3841 fold_state = LF; /* output two newline */
3847 if (f_prev&0x80) { /* Japanese? */
3849 fold_state = 0; /* ignore given single newline */
3850 } else if (f_prev==SP) {
3854 if (++f_line<=fold_len)
3858 fold_state = CR; /* fold and output nothing */
3862 } else if (c1=='\f') {
3865 fold_state = LF; /* output newline and clear */
3866 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3867 /* X0208 kankaku or ascii space */
3869 fold_state = 0; /* remove duplicate spaces */
3872 if (++f_line<=fold_len)
3873 fold_state = SP; /* output ASCII space only */
3875 f_prev = SP; f_line = 0;
3876 fold_state = CR; /* fold and output nothing */
3880 prev0 = f_prev; /* we still need this one... , but almost done */
3882 if (c2 || c2 == JIS_X_0201_1976_K)
3883 f_prev |= 0x80; /* this is Japanese */
3884 f_line += c2 == JIS_X_0201_1976_K ? 1: char_size(c2,c1);
3885 if (f_line<=fold_len) { /* normal case */
3888 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3889 f_line = char_size(c2,c1);
3890 fold_state = LF; /* We can't wait, do fold now */
3891 } else if (c2 == JIS_X_0201_1976_K) {
3892 /* simple kinsoku rules return 1 means no folding */
3893 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3894 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3895 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3896 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3897 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3898 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3899 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3901 fold_state = LF;/* add one new f_line before this character */
3904 fold_state = LF;/* add one new f_line before this character */
3907 /* kinsoku point in ASCII */
3908 if ( c1==')'|| /* { [ ( */
3919 /* just after special */
3920 } else if (!is_alnum(prev0)) {
3921 f_line = char_size(c2,c1);
3923 } else if ((prev0==SP) || /* ignored new f_line */
3924 (prev0==LF)|| /* ignored new f_line */
3925 (prev0&0x80)) { /* X0208 - ASCII */
3926 f_line = char_size(c2,c1);
3927 fold_state = LF;/* add one new f_line before this character */
3929 fold_state = 1; /* default no fold in ASCII */
3933 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3934 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3935 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3936 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3937 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3938 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3939 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3940 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3941 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3942 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3943 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3944 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3945 /* default no fold in kinsoku */
3948 f_line = char_size(c2,c1);
3949 /* add one new f_line before this character */
3952 f_line = char_size(c2,c1);
3954 /* add one new f_line before this character */
3959 /* terminator process */
3960 switch(fold_state) {
3962 oconv_newline(o_fconv);
3968 oconv_newline(o_fconv);
3979 static nkf_char z_prev2=0,z_prev1=0;
3982 z_conv(nkf_char c2, nkf_char c1)
3985 /* if (c2) c1 &= 0x7f; assertion */
3987 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3993 if (z_prev2 == JIS_X_0201_1976_K) {
3994 if (c2 == JIS_X_0201_1976_K) {
3995 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3997 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3999 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4001 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4003 } else if (x0213_f && c1 == (0xdf&0x7f) && ev_x0213[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4005 (*o_zconv)(ev_x0213[(z_prev1-SP)*2], ev_x0213[(z_prev1-SP)*2+1]);
4010 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4012 if (c2 == JIS_X_0201_1976_K) {
4013 if (dv[(c1-SP)*2] || ev[(c1-SP)*2] || (x0213_f && ev_x0213[(c1-SP)*2])) {
4014 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4019 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4030 if (alpha_f&1 && c2 == 0x23) {
4031 /* JISX0208 Alphabet */
4033 } else if (c2 == 0x21) {
4034 /* JISX0208 Kigou */
4039 } else if (alpha_f&4) {
4044 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4050 if (alpha_f&8 && c2 == 0) {
4052 const char *entity = 0;
4054 case '>': entity = ">"; break;
4055 case '<': entity = "<"; break;
4056 case '\"': entity = """; break;
4057 case '&': entity = "&"; break;
4060 while (*entity) (*o_zconv)(0, *entity++);
4066 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4071 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4075 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4079 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4083 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4087 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4091 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4095 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4099 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4104 (*o_zconv)(JIS_X_0201_1976_K, c);
4107 } else if (c2 == 0x25) {
4108 /* JISX0208 Katakana */
4109 static const int fullwidth_to_halfwidth[] =
4111 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4112 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4113 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4114 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4115 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4116 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4117 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4118 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4119 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4120 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4121 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x365F,
4122 0x375F, 0x385F, 0x395F, 0x3A5F, 0x3E5F, 0x425F, 0x445F, 0x0000
4124 if (fullwidth_to_halfwidth[c1-0x20]){
4125 c2 = fullwidth_to_halfwidth[c1-0x20];
4126 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
4128 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
4132 } else if (c2 == 0 && nkf_char_unicode_p(c1) &&
4133 ((c1&VALUE_MASK) == 0x3099 || (c1&VALUE_MASK) == 0x309A)) { /*
\e$B9g@.MQByE@!&H>ByE@
\e(B */
4134 (*o_zconv)(JIS_X_0201_1976_K, 0x5E + (c1&VALUE_MASK) - 0x3099);
4142 #define rot13(c) ( \
4144 (c <= 'M') ? (c + 13): \
4145 (c <= 'Z') ? (c - 13): \
4147 (c <= 'm') ? (c + 13): \
4148 (c <= 'z') ? (c - 13): \
4152 #define rot47(c) ( \
4154 ( c <= 'O') ? (c + 47) : \
4155 ( c <= '~') ? (c - 47) : \
4160 rot_conv(nkf_char c2, nkf_char c1)
4162 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
4168 (*o_rot_conv)(c2,c1);
4172 hira_conv(nkf_char c2, nkf_char c1)
4176 if (0x20 < c1 && c1 < 0x74) {
4178 (*o_hira_conv)(c2,c1);
4180 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
4182 c1 = nkf_char_unicode_new(0x3094);
4183 (*o_hira_conv)(c2,c1);
4186 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4188 (*o_hira_conv)(c2,c1);
4193 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
4196 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4198 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4202 (*o_hira_conv)(c2,c1);
4207 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4209 #define RANGE_NUM_MAX 18
4210 static const nkf_char range[RANGE_NUM_MAX][2] = {
4231 nkf_char start, end, c;
4233 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4237 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4242 for (i = 0; i < RANGE_NUM_MAX; i++) {
4243 start = range[i][0];
4246 if (c >= start && c <= end) {
4251 (*o_iso2022jp_check_conv)(c2,c1);
4255 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4257 static const unsigned char *mime_pattern[] = {
4258 (const unsigned char *)"\075?EUC-JP?B?",
4259 (const unsigned char *)"\075?SHIFT_JIS?B?",
4260 (const unsigned char *)"\075?ISO-8859-1?Q?",
4261 (const unsigned char *)"\075?ISO-8859-1?B?",
4262 (const unsigned char *)"\075?ISO-2022-JP?B?",
4263 (const unsigned char *)"\075?ISO-2022-JP?B?",
4264 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4265 #if defined(UTF8_INPUT_ENABLE)
4266 (const unsigned char *)"\075?UTF-8?B?",
4267 (const unsigned char *)"\075?UTF-8?Q?",
4269 (const unsigned char *)"\075?US-ASCII?Q?",
4274 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4275 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4276 e_iconv, s_iconv, 0, 0, 0, 0, 0,
4277 #if defined(UTF8_INPUT_ENABLE)
4283 static const nkf_char mime_encode[] = {
4284 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K,
4285 #if defined(UTF8_INPUT_ENABLE)
4292 static const nkf_char mime_encode_method[] = {
4293 'B', 'B','Q', 'B', 'B', 'B', 'Q',
4294 #if defined(UTF8_INPUT_ENABLE)
4302 /* MIME preprocessor fifo */
4304 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
4305 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
4306 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
4308 unsigned char buf[MIME_BUF_SIZE];
4310 unsigned int last; /* decoded */
4311 unsigned int input; /* undecoded */
4313 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
4315 #define MAXRECOVER 20
4318 mime_input_buf_unshift(nkf_char c)
4320 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
4324 mime_ungetc(nkf_char c, ARG_UNUSED FILE *f)
4326 mime_input_buf_unshift(c);
4331 mime_ungetc_buf(nkf_char c, FILE *f)
4334 (*i_mungetc_buf)(c,f);
4336 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
4341 mime_getc_buf(FILE *f)
4343 /* we don't keep eof of mime_input_buf, becase it contains ?= as
4344 a terminator. It was checked in mime_integrity. */
4345 return ((mimebuf_f)?
4346 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
4350 switch_mime_getc(void)
4352 if (i_getc!=mime_getc) {
4353 i_mgetc = i_getc; i_getc = mime_getc;
4354 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4355 if(mime_f==STRICT_MIME) {
4356 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4357 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4363 unswitch_mime_getc(void)
4365 if(mime_f==STRICT_MIME) {
4366 i_mgetc = i_mgetc_buf;
4367 i_mungetc = i_mungetc_buf;
4370 i_ungetc = i_mungetc;
4371 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4372 mime_iconv_back = NULL;
4376 mime_integrity(FILE *f, const unsigned char *p)
4380 /* In buffered mode, read until =? or NL or buffer full
4382 mime_input_state.input = mime_input_state.top;
4383 mime_input_state.last = mime_input_state.top;
4385 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
4387 q = mime_input_state.input;
4388 while((c=(*i_getc)(f))!=EOF) {
4389 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
4390 break; /* buffer full */
4392 if (c=='=' && d=='?') {
4393 /* checked. skip header, start decode */
4394 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4395 /* mime_last_input = mime_input_state.input; */
4396 mime_input_state.input = q;
4400 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4402 /* Should we check length mod 4? */
4403 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4406 /* In case of Incomplete MIME, no MIME decode */
4407 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4408 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
4409 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
4410 switch_mime_getc(); /* anyway we need buffered getc */
4415 mime_begin_strict(FILE *f)
4419 const unsigned char *p,*q;
4420 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4422 mime_decode_mode = FALSE;
4423 /* =? has been checked */
4425 p = mime_pattern[j];
4428 for(i=2;p[i]>SP;i++) { /* start at =? */
4429 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4430 /* pattern fails, try next one */
4432 while (mime_pattern[++j]) {
4433 p = mime_pattern[j];
4434 for(k=2;k<i;k++) /* assume length(p) > i */
4435 if (p[k]!=q[k]) break;
4436 if (k==i && nkf_toupper(c1)==p[k]) break;
4438 p = mime_pattern[j];
4439 if (p) continue; /* found next one, continue */
4440 /* all fails, output from recovery buffer */
4448 mime_decode_mode = p[i-2];
4450 mime_iconv_back = iconv;
4451 set_iconv(FALSE, mime_priority_func[j]);
4452 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4454 if (mime_decode_mode=='B') {
4455 mimebuf_f = unbuf_f;
4457 /* do MIME integrity check */
4458 return mime_integrity(f,mime_pattern[j]);
4472 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4473 /* re-read and convert again from mime_buffer. */
4475 /* =? has been checked */
4476 k = mime_input_state.last;
4477 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4478 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4479 /* We accept any character type even if it is breaked by new lines */
4480 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4481 if (c1==LF||c1==SP||c1==CR||
4482 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4484 /* Failed. But this could be another MIME preemble */
4486 mime_input_state.last--;
4492 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4493 if (!(++i<MAXRECOVER) || c1==EOF) break;
4494 if (c1=='b'||c1=='B') {
4495 mime_decode_mode = 'B';
4496 } else if (c1=='q'||c1=='Q') {
4497 mime_decode_mode = 'Q';
4501 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4502 if (!(++i<MAXRECOVER) || c1==EOF) break;
4504 mime_decode_mode = FALSE;
4510 if (!mime_decode_mode) {
4511 /* false MIME premble, restart from mime_buffer */
4512 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4513 /* Since we are in MIME mode until buffer becomes empty, */
4514 /* we never go into mime_begin again for a while. */
4517 /* discard mime preemble, and goto MIME mode */
4518 mime_input_state.last = k;
4519 /* do no MIME integrity check */
4520 return c1; /* used only for checking EOF */
4525 no_putc(ARG_UNUSED nkf_char c)
4531 debug(const char *str)
4534 fprintf(stderr, "%s\n", str ? str : "NULL");
4540 set_input_codename(const char *codename)
4542 if (!input_codename) {
4543 input_codename = codename;
4544 } else if (strcmp(codename, input_codename) != 0) {
4545 input_codename = "";
4550 get_guessed_code(void)
4552 if (input_codename && !*input_codename) {
4553 input_codename = "BINARY";
4555 struct input_code *p = find_inputcode_byfunc(iconv);
4556 if (!input_codename) {
4557 input_codename = "ASCII";
4558 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4559 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4560 input_codename = "CP932";
4561 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4562 if (p->score & SCORE_X0213)
4563 input_codename = "EUC-JIS-2004";
4564 else if (p->score & (SCORE_X0212))
4565 input_codename = "EUCJP-MS";
4566 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4567 input_codename = "CP51932";
4568 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4569 if (p->score & (SCORE_KANA))
4570 input_codename = "CP50221";
4571 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4572 input_codename = "CP50220";
4575 return input_codename;
4578 #if !defined(PERL_XS) && !defined(WIN32DLL)
4580 print_guessed_code(char *filename)
4582 if (filename != NULL) printf("%s: ", filename);
4583 if (input_codename && !*input_codename) {
4586 input_codename = get_guessed_code();
4588 printf("%s\n", input_codename);
4590 printf("%s%s%s%s\n",
4592 iconv != w_iconv16 && iconv != w_iconv32 ? "" :
4593 input_endian == ENDIAN_LITTLE ? " LE" :
4594 input_endian == ENDIAN_BIG ? " BE" :
4596 input_bom_f ? " (BOM)" : "",
4597 input_eol == CR ? " (CR)" :
4598 input_eol == LF ? " (LF)" :
4599 input_eol == CRLF ? " (CRLF)" :
4600 input_eol == EOF ? " (MIXED NL)" :
4610 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4612 nkf_char c1, c2, c3;
4618 if (!nkf_isxdigit(c2)){
4623 if (!nkf_isxdigit(c3)){
4628 return (hex2bin(c2) << 4) | hex2bin(c3);
4634 return hex_getc(':', f, i_cgetc, i_cungetc);
4638 cap_ungetc(nkf_char c, FILE *f)
4640 return (*i_cungetc)(c, f);
4646 return hex_getc('%', f, i_ugetc, i_uungetc);
4650 url_ungetc(nkf_char c, FILE *f)
4652 return (*i_uungetc)(c, f);
4656 #ifdef NUMCHAR_OPTION
4658 numchar_getc(FILE *f)
4660 nkf_char (*g)(FILE *) = i_ngetc;
4661 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4672 if (buf[i] == 'x' || buf[i] == 'X'){
4673 for (j = 0; j < 7; j++){
4675 if (!nkf_isxdigit(buf[i])){
4682 c |= hex2bin(buf[i]);
4685 for (j = 0; j < 8; j++){
4689 if (!nkf_isdigit(buf[i])){
4696 c += hex2bin(buf[i]);
4702 return nkf_char_unicode_new(c);
4712 numchar_ungetc(nkf_char c, FILE *f)
4714 return (*i_nungetc)(c, f);
4718 #ifdef UNICODE_NORMALIZATION
4723 nkf_char (*g)(FILE *f) = i_nfc_getc;
4724 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4725 nkf_buf_t *buf = nkf_state->nfc_buf;
4726 const unsigned char *array;
4727 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4728 nkf_char c = (*g)(f);
4730 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4732 nkf_buf_push(buf, c);
4734 while (lower <= upper) {
4735 int mid = (lower+upper) / 2;
4737 array = normalization_table[mid].nfd;
4738 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4739 if (len >= nkf_buf_length(buf)) {
4743 lower = 1, upper = 0;
4746 nkf_buf_push(buf, c);
4748 if (array[len] != nkf_buf_at(buf, len)) {
4749 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4750 else upper = mid - 1;
4757 array = normalization_table[mid].nfc;
4759 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4760 nkf_buf_push(buf, array[i]);
4764 } while (lower <= upper);
4766 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4767 c = nkf_buf_pop(buf);
4773 nfc_ungetc(nkf_char c, FILE *f)
4775 return (*i_nfc_ungetc)(c, f);
4777 #endif /* UNICODE_NORMALIZATION */
4781 base64decode(nkf_char c)
4786 i = c - 'A'; /* A..Z 0-25 */
4787 } else if (c == '_') {
4788 i = '?' /* 63 */ ; /* _ 63 */
4790 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4792 } else if (c > '/') {
4793 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4794 } else if (c == '+' || c == '-') {
4795 i = '>' /* 62 */ ; /* + and - 62 */
4797 i = '?' /* 63 */ ; /* / 63 */
4805 nkf_char c1, c2, c3, c4, cc;
4806 nkf_char t1, t2, t3, t4, mode, exit_mode;
4807 nkf_char lwsp_count;
4810 nkf_char lwsp_size = 128;
4812 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4813 return mime_input_buf(mime_input_state.top++);
4815 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4816 mime_decode_mode=FALSE;
4817 unswitch_mime_getc();
4818 return (*i_getc)(f);
4821 if (mimebuf_f == FIXED_MIME)
4822 exit_mode = mime_decode_mode;
4825 if (mime_decode_mode == 'Q') {
4826 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4828 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4829 if (c1<=SP || DEL<=c1) {
4830 mime_decode_mode = exit_mode; /* prepare for quit */
4833 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4837 mime_decode_mode = exit_mode; /* prepare for quit */
4838 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4839 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4840 /* end Q encoding */
4841 input_mode = exit_mode;
4843 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4844 while ((c1=(*i_getc)(f))!=EOF) {
4849 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4857 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4858 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4873 lwsp_buf[lwsp_count] = (unsigned char)c1;
4874 if (lwsp_count++>lwsp_size){
4876 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4877 lwsp_buf = lwsp_buf_new;
4883 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4885 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4886 i_ungetc(lwsp_buf[lwsp_count],f);
4889 nkf_xfree(lwsp_buf);
4892 if (c1=='='&&c2<SP) { /* this is soft wrap */
4893 while((c1 = (*i_mgetc)(f)) <=SP) {
4894 if (c1 == EOF) return (EOF);
4896 mime_decode_mode = 'Q'; /* still in MIME */
4897 goto restart_mime_q;
4900 mime_decode_mode = 'Q'; /* still in MIME */
4904 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4905 if (c2<=SP) return c2;
4906 mime_decode_mode = 'Q'; /* still in MIME */
4907 return ((hex2bin(c2)<<4) + hex2bin(c3));
4910 if (mime_decode_mode != 'B') {
4911 mime_decode_mode = FALSE;
4912 return (*i_mgetc)(f);
4916 /* Base64 encoding */
4918 MIME allows line break in the middle of
4919 Base64, but we are very pessimistic in decoding
4920 in unbuf mode because MIME encoded code may broken by
4921 less or editor's control sequence (such as ESC-[-K in unbuffered
4922 mode. ignore incomplete MIME.
4924 mode = mime_decode_mode;
4925 mime_decode_mode = exit_mode; /* prepare for quit */
4927 while ((c1 = (*i_mgetc)(f))<=SP) {
4932 if ((c2 = (*i_mgetc)(f))<=SP) {
4935 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4936 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4939 if ((c1 == '?') && (c2 == '=')) {
4942 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4943 while ((c1=(*i_getc)(f))!=EOF) {
4948 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4956 if ((c1=(*i_getc)(f))!=EOF) {
4960 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4975 lwsp_buf[lwsp_count] = (unsigned char)c1;
4976 if (lwsp_count++>lwsp_size){
4978 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4979 lwsp_buf = lwsp_buf_new;
4985 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4987 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4988 i_ungetc(lwsp_buf[lwsp_count],f);
4991 nkf_xfree(lwsp_buf);
4995 if ((c3 = (*i_mgetc)(f))<=SP) {
4998 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4999 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5003 if ((c4 = (*i_mgetc)(f))<=SP) {
5006 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5007 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5011 mime_decode_mode = mode; /* still in MIME sigh... */
5013 /* BASE 64 decoding */
5015 t1 = 0x3f & base64decode(c1);
5016 t2 = 0x3f & base64decode(c2);
5017 t3 = 0x3f & base64decode(c3);
5018 t4 = 0x3f & base64decode(c4);
5019 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5021 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5022 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5024 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5025 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5027 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5032 return mime_input_buf(mime_input_state.top++);
5035 static const char basis_64[] =
5036 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5038 #define MIMEOUT_BUF_LENGTH 74
5040 unsigned char buf[MIMEOUT_BUF_LENGTH+1];
5044 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5047 open_mime(nkf_char mode)
5049 const unsigned char *p;
5052 p = mime_pattern[0];
5053 for(i=0;mime_pattern[i];i++) {
5054 if (mode == mime_encode[i]) {
5055 p = mime_pattern[i];
5059 mimeout_mode = mime_encode_method[i];
5061 if (base64_count>45) {
5062 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
5063 (*o_mputc)(mimeout_state.buf[i]);
5066 put_newline(o_mputc);
5069 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
5073 for (;i<mimeout_state.count;i++) {
5074 if (nkf_isspace(mimeout_state.buf[i])) {
5075 (*o_mputc)(mimeout_state.buf[i]);
5085 j = mimeout_state.count;
5086 mimeout_state.count = 0;
5088 mime_putc(mimeout_state.buf[i]);
5093 mime_prechar(nkf_char c2, nkf_char c1)
5095 if (mimeout_mode > 0){
5097 if (base64_count + mimeout_state.count/3*4> 73){
5098 (*o_base64conv)(EOF,0);
5099 oconv_newline(o_base64conv);
5100 (*o_base64conv)(0,SP);
5104 if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) {
5105 (*o_base64conv)(EOF,0);
5106 oconv_newline(o_base64conv);
5107 (*o_base64conv)(0,SP);
5113 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
5114 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5115 open_mime(output_mode);
5116 (*o_base64conv)(EOF,0);
5117 oconv_newline(o_base64conv);
5118 (*o_base64conv)(0,SP);
5137 switch(mimeout_mode) {
5142 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
5148 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
5153 if (mimeout_mode > 0) {
5154 if (mimeout_f!=FIXED_MIME) {
5156 } else if (mimeout_mode != 'Q')
5162 mimeout_addchar(nkf_char c)
5164 switch(mimeout_mode) {
5169 } else if(!nkf_isalnum(c)) {
5171 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5172 (*o_mputc)(bin2hex((c&0xf)));
5180 nkf_state->mimeout_state=c;
5181 (*o_mputc)(basis_64[c>>2]);
5186 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5187 nkf_state->mimeout_state=c;
5192 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
5193 (*o_mputc)(basis_64[c & 0x3F]);
5205 mime_putc(nkf_char c)
5210 if (mimeout_f == FIXED_MIME){
5211 if (mimeout_mode == 'Q'){
5212 if (base64_count > 71){
5213 if (c!=CR && c!=LF) {
5215 put_newline(o_mputc);
5220 if (base64_count > 71){
5222 put_newline(o_mputc);
5225 if (c == EOF) { /* c==EOF */
5229 if (c != EOF) { /* c==EOF */
5235 /* mimeout_f != FIXED_MIME */
5237 if (c == EOF) { /* c==EOF */
5238 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
5239 j = mimeout_state.count;
5240 mimeout_state.count = 0;
5242 if (mimeout_mode > 0) {
5243 if (!nkf_isblank(mimeout_state.buf[j-1])) {
5245 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
5248 mimeout_addchar(mimeout_state.buf[i]);
5252 mimeout_addchar(mimeout_state.buf[i]);
5256 mimeout_addchar(mimeout_state.buf[i]);
5262 mimeout_addchar(mimeout_state.buf[i]);
5268 if (mimeout_state.count > 0){
5269 lastchar = mimeout_state.buf[mimeout_state.count - 1];
5274 if (mimeout_mode=='Q') {
5275 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5276 if (c == CR || c == LF) {
5281 } else if (c <= SP) {
5283 if (base64_count > 70) {
5284 put_newline(o_mputc);
5287 if (!nkf_isblank(c)) {
5292 if (base64_count > 70) {
5294 put_newline(o_mputc);
5297 open_mime(output_mode);
5299 if (!nkf_noescape_mime(c)) {
5312 if (mimeout_mode <= 0) {
5313 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5314 output_mode == UTF_8)) {
5315 if (nkf_isspace(c)) {
5317 if (mimeout_mode == -1) {
5320 if (c==CR || c==LF) {
5322 open_mime(output_mode);
5328 for (i=0;i<mimeout_state.count;i++) {
5329 (*o_mputc)(mimeout_state.buf[i]);
5330 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
5341 mimeout_state.buf[0] = (char)c;
5342 mimeout_state.count = 1;
5344 if (base64_count > 1
5345 && base64_count + mimeout_state.count > 76
5346 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
5347 static const char *str = "boundary=\"";
5348 static int len = 10;
5351 for (; i < mimeout_state.count - len; ++i) {
5352 if (!strncmp((char *)(mimeout_state.buf+i), str, len)) {
5358 if (i == 0 || i == mimeout_state.count - len) {
5359 put_newline(o_mputc);
5361 if (!nkf_isspace(mimeout_state.buf[0])){
5368 for (j = 0; j <= i; ++j) {
5369 (*o_mputc)(mimeout_state.buf[j]);
5371 put_newline(o_mputc);
5373 for (; j <= mimeout_state.count; ++j) {
5374 mimeout_state.buf[j - i] = mimeout_state.buf[j];
5376 mimeout_state.count -= i;
5379 mimeout_state.buf[mimeout_state.count++] = (char)c;
5380 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5381 open_mime(output_mode);
5386 if (lastchar==CR || lastchar == LF){
5387 for (i=0;i<mimeout_state.count;i++) {
5388 (*o_mputc)(mimeout_state.buf[i]);
5391 mimeout_state.count = 0;
5394 for (i=0;i<mimeout_state.count-1;i++) {
5395 (*o_mputc)(mimeout_state.buf[i]);
5398 mimeout_state.buf[0] = SP;
5399 mimeout_state.count = 1;
5401 open_mime(output_mode);
5404 /* mimeout_mode == 'B', 1, 2 */
5405 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5406 output_mode == UTF_8)) {
5407 if (lastchar == CR || lastchar == LF){
5408 if (nkf_isblank(c)) {
5409 for (i=0;i<mimeout_state.count;i++) {
5410 mimeout_addchar(mimeout_state.buf[i]);
5412 mimeout_state.count = 0;
5415 for (i=0;i<mimeout_state.count;i++) {
5416 (*o_mputc)(mimeout_state.buf[i]);
5419 mimeout_state.count = 0;
5421 mimeout_state.buf[mimeout_state.count++] = (char)c;
5424 if (nkf_isspace(c)) {
5425 for (i=0;i<mimeout_state.count;i++) {
5426 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
5428 for (i=0;i<mimeout_state.count;i++) {
5429 (*o_mputc)(mimeout_state.buf[i]);
5432 mimeout_state.count = 0;
5435 mimeout_state.buf[mimeout_state.count++] = (char)c;
5436 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5438 for (i=0;i<mimeout_state.count;i++) {
5439 (*o_mputc)(mimeout_state.buf[i]);
5442 mimeout_state.count = 0;
5446 if (mimeout_state.count>0 && SP<c && c!='=') {
5447 mimeout_state.buf[mimeout_state.count++] = (char)c;
5448 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5449 j = mimeout_state.count;
5450 mimeout_state.count = 0;
5452 mimeout_addchar(mimeout_state.buf[i]);
5459 if (mimeout_state.count>0) {
5460 j = mimeout_state.count;
5461 mimeout_state.count = 0;
5463 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5465 mimeout_addchar(mimeout_state.buf[i]);
5471 (*o_mputc)(mimeout_state.buf[i]);
5473 open_mime(output_mode);
5480 base64_conv(nkf_char c2, nkf_char c1)
5482 mime_prechar(c2, c1);
5483 (*o_base64conv)(c2,c1);
5487 typedef struct nkf_iconv_t {
5490 size_t input_buffer_size;
5491 char *output_buffer;
5492 size_t output_buffer_size;
5496 nkf_iconv_new(char *tocode, char *fromcode)
5498 nkf_iconv_t converter;
5500 converter->input_buffer_size = IOBUF_SIZE;
5501 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5502 converter->output_buffer_size = IOBUF_SIZE * 2;
5503 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5504 converter->cd = iconv_open(tocode, fromcode);
5505 if (converter->cd == (iconv_t)-1)
5509 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5512 perror("can't iconv_open");
5518 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5520 size_t invalid = (size_t)0;
5521 char *input_buffer = converter->input_buffer;
5522 size_t input_length = (size_t)0;
5523 char *output_buffer = converter->output_buffer;
5524 size_t output_length = converter->output_buffer_size;
5529 while ((c = (*i_getc)(f)) != EOF) {
5530 input_buffer[input_length++] = c;
5531 if (input_length < converter->input_buffer_size) break;
5535 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5536 while (output_length-- > 0) {
5537 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5539 if (ret == (size_t) - 1) {
5542 if (input_buffer != converter->input_buffer)
5543 memmove(converter->input_buffer, input_buffer, input_length);
5546 converter->output_buffer_size *= 2;
5547 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5548 if (output_buffer == NULL) {
5549 perror("can't realloc");
5552 converter->output_buffer = output_buffer;
5555 perror("can't iconv");
5568 nkf_iconv_close(nkf_iconv_t *convert)
5570 nkf_xfree(converter->inbuf);
5571 nkf_xfree(converter->outbuf);
5572 iconv_close(converter->cd);
5581 struct input_code *p = input_code_list;
5593 mime_f = MIME_DECODE_DEFAULT;
5594 mime_decode_f = FALSE;
5599 x0201_f = NKF_UNSPECIFIED;
5600 iso2022jp_f = FALSE;
5601 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5602 ms_ucs_map_f = UCS_MAP_ASCII;
5604 #ifdef UTF8_INPUT_ENABLE
5605 no_cp932ext_f = FALSE;
5606 no_best_fit_chars_f = FALSE;
5607 encode_fallback = NULL;
5608 unicode_subchar = '?';
5609 input_endian = ENDIAN_BIG;
5611 #ifdef UTF8_OUTPUT_ENABLE
5612 output_bom_f = FALSE;
5613 output_endian = ENDIAN_BIG;
5615 #ifdef UNICODE_NORMALIZATION
5631 #ifdef SHIFTJIS_CP932
5641 for (i = 0; i < 256; i++){
5642 prefix_table[i] = 0;
5646 mimeout_state.count = 0;
5651 fold_preserve_f = FALSE;
5654 kanji_intro = DEFAULT_J;
5655 ascii_intro = DEFAULT_R;
5656 fold_margin = FOLD_MARGIN;
5657 o_zconv = no_connection;
5658 o_fconv = no_connection;
5659 o_eol_conv = no_connection;
5660 o_rot_conv = no_connection;
5661 o_hira_conv = no_connection;
5662 o_base64conv = no_connection;
5663 o_iso2022jp_check_conv = no_connection;
5666 i_ungetc = std_ungetc;
5668 i_bungetc = std_ungetc;
5671 i_mungetc = std_ungetc;
5672 i_mgetc_buf = std_getc;
5673 i_mungetc_buf = std_ungetc;
5674 output_mode = ASCII;
5676 mime_decode_mode = FALSE;
5682 z_prev2=0,z_prev1=0;
5684 iconv_for_check = 0;
5686 input_codename = NULL;
5687 input_encoding = NULL;
5688 output_encoding = NULL;
5696 module_connection(void)
5698 if (input_encoding) set_input_encoding(input_encoding);
5699 if (!output_encoding) {
5700 output_encoding = nkf_default_encoding();
5702 if (!output_encoding) {
5703 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5706 set_output_encoding(output_encoding);
5707 oconv = nkf_enc_to_oconv(output_encoding);
5709 if (nkf_enc_unicode_p(output_encoding))
5710 output_mode = UTF_8;
5712 if (x0201_f == NKF_UNSPECIFIED) {
5713 x0201_f = X0201_DEFAULT;
5716 /* replace continucation module, from output side */
5718 /* output redicrection */
5720 if (noout_f || guess_f){
5727 if (mimeout_f == TRUE) {
5728 o_base64conv = oconv; oconv = base64_conv;
5730 /* base64_count = 0; */
5733 if (eolmode_f || guess_f) {
5734 o_eol_conv = oconv; oconv = eol_conv;
5737 o_rot_conv = oconv; oconv = rot_conv;
5740 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5743 o_hira_conv = oconv; oconv = hira_conv;
5746 o_fconv = oconv; oconv = fold_conv;
5749 if (alpha_f || x0201_f) {
5750 o_zconv = oconv; oconv = z_conv;
5754 i_ungetc = std_ungetc;
5755 /* input redicrection */
5758 i_cgetc = i_getc; i_getc = cap_getc;
5759 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5762 i_ugetc = i_getc; i_getc = url_getc;
5763 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5766 #ifdef NUMCHAR_OPTION
5768 i_ngetc = i_getc; i_getc = numchar_getc;
5769 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5772 #ifdef UNICODE_NORMALIZATION
5774 i_nfc_getc = i_getc; i_getc = nfc_getc;
5775 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5778 if (mime_f && mimebuf_f==FIXED_MIME) {
5779 i_mgetc = i_getc; i_getc = mime_getc;
5780 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5783 i_bgetc = i_getc; i_getc = broken_getc;
5784 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5786 if (input_encoding) {
5787 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5789 set_iconv(FALSE, e_iconv);
5793 struct input_code *p = input_code_list;
5802 Conversion main loop. Code detection only.
5805 #if !defined(PERL_XS) && !defined(WIN32DLL)
5812 module_connection();
5813 while ((c = (*i_getc)(f)) != EOF)
5820 #define NEXT continue /* no output, get next */
5821 #define SKIP c2=0;continue /* no output, get next */
5822 #define MORE c2=c1;continue /* need one more byte */
5823 #define SEND (void)0 /* output c1 and c2, get next */
5824 #define LAST break /* end of loop, go closing */
5825 #define set_input_mode(mode) do { \
5826 input_mode = mode; \
5828 set_input_codename("ISO-2022-JP"); \
5829 debug("ISO-2022-JP"); \
5833 kanji_convert(FILE *f)
5835 nkf_char c1=0, c2=0, c3=0, c4=0;
5836 int shift_mode = 0; /* 0, 1, 2, 3 */
5838 int is_8bit = FALSE;
5840 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5845 output_mode = ASCII;
5847 if (module_connection() < 0) {
5848 #if !defined(PERL_XS) && !defined(WIN32DLL)
5849 fprintf(stderr, "no output encoding given\n");
5855 #ifdef UTF8_INPUT_ENABLE
5856 if(iconv == w_iconv32){
5857 while ((c1 = (*i_getc)(f)) != EOF &&
5858 (c2 = (*i_getc)(f)) != EOF &&
5859 (c3 = (*i_getc)(f)) != EOF &&
5860 (c4 = (*i_getc)(f)) != EOF) {
5861 nkf_char c5, c6, c7, c8;
5862 if (nkf_iconv_utf_32(c1, c2, c3, c4) == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5863 if ((c5 = (*i_getc)(f)) != EOF &&
5864 (c6 = (*i_getc)(f)) != EOF &&
5865 (c7 = (*i_getc)(f)) != EOF &&
5866 (c8 = (*i_getc)(f)) != EOF) {
5867 if (nkf_iconv_utf_32_combine(c1, c2, c3, c4, c5, c6, c7, c8)) {
5872 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5875 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5881 else if (iconv == w_iconv16) {
5882 while ((c1 = (*i_getc)(f)) != EOF &&
5883 (c2 = (*i_getc)(f)) != EOF) {
5884 size_t ret = nkf_iconv_utf_16(c1, c2, 0, 0);
5885 if (ret == NKF_ICONV_NEED_TWO_MORE_BYTES &&
5886 (c3 = (*i_getc)(f)) != EOF &&
5887 (c4 = (*i_getc)(f)) != EOF) {
5888 nkf_iconv_utf_16(c1, c2, c3, c4);
5889 } else if (ret == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5890 if ((c3 = (*i_getc)(f)) != EOF &&
5891 (c4 = (*i_getc)(f)) != EOF) {
5892 if (nkf_iconv_utf_16_combine(c1, c2, c3, c4)) {
5895 nkf_iconv_utf_16_nocombine(c1, c2);
5898 nkf_iconv_utf_16_nocombine(c1, c2);
5906 while ((c1 = (*i_getc)(f)) != EOF) {
5907 #ifdef INPUT_CODE_FIX
5908 if (!input_encoding)
5913 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
5914 /* in case of 8th bit is on */
5915 if (!estab_f&&!mime_decode_mode) {
5916 /* in case of not established yet */
5917 /* It is still ambiguious */
5918 if (h_conv(f, c2, c1)==EOF) {
5926 /* in case of already established */
5928 /* ignore bogus code */
5936 /* 2nd byte of 7 bit code or SJIS */
5940 else if (nkf_char_unicode_p(c1)) {
5946 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5949 }else if (input_codename && input_codename[0] == 'I' &&
5950 0xA1 <= c1 && c1 <= 0xDF) {
5951 /* JIS X 0201 Katakana in 8bit JIS */
5952 c2 = JIS_X_0201_1976_K;
5955 } else if (c1 > DEL) {
5957 if (!estab_f && !iso8859_f) {
5958 /* not established yet */
5960 } else { /* estab_f==TRUE */
5966 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5967 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5969 c2 = JIS_X_0201_1976_K;
5974 /* already established */
5978 } else if (SP < c1 && c1 < DEL) {
5979 /* in case of Roman characters */
5981 /* output 1 shifted byte */
5985 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5986 /* output 1 shifted byte */
5987 c2 = JIS_X_0201_1976_K;
5990 /* look like bogus code */
5993 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5994 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5995 /* in case of Kanji shifted */
5997 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5998 /* Check MIME code */
5999 if ((c1 = (*i_getc)(f)) == EOF) {
6002 } else if (c1 == '?') {
6003 /* =? is mime conversion start sequence */
6004 if(mime_f == STRICT_MIME) {
6005 /* check in real detail */
6006 if (mime_begin_strict(f) == EOF)
6009 } else if (mime_begin(f) == EOF)
6018 /* normal ASCII code */
6021 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
6024 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
6027 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
6028 if ((c1 = (*i_getc)(f)) == EOF) {
6032 else if (c1 == '&') {
6034 if ((c1 = (*i_getc)(f)) == EOF) {
6040 else if (c1 == '$') {
6042 if ((c1 = (*i_getc)(f)) == EOF) {
6043 /* don't send bogus code
6045 (*oconv)(0, '$'); */
6047 } else if (c1 == '@' || c1 == 'B') {
6049 set_input_mode(JIS_X_0208);
6051 } else if (c1 == '(') {
6053 if ((c1 = (*i_getc)(f)) == EOF) {
6054 /* don't send bogus code
6060 } else if (c1 == '@'|| c1 == 'B') {
6062 set_input_mode(JIS_X_0208);
6065 } else if (c1 == 'D'){
6066 set_input_mode(JIS_X_0212);
6068 #endif /* X0212_ENABLE */
6069 } else if (c1 == 'O' || c1 == 'Q'){
6070 set_input_mode(JIS_X_0213_1);
6072 } else if (c1 == 'P'){
6073 set_input_mode(JIS_X_0213_2);
6076 /* could be some special code */
6083 } else if (broken_f&0x2) {
6084 /* accept any ESC-(-x as broken code ... */
6085 input_mode = JIS_X_0208;
6094 } else if (c1 == '(') {
6096 if ((c1 = (*i_getc)(f)) == EOF) {
6097 /* don't send bogus code
6099 (*oconv)(0, '('); */
6102 else if (c1 == 'I') {
6103 /* JIS X 0201 Katakana */
6104 set_input_mode(JIS_X_0201_1976_K);
6108 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
6109 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
6110 set_input_mode(ASCII);
6113 else if (broken_f&0x2) {
6114 set_input_mode(ASCII);
6123 else if (c1 == '.') {
6125 if ((c1 = (*i_getc)(f)) == EOF) {
6128 else if (c1 == 'A') {
6139 else if (c1 == 'N') {
6142 if (g2 == ISO_8859_1) {
6157 } else if (c1 == ESC && iconv == s_iconv) {
6158 /* ESC in Shift_JIS */
6159 if ((c1 = (*i_getc)(f)) == EOF) {
6162 } else if (c1 == '$') {
6164 if ((c1 = (*i_getc)(f)) == EOF) {
6166 } else if (('E' <= c1 && c1 <= 'G') ||
6167 ('O' <= c1 && c1 <= 'Q')) {
6175 static const nkf_char jphone_emoji_first_table[7] =
6176 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
6177 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
6178 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6179 while (SP <= c1 && c1 <= 'z') {
6180 (*oconv)(0, c1 + c3);
6181 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6197 } else if (c1 == LF || c1 == CR) {
6199 input_mode = ASCII; set_iconv(FALSE, 0);
6201 } else if (mime_decode_f && !mime_decode_mode){
6203 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
6211 } else { /* if (c1 == CR)*/
6212 if ((c1=(*i_getc)(f))!=EOF) {
6216 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
6236 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
6239 if ((c3 = (*i_getc)(f)) != EOF) {
6242 if ((c4 = (*i_getc)(f)) != EOF) {
6244 (*iconv)(c2, c1, c3|c4);
6249 /* 4 bytes UTF-8 (check combining character) */
6250 if ((c3 = (*i_getc)(f)) != EOF) {
6251 if ((c4 = (*i_getc)(f)) != EOF) {
6252 if (w_iconv_combine(c2, c1, 0, c3, c4, 0)) {
6255 w_iconv_nocombine(c2, c1, 0);
6259 w_iconv_nocombine(c2, c1, 0);
6262 w_iconv_nocombine(c2, c1, 0);
6266 /* 3 bytes EUC or UTF-8 */
6267 if ((c3 = (*i_getc)(f)) != EOF) {
6269 if ((*iconv)(c2, c1, c3) == -3) {
6270 /* 6 bytes UTF-8 (check combining character) */
6272 if ((c4 = (*i_getc)(f)) != EOF) {
6273 if ((c5 = (*i_getc)(f)) != EOF) {
6274 if ((c6 = (*i_getc)(f)) != EOF) {
6275 if (w_iconv_combine(c2, c1, c3, c4, c5, c6)) {
6279 w_iconv_nocombine(c2, c1, c3);
6284 w_iconv_nocombine(c2, c1, c3);
6288 w_iconv_nocombine(c2, c1, c3);
6291 w_iconv_nocombine(c2, c1, c3);
6301 0x7F <= c2 && c2 <= 0x92 &&
6302 0x21 <= c1 && c1 <= 0x7E) {
6304 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
6307 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
6311 (*oconv)(PREFIX_EUCG3 | c2, c1);
6313 #endif /* X0212_ENABLE */
6315 (*oconv)(PREFIX_EUCG3 | c2, c1);
6318 (*oconv)(input_mode, c1); /* other special case */
6324 /* goto next_word */
6329 (*iconv)(EOF, 0, 0);
6330 if (!input_codename)
6333 struct input_code *p = input_code_list;
6334 struct input_code *result = p;
6336 if (p->score < result->score) result = p;
6339 set_input_codename(result->name);
6341 debug(result->name);
6349 * int options(unsigned char *cp)
6356 options(unsigned char *cp)
6360 unsigned char *cp_back = NULL;
6365 while(*cp && *cp++!='-');
6366 while (*cp || cp_back) {
6374 case '-': /* literal options */
6375 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
6379 for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) {
6380 p = (unsigned char *)long_option[i].name;
6381 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
6382 if (*p == cp[j] || cp[j] == SP){
6389 #if !defined(PERL_XS) && !defined(WIN32DLL)
6390 fprintf(stderr, "unknown long option: --%s\n", cp);
6394 while(*cp && *cp != SP && cp++);
6395 if (long_option[i].alias[0]){
6397 cp = (unsigned char *)long_option[i].alias;
6400 if (strcmp(long_option[i].name, "help") == 0){
6405 if (strcmp(long_option[i].name, "ic=") == 0){
6406 enc = nkf_enc_find((char *)p);
6408 input_encoding = enc;
6411 if (strcmp(long_option[i].name, "oc=") == 0){
6412 enc = nkf_enc_find((char *)p);
6413 /* if (enc <= 0) continue; */
6415 output_encoding = enc;
6418 if (strcmp(long_option[i].name, "guess=") == 0){
6419 if (p[0] == '0' || p[0] == '1') {
6427 if (strcmp(long_option[i].name, "overwrite") == 0){
6430 preserve_time_f = TRUE;
6433 if (strcmp(long_option[i].name, "overwrite=") == 0){
6436 preserve_time_f = TRUE;
6438 backup_suffix = (char *)p;
6441 if (strcmp(long_option[i].name, "in-place") == 0){
6444 preserve_time_f = FALSE;
6447 if (strcmp(long_option[i].name, "in-place=") == 0){
6450 preserve_time_f = FALSE;
6452 backup_suffix = (char *)p;
6457 if (strcmp(long_option[i].name, "cap-input") == 0){
6461 if (strcmp(long_option[i].name, "url-input") == 0){
6466 #ifdef NUMCHAR_OPTION
6467 if (strcmp(long_option[i].name, "numchar-input") == 0){
6473 if (strcmp(long_option[i].name, "no-output") == 0){
6477 if (strcmp(long_option[i].name, "debug") == 0){
6482 if (strcmp(long_option[i].name, "cp932") == 0){
6483 #ifdef SHIFTJIS_CP932
6487 #ifdef UTF8_OUTPUT_ENABLE
6488 ms_ucs_map_f = UCS_MAP_CP932;
6492 if (strcmp(long_option[i].name, "no-cp932") == 0){
6493 #ifdef SHIFTJIS_CP932
6497 #ifdef UTF8_OUTPUT_ENABLE
6498 ms_ucs_map_f = UCS_MAP_ASCII;
6502 #ifdef SHIFTJIS_CP932
6503 if (strcmp(long_option[i].name, "cp932inv") == 0){
6510 if (strcmp(long_option[i].name, "x0212") == 0){
6517 if (strcmp(long_option[i].name, "exec-in") == 0){
6521 if (strcmp(long_option[i].name, "exec-out") == 0){
6526 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6527 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6528 no_cp932ext_f = TRUE;
6531 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6532 no_best_fit_chars_f = TRUE;
6535 if (strcmp(long_option[i].name, "fb-skip") == 0){
6536 encode_fallback = NULL;
6539 if (strcmp(long_option[i].name, "fb-html") == 0){
6540 encode_fallback = encode_fallback_html;
6543 if (strcmp(long_option[i].name, "fb-xml") == 0){
6544 encode_fallback = encode_fallback_xml;
6547 if (strcmp(long_option[i].name, "fb-java") == 0){
6548 encode_fallback = encode_fallback_java;
6551 if (strcmp(long_option[i].name, "fb-perl") == 0){
6552 encode_fallback = encode_fallback_perl;
6555 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6556 encode_fallback = encode_fallback_subchar;
6559 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6560 encode_fallback = encode_fallback_subchar;
6561 unicode_subchar = 0;
6563 /* decimal number */
6564 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6565 unicode_subchar *= 10;
6566 unicode_subchar += hex2bin(p[i]);
6568 }else if(p[1] == 'x' || p[1] == 'X'){
6569 /* hexadecimal number */
6570 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6571 unicode_subchar <<= 4;
6572 unicode_subchar |= hex2bin(p[i]);
6576 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6577 unicode_subchar *= 8;
6578 unicode_subchar += hex2bin(p[i]);
6581 w16e_conv(unicode_subchar, &i, &j);
6582 unicode_subchar = i<<8 | j;
6586 #ifdef UTF8_OUTPUT_ENABLE
6587 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6588 ms_ucs_map_f = UCS_MAP_MS;
6592 #ifdef UNICODE_NORMALIZATION
6593 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6598 if (strcmp(long_option[i].name, "prefix=") == 0){
6599 if (nkf_isgraph(p[0])){
6600 for (i = 1; nkf_isgraph(p[i]); i++){
6601 prefix_table[p[i]] = p[0];
6606 #if !defined(PERL_XS) && !defined(WIN32DLL)
6607 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6612 case 'b': /* buffered mode */
6615 case 'u': /* non bufferd mode */
6618 case 't': /* transparent mode */
6623 } else if (*cp=='2') {
6627 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6635 case 'j': /* JIS output */
6637 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6639 case 'e': /* AT&T EUC output */
6640 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6642 case 's': /* SJIS output */
6643 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6645 case 'l': /* ISO8859 Latin-1 support, no conversion */
6646 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6647 input_encoding = nkf_enc_from_index(ISO_8859_1);
6649 case 'i': /* Kanji IN ESC-$-@/B */
6650 if (*cp=='@'||*cp=='B')
6651 kanji_intro = *cp++;
6653 case 'o': /* ASCII IN ESC-(-J/B/H */
6654 /* ESC ( H was used in initial JUNET messages */
6655 if (*cp=='J'||*cp=='B'||*cp=='H')
6656 ascii_intro = *cp++;
6660 bit:1 katakana->hiragana
6661 bit:2 hiragana->katakana
6663 if ('9'>= *cp && *cp>='0')
6664 hira_f |= (*cp++ -'0');
6671 #if defined(MSDOS) || defined(__OS2__)
6678 show_configuration();
6686 #ifdef UTF8_OUTPUT_ENABLE
6687 case 'w': /* UTF-{8,16,32} output */
6692 output_encoding = nkf_enc_from_index(UTF_8N);
6694 output_bom_f = TRUE;
6695 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6699 if ('1'== cp[0] && '6'==cp[1]) {
6702 } else if ('3'== cp[0] && '2'==cp[1]) {
6706 output_encoding = nkf_enc_from_index(UTF_8);
6711 output_endian = ENDIAN_LITTLE;
6712 output_bom_f = TRUE;
6713 } else if (cp[0] == 'B') {
6715 output_bom_f = TRUE;
6718 output_bom_f = FALSE;
6720 enc_idx = enc_idx == UTF_16
6721 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6722 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6724 enc_idx = enc_idx == UTF_16
6725 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6726 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6728 output_encoding = nkf_enc_from_index(enc_idx);
6732 #ifdef UTF8_INPUT_ENABLE
6733 case 'W': /* UTF input */
6736 input_encoding = nkf_enc_from_index(UTF_8);
6739 if ('1'== cp[0] && '6'==cp[1]) {
6741 input_endian = ENDIAN_BIG;
6743 } else if ('3'== cp[0] && '2'==cp[1]) {
6745 input_endian = ENDIAN_BIG;
6748 input_encoding = nkf_enc_from_index(UTF_8);
6753 input_endian = ENDIAN_LITTLE;
6754 } else if (cp[0] == 'B') {
6756 input_endian = ENDIAN_BIG;
6758 enc_idx = (enc_idx == UTF_16
6759 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6760 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6761 input_encoding = nkf_enc_from_index(enc_idx);
6765 /* Input code assumption */
6766 case 'J': /* ISO-2022-JP input */
6767 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6769 case 'E': /* EUC-JP input */
6770 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6772 case 'S': /* Shift_JIS input */
6773 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6775 case 'Z': /* Convert X0208 alphabet to asii */
6777 bit:0 Convert JIS X 0208 Alphabet to ASCII
6778 bit:1 Convert Kankaku to one space
6779 bit:2 Convert Kankaku to two spaces
6780 bit:3 Convert HTML Entity
6781 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6783 while ('0'<= *cp && *cp <='4') {
6784 alpha_f |= 1 << (*cp++ - '0');
6788 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6789 x0201_f = FALSE; /* No X0201->X0208 conversion */
6791 ESC-(-I in JIS, EUC, MS Kanji
6792 SI/SO in JIS, EUC, MS Kanji
6793 SS2 in EUC, JIS, not in MS Kanji
6794 MS Kanji (0xa0-0xdf)
6796 ESC-(-I in JIS (0x20-0x5f)
6797 SS2 in EUC (0xa0-0xdf)
6798 0xa0-0xd in MS Kanji (0xa0-0xdf)
6801 case 'X': /* Convert X0201 kana to X0208 */
6804 case 'F': /* prserve new lines */
6805 fold_preserve_f = TRUE;
6806 case 'f': /* folding -f60 or -f */
6809 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6811 fold_len += *cp++ - '0';
6813 if (!(0<fold_len && fold_len<BUFSIZ))
6814 fold_len = DEFAULT_FOLD;
6818 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6820 fold_margin += *cp++ - '0';
6824 case 'm': /* MIME support */
6825 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6826 if (*cp=='B'||*cp=='Q') {
6827 mime_decode_mode = *cp++;
6828 mimebuf_f = FIXED_MIME;
6829 } else if (*cp=='N') {
6830 mime_f = TRUE; cp++;
6831 } else if (*cp=='S') {
6832 mime_f = STRICT_MIME; cp++;
6833 } else if (*cp=='0') {
6834 mime_decode_f = FALSE;
6835 mime_f = FALSE; cp++;
6837 mime_f = STRICT_MIME;
6840 case 'M': /* MIME output */
6843 mimeout_f = FIXED_MIME; cp++;
6844 } else if (*cp=='Q') {
6846 mimeout_f = FIXED_MIME; cp++;
6851 case 'B': /* Broken JIS support */
6853 bit:1 allow any x on ESC-(-x or ESC-$-x
6854 bit:2 reset to ascii on NL
6856 if ('9'>= *cp && *cp>='0')
6857 broken_f |= 1<<(*cp++ -'0');
6862 case 'O':/* for Output file */
6866 case 'c':/* add cr code */
6869 case 'd':/* delete cr code */
6872 case 'I': /* ISO-2022-JP output */
6875 case 'L': /* line mode */
6876 if (*cp=='u') { /* unix */
6877 eolmode_f = LF; cp++;
6878 } else if (*cp=='m') { /* mac */
6879 eolmode_f = CR; cp++;
6880 } else if (*cp=='w') { /* windows */
6881 eolmode_f = CRLF; cp++;
6882 } else if (*cp=='0') { /* no conversion */
6883 eolmode_f = 0; cp++;
6888 if ('2' <= *cp && *cp <= '9') {
6891 } else if (*cp == '0' || *cp == '1') {
6900 /* module muliple options in a string are allowed for Perl moudle */
6901 while(*cp && *cp++!='-');
6904 #if !defined(PERL_XS) && !defined(WIN32DLL)
6905 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6907 /* bogus option but ignored */
6915 #include "nkf32dll.c"
6916 #elif defined(PERL_XS)
6917 #else /* WIN32DLL */
6919 main(int argc, char **argv)
6924 char *outfname = NULL;
6927 #ifdef EASYWIN /*Easy Win */
6928 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6930 #ifdef DEFAULT_CODE_LOCALE
6931 setlocale(LC_CTYPE, "");
6935 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6936 cp = (unsigned char *)*argv;
6941 if (pipe(fds) < 0 || (pid = fork()) < 0){
6952 execvp(argv[1], &argv[1]);
6969 int debug_f_back = debug_f;
6972 int exec_f_back = exec_f;
6975 int x0212_f_back = x0212_f;
6977 int x0213_f_back = x0213_f;
6978 int guess_f_back = guess_f;
6980 guess_f = guess_f_back;
6983 debug_f = debug_f_back;
6986 exec_f = exec_f_back;
6988 x0212_f = x0212_f_back;
6989 x0213_f = x0213_f_back;
6992 if (binmode_f == TRUE)
6993 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6994 if (freopen("","wb",stdout) == NULL)
7001 setbuf(stdout, (char *) NULL);
7003 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
7006 if (binmode_f == TRUE)
7007 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7008 if (freopen("","rb",stdin) == NULL) return (-1);
7012 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
7016 kanji_convert(stdin);
7017 if (guess_f) print_guessed_code(NULL);
7021 int is_argument_error = FALSE;
7023 input_codename = NULL;
7026 iconv_for_check = 0;
7028 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
7030 is_argument_error = TRUE;
7038 /* reopen file for stdout */
7039 if (file_out_f == TRUE) {
7042 outfname = nkf_xmalloc(strlen(origfname)
7043 + strlen(".nkftmpXXXXXX")
7045 strcpy(outfname, origfname);
7049 for (i = strlen(outfname); i; --i){
7050 if (outfname[i - 1] == '/'
7051 || outfname[i - 1] == '\\'){
7057 strcat(outfname, "ntXXXXXX");
7059 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
7060 S_IREAD | S_IWRITE);
7062 strcat(outfname, ".nkftmpXXXXXX");
7063 fd = mkstemp(outfname);
7066 || (fd_backup = dup(fileno(stdout))) < 0
7067 || dup2(fd, fileno(stdout)) < 0
7078 outfname = "nkf.out";
7081 if(freopen(outfname, "w", stdout) == NULL) {
7085 if (binmode_f == TRUE) {
7086 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7087 if (freopen("","wb",stdout) == NULL)
7094 if (binmode_f == TRUE)
7095 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7096 if (freopen("","rb",fin) == NULL)
7101 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
7105 char *filename = NULL;
7107 if (nfiles > 1) filename = origfname;
7108 if (guess_f) print_guessed_code(filename);
7114 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7122 if (dup2(fd_backup, fileno(stdout)) < 0){
7125 if (stat(origfname, &sb)) {
7126 fprintf(stderr, "Can't stat %s\n", origfname);
7128 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
7129 if (chmod(outfname, sb.st_mode)) {
7130 fprintf(stderr, "Can't set permission %s\n", outfname);
7133 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
7134 if(preserve_time_f){
7135 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7136 tb[0] = tb[1] = sb.st_mtime;
7137 if (utime(outfname, tb)) {
7138 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7141 tb.actime = sb.st_atime;
7142 tb.modtime = sb.st_mtime;
7143 if (utime(outfname, &tb)) {
7144 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7149 char *backup_filename = get_backup_filename(backup_suffix, origfname);
7151 unlink(backup_filename);
7153 if (rename(origfname, backup_filename)) {
7154 perror(backup_filename);
7155 fprintf(stderr, "Can't rename %s to %s\n",
7156 origfname, backup_filename);
7158 nkf_xfree(backup_filename);
7161 if (unlink(origfname)){
7166 if (rename(outfname, origfname)) {
7168 fprintf(stderr, "Can't rename %s to %s\n",
7169 outfname, origfname);
7171 nkf_xfree(outfname);
7176 if (is_argument_error)
7179 #ifdef EASYWIN /*Easy Win */
7180 if (file_out_f == FALSE)
7181 scanf("%d",&end_check);
7184 #else /* for Other OS */
7185 if (file_out_f == TRUE)
7187 #endif /*Easy Win */
7190 #endif /* WIN32DLL */