NKF.mod/NKF.pm

   1 # Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
   2 # Copyright (c) 1996-2018, The nkf Project.
   3 # All rights reserved.
   4 #
   5 # This software is provided 'as-is', without any express or implied
   6 # warranty. In no event will the authors be held liable for any damages
   7 # arising from the use of this software.
   8 #
   9 # Permission is granted to anyone to use this software for any purpose,
  10 # including commercial applications, and to alter it and redistribute it
  11 # freely, subject to the following restrictions:
  12 #
  13 # 1. The origin of this software must not be misrepresented; you must not
  14 # claim that you wrote the original software. If you use this software
  15 # in a product, an acknowledgment in the product documentation would be
  16 # appreciated but is not required.
  17 #
  18 # 2. Altered source versions must be plainly marked as such, and must not be
  19 # misrepresented as being the original software.
  20 #
  21 # 3. This notice may not be removed or altered from any source distribution.
  22
  23 package NKF;
  24
  25 use strict;
  26 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  27
  28 require Exporter;
  29 require DynaLoader;
  30
  31 @ISA = qw(Exporter DynaLoader);
  32 # Items to export into callers namespace by default. Note: do not export
  33 # names by default without a very good reason. Use EXPORT_OK instead.
  34 # Do not simply export all your public functions/methods/constants.
  35 @EXPORT = qw(
  36         nkf     nkf_continue    inputcode
  37 );
  38 $VERSION = '2.15';
  39
  40 bootstrap NKF $VERSION;
  41
  42 # Preloaded methods go here.
  43
  44 # Autoload methods go after =cut, and are processed by the autosplit program.
  45
  46 1;
  47 __END__
  48
  49 #
  50 # =begin FUNC ¤«¤é =end FUNC ¤Þ¤Ç¤Ï Perl/NKF ¤Î¥É¥¥å¥á¥ó¥È
  51 # =begin COMMAND ¤«¤é =end COMMAND ¤Þ¤Ç¤Ï nkf ¥³¥Þ¥ó¥É¤Î¥É¥¥å¥á¥ó¥È
  52 #
  53
  54 =head1 NAME
  55
  56 =begin FUNC
  57
  58 NKF - Perl extension for Network Kanji Filter
  59
  60 =end FUNC
  61
  62 =begin COMMAND
  63
  64 nkf - Network Kanji Filter
  65
  66 =end COMMAND
  67
  68 =head1 SYNOPSIS
  69
  70 =begin FUNC
  71
  72   use NKF;
  73   $output = nkf("-s",$input);
  74
  75 =end FUNC
  76
  77 =begin COMMAND
  78
  79 nkf B<[-butjnesliohrTVvwWJESZxXFfmMBOcdILg]> B<[>I<file ...>B<]>
  80
  81 =end COMMAND
  82
  83 =head1 DESCRIPTION
  84
  85 =begin FUNC
  86
  87 This is a Perl Extension version of nkf (Network Kanji Filter).
  88 It converts the last argument and return converted result. Conversion
  89 details are specified by flags before the last argument.
  90
  91 =end FUNC
  92
  93 B<Nkf> is a yet another kanji code converter among networks, hosts and terminals.
  94 It converts input kanji code to designated kanji code
  95 such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8, UTF-16 or UTF-32.
  96
  97 One of the most unique faculty of B<nkf> is the guess of the input kanji encodings.
  98 It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8, UTF-16 and UTF-32.
  99 So users needn't set the input kanji code explicitly.
 100
 101 By default, X0201 kana is converted into X0208 kana.
 102 For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported.
 103 For automatic code detection, nkf assumes no X0201 kana in Shift_JIS.
 104 To accept X0201 in Shift_JIS, use B<-X>, B<-x> or B<-S>.
 105
 106 multiple options are specified as separate strings, such as
 107
 108   print nkf('--ic=UTF8-MAC', '-w', $string), "\n";
 109
 110 except the last arguments.
 111
 112 =head1 OPTIONS
 113
 114 =over
 115
 116 =item B<-J -S -E -W -W16 -W32 -j -s -e -w -w16 -w32>
 117
 118 Specify input and output encodings. Upper case is input.
 119 cf. --ic and --oc.
 120
 121 =over
 122
 123 =item B<-J>
 124
 125 ISO-2022-JP (JIS code).
 126
 127 =item B<-S>
 128
 129 Shift_JIS and JIS X 0201 kana.
 130 EUC-JP is recognized as X0201 kana. Without B<-x> flag,
 131 JIS X 0201 Katakana (a.k.a.halfwidth kana) is converted into JIS X 0208.
 132 If you use Windows, see Windows-31J (CP932).
 133
 134 =item B<-E>
 135
 136 EUC-JP.
 137
 138 =item B<-W>
 139
 140 UTF-8N.
 141
 142 =item B<-W16[BL][0]>
 143
 144 UTF-16.
 145 B or L gives whether Big Endian or Little Endian.
 146 0 gives whether put BOM or not.
 147
 148 =item B<-W32[BL][0]>
 149
 150 UTF-32.
 151 B or L gives whether Big Endian or Little Endian.
 152 0 gives whether put BOM or not.
 153
 154 =back
 155
 156 =item B<-b -u>
 157
 158 Output is buffered (DEFAULT), Output is unbuffered.
 159
 160 =item B<-t>
 161
 162 No conversion.
 163
 164 =item B<-i[@B]>
 165
 166 Specify the escape sequence for JIS X 0208.
 167
 168 =over
 169
 170 =item B<-i@>
 171
 172 Use ESC ( @. (JIS X 0208-1978)
 173
 174 =item B<-iB>
 175
 176 Use ESC ( B. (JIS X 0208-1983/1990 DEFAULT)
 177
 178 =back
 179
 180 =item B<-o[BJ]>
 181
 182 Specify the escape sequence for US-ASCII/JIS X 0201 Roman. (DEFAULT B)
 183
 184 =item B<-r>
 185
 186 {de/en}crypt ROT13/47
 187
 188 =item B<-h[123] --hiragana --katakana --katakana-hiragana>
 189
 190 =over
 191
 192 =item B<-h1 --hiragana>
 193
 194 Katakana to Hiragana conversion.
 195
 196 =item B<-h2 --katakana>
 197
 198 Hiragana to Katakana conversion.
 199
 200 =item B<-h3 --katakana-hiragana>
 201
 202 Katakana to Hiragana and Hiragana to Katakana conversion.
 203
 204 =back
 205
 206 =item B<-T>
 207
 208 Text mode output (MS-DOS)
 209
 210 =item B<-f[I<m> [- I<n>]]>
 211
 212 Folding on I<m> length with I<n> margin in a line.
 213 Without this option, fold length is 60 and fold margin is 10.
 214
 215 =item B<-F>
 216
 217 New line preserving line folding.
 218
 219 =item B<-Z[0-3]>
 220
 221 Convert X0208 alphabet (Fullwidth Alphabets) to ASCII.
 222
 223 =over
 224
 225 =item B<-Z -Z0>
 226
 227 Convert X0208 alphabet to ASCII.
 228
 229 =item B<-Z1>
 230
 231 Convert X0208 kankaku to single ASCII space.
 232
 233 =item B<-Z2>
 234
 235 Convert X0208 kankaku to double ASCII spaces.
 236
 237 =item B<-Z3>
 238
 239 Replacing fullwidth >, <, ", & into '&gt;', '&lt;', '&quot;', '&amp;' as in HTML.
 240
 241 =back
 242
 243 =item B<-X -x>
 244
 245 With B<-X> or without this option, X0201 is converted into X0208 Kana.
 246 With B<-x>, try to preserve X0208 kana and do not convert X0201 kana to X0208.
 247 In JIS output, ESC-(-I is used. In EUC output, SS2 is used.
 248
 249 =item B<-B[0-2]>
 250
 251 Assume broken JIS-Kanji input, which lost ESC.
 252 Useful when your site is using old B-News Nihongo patch.
 253
 254 =over
 255
 256 =item B<-B1>
 257
 258 allows any chars after ESC-( or ESC-$.
 259
 260 =item B<-B2>
 261
 262 force ASCII after NL.
 263
 264 =back
 265
 266 =item B<-I>
 267
 268 Replacing non iso-2022-jp char into a geta character
 269 (substitute character in Japanese).
 270
 271 =item B<-m[BQN0]>
 272
 273 MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT)
 274 To see ISO8859-1 (Latin-1) -l is necessary.
 275
 276 =over
 277
 278 =item B<-mB>
 279
 280 Decode MIME base64 encoded stream. Remove header or other part before
 281 conversion.
 282
 283 =item B<-mQ>
 284
 285 Decode MIME quoted stream. '_' in quoted stream is converted to space.
 286
 287 =item B<-mN>
 288
 289 Non-strict decoding.
 290 It allows line break in the middle of the base64 encoding.
 291
 292 =item B<-m0>
 293
 294 No MIME decode.
 295
 296 =back
 297
 298 =item B<-M>
 299
 300 MIME encode. Header style. All ASCII code and control characters are intact.
 301
 302 =over
 303
 304 =item B<-MB>
 305
 306 MIME encode Base64 stream.
 307 Kanji conversion is performed before encoding, so this cannot be used as a picture encoder.
 308
 309 =item B<-MQ>
 310
 311 Perform quoted encoding.
 312
 313 =back
 314
 315 =item B<-l>
 316
 317 Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP.
 318 B<-s>, B<-e> and B<-x> are not compatible with this option.
 319
 320 =item B<-L[uwm] -d -c>
 321
 322 Convert line breaks.
 323
 324 =over
 325
 326 =item B<-Lu -d>
 327
 328 unix (LF)
 329
 330 =item B<-Lw -c>
 331
 332 windows (CRLF)
 333
 334 =item B<-Lm>
 335
 336 mac (CR)
 337
 338 Without this option, nkf doesn't convert line breaks.
 339
 340 =back
 341
 342 =item B<--fj --unix --mac --msdos --windows>
 343
 344 Convert for these systems.
 345
 346 =item B<--jis --euc --sjis --mime --base64>
 347
 348 Convert to named code.
 349
 350 =item B<--jis-input --euc-input --sjis-input --mime-input --base64-input>
 351
 352 Assume input system
 353
 354 =item B<--ic=I<input codeset> --oc=I<output codeset>>
 355
 356 Set the input or output codeset.
 357 NKF supports following codesets and those codeset names are case insensitive.
 358
 359 =over
 360
 361 =item ISO-2022-JP
 362
 363 a.k.a. RFC1468, 7bit JIS, JUNET
 364
 365 =item EUC-JP (eucJP-nkf)
 366
 367 a.k.a. AT&T JIS, Japanese EUC, UJIS
 368
 369 =item eucJP-ascii
 370
 371 =item eucJP-ms
 372
 373 =item CP51932
 374
 375 Microsoft Version of EUC-JP.
 376
 377 =item Shift_JIS
 378
 379 a.k.a. SJIS, MS_Kanji
 380
 381 =item Windows-31J
 382
 383 a.k.a. CP932
 384
 385 =item UTF-8
 386
 387 same as UTF-8N
 388
 389 =item UTF-8N
 390
 391 UTF-8 without BOM
 392
 393 =item UTF-8-BOM
 394
 395 UTF-8 with BOM
 396
 397 =item UTF8-MAC (input only)
 398
 399 decomposed UTF-8
 400
 401 =item UTF-16
 402
 403 same as UTF-16BE
 404
 405 =item UTF-16BE
 406
 407 UTF-16 Big Endian without BOM
 408
 409 =item UTF-16BE-BOM
 410
 411 UTF-16 Big Endian with BOM
 412
 413 =item UTF-16LE
 414
 415 UTF-16 Little Endian without BOM
 416
 417 =item UTF-16LE-BOM
 418
 419 UTF-16 Little Endian with BOM
 420
 421 =item UTF-32
 422
 423 same as UTF-32BE
 424
 425 =item UTF-32BE
 426
 427 UTF-32 Big Endian without BOM
 428
 429 =item UTF-32BE-BOM
 430
 431 UTF-32 Big Endian with BOM
 432
 433 =item UTF-32LE
 434
 435 UTF-32 Little Endian without BOM
 436
 437 =item UTF-32LE-BOM
 438
 439 UTF-32 Little Endian with BOM
 440
 441 =back
 442
 443 =item B<--fb-{skip, html, xml, perl, java, subchar}>
 444
 445 Specify the way that nkf handles unassigned characters.
 446 Without this option, --fb-skip is assumed.
 447
 448 =item B<--prefix=I<escape character>I<target character>..>
 449
 450 When nkf converts to Shift_JIS,
 451 nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
 452 1st byte of argument is the escape character and following bytes are target characters.
 453
 454 =item B<--no-cp932ext>
 455
 456 Handle the characters extended in CP932 as unassigned characters.
 457
 458 =item B<--no-best-fit-chars>
 459
 460 When Unicode to Encoded byte conversion,
 461 don't convert characters which is not round trip safe.
 462 When Unicode to Unicode conversion,
 463 with this and -x option, nkf can be used as UTF converter.
 464 (In other words, without this and -x option, nkf doesn't save some characters)
 465
 466 When nkf converts strings that related to path, you should use this option.
 467
 468 =item B<--cap-input>
 469
 470 Decode hex encoded characters.
 471
 472 =item B<--url-input>
 473
 474 Unescape percent escaped characters.
 475
 476 =item B<--numchar-input>
 477
 478 Decode character reference, such as "&#....;".
 479
 480 =begin COMMAND
 481
 482 =item B<--in-place[=>I<SUFFIX>B<]>  B<--overwrite[=>I<SUFFIX>B<]>
 483
 484 Overwrite B<original> listed files by filtered result.
 485
 486 B<Note> --overwrite preserves timestamps of original files.
 487
 488 =item B<--guess=[12]>
 489
 490 Print guessed encoding and newline. (2 is default, 1 is only encoding)
 491
 492 =item B<--help>
 493
 494 Print nkf's help.
 495
 496 =item B<--version>
 497
 498 Print nkf's version.
 499
 500 =end COMMAND
 501
 502 =item B<-->
 503
 504 Ignore rest of -option.
 505
 506 =back
 507
 508 =head1 AUTHOR
 509
 510 Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
 511
 512 Copyright (c) 1996-2018, The nkf Project.
 513
 514 =begin FUNC
 515
 516 =head1 SEE ALSO
 517
 518 perl(1).   nkf(1)
 519
 520 =end FUNC
 521
 522 =cut