lib/Jcode.pm

   1 #
   2 # $Id: Jcode.pm,v 1.5 2007/03/11 11:46:05 takezoe Exp $
   3 #
   4
   5 package Jcode;
   6 use 5.005; # fair ?
   7 use Carp;
   8 use strict;
   9 use vars qw($RCSID $VERSION $DEBUG);
  10
  11 $RCSID = q$Id: Jcode.pm,v 1.5 2007/03/11 11:46:05 takezoe Exp $;
  12 $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  13 $DEBUG = 0;
  14
  15 # we no longer use Exporter
  16 use vars qw($USE_ENCODE);
  17 $USE_ENCODE = ($] >= 5.008001);
  18
  19 use Exporter;
  20 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  21 @ISA         = qw(Exporter);
  22 @EXPORT      = qw(jcode getcode);
  23 @EXPORT_OK   = qw($RCSID $VERSION $DEBUG);
  24 %EXPORT_TAGS = ( all       => [ @EXPORT, @EXPORT_OK ] );
  25
  26 use overload
  27     q("") => sub { $_[0]->euc },
  28     q(==) => sub { overload::StrVal($_[0]) eq overload::StrVal($_[1]) },
  29     q(.=) => sub { $_[0]->append( $_[1] ) },
  30     fallback => 1,
  31     ;
  32
  33 if ($USE_ENCODE){
  34     $DEBUG and warn "Using Encode";
  35     my $data = join("", <DATA>);
  36     eval $data;
  37     $@ and die $@;
  38 }else{
  39     $DEBUG and warn "Not Using Encode";
  40     require Jcode::_Classic;
  41     use vars qw/@ISA/;
  42     unshift @ISA, qw/Jcode::_Classic/;
  43     for my $sub (qw/jcode getcode convert load_module/){
  44         no strict 'refs';
  45         *{$sub} = \&{'Jcode::_Classic::' . $sub };
  46     }
  47     for my $enc (qw/sjis jis ucs2 utf8/){
  48         no strict 'refs';
  49         *{"euc_" . $enc} = \&{"Jcode::_Classic::" . "euc_" . $enc};
  50         *{$enc . "_euc"} = \&{"Jcode::_Classic::" . $enc . "_euc"};
  51     }
  52 }
  53
  54 1;
  55 __DATA__
  56 #
  57 # This idea was inspired by JEncode
  58 # http://www.donzoko.net/cgi/jencode/
  59 #
  60 package Jcode;
  61 use Encode;
  62 use Encode::Alias;
  63 use Encode::Guess;
  64 use Encode::JP::H2Z;
  65 use Scalar::Util; # to resolve from_to() vs. 'constant' issue.
  66
  67 my %jname2e = (
  68                sjis        => 'shiftjis',
  69                euc         => 'euc-jp',
  70                jis         => '7bit-jis',
  71                iso_2022_jp => 'iso-2022-jp',
  72                ucs2        => 'UTF-16BE',
  73               );
  74
  75 my %ename2j = reverse %jname2e;
  76
  77 our $FALLBACK = Encode::LEAVE_SRC;
  78 sub FB_PERLQQ()   { Encode::FB_PERLQQ() };
  79 sub FB_XMLCREF()  { Encode::FB_XMLCREF() };
  80 sub FB_HTMLCREF() { Encode::FB_HTMLCREF() };
  81 #for my $fb (qw/FB_PERLQQ FB_XMLCREF FB_HTMLCREF/){
  82 #    no strict 'refs';
  83 #    *{$fb} = \&{"Encode::$fb"};
  84 #}
  85
  86
  87 #######################################
  88 # Functions
  89 #######################################
  90
  91 sub jcode { return __PACKAGE__->new(@_); }
  92
  93 #
  94 # Used to be in Jcode::Constants
  95 #
  96
  97 my %_0208 = (
  98              1978 => '\e\$\@',
  99              1983 => '\e\$B',
 100              1990 => '\e&\@\e\$B',
 101             );
 102 my %RE = (
 103        ASCII     => '[\x00-\x7f]',
 104        BIN       => '[\x00-\x06\x7f\xff]',
 105        EUC_0212  => '\x8f[\xa1-\xfe][\xa1-\xfe]',
 106        EUC_C     => '[\xa1-\xfe][\xa1-\xfe]',
 107        EUC_KANA  => '\x8e[\xa1-\xdf]',
 108        JIS_0208  =>  "$_0208{1978}|$_0208{1983}|$_0208{1990}",
 109        JIS_0212  => "\e" . '\$\(D',
 110        JIS_ASC   => "\e" . '\([BJ]',
 111        JIS_KANA  => "\e" . '\(I',
 112        SJIS_C    => '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]',
 113        SJIS_KANA => '[\xa1-\xdf]',
 114        UTF8      => '[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]'
 115       );
 116
 117 sub _max {
 118     my $result = shift;
 119     for my $n (@_){
 120         $result = $n if $n > $result;
 121     }
 122     return $result;
 123 }
 124
 125 sub getcode {
 126     my $arg = shift;
 127     my $r_str = ref $arg ? $arg : \$arg;
 128     Encode::is_utf8($$r_str) and return 'utf8';
 129     my ($code, $nmatch, $sjis, $euc, $utf8) = ("", 0, 0, 0, 0);
 130     if ($$r_str =~ /$RE{BIN}/o) {       # 'binary'
 131         my $ucs2;
 132         $ucs2 += length($1)
 133             while $$r_str =~ /(\x00$RE{ASCII})+/go;
 134         if ($ucs2){      # smells like raw unicode
 135             ($code, $nmatch) = ('ucs2', $ucs2);
 136         }else{
 137             ($code, $nmatch) = ('binary', 0);
 138          }
 139     }
 140     elsif ($$r_str !~ /[\e\x80-\xff]/o) {       # not Japanese
 141         ($code, $nmatch) = ('ascii', 1);
 142     }                           # 'jis'
 143     elsif ($$r_str =~
 144            m[
 145              $RE{JIS_0208}|$RE{JIS_0212}|$RE{JIS_ASC}|$RE{JIS_KANA}
 146            ]ox)
 147     {
 148         ($code, $nmatch) = ('jis', 1);
 149     }
 150     else { # should be euc|sjis|utf8
 151         # use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp>
 152         $sjis += length($1)
 153             while $$r_str =~ /((?:$RE{SJIS_C})+)/go;
 154         $euc  += length($1)
 155             while $$r_str =~ /((?:$RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})+)/go;
 156         $utf8 += length($1)
 157             while $$r_str =~ /((?:$RE{UTF8})+)/go;
 158         # $utf8 *= 1.5; # M. Takahashi's suggestion
 159         $nmatch = _max($utf8, $sjis, $euc);
 160         carp ">DEBUG:sjis = $sjis, euc = $euc, utf8 = $utf8" if $DEBUG >= 3;
 161         $code =
 162             ($euc > $sjis and $euc > $utf8) ? 'euc' :
 163                 ($sjis > $euc and $sjis > $utf8) ? 'sjis' :
 164                     ($utf8 > $euc and $utf8 > $sjis) ? 'utf8' : undef;
 165     }
 166     return wantarray ? ($code, $nmatch) : $code;
 167 }
 168
 169 sub convert{
 170     my $r_str = (ref $_[0]) ? $_[0] : \$_[0];
 171     my (undef,$ocode,$icode,$opt) = @_;
 172     Encode::is_utf8($$r_str) and utf8::encode($$r_str);
 173     defined $icode or $icode = getcode($r_str) or return;
 174     $icode eq 'binary' and return $$r_str;
 175
 176     $jname2e{$icode} and $icode = $jname2e{$icode};
 177     $jname2e{$ocode} and $ocode = $jname2e{$ocode};
 178
 179     if ($opt){
 180         return $opt eq 'z'
 181             ? jcode($r_str, $icode)->h2z->$ocode
 182                 : jcode($r_str, $icode)->z2h->$ocode ;
 183
 184     }else{
 185         if (Scalar::Util::readonly($$r_str)){
 186             my $tmp = $$r_str;
 187             Encode::from_to($tmp, $icode, $ocode);
 188             return $tmp;
 189         }else{
 190             Encode::from_to($$r_str, $icode, $ocode);
 191             return $$r_str;
 192         }
 193     }
 194 }
 195
 196 #######################################
 197 # Constructors
 198 #######################################
 199
 200 sub new{
 201     my $class = shift;
 202     my $self  = {};
 203     bless $self => $class;
 204     defined $_[0] or $_[0] = '';
 205     $self->set(@_);
 206 }
 207
 208 sub set{
 209     my $self  = shift;
 210     my $str   = $_[0];
 211     my $r_str = (ref $str) ? $str : \$str;
 212     my $code  = $_[1] if(defined $_[1]);
 213     my $icode =  $code || getcode($r_str) || 'euc';
 214     $self->{icode}  = $jname2e{$icode} || $icode;
 215     # binary and flagged utf8 are stored as-is
 216     unless (Encode::is_utf8($$r_str) || $icode eq 'binary'){
 217         $$r_str = decode($self->{icode}, $$r_str);
 218     }
 219     $self->{r_str}  = $r_str;
 220     $self->{nmatch} = 0;
 221     $self->{method} = 'Encode';
 222     $self->{fallback} = $FALLBACK;
 223     $self;
 224 }
 225
 226 sub append{
 227     my $self  = shift;
 228     my $str   = $_[0];
 229     my $r_str = (ref $str) ? $str : \$str;
 230     my $code  = $_[1] if(defined $_[1]);
 231     my $icode =  $code || getcode($r_str) || 'euc';
 232     $self->{icode}  = $jname2e{$icode} || $icode;
 233     # binary and flagged utf8 are stored as-is
 234     unless (Encode::is_utf8($$r_str) || $icode eq 'binary'){
 235         $$r_str = decode($self->{icode}, $$r_str);
 236     }
 237     ${ $self->{r_str} }  .= $$r_str;
 238     $self->{nmatch} = 0;
 239     $self->{method} = 'internal';
 240     $self;
 241 }
 242
 243 #######################################
 244 # Accessors
 245 #######################################
 246
 247 for my $method (qw/r_str icode nmatch error_m error_r error_tr/){
 248     no strict 'refs';
 249     *{$method} = sub { $_[0]->{$method} };
 250 }
 251
 252 sub fallback{
 253     my $self = shift;
 254     @_ or return $self->{fallback};
 255     $self->{fallback} =  $_[0]|Encode::LEAVE_SRC;
 256     return $self;
 257 }
 258
 259 #######################################
 260 # Converters
 261 #######################################
 262
 263 sub utf8 { encode_utf8( ${$_[0]->{r_str}} ) }
 264
 265 #
 266 #  Those supported in Jcode 0.x are defined as default
 267 #
 268
 269 for my $enc (keys %jname2e){
 270     no strict 'refs';
 271     my $name = $jname2e{$enc} || $enc;
 272     my $e = find_encoding($name) or croak "$enc not supported";
 273     *{$enc} = sub {
 274         my $r_str = $_[0]->{r_str};
 275         Encode::is_utf8($$r_str) ?
 276                 $e->encode($$r_str, $_[0]->{fallback}) : $$r_str;
 277     };
 278 }
 279
 280 #
 281 # The rest is defined on the fly
 282 #
 283
 284 sub DESTROY {};
 285
 286 sub AUTOLOAD {
 287     our $AUTOLOAD;
 288     my $self = shift;
 289     my $type = ref $self
 290         or confess "$self is not an object";
 291     my $myname = $AUTOLOAD;
 292     $myname =~ s/.*:://;  # strip fully-qualified portion
 293     $myname eq 'DESTROY' and return;
 294     my $e = find_encoding($myname)
 295         or confess __PACKAGE__, ": unknown encoding: $myname";
 296     $DEBUG and warn ref($self), "->$myname defined";
 297     no strict 'refs';
 298     *{$myname} =
 299         sub {
 300             my $str = ${ $_[0]->{r_str} };
 301             Encode::is_utf8($str) ?
 302                       $e->encode($str, $_[0]->{fallback}) : $str;
 303           };
 304     $myname->($self);
 305 }
 306
 307 #######################################
 308 # Length, Translation and Fold
 309 #######################################
 310
 311 sub jlength{
 312     length(  ${$_[0]->{r_str}} );
 313 }
 314
 315 sub tr{
 316     my $self = shift;
 317     my $str  = ${$self->{r_str}};
 318     my $from = Encode::is_utf8($_[0]) ? $_[0] : decode('euc-jp', $_[0]);
 319     my $to   = Encode::is_utf8($_[1]) ? $_[1] : decode('euc-jp', $_[1]);
 320     my $opt  = $_[2] || '';
 321     $from =~ s,\\,\\\\,og; $from =~ s,/,\\/,og;
 322     $to   =~ s,\\,\\\\,og; $to   =~ s,/,\\/,og;
 323     $opt  =~ s,[^a-z],,og;
 324     my $match = eval qq{ \$str =~ tr/$from/$to/$opt };
 325     if ($@){
 326         $self->{error_tr} = $@;
 327         return $self;
 328     }
 329     $self->{r_str} = \$str;
 330     $self->{nmatch} = $match || 0;
 331     return $self;
 332 }
 333
 334 sub jfold{
 335     my $self = shift;
 336     my $r_str  = $self->{r_str};
 337     my $bpl = shift || 72;
 338     my $nl  = shift || "\n";
 339     my $kin = shift;
 340
 341     my @lines = ();
 342     my %kinsoku = ();
 343     my ($len, $i) = (0,0);
 344
 345     if( defined $kin and (ref $kin) eq 'ARRAY' ){
 346         %kinsoku = map { my $k = Encode::is_utf8($_) ?
 347                              $_ : decode('euc-jp' =>  $_);
 348                          ($k, 1) } @$kin;
 349     }
 350
 351     while($$r_str =~ m/(.)/sg){
 352         my $char = $1;
 353         # <UFF61> \xA1 |0 # HALFWIDTH IDEOGRAPHIC FULL STOP
 354         # <UFF9F> \xDF |0 # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
 355         my $ord = ord($char);
 356         my $clen =  $ord < 128 ? 1
 357             : $ord <  0xff61 ? 2
 358             : $ord <= 0xff9f ? 1 : 2;
 359         if ($len + $clen > $bpl){
 360             unless($kinsoku{$char}){
 361                 $i++;
 362                 $len = 0;
 363             }
 364         }
 365         $lines[$i] .= $char;
 366         $len += $clen;
 367     }
 368     defined($lines[$i]) or pop @lines;
 369     $$r_str = join($nl, @lines);
 370
 371     $self->{r_str} = $r_str;
 372     my $e = find_encoding($self->{icode});
 373     @lines = map {
 374         Encode::is_utf8($_) ? $e->encode($_, $self->{fallback}) : $_
 375     } @lines;
 376
 377     return wantarray ? @lines : $self;
 378 }
 379
 380 #######################################
 381 # Full and Half
 382 #######################################
 383
 384 sub h2z{
 385     my $self = shift;
 386     my $euc  = $self->euc;
 387     Encode::JP::H2Z::h2z(\$euc, @_);
 388     $self->set($euc => 'euc');
 389     $self;
 390 }
 391
 392 sub z2h{
 393     my $self = shift;
 394     my $euc =  $self->euc;
 395     Encode::JP::H2Z::z2h(\$euc, @_);
 396     $self->set($euc => 'euc');
 397     $self;
 398 }
 399
 400 #######################################
 401 # MIME-Encoding
 402 #######################################
 403
 404 sub mime_decode{
 405     my $self = shift;
 406     my $utf8  = Encode::decode('MIME-Header', $self->utf8);
 407     $self->set($utf8 =>'utf8');
 408 }
 409
 410 sub mime_encode{
 411     my $self = shift;
 412     my $str = $self->euc;
 413     my $r_str = \$str;
 414     my $lf  = shift || "\n";
 415     my $bpl = shift || 76;
 416     my ($trailing_crlf) = ($$r_str =~ /(\n|\r|\x0d\x0a)$/o);
 417     $str  = _mime_unstructured_header($$r_str, $lf, $bpl);
 418     not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o;
 419     $str;
 420 }
 421
 422 #
 423 # shamelessly stolen from
 424 # http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64
 425 #
 426
 427 sub _add_encoded_word {
 428     require MIME::Base64;
 429     my($str, $line, $bpl) = @_;
 430     my $result = '';
 431     while (length($str)) {
 432         my $target = $str;
 433         $str = '';
 434         if (length($line) + 22 +
 435             ($target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o) * 8 > $bpl) {
 436             $line =~ s/[ \t\n\r]*$/\n/;
 437             $result .= $line;
 438             $line = ' ';
 439         }
 440         while (1) {
 441             my $iso_2022_jp = jcode($target, 'euc')->iso_2022_jp;
 442             if (my $count = ($iso_2022_jp =~ tr/\x80-\xff//d)){
 443                 $DEBUG and warn $count;
 444                 $target = jcode($iso_2022_jp, 'iso_2022_jp')->euc;
 445             }
 446             my $encoded = '=?ISO-2022-JP?B?' .
 447               MIME::Base64::encode_base64($iso_2022_jp, '')
 448                       . '?=';
 449             if (length($encoded) + length($line) > $bpl) {
 450                 $target =~
 451                     s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o;
 452                 $str = $1 . $str;
 453             } else {
 454                 $line .= $encoded;
 455                 last;
 456             }
 457         }
 458     }
 459     return $result . $line;
 460 }
 461
 462 sub _mime_unstructured_header {
 463     my ($oldheader, $lf, $bpl) = @_;
 464     my(@words, @wordstmp, $i);
 465     my $header = '';
 466     $oldheader =~ s/\s+$//;
 467     @wordstmp = split /\s+/, $oldheader;
 468     for ($i = 0; $i < $#wordstmp; $i++) {
 469         if ($wordstmp[$i] !~ /^[\x21-\x7E]+$/ and
 470             $wordstmp[$i + 1] !~ /^[\x21-\x7E]+$/) {
 471             $wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]";
 472         } else {
 473             push(@words, $wordstmp[$i]);
 474         }
 475     }
 476     push(@words, $wordstmp[-1]);
 477     for my $word (@words) {
 478         if ($word =~ /^[\x21-\x7E]+$/) {
 479             $header =~ /(?:.*\n)*(.*)/;
 480             if (length($1) + length($word) > $bpl) {
 481                 $header .= "$lf $word";
 482             } else {
 483                 $header .= $word;
 484             }
 485         } else {
 486             $header = _add_encoded_word($word, $header, $bpl);
 487         }
 488         $header =~ /(?:.*\n)*(.*)/;
 489         if (length($1) == $bpl) {
 490             $header .= "$lf ";
 491         } else {
 492             $header .= ' ';
 493         }
 494     }
 495     $header =~ s/\n? $/\n/;
 496     $header;
 497 }
 498
 499 #######################################
 500 # Matching and Replacing
 501 #######################################
 502
 503 no warnings 'uninitialized';
 504
 505 sub m{
 506     use utf8;
 507     my $self    = shift;
 508     my $r_str   = $self->{r_str};
 509     my $pattern = Encode::is_utf8($_[0]) ? shift : decode("euc-jp" => shift);
 510     my $opt     = shift || '' ;
 511     my @match;
 512
 513     $pattern =~ s,\\,\\\\,og; $pattern =~ s,/,\\/,og;
 514     $opt     =~ s,[^a-z],,og;
 515
 516     eval qq{ \@match = (\$\$r_str =~ m/$pattern/$opt) };
 517     if ($@){
 518         $self->{error_m} = $@;
 519         return;
 520     }
 521     # print @match, "\n";
 522     wantarray ?  map {encode('euc-jp' => $_)} @match : scalar @match;
 523 }
 524
 525 sub s{
 526     use utf8;
 527     my $self    = shift;
 528     my $r_str   = $self->{r_str};
 529     my $pattern = Encode::is_utf8($_[0]) ? shift : decode("euc-jp" => shift);
 530     my $replace = Encode::is_utf8($_[0]) ? shift : decode("euc-jp" => shift);
 531     my $opt     = shift;
 532
 533     $pattern =~ s,\\,\\\\,og; $pattern =~ s,/,\\/,og;
 534     $replace =~ s,\\,\\\\,og; $replace =~ s,/,\\/,og;
 535     $opt     =~ s,[^a-z],,og;
 536
 537     eval qq{ (\$\$r_str =~ s/$pattern/$replace/$opt) };
 538     if ($@){
 539         $self->{error_s} = $@;
 540     }
 541     $self;
 542 }
 543
 544 1;
 545 __END__
 546
 547 =head1 NAME
 548
 549 Jcode - Japanese Charset Handler
 550
 551 =head1 SYNOPSIS
 552
 553  use Jcode;
 554  #
 555  # traditional
 556  Jcode::convert(\$str, $ocode, $icode, "z");
 557  # or OOP!
 558  print Jcode->new($str)->h2z->tr($from, $to)->utf8;
 559
 560 =cut
 561
 562 =head1 DESCRIPTION
 563
 564 B<<Japanese document is now available as L<Jcode::Nihongo>. >>
 565
 566 Jcode.pm supports both object and traditional approach.
 567 With object approach, you can go like;
 568
 569   $iso_2022_jp = Jcode->new($str)->h2z->jis;
 570
 571 Which is more elegant than:
 572
 573   $iso_2022_jp = $str;
 574   &jcode::convert(\$iso_2022_jp, 'jis', &jcode::getcode(\$str), "z");
 575
 576 For those unfamiliar with objects, Jcode.pm still supports C<getcode()>
 577 and C<convert().>
 578
 579 If the perl version is 5.8.1, Jcode acts as a wrapper to L<Encode>,
 580 the standard charset handler module for Perl 5.8 or later.
 581
 582 =head1 Methods
 583
 584 Methods mentioned here all return Jcode object unless otherwise mentioned.
 585
 586 =head2 Constructors
 587
 588 =over 2
 589
 590 =item $j = Jcode-E<gt>new($str [, $icode])
 591
 592 Creates Jcode object $j from $str.  Input code is automatically checked
 593 unless you explicitly set $icode. For available charset, see L<getcode>
 594 below.
 595
 596 For perl 5.8.1 or better, C<$icode> can be I<any encoding name>
 597 that L<Encode> understands.
 598
 599   $j = Jcode->new($european, 'iso-latin1');
 600
 601 When the object is stringified, it returns the EUC-converted string so
 602 you can <print $j> instead of <print $j->euc>.
 603
 604 =over 2
 605
 606 =item Passing Reference
 607
 608 Instead of scalar value, You can use reference as
 609
 610 Jcode->new(\$str);
 611
 612 This saves time a little bit.  In exchange of the value of $str being
 613 converted. (In a way, $str is now "tied" to jcode object).
 614
 615 =back
 616
 617 =item $j-E<gt>set($str [, $icode])
 618
 619 Sets $j's internal string to $str.  Handy when you use Jcode object repeatedly
 620 (saves time and memory to create object).
 621
 622  # converts mailbox to SJIS format
 623  my $jconv = new Jcode;
 624  $/ = 00;
 625  while(&lt;&gt;){
 626      print $jconv->set(\$_)->mime_decode->sjis;
 627  }
 628
 629 =item $j-E<gt>append($str [, $icode]);
 630
 631 Appends $str to $j's internal string.
 632
 633 =item $j = jcode($str [, $icode]);
 634
 635 shortcut for Jcode->new() so you can go like;
 636
 637 =back
 638
 639 =head2 Encoded Strings
 640
 641 In general, you can retrieve I<encoded> string as $j-E<gt>I<encoded>.
 642
 643 =over 2
 644
 645 =item $sjis = jcode($str)->sjis
 646
 647 =item $euc = $j-E<gt>euc
 648
 649 =item $jis = $j-E<gt>jis
 650
 651 =item $sjis = $j-E<gt>sjis
 652
 653 =item $ucs2 = $j-E<gt>ucs2
 654
 655 =item $utf8 = $j-E<gt>utf8
 656
 657 What you code is what you get :)
 658
 659 =item $iso_2022_jp = $j-E<gt>iso_2022_jp
 660
 661 Same as C<< $j->h2z->jis >>.
 662 Hankaku Kanas are forcibly converted to Zenkaku.
 663
 664 For perl 5.8.1 and better, you can also use any encoding names and
 665 aliases that Encode supports.  For example:
 666
 667   $european = $j->iso_latin1; # replace '-' with '_' for names.
 668
 669 B<FYI>: L<Encode::Encoder> uses similar trick.
 670
 671 =over 2
 672
 673 =item $j-E<gt>fallback($fallback)
 674
 675 For perl is 5.8.1 or better, Jcode stores the internal string in
 676 UTF-8.  Any character that does not map to I<< -E<gt>encoding >> are
 677 replaced with a '?', which is L<Encode> standard.
 678
 679   my $unistr = "\x{262f}"; # YIN YANG
 680   my $j = jcode($unistr);  # $j->euc is '?'
 681
 682 You can change this behavior by specifying fallback like L<Encode>.
 683 Values are the same as L<Encode>.  C<Jcode::FB_PERLQQ>,
 684 C<Jcode::FB_XMLCREF>, C<Jcode::FB_HTMLCREF> are aliased to those
 685 of L<Encode> for convenice.
 686
 687   print $j->fallback(Jcode::FB_PERLQQ)->euc;   # '\x{262f}'
 688   print $j->fallback(Jcode::FB_XMLCREF)->euc;  # '&#x262f;'
 689   print $j->fallback(Jcode::FB_HTMLCREF)->euc; # '&#9775;'
 690
 691 The global variable C<$Jcode::FALLBACK> stores the default fallback so you can override that by assigning the value.
 692
 693   $Jcode::FALLBACK = Jcode::FB_PERLQQ; # set default fallback scheme
 694
 695 =back
 696
 697 =item [@lines =] $jcode-E<gt>jfold([$width, $newline_str, $kref])
 698
 699 folds lines in jcode string every $width (default: 72) where $width is
 700 the number of "halfwidth" character.  Fullwidth Characters are counted
 701 as two.
 702
 703 with a newline string spefied by $newline_str (default: "\n").
 704
 705 Rudimentary kinsoku suppport is now available for Perl 5.8.1 and better.
 706
 707 =item $length = $jcode-E<gt>jlength();
 708
 709 returns character length properly, rather than byte length.
 710
 711 =back
 712
 713 =head2 Methods that use MIME::Base64
 714
 715 To use methods below, you need L<MIME::Base64>.  To install, simply
 716
 717    perl -MCPAN -e 'CPAN::Shell->install("MIME::Base64")'
 718
 719 If your perl is 5.6 or better, there is no need since L<MIME::Base64>
 720 is bundled.
 721
 722 =over 2
 723
 724 =item $mime_header = $j-E<gt>mime_encode([$lf, $bpl])
 725
 726 Converts $str to MIME-Header documented in RFC1522.
 727 When $lf is specified, it uses $lf to fold line (default: \n).
 728 When $bpl is specified, it uses $bpl for the number of bytes (default: 76;
 729 this number must be smaller than 76).
 730
 731 For Perl 5.8.1 or better, you can also encode MIME Header as:
 732
 733   $mime_header = $j->MIME_Header;
 734
 735 In which case the resulting C<$mime_header> is MIME-B-encoded UTF-8
 736 whereas C<< $j->mime_encode() >> returnes MIME-B-encoded ISO-2022-JP.
 737 Most modern MUAs support both.
 738
 739 =item $j-E<gt>mime_decode;
 740
 741 Decodes MIME-Header in Jcode object.  For perl 5.8.1 or better, you
 742 can also do the same as:
 743
 744   Jcode->new($str, 'MIME-Header')
 745
 746 =back
 747
 748 =head2 Hankaku vs. Zenkaku
 749
 750 =over 2
 751
 752 =item $j-E<gt>h2z([$keep_dakuten])
 753
 754 Converts X201 kana (Hankaku) to X208 kana (Zenkaku).
 755 When $keep_dakuten is set, it leaves dakuten as is
 756 (That is, "ka + dakuten" is left as is instead of
 757 being converted to "ga")
 758
 759 You can retrieve the number of matches via $j->nmatch;
 760
 761 =item $j-E<gt>z2h
 762
 763 Converts X208 kana (Zenkaku) to X201 kana (Hankaku).
 764
 765 You can retrieve the number of matches via $j->nmatch;
 766
 767 =back
 768
 769 =head2 Regexp emulators
 770
 771 To use C<< -E<gt>m() >> and C<< -E<gt>s() >>, you need perl 5.8.1 or
 772 better.
 773
 774 =over 2
 775
 776 =item $j-E<gt>tr($from, $to, $opt);
 777
 778 Applies C<tr/$from/$to/> on Jcode object where $from and $to are
 779 EUC-JP strings.  On perl 5.8.1 or better, $from and $to can
 780 also be flagged UTF-8 strings.
 781
 782 If C<$opt> is set, C<tr/$from/$to/$opt> is applied.  C<$opt> must
 783 be 'c', 'd' or the combination thereof.
 784
 785 You can retrieve the number of matches via $j->nmatch;
 786
 787 The following methods are available only for perl 5.8.1 or better.
 788
 789 =item $j-E<gt>s($patter, $replace, $opt);
 790
 791 Applies C<s/$pattern/$replace/$opt>. C<$pattern> and C<replace> must
 792 be in EUC-JP or flagged UTF-8. C<$opt> are the same as regexp options.
 793 See L<perlre> for regexp options.
 794
 795 Like C<< $j->tr() >>, C<< $j->s() >> returns the object itself so
 796 you can nest the operation as follows;
 797
 798   $j->tr("a-z", "A-Z")->s("foo", "bar");
 799
 800 =item  [@match = ] $j-E<gt>m($pattern, $opt);
 801
 802 Applies C<m/$patter/$opt>.  Note that this method DOES NOT RETURN
 803 AN OBJECT so you can't chain the method like  C<< $j->s() >>.
 804
 805 =back
 806
 807 =head2 Instance Variables
 808
 809 If you need to access instance variables of Jcode object, use access
 810 methods below instead of directly accessing them (That's what OOP
 811 is all about)
 812
 813 FYI, Jcode uses a ref to array instead of ref to hash (common way) to
 814 optimize speed (Actually you don't have to know as long as you use
 815 access methods instead;  Once again, that's OOP)
 816
 817 =over 2
 818
 819 =item $j-E<gt>r_str
 820
 821 Reference to the EUC-coded String.
 822
 823 =item $j-E<gt>icode
 824
 825 Input charcode in recent operation.
 826
 827 =item $j-E<gt>nmatch
 828
 829 Number of matches (Used in $j->tr, etc.)
 830
 831 =back
 832
 833 =cut
 834
 835 =head1 Subroutines
 836
 837 =over 2
 838
 839 =item ($code, [$nmatch]) = getcode($str)
 840
 841 Returns char code of $str. Return codes are as follows
 842
 843  ascii   Ascii (Contains no Japanese Code)
 844  binary  Binary (Not Text File)
 845  euc     EUC-JP
 846  sjis    SHIFT_JIS
 847  jis     JIS (ISO-2022-JP)
 848  ucs2    UCS2 (Raw Unicode)
 849  utf8    UTF8
 850
 851 When array context is used instead of scaler, it also returns how many
 852 character codes are found.  As mentioned above, $str can be \$str
 853 instead.
 854
 855 B<jcode.pl Users:>  This function is 100% upper-conpatible with
 856 jcode::getcode() -- well, almost;
 857
 858  * When its return value is an array, the order is the opposite;
 859    jcode::getcode() returns $nmatch first.
 860
 861  * jcode::getcode() returns 'undef' when the number of EUC characters
 862    is equal to that of SJIS.  Jcode::getcode() returns EUC.  for
 863    Jcode.pm there is no in-betweens.
 864
 865 =item Jcode::convert($str, [$ocode, $icode, $opt])
 866
 867 Converts $str to char code specified by $ocode.  When $icode is specified
 868 also, it assumes $icode for input string instead of the one checked by
 869 getcode(). As mentioned above, $str can be \$str instead.
 870
 871 B<jcode.pl Users:>  This function is 100% upper-conpatible with
 872 jcode::convert() !
 873
 874 =back
 875
 876 =head1 BUGS
 877
 878 For perl is 5.8.1 or later, Jcode acts as a wrapper to L<Encode>.
 879 Meaning Jcode is subject to bugs therein.
 880
 881 =head1 ACKNOWLEDGEMENTS
 882
 883 This package owes a lot in motivation, design, and code, to the jcode.pl
 884 for Perl4 by Kazumasa Utashiro <utashiro@iij.ad.jp>.
 885
 886 Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp> has helped me polish regexp from the
 887 very first stage of development.
 888
 889 JEncode by makamaka@donzoko.net has inspired me to integrate Encode to
 890 Jcode.  He has also contributed Japanese POD.
 891
 892 And folks at Jcode Mailing list <jcode5@ring.gr.jp>.  Without them, I
 893 couldn't have coded this far.
 894
 895 =head1 SEE ALSO
 896
 897 L<Encode>
 898
 899 L<Jcode::Nihongo>
 900
 901 L<http://www.iana.org/assignments/character-sets>
 902
 903 =head1 COPYRIGHT
 904
 905 Copyright 1999-2005 Dan Kogai <dankogai@dan.co.jp>
 906
 907 This library is free software; you can redistribute it
 908 and/or modify it under the same terms as Perl itself.
 909
 910 =cut