Encode/Unicode/UTF8.pm

=head1 NAME

Encode::Unicode::UTF8 --- Encode/decode of UTF-8 related encodings

=head1 ENCODINGS

=over 4

=cut

require v5.7.3;
package Encode::Unicode::UTF8;
use strict;
use vars qw($VERSION);
$VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};

package Encode::Unicode::UTF8::CESU8;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/CESU-8 cesu8 csCESU-8/);

=item CESU-8

Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8),
defined in UTR #26.  (Alias: csCESU-8 (IANA), cesu8)

=cut

my %_U2C;
sub encode ($$;$) {
  use integer;
  my ($obj, $str, $chk) = @_;
  $_[1] = '' if $chk;
  $str =~ s{([\x{010000}-\x{10FFFF}])}{
    my $u = $1;
    unless ($_U2C{$u}) {
      $_U2C{$u} = chr ((ord ($u) - 0x10000) / 0x400 + 0xD800).
                  chr ((ord ($u) - 0x10000) % 0x400 + 0xDC00);
    }
    $_U2C{$u};
  }ge;
  Encode::_utf8_off ($str);
  $str;
}

my %_C2U;
sub decode ($$;$) {
  no warnings;
  my ($obj, $str, $chk) = @_;
  $_[1] = '' if $chk;
  Encode::_utf8_on ($str);
  $str =~ s{([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])}{
    my ($u1,$u2) = ($1,$2);
    unless ($_C2U{$u1.$u2}) {
      $_C2U{$u1.$u2} = chr (0x10000+(ord($u1)-0xD800)*0x400+(ord($u2)-0xDC00));
    }
    $_C2U{$u1.$u2};
  }ge;
  return $str;
}

package Encode::Unicode::UTF8::UTF8Mod;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/utf-8-mod utf8-mod/);

=item utf-8-mod

Modified UTF-8 for UTF-EBCDIC, defined in UTR #16.
(Alias: utf8-mod)

=cut

my %_4to8m;
sub encode ($$;$) {
  my ($obj, $str, $chk) = @_;
  my $r = '';
  for (split //, $str) {
    unless ($_4to8m{$_}) {
      my $U = ord $_;
      if ($U <= 0x9F) {
        $_4to8m{$_} = $_;
      } else {
        $_4to8m{$_} = _ucs4_to_utf8m ($U);
      }
    }
    $r .= $_4to8m{$_};
  }
  $_[1] = '' if $chk;
  return $r;
}

my %_8mto4;
sub decode ($$;$) {
  my ($obj, $str, $chk) = @_;
  $str =~ s{
     ([\xC0-\xDF][\xA0-\xFF])
    |([\xE0-\xEF][\xA0-\xFF][\xA0-\xFF])
    |([\xF0-\xF7][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
    |([\xF8-\xFB][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
    | ([\xFC\xFD][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
    | ([\xFE\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
  }{
    my ($o2,$o3,$o4,$o5,$o6,$o7) = ($1,$2,$3,$4,$5,$6);
    unless ($_8mto4{$o2.$o3.$o4.$o5.$o6.$o7}) {
      if ($o2) {
        my @o = split //, $o2;
        $_8mto4{$o2} =
          chr (((ord ($o[0]) & 0x1F) << 5) + (ord ($o[1]) & 0x1F));
      } elsif ($o3) {
        my @o = split //, $o3;
        $_8mto4{$o3} =
          chr (((ord ($o[0]) & 0x03) << 10) + ((ord ($o[1]) & 0x1F) << 5)
            + (ord ($o[2]) & 0x1F));
      } elsif ($o4) {
        my @o = split //, $o4;
        $_8mto4{$o4} =
          chr (((ord ($o[0]) & 0x07) << 15) + ((ord ($o[1]) & 0x1F) << 10)
          + ((ord ($o[2]) & 0x1F) << 5) + (ord ($o[3]) & 0x1F));
      } elsif ($o5) {
        my @o = split //, $o5;
        $_8mto4{$o5} =
          chr (((ord ($o[0]) & 0x03) << 20) + ((ord ($o[1]) & 0x1F) << 15)
          + ((ord ($o[2]) & 0x1F) << 10) + ((ord ($o[3]) & 0x1F) << 5)
          + (ord ($o[4]) & 0x1F));
      } elsif ($o6) {
        my @o = split //, $o6;
        $_8mto4{$o6} =
          chr (((ord ($o[0]) & 0x01) << 25) + ((ord ($o[1]) & 0x1F) << 20)
          + ((ord ($o[2]) & 0x1F) << 15) + ((ord ($o[3]) & 0x1F) << 10)
          + ((ord ($o[4]) & 0x1F) << 5) + (ord ($o[5]) & 0x1F));
      } else {
        my @o = split //, $o7;
        $_8mto4{$o7} =
          chr (((ord ($o[0]) & 0x01) << 30) + ((ord ($o[1]) & 0x1F) << 25)
          + ((ord ($o[2]) & 0x1F) << 20) + ((ord ($o[3]) & 0x1F) << 15)
          + ((ord ($o[4]) & 0x1F) << 10) + ((ord ($o[5]) & 0x1F) << 5)
          + (ord ($o[6]) & 0x1F));
      }
    }
    $_8mto4{$o2.$o3.$o4.$o5.$o6.$o7};
  }goex;
  $_[1] = '' if $chk;
  return $str;
}

sub _ucs4_to_utf8m ($) {
  my $U = shift;
  if ($U <= 0x009F) {
    return pack 'C', $U;
  } elsif ($U <= 0x03FF) {
    return pack 'C2', (0xC0 | ($U >> 5)), (0xA0 | ($U & 0x1F));
  } elsif ($U <= 0x3FFF) {
    return pack 'C3', (0xE0 | ($U >> 10)), (0xA0 | (($U >> 5) & 0x1F)),
                      (0xA0 | ($U & 0x4F));
  } elsif ($U <= 0x0003FFFF) {
    return pack 'C4', (0xF0 | ($U >> 15)), (0xA0 | (($U >> 10) & 0x1F)),
                      (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
  } elsif ($U <= 0x003FFFFF) {
    return pack 'C5', (0xF8 | ($U >> 20)),
                      (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
                      (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
  } elsif ($U <= 0x03FFFFFF) {
    return pack 'C6', (0xFC | ($U >> 25)), (0xA0 | (($U >> 20) & 0x1F)),
                      (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
                      (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
  } else {#if ($U <= 0x7FFFFFFF) {
    return pack 'C7', (0xFE | (($U >> 30) & 0x01)), (0xA0 | (($U >> 25) & 0x1F)),
                      (0xA0 | (($U >> 20) & 0x1F)), (0xA0 | (($U >> 15) & 0x1F)),
                      (0xA0 | (($U >> 10) & 0x1F)), (0xA0 | (($U >> 5) & 0x1F)),
                      (0xA0 | ($U & 0x1F));
  }
}

package Encode::Unicode::UTF8::UTFEBCDIC;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/utf-ebcdic ef-utf utf-ebcdic-without-bom/);

=item utf-ebcdic

UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
defined in UTR #16, without BOM. (Alias: ef-utf, utf-ebcdic-without-bom)

=cut

my $_tbl_u8m = q(\x00-\xFF);
my $_tbl_ue = q(\x00-\x03\x37\x2D\x2E\x2F\x16\x05\x15\x0B-\x0F\x10-\x13\x3C\x3D\x32\x26\x18\x19\x3F\x27\x1C-\x1F\x40\x5A\x7F\x7B\x5B\x6C\x50\x7D\x4D\x5D\x5C\x4E\x6B\x60\x4B\x61\xF0-\xF9\x7A\x5E\x4C\x7E\x6E\x6F\x7C\xC1-\xC9\xD1-\xD9\xE2-\xE9\xAD\xE0\xBD\x5F\x6D\x79\x81-\x89\x91-\x99\xA2-\xA9\xC0\x4F\xD0\xA1\x07\x20-\x25\x06\x17\x28-\x2C\x09\x0A\x1B\x30\x31\x1A\x33-\x36\x08\x38-\x3B\x04\x14\x3E\xFF\x41-\x49\x4A\x51-\x59\x62-\x6A\x70-\x78\x80\x8A-\x90\x9A-\xA0\xAA-\xAC\xAE-\xBC\xBE\xBF\xCA-\xCF\xDA-\xDF\xE1\xEA-\xEF\xFA-\xFE);
sub encode ($$;$) {
  my ($obj, $str, $chk) = @_;
  $str = Encode::encode ('utf-8-mod', $str);
  eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
  $_[1] = '' if $chk;
  return $str;
}

sub decode ($$;$) {
  my ($obj, $str, $chk) = @_;
  eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
  $_[1] = '' if $chk;
  return Encode::decode ('utf-8-mod', $str);
}

package Encode::Unicode::UTF8::UTFEBCDICwBOM;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/utf-ebcdic-with-bom/);

=item utf-ebcdic-with-bom

UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
defined in UTR #16, with BOM

=cut

sub encode ($$;$) {
  my ($obj, $str, $chk) = @_;
  $str = Encode::encode ('utf-8-mod', "\x{FEFF}".$str);
  eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
  $_[1] = '' if $chk;
  $str;
}

sub decode ($$;$) {
  my ($obj, $str, $chk) = @_;
  eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
  $_[1] = '' if $chk;
  my $str = Encode::decode ('utf-8-mod', $str);
  $str =~ s/^\x{FEFF}//;
  $str;
}

1;

=back

Note that UTF-8-Mod and UTF-EBCDIC are supported by perl
for EBCDIC platforms.  If we can use that code (written in C),
convertion of those encodings will become faster.

Note also that UTF-8 -> CESU-8 could be implemented as
utf8_off(decode_ucs2(encode_utf16(utf8))) and CESU-8 -> UTF-8
could be implemented as decode_utf16(encode_ucs2(cesu8)),
if Encode::Unicode did not check malformed UTF-8 sequences.
It might make convertion faster when XS is used.

=head1 SEE ALSO

"UTF-EBCDIC", Unicode Technical Report #16,
<http://www.unicode.org/unicode/reports/tr16/>.

"Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8)",
Unicode Technical Report #26, <http://www.unicode.org/unicode/reports/tr26/>.

=head1 LICENSE

Copyright 2002 Nanashi-san

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; see the file COPYING.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.

=cut

## $Date: 2002/09/15 04:15:51 $
### UTF8.pm ends here
1	wakaba	1.1	=head1 NAME
2
3			Encode::Unicode::UTF8 --- Encode/decode of UTF-8 related encodings
4
5			=head1 ENCODINGS
6
7			=over 4
8
9			=cut
10
11			require v5.7.3;
12			package Encode::Unicode::UTF8;
13			use strict;
14			use vars qw($VERSION);
15			$VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
16
17			package Encode::Unicode::UTF8::CESU8;
18			use base qw(Encode::Encoding);
19			__PACKAGE__->Define (qw/CESU-8 cesu8 csCESU-8/);
20
21			=item CESU-8
22
23			Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8),
24			defined in UTR #26. (Alias: csCESU-8 (IANA), cesu8)
25
26			=cut
27
28			my %_U2C;
29			sub encode ($$;$) {
30			use integer;
31			my ($obj, $str, $chk) = @_;
32			$_[1] = '' if $chk;
33			$str =~ s{([\x{010000}-\x{10FFFF}])}{
34			my $u = $1;
35			unless ($_U2C{$u}) {
36			$_U2C{$u} = chr ((ord ($u) - 0x10000) / 0x400 + 0xD800).
37			chr ((ord ($u) - 0x10000) % 0x400 + 0xDC00);
38			}
39			$_U2C{$u};
40			}ge;
41			Encode::_utf8_off ($str);
42			$str;
43			}
44
45			my %_C2U;
46			sub decode ($$;$) {
47			no warnings;
48			my ($obj, $str, $chk) = @_;
49			$_[1] = '' if $chk;
50			Encode::_utf8_on ($str);
51			$str =~ s{([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])}{
52			my ($u1,$u2) = ($1,$2);
53			unless ($_C2U{$u1.$u2}) {
54			$_C2U{$u1.$u2} = chr (0x10000+(ord($u1)-0xD800)*0x400+(ord($u2)-0xDC00));
55			}
56			$_C2U{$u1.$u2};
57			}ge;
58			return $str;
59			}
60
61			package Encode::Unicode::UTF8::UTF8Mod;
62			use base qw(Encode::Encoding);
63			__PACKAGE__->Define (qw/utf-8-mod utf8-mod/);
64
65			=item utf-8-mod
66
67			Modified UTF-8 for UTF-EBCDIC, defined in UTR #16.
68			(Alias: utf8-mod)
69
70			=cut
71
72			my %_4to8m;
73			sub encode ($$;$) {
74			my ($obj, $str, $chk) = @_;
75			my $r = '';
76			for (split //, $str) {
77			unless ($_4to8m{$_}) {
78			my $U = ord $_;
79			if ($U <= 0x9F) {
80			$_4to8m{$_} = $_;
81			} else {
82			$_4to8m{$_} = _ucs4_to_utf8m ($U);
83			}
84			}
85			$r .= $_4to8m{$_};
86			}
87			$_[1] = '' if $chk;
88			return $r;
89			}
90
91			my %_8mto4;
92			sub decode ($$;$) {
93			my ($obj, $str, $chk) = @_;
94			$str =~ s{
95			([\xC0-\xDF][\xA0-\xFF])
96			\|([\xE0-\xEF][\xA0-\xFF][\xA0-\xFF])
97			\|([\xF0-\xF7][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
98			\|([\xF8-\xFB][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
99			\| ([\xFC\xFD][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
100			\| ([\xFE\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
101			}{
102			my ($o2,$o3,$o4,$o5,$o6,$o7) = ($1,$2,$3,$4,$5,$6);
103			unless ($_8mto4{$o2.$o3.$o4.$o5.$o6.$o7}) {
104			if ($o2) {
105			my @o = split //, $o2;
106			$_8mto4{$o2} =
107			chr (((ord ($o[0]) & 0x1F) << 5) + (ord ($o[1]) & 0x1F));
108			} elsif ($o3) {
109			my @o = split //, $o3;
110			$_8mto4{$o3} =
111			chr (((ord ($o[0]) & 0x03) << 10) + ((ord ($o[1]) & 0x1F) << 5)
112			+ (ord ($o[2]) & 0x1F));
113			} elsif ($o4) {
114			my @o = split //, $o4;
115			$_8mto4{$o4} =
116			chr (((ord ($o[0]) & 0x07) << 15) + ((ord ($o[1]) & 0x1F) << 10)
117			+ ((ord ($o[2]) & 0x1F) << 5) + (ord ($o[3]) & 0x1F));
118			} elsif ($o5) {
119			my @o = split //, $o5;
120			$_8mto4{$o5} =
121			chr (((ord ($o[0]) & 0x03) << 20) + ((ord ($o[1]) & 0x1F) << 15)
122			+ ((ord ($o[2]) & 0x1F) << 10) + ((ord ($o[3]) & 0x1F) << 5)
123			+ (ord ($o[4]) & 0x1F));
124			} elsif ($o6) {
125			my @o = split //, $o6;
126			$_8mto4{$o6} =
127			chr (((ord ($o[0]) & 0x01) << 25) + ((ord ($o[1]) & 0x1F) << 20)
128			+ ((ord ($o[2]) & 0x1F) << 15) + ((ord ($o[3]) & 0x1F) << 10)
129			+ ((ord ($o[4]) & 0x1F) << 5) + (ord ($o[5]) & 0x1F));
130			} else {
131			my @o = split //, $o7;
132			$_8mto4{$o7} =
133			chr (((ord ($o[0]) & 0x01) << 30) + ((ord ($o[1]) & 0x1F) << 25)
134			+ ((ord ($o[2]) & 0x1F) << 20) + ((ord ($o[3]) & 0x1F) << 15)
135			+ ((ord ($o[4]) & 0x1F) << 10) + ((ord ($o[5]) & 0x1F) << 5)
136			+ (ord ($o[6]) & 0x1F));
137			}
138			}
139			$_8mto4{$o2.$o3.$o4.$o5.$o6.$o7};
140			}goex;
141			$_[1] = '' if $chk;
142			return $str;
143			}
144
145			sub _ucs4_to_utf8m ($) {
146			my $U = shift;
147			if ($U <= 0x009F) {
148			return pack 'C', $U;
149			} elsif ($U <= 0x03FF) {
150			return pack 'C2', (0xC0 \| ($U >> 5)), (0xA0 \| ($U & 0x1F));
151			} elsif ($U <= 0x3FFF) {
152			return pack 'C3', (0xE0 \| ($U >> 10)), (0xA0 \| (($U >> 5) & 0x1F)),
153			(0xA0 \| ($U & 0x4F));
154			} elsif ($U <= 0x0003FFFF) {
155			return pack 'C4', (0xF0 \| ($U >> 15)), (0xA0 \| (($U >> 10) & 0x1F)),
156			(0xA0 \| (($U >> 5) & 0x1F)), (0xA0 \| ($U & 0x1F));
157			} elsif ($U <= 0x003FFFFF) {
158			return pack 'C5', (0xF8 \| ($U >> 20)),
159			(0xA0 \| (($U >> 15) & 0x1F)), (0xA0 \| (($U >> 10) & 0x1F)),
160			(0xA0 \| (($U >> 5) & 0x1F)), (0xA0 \| ($U & 0x1F));
161			} elsif ($U <= 0x03FFFFFF) {
162			return pack 'C6', (0xFC \| ($U >> 25)), (0xA0 \| (($U >> 20) & 0x1F)),
163			(0xA0 \| (($U >> 15) & 0x1F)), (0xA0 \| (($U >> 10) & 0x1F)),
164			(0xA0 \| (($U >> 5) & 0x1F)), (0xA0 \| ($U & 0x1F));
165			} else {#if ($U <= 0x7FFFFFFF) {
166			return pack 'C7', (0xFE \| (($U >> 30) & 0x01)), (0xA0 \| (($U >> 25) & 0x1F)),
167			(0xA0 \| (($U >> 20) & 0x1F)), (0xA0 \| (($U >> 15) & 0x1F)),
168			(0xA0 \| (($U >> 10) & 0x1F)), (0xA0 \| (($U >> 5) & 0x1F)),
169			(0xA0 \| ($U & 0x1F));
170			}
171			}
172
173			package Encode::Unicode::UTF8::UTFEBCDIC;
174			use base qw(Encode::Encoding);
175			__PACKAGE__->Define (qw/utf-ebcdic ef-utf utf-ebcdic-without-bom/);
176
177			=item utf-ebcdic
178
179			UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
180			defined in UTR #16, without BOM. (Alias: ef-utf, utf-ebcdic-without-bom)
181
182			=cut
183
184			my $_tbl_u8m = q(\x00-\xFF);
185			my $_tbl_ue = q(\x00-\x03\x37\x2D\x2E\x2F\x16\x05\x15\x0B-\x0F\x10-\x13\x3C\x3D\x32\x26\x18\x19\x3F\x27\x1C-\x1F\x40\x5A\x7F\x7B\x5B\x6C\x50\x7D\x4D\x5D\x5C\x4E\x6B\x60\x4B\x61\xF0-\xF9\x7A\x5E\x4C\x7E\x6E\x6F\x7C\xC1-\xC9\xD1-\xD9\xE2-\xE9\xAD\xE0\xBD\x5F\x6D\x79\x81-\x89\x91-\x99\xA2-\xA9\xC0\x4F\xD0\xA1\x07\x20-\x25\x06\x17\x28-\x2C\x09\x0A\x1B\x30\x31\x1A\x33-\x36\x08\x38-\x3B\x04\x14\x3E\xFF\x41-\x49\x4A\x51-\x59\x62-\x6A\x70-\x78\x80\x8A-\x90\x9A-\xA0\xAA-\xAC\xAE-\xBC\xBE\xBF\xCA-\xCF\xDA-\xDF\xE1\xEA-\xEF\xFA-\xFE);
186			sub encode ($$;$) {
187			my ($obj, $str, $chk) = @_;
188			$str = Encode::encode ('utf-8-mod', $str);
189			eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
190			$_[1] = '' if $chk;
191			return $str;
192			}
193
194			sub decode ($$;$) {
195			my ($obj, $str, $chk) = @_;
196			eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
197			$_[1] = '' if $chk;
198			return Encode::decode ('utf-8-mod', $str);
199			}
200
201			package Encode::Unicode::UTF8::UTFEBCDICwBOM;
202			use base qw(Encode::Encoding);
203			__PACKAGE__->Define (qw/utf-ebcdic-with-bom/);
204
205			=item utf-ebcdic-with-bom
206
207			UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
208			defined in UTR #16, with BOM
209
210			=cut
211
212			sub encode ($$;$) {
213			my ($obj, $str, $chk) = @_;
214			$str = Encode::encode ('utf-8-mod', "\x{FEFF}".$str);
215			eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
216			$_[1] = '' if $chk;
217			$str;
218			}
219
220			sub decode ($$;$) {
221			my ($obj, $str, $chk) = @_;
222			eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
223			$_[1] = '' if $chk;
224			my $str = Encode::decode ('utf-8-mod', $str);
225			$str =~ s/^\x{FEFF}//;
226			$str;
227			}
228
229			1;
230
231			=back
232
233			Note that UTF-8-Mod and UTF-EBCDIC are supported by perl
234			for EBCDIC platforms. If we can use that code (written in C),
235			convertion of those encodings will become faster.
236
237			Note also that UTF-8 -> CESU-8 could be implemented as
238			utf8_off(decode_ucs2(encode_utf16(utf8))) and CESU-8 -> UTF-8
239			could be implemented as decode_utf16(encode_ucs2(cesu8)),
240			if Encode::Unicode did not check malformed UTF-8 sequences.
241			It might make convertion faster when XS is used.
242
243			=head1 SEE ALSO
244
245			"UTF-EBCDIC", Unicode Technical Report #16,
246			<http://www.unicode.org/unicode/reports/tr16/>.
247
248			"Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8)",
249			Unicode Technical Report #26, <http://www.unicode.org/unicode/reports/tr26/>.
250
251			=head1 LICENSE
252
253			Copyright 2002 Nanashi-san
254
255			This program is free software; you can redistribute it and/or modify
256			it under the terms of the GNU General Public License as published by
257			the Free Software Foundation; either version 2 of the License, or
258			(at your option) any later version.
259
260			This program is distributed in the hope that it will be useful,
261			but WITHOUT ANY WARRANTY; without even the implied warranty of
262			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
263			GNU General Public License for more details.
264
265			You should have received a copy of the GNU General Public License
266			along with this program; see the file COPYING. If not, write to
267			the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
268			Boston, MA 02111-1307, USA.
269
270			=cut
271
272			## $Date: 2002/09/15 04:15:51 $
273			### UTF8.pm ends here