Encode/Unicode/UTF8.pm

=head1 NAME

Encode::Unicode::UTF8 --- Encode/decode of UTF-8 related encodings

=head1 ENCODINGS

=over 4

=cut

require v5.7.3;
package Encode::Unicode::UTF8;
use strict;
use vars qw($VERSION);
$VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};

package Encode::Unicode::UTF8::CESU8;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/CESU-8 cesu8 csCESU-8/);

=item CESU-8

Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8),
defined in UTR #26.  (Alias: csCESU-8 (IANA), cesu8)

=cut

my %_U2C;
sub encode ($$;$) {
  use integer;
  my ($obj, $str, $chk) = @_;
  $_[1] = '' if $chk;
  $str =~ s{([\x{010000}-\x{10FFFF}])}{
    my $u = $1;
    unless ($_U2C{$u}) {
      $_U2C{$u} = chr ((ord ($u) - 0x10000) / 0x400 + 0xD800).
                  chr ((ord ($u) - 0x10000) % 0x400 + 0xDC00);
    }
    $_U2C{$u};
  }ge;
  Encode::_utf8_off ($str);
  $str;
}

my %_C2U;
sub decode ($$;$) {
  no warnings;
  my ($obj, $str, $chk) = @_;
  $_[1] = '' if $chk;
  Encode::_utf8_on ($str);
  $str =~ s{([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])}{
    my ($u1,$u2) = ($1,$2);
    unless ($_C2U{$u1.$u2}) {
      $_C2U{$u1.$u2} = chr (0x10000+(ord($u1)-0xD800)*0x400+(ord($u2)-0xDC00));
    }
    $_C2U{$u1.$u2};
  }ge;
  return $str;
}

package Encode::Unicode::UTF8::UTF8Mod;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/utf-8-mod utf8-mod/);

=item utf-8-mod

Modified UTF-8 for UTF-EBCDIC, defined in UTR #16.
(Alias: utf8-mod)

=cut

my %_4to8m;
sub encode ($$;$) {
  my ($obj, $str, $chk) = @_;
  my $r = '';
  for (split //, $str) {
    unless ($_4to8m{$_}) {
      my $U = ord $_;
      if ($U <= 0x9F) {
        $_4to8m{$_} = $_;
      } else {
        $_4to8m{$_} = _ucs4_to_utf8m ($U);
      }
    }
    $r .= $_4to8m{$_};
  }
  $_[1] = '' if $chk;
  return $r;
}

my %_8mto4;
sub decode ($$;$) {
  my ($obj, $str, $chk) = @_;
  $str =~ s{
     ([\xC0-\xDF][\xA0-\xFF])
    |([\xE0-\xEF][\xA0-\xFF][\xA0-\xFF])
    |([\xF0-\xF7][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
    |([\xF8-\xFB][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
    | ([\xFC\xFD][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
    | ([\xFE\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
  }{
    my ($o2,$o3,$o4,$o5,$o6,$o7) = ($1,$2,$3,$4,$5,$6);
    unless ($_8mto4{$o2.$o3.$o4.$o5.$o6.$o7}) {
      if ($o2) {
        my @o = split //, $o2;
        $_8mto4{$o2} =
          chr (((ord ($o[0]) & 0x1F) << 5) + (ord ($o[1]) & 0x1F));
      } elsif ($o3) {
        my @o = split //, $o3;
        $_8mto4{$o3} =
          chr (((ord ($o[0]) & 0x03) << 10) + ((ord ($o[1]) & 0x1F) << 5)
            + (ord ($o[2]) & 0x1F));
      } elsif ($o4) {
        my @o = split //, $o4;
        $_8mto4{$o4} =
          chr (((ord ($o[0]) & 0x07) << 15) + ((ord ($o[1]) & 0x1F) << 10)
          + ((ord ($o[2]) & 0x1F) << 5) + (ord ($o[3]) & 0x1F));
      } elsif ($o5) {
        my @o = split //, $o5;
        $_8mto4{$o5} =
          chr (((ord ($o[0]) & 0x03) << 20) + ((ord ($o[1]) & 0x1F) << 15)
          + ((ord ($o[2]) & 0x1F) << 10) + ((ord ($o[3]) & 0x1F) << 5)
          + (ord ($o[4]) & 0x1F));
      } elsif ($o6) {
        my @o = split //, $o6;
        $_8mto4{$o6} =
          chr (((ord ($o[0]) & 0x01) << 25) + ((ord ($o[1]) & 0x1F) << 20)
          + ((ord ($o[2]) & 0x1F) << 15) + ((ord ($o[3]) & 0x1F) << 10)
          + ((ord ($o[4]) & 0x1F) << 5) + (ord ($o[5]) & 0x1F));
      } else {
        my @o = split //, $o7;
        $_8mto4{$o7} =
          chr (((ord ($o[0]) & 0x01) << 30) + ((ord ($o[1]) & 0x1F) << 25)
          + ((ord ($o[2]) & 0x1F) << 20) + ((ord ($o[3]) & 0x1F) << 15)
          + ((ord ($o[4]) & 0x1F) << 10) + ((ord ($o[5]) & 0x1F) << 5)
          + (ord ($o[6]) & 0x1F));
      }
    }
    $_8mto4{$o2.$o3.$o4.$o5.$o6.$o7};
  }goex;
  $_[1] = '' if $chk;
  return $str;
}

sub _ucs4_to_utf8m ($) {
  my $U = shift;
  if ($U <= 0x009F) {
    return pack 'C', $U;
  } elsif ($U <= 0x03FF) {
    return pack 'C2', (0xC0 | ($U >> 5)), (0xA0 | ($U & 0x1F));
  } elsif ($U <= 0x3FFF) {
    return pack 'C3', (0xE0 | ($U >> 10)), (0xA0 | (($U >> 5) & 0x1F)),
                      (0xA0 | ($U & 0x4F));
  } elsif ($U <= 0x0003FFFF) {
    return pack 'C4', (0xF0 | ($U >> 15)), (0xA0 | (($U >> 10) & 0x1F)),
                      (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
  } elsif ($U <= 0x003FFFFF) {
    return pack 'C5', (0xF8 | ($U >> 20)),
                      (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
                      (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
  } elsif ($U <= 0x03FFFFFF) {
    return pack 'C6', (0xFC | ($U >> 25)), (0xA0 | (($U >> 20) & 0x1F)),
                      (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
                      (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
  } else {#if ($U <= 0x7FFFFFFF) {
    return pack 'C7', (0xFE | (($U >> 30) & 0x01)), (0xA0 | (($U >> 25) & 0x1F)),
                      (0xA0 | (($U >> 20) & 0x1F)), (0xA0 | (($U >> 15) & 0x1F)),
                      (0xA0 | (($U >> 10) & 0x1F)), (0xA0 | (($U >> 5) & 0x1F)),
                      (0xA0 | ($U & 0x1F));
  }
}

package Encode::Unicode::UTF8::UTFEBCDIC;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/utf-ebcdic ef-utf utf-ebcdic-without-bom/);

=item utf-ebcdic

UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
defined in UTR #16, without BOM. (Alias: ef-utf, utf-ebcdic-without-bom)

=cut

my $_tbl_u8m = q(\x00-\xFF);
my $_tbl_ue = q(\x00-\x03\x37\x2D\x2E\x2F\x16\x05\x15\x0B-\x0F\x10-\x13\x3C\x3D\x32\x26\x18\x19\x3F\x27\x1C-\x1F\x40\x5A\x7F\x7B\x5B\x6C\x50\x7D\x4D\x5D\x5C\x4E\x6B\x60\x4B\x61\xF0-\xF9\x7A\x5E\x4C\x7E\x6E\x6F\x7C\xC1-\xC9\xD1-\xD9\xE2-\xE9\xAD\xE0\xBD\x5F\x6D\x79\x81-\x89\x91-\x99\xA2-\xA9\xC0\x4F\xD0\xA1\x07\x20-\x25\x06\x17\x28-\x2C\x09\x0A\x1B\x30\x31\x1A\x33-\x36\x08\x38-\x3B\x04\x14\x3E\xFF\x41-\x49\x4A\x51-\x59\x62-\x6A\x70-\x78\x80\x8A-\x90\x9A-\xA0\xAA-\xAC\xAE-\xBC\xBE\xBF\xCA-\xCF\xDA-\xDF\xE1\xEA-\xEF\xFA-\xFE);
sub encode ($$;$) {
  my ($obj, $str, $chk) = @_;
  $str = Encode::encode ('utf-8-mod', $str);
  eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
  $_[1] = '' if $chk;
  return $str;
}

sub decode ($$;$) {
  my ($obj, $str, $chk) = @_;
  eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
  $_[1] = '' if $chk;
  return Encode::decode ('utf-8-mod', $str);
}

package Encode::Unicode::UTF8::UTFEBCDICwBOM;
use base qw(Encode::Encoding);
__PACKAGE__->Define (qw/utf-ebcdic-with-bom/);

=item utf-ebcdic-with-bom

UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
defined in UTR #16, with BOM

=cut

sub encode ($$;$) {
  my ($obj, $str, $chk) = @_;
  $str = Encode::encode ('utf-8-mod', "\x{FEFF}".$str);
  eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
  $_[1] = '' if $chk;
  $str;
}

sub decode ($$;$) {
  my ($obj, $str, $chk) = @_;
  eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
  $_[1] = '' if $chk;
  my $str = Encode::decode ('utf-8-mod', $str);
  $str =~ s/^\x{FEFF}//;
  $str;
}

1;

=back

Note that UTF-8-Mod and UTF-EBCDIC are supported by perl
for EBCDIC platforms.  If we can use that code (written in C),
convertion of those encodings will become faster.

Note also that UTF-8 -> CESU-8 could be implemented as
utf8_off(decode_ucs2(encode_utf16(utf8))) and CESU-8 -> UTF-8
could be implemented as decode_utf16(encode_ucs2(cesu8)),
if Encode::Unicode did not check malformed UTF-8 sequences.
It might make convertion faster when XS is used.

=head1 SEE ALSO

"UTF-EBCDIC", Unicode Technical Report #16,
<http://www.unicode.org/unicode/reports/tr16/>.

"Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8)",
Unicode Technical Report #26, <http://www.unicode.org/unicode/reports/tr26/>.

=head1 LICENSE

Copyright 2002 Nanashi-san

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; see the file COPYING.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.

=cut

## $Date: 2002/09/15 04:15:51 $
### UTF8.pm ends here
1	=head1 NAME
2
3	Encode::Unicode::UTF8 --- Encode/decode of UTF-8 related encodings
4
5	=head1 ENCODINGS
6
7	=over 4
8
9	=cut
10
11	require v5.7.3;
12	package Encode::Unicode::UTF8;
13	use strict;
14	use vars qw($VERSION);
15	$VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
16
17	package Encode::Unicode::UTF8::CESU8;
18	use base qw(Encode::Encoding);
19	__PACKAGE__->Define (qw/CESU-8 cesu8 csCESU-8/);
20
21	=item CESU-8
22
23	Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8),
24	defined in UTR #26. (Alias: csCESU-8 (IANA), cesu8)
25
26	=cut
27
28	my %_U2C;
29	sub encode ($$;$) {
30	use integer;
31	my ($obj, $str, $chk) = @_;
32	$_[1] = '' if $chk;
33	$str =~ s{([\x{010000}-\x{10FFFF}])}{
34	my $u = $1;
35	unless ($_U2C{$u}) {
36	$_U2C{$u} = chr ((ord ($u) - 0x10000) / 0x400 + 0xD800).
37	chr ((ord ($u) - 0x10000) % 0x400 + 0xDC00);
38	}
39	$_U2C{$u};
40	}ge;
41	Encode::_utf8_off ($str);
42	$str;
43	}
44
45	my %_C2U;
46	sub decode ($$;$) {
47	no warnings;
48	my ($obj, $str, $chk) = @_;
49	$_[1] = '' if $chk;
50	Encode::_utf8_on ($str);
51	$str =~ s{([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])}{
52	my ($u1,$u2) = ($1,$2);
53	unless ($_C2U{$u1.$u2}) {
54	$_C2U{$u1.$u2} = chr (0x10000+(ord($u1)-0xD800)*0x400+(ord($u2)-0xDC00));
55	}
56	$_C2U{$u1.$u2};
57	}ge;
58	return $str;
59	}
60
61	package Encode::Unicode::UTF8::UTF8Mod;
62	use base qw(Encode::Encoding);
63	__PACKAGE__->Define (qw/utf-8-mod utf8-mod/);
64
65	=item utf-8-mod
66
67	Modified UTF-8 for UTF-EBCDIC, defined in UTR #16.
68	(Alias: utf8-mod)
69
70	=cut
71
72	my %_4to8m;
73	sub encode ($$;$) {
74	my ($obj, $str, $chk) = @_;
75	my $r = '';
76	for (split //, $str) {
77	unless ($_4to8m{$_}) {
78	my $U = ord $_;
79	if ($U <= 0x9F) {
80	$_4to8m{$_} = $_;
81	} else {
82	$_4to8m{$_} = _ucs4_to_utf8m ($U);
83	}
84	}
85	$r .= $_4to8m{$_};
86	}
87	$_[1] = '' if $chk;
88	return $r;
89	}
90
91	my %_8mto4;
92	sub decode ($$;$) {
93	my ($obj, $str, $chk) = @_;
94	$str =~ s{
95	([\xC0-\xDF][\xA0-\xFF])
96	\|([\xE0-\xEF][\xA0-\xFF][\xA0-\xFF])
97	\|([\xF0-\xF7][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
98	\|([\xF8-\xFB][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
99	\| ([\xFC\xFD][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
100	\| ([\xFE\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
101	}{
102	my ($o2,$o3,$o4,$o5,$o6,$o7) = ($1,$2,$3,$4,$5,$6);
103	unless ($_8mto4{$o2.$o3.$o4.$o5.$o6.$o7}) {
104	if ($o2) {
105	my @o = split //, $o2;
106	$_8mto4{$o2} =
107	chr (((ord ($o[0]) & 0x1F) << 5) + (ord ($o[1]) & 0x1F));
108	} elsif ($o3) {
109	my @o = split //, $o3;
110	$_8mto4{$o3} =
111	chr (((ord ($o[0]) & 0x03) << 10) + ((ord ($o[1]) & 0x1F) << 5)
112	+ (ord ($o[2]) & 0x1F));
113	} elsif ($o4) {
114	my @o = split //, $o4;
115	$_8mto4{$o4} =
116	chr (((ord ($o[0]) & 0x07) << 15) + ((ord ($o[1]) & 0x1F) << 10)
117	+ ((ord ($o[2]) & 0x1F) << 5) + (ord ($o[3]) & 0x1F));
118	} elsif ($o5) {
119	my @o = split //, $o5;
120	$_8mto4{$o5} =
121	chr (((ord ($o[0]) & 0x03) << 20) + ((ord ($o[1]) & 0x1F) << 15)
122	+ ((ord ($o[2]) & 0x1F) << 10) + ((ord ($o[3]) & 0x1F) << 5)
123	+ (ord ($o[4]) & 0x1F));
124	} elsif ($o6) {
125	my @o = split //, $o6;
126	$_8mto4{$o6} =
127	chr (((ord ($o[0]) & 0x01) << 25) + ((ord ($o[1]) & 0x1F) << 20)
128	+ ((ord ($o[2]) & 0x1F) << 15) + ((ord ($o[3]) & 0x1F) << 10)
129	+ ((ord ($o[4]) & 0x1F) << 5) + (ord ($o[5]) & 0x1F));
130	} else {
131	my @o = split //, $o7;
132	$_8mto4{$o7} =
133	chr (((ord ($o[0]) & 0x01) << 30) + ((ord ($o[1]) & 0x1F) << 25)
134	+ ((ord ($o[2]) & 0x1F) << 20) + ((ord ($o[3]) & 0x1F) << 15)
135	+ ((ord ($o[4]) & 0x1F) << 10) + ((ord ($o[5]) & 0x1F) << 5)
136	+ (ord ($o[6]) & 0x1F));
137	}
138	}
139	$_8mto4{$o2.$o3.$o4.$o5.$o6.$o7};
140	}goex;
141	$_[1] = '' if $chk;
142	return $str;
143	}
144
145	sub _ucs4_to_utf8m ($) {
146	my $U = shift;
147	if ($U <= 0x009F) {
148	return pack 'C', $U;
149	} elsif ($U <= 0x03FF) {
150	return pack 'C2', (0xC0 \| ($U >> 5)), (0xA0 \| ($U & 0x1F));
151	} elsif ($U <= 0x3FFF) {
152	return pack 'C3', (0xE0 \| ($U >> 10)), (0xA0 \| (($U >> 5) & 0x1F)),
153	(0xA0 \| ($U & 0x4F));
154	} elsif ($U <= 0x0003FFFF) {
155	return pack 'C4', (0xF0 \| ($U >> 15)), (0xA0 \| (($U >> 10) & 0x1F)),
156	(0xA0 \| (($U >> 5) & 0x1F)), (0xA0 \| ($U & 0x1F));
157	} elsif ($U <= 0x003FFFFF) {
158	return pack 'C5', (0xF8 \| ($U >> 20)),
159	(0xA0 \| (($U >> 15) & 0x1F)), (0xA0 \| (($U >> 10) & 0x1F)),
160	(0xA0 \| (($U >> 5) & 0x1F)), (0xA0 \| ($U & 0x1F));
161	} elsif ($U <= 0x03FFFFFF) {
162	return pack 'C6', (0xFC \| ($U >> 25)), (0xA0 \| (($U >> 20) & 0x1F)),
163	(0xA0 \| (($U >> 15) & 0x1F)), (0xA0 \| (($U >> 10) & 0x1F)),
164	(0xA0 \| (($U >> 5) & 0x1F)), (0xA0 \| ($U & 0x1F));
165	} else {#if ($U <= 0x7FFFFFFF) {
166	return pack 'C7', (0xFE \| (($U >> 30) & 0x01)), (0xA0 \| (($U >> 25) & 0x1F)),
167	(0xA0 \| (($U >> 20) & 0x1F)), (0xA0 \| (($U >> 15) & 0x1F)),
168	(0xA0 \| (($U >> 10) & 0x1F)), (0xA0 \| (($U >> 5) & 0x1F)),
169	(0xA0 \| ($U & 0x1F));
170	}
171	}
172
173	package Encode::Unicode::UTF8::UTFEBCDIC;
174	use base qw(Encode::Encoding);
175	__PACKAGE__->Define (qw/utf-ebcdic ef-utf utf-ebcdic-without-bom/);
176
177	=item utf-ebcdic
178
179	UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
180	defined in UTR #16, without BOM. (Alias: ef-utf, utf-ebcdic-without-bom)
181
182	=cut
183
184	my $_tbl_u8m = q(\x00-\xFF);
185	my $_tbl_ue = q(\x00-\x03\x37\x2D\x2E\x2F\x16\x05\x15\x0B-\x0F\x10-\x13\x3C\x3D\x32\x26\x18\x19\x3F\x27\x1C-\x1F\x40\x5A\x7F\x7B\x5B\x6C\x50\x7D\x4D\x5D\x5C\x4E\x6B\x60\x4B\x61\xF0-\xF9\x7A\x5E\x4C\x7E\x6E\x6F\x7C\xC1-\xC9\xD1-\xD9\xE2-\xE9\xAD\xE0\xBD\x5F\x6D\x79\x81-\x89\x91-\x99\xA2-\xA9\xC0\x4F\xD0\xA1\x07\x20-\x25\x06\x17\x28-\x2C\x09\x0A\x1B\x30\x31\x1A\x33-\x36\x08\x38-\x3B\x04\x14\x3E\xFF\x41-\x49\x4A\x51-\x59\x62-\x6A\x70-\x78\x80\x8A-\x90\x9A-\xA0\xAA-\xAC\xAE-\xBC\xBE\xBF\xCA-\xCF\xDA-\xDF\xE1\xEA-\xEF\xFA-\xFE);
186	sub encode ($$;$) {
187	my ($obj, $str, $chk) = @_;
188	$str = Encode::encode ('utf-8-mod', $str);
189	eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
190	$_[1] = '' if $chk;
191	return $str;
192	}
193
194	sub decode ($$;$) {
195	my ($obj, $str, $chk) = @_;
196	eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
197	$_[1] = '' if $chk;
198	return Encode::decode ('utf-8-mod', $str);
199	}
200
201	package Encode::Unicode::UTF8::UTFEBCDICwBOM;
202	use base qw(Encode::Encoding);
203	__PACKAGE__->Define (qw/utf-ebcdic-with-bom/);
204
205	=item utf-ebcdic-with-bom
206
207	UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
208	defined in UTR #16, with BOM
209
210	=cut
211
212	sub encode ($$;$) {
213	my ($obj, $str, $chk) = @_;
214	$str = Encode::encode ('utf-8-mod', "\x{FEFF}".$str);
215	eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
216	$_[1] = '' if $chk;
217	$str;
218	}
219
220	sub decode ($$;$) {
221	my ($obj, $str, $chk) = @_;
222	eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
223	$_[1] = '' if $chk;
224	my $str = Encode::decode ('utf-8-mod', $str);
225	$str =~ s/^\x{FEFF}//;
226	$str;
227	}
228
229	1;
230
231	=back
232
233	Note that UTF-8-Mod and UTF-EBCDIC are supported by perl
234	for EBCDIC platforms. If we can use that code (written in C),
235	convertion of those encodings will become faster.
236
237	Note also that UTF-8 -> CESU-8 could be implemented as
238	utf8_off(decode_ucs2(encode_utf16(utf8))) and CESU-8 -> UTF-8
239	could be implemented as decode_utf16(encode_ucs2(cesu8)),
240	if Encode::Unicode did not check malformed UTF-8 sequences.
241	It might make convertion faster when XS is used.
242
243	=head1 SEE ALSO
244
245	"UTF-EBCDIC", Unicode Technical Report #16,
246	<http://www.unicode.org/unicode/reports/tr16/>.
247
248	"Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8)",
249	Unicode Technical Report #26, <http://www.unicode.org/unicode/reports/tr26/>.
250
251	=head1 LICENSE
252
253	Copyright 2002 Nanashi-san
254
255	This program is free software; you can redistribute it and/or modify
256	it under the terms of the GNU General Public License as published by
257	the Free Software Foundation; either version 2 of the License, or
258	(at your option) any later version.
259
260	This program is distributed in the hope that it will be useful,
261	but WITHOUT ANY WARRANTY; without even the implied warranty of
262	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
263	GNU General Public License for more details.
264
265	You should have received a copy of the GNU General Public License
266	along with this program; see the file COPYING. If not, write to
267	the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
268	Boston, MA 02111-1307, USA.
269
270	=cut
271
272	## $Date: 2002/09/15 04:15:51 $
273	### UTF8.pm ends here