/[suikacvs]/perl/lib/Encode/Unicode/UTF8.pm
Suika

Contents of /perl/lib/Encode/Unicode/UTF8.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Mon Sep 23 08:28:39 2002 UTC (22 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
2002-09-23  Nanashi-san

	* UTF8.pm, UTF9.pm: New modules.  (Committed by
	Wakaba <w@suika.fam.cx>.)

1 =head1 NAME
2
3 Encode::Unicode::UTF8 --- Encode/decode of UTF-8 related encodings
4
5 =head1 ENCODINGS
6
7 =over 4
8
9 =cut
10
11 require v5.7.3;
12 package Encode::Unicode::UTF8;
13 use strict;
14 use vars qw($VERSION);
15 $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
16
17 package Encode::Unicode::UTF8::CESU8;
18 use base qw(Encode::Encoding);
19 __PACKAGE__->Define (qw/CESU-8 cesu8 csCESU-8/);
20
21 =item CESU-8
22
23 Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8),
24 defined in UTR #26. (Alias: csCESU-8 (IANA), cesu8)
25
26 =cut
27
28 my %_U2C;
29 sub encode ($$;$) {
30 use integer;
31 my ($obj, $str, $chk) = @_;
32 $_[1] = '' if $chk;
33 $str =~ s{([\x{010000}-\x{10FFFF}])}{
34 my $u = $1;
35 unless ($_U2C{$u}) {
36 $_U2C{$u} = chr ((ord ($u) - 0x10000) / 0x400 + 0xD800).
37 chr ((ord ($u) - 0x10000) % 0x400 + 0xDC00);
38 }
39 $_U2C{$u};
40 }ge;
41 Encode::_utf8_off ($str);
42 $str;
43 }
44
45 my %_C2U;
46 sub decode ($$;$) {
47 no warnings;
48 my ($obj, $str, $chk) = @_;
49 $_[1] = '' if $chk;
50 Encode::_utf8_on ($str);
51 $str =~ s{([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])}{
52 my ($u1,$u2) = ($1,$2);
53 unless ($_C2U{$u1.$u2}) {
54 $_C2U{$u1.$u2} = chr (0x10000+(ord($u1)-0xD800)*0x400+(ord($u2)-0xDC00));
55 }
56 $_C2U{$u1.$u2};
57 }ge;
58 return $str;
59 }
60
61 package Encode::Unicode::UTF8::UTF8Mod;
62 use base qw(Encode::Encoding);
63 __PACKAGE__->Define (qw/utf-8-mod utf8-mod/);
64
65 =item utf-8-mod
66
67 Modified UTF-8 for UTF-EBCDIC, defined in UTR #16.
68 (Alias: utf8-mod)
69
70 =cut
71
72 my %_4to8m;
73 sub encode ($$;$) {
74 my ($obj, $str, $chk) = @_;
75 my $r = '';
76 for (split //, $str) {
77 unless ($_4to8m{$_}) {
78 my $U = ord $_;
79 if ($U <= 0x9F) {
80 $_4to8m{$_} = $_;
81 } else {
82 $_4to8m{$_} = _ucs4_to_utf8m ($U);
83 }
84 }
85 $r .= $_4to8m{$_};
86 }
87 $_[1] = '' if $chk;
88 return $r;
89 }
90
91 my %_8mto4;
92 sub decode ($$;$) {
93 my ($obj, $str, $chk) = @_;
94 $str =~ s{
95 ([\xC0-\xDF][\xA0-\xFF])
96 |([\xE0-\xEF][\xA0-\xFF][\xA0-\xFF])
97 |([\xF0-\xF7][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
98 |([\xF8-\xFB][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
99 | ([\xFC\xFD][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
100 | ([\xFE\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
101 }{
102 my ($o2,$o3,$o4,$o5,$o6,$o7) = ($1,$2,$3,$4,$5,$6);
103 unless ($_8mto4{$o2.$o3.$o4.$o5.$o6.$o7}) {
104 if ($o2) {
105 my @o = split //, $o2;
106 $_8mto4{$o2} =
107 chr (((ord ($o[0]) & 0x1F) << 5) + (ord ($o[1]) & 0x1F));
108 } elsif ($o3) {
109 my @o = split //, $o3;
110 $_8mto4{$o3} =
111 chr (((ord ($o[0]) & 0x03) << 10) + ((ord ($o[1]) & 0x1F) << 5)
112 + (ord ($o[2]) & 0x1F));
113 } elsif ($o4) {
114 my @o = split //, $o4;
115 $_8mto4{$o4} =
116 chr (((ord ($o[0]) & 0x07) << 15) + ((ord ($o[1]) & 0x1F) << 10)
117 + ((ord ($o[2]) & 0x1F) << 5) + (ord ($o[3]) & 0x1F));
118 } elsif ($o5) {
119 my @o = split //, $o5;
120 $_8mto4{$o5} =
121 chr (((ord ($o[0]) & 0x03) << 20) + ((ord ($o[1]) & 0x1F) << 15)
122 + ((ord ($o[2]) & 0x1F) << 10) + ((ord ($o[3]) & 0x1F) << 5)
123 + (ord ($o[4]) & 0x1F));
124 } elsif ($o6) {
125 my @o = split //, $o6;
126 $_8mto4{$o6} =
127 chr (((ord ($o[0]) & 0x01) << 25) + ((ord ($o[1]) & 0x1F) << 20)
128 + ((ord ($o[2]) & 0x1F) << 15) + ((ord ($o[3]) & 0x1F) << 10)
129 + ((ord ($o[4]) & 0x1F) << 5) + (ord ($o[5]) & 0x1F));
130 } else {
131 my @o = split //, $o7;
132 $_8mto4{$o7} =
133 chr (((ord ($o[0]) & 0x01) << 30) + ((ord ($o[1]) & 0x1F) << 25)
134 + ((ord ($o[2]) & 0x1F) << 20) + ((ord ($o[3]) & 0x1F) << 15)
135 + ((ord ($o[4]) & 0x1F) << 10) + ((ord ($o[5]) & 0x1F) << 5)
136 + (ord ($o[6]) & 0x1F));
137 }
138 }
139 $_8mto4{$o2.$o3.$o4.$o5.$o6.$o7};
140 }goex;
141 $_[1] = '' if $chk;
142 return $str;
143 }
144
145 sub _ucs4_to_utf8m ($) {
146 my $U = shift;
147 if ($U <= 0x009F) {
148 return pack 'C', $U;
149 } elsif ($U <= 0x03FF) {
150 return pack 'C2', (0xC0 | ($U >> 5)), (0xA0 | ($U & 0x1F));
151 } elsif ($U <= 0x3FFF) {
152 return pack 'C3', (0xE0 | ($U >> 10)), (0xA0 | (($U >> 5) & 0x1F)),
153 (0xA0 | ($U & 0x4F));
154 } elsif ($U <= 0x0003FFFF) {
155 return pack 'C4', (0xF0 | ($U >> 15)), (0xA0 | (($U >> 10) & 0x1F)),
156 (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
157 } elsif ($U <= 0x003FFFFF) {
158 return pack 'C5', (0xF8 | ($U >> 20)),
159 (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
160 (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
161 } elsif ($U <= 0x03FFFFFF) {
162 return pack 'C6', (0xFC | ($U >> 25)), (0xA0 | (($U >> 20) & 0x1F)),
163 (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
164 (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
165 } else {#if ($U <= 0x7FFFFFFF) {
166 return pack 'C7', (0xFE | (($U >> 30) & 0x01)), (0xA0 | (($U >> 25) & 0x1F)),
167 (0xA0 | (($U >> 20) & 0x1F)), (0xA0 | (($U >> 15) & 0x1F)),
168 (0xA0 | (($U >> 10) & 0x1F)), (0xA0 | (($U >> 5) & 0x1F)),
169 (0xA0 | ($U & 0x1F));
170 }
171 }
172
173 package Encode::Unicode::UTF8::UTFEBCDIC;
174 use base qw(Encode::Encoding);
175 __PACKAGE__->Define (qw/utf-ebcdic ef-utf utf-ebcdic-without-bom/);
176
177 =item utf-ebcdic
178
179 UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
180 defined in UTR #16, without BOM. (Alias: ef-utf, utf-ebcdic-without-bom)
181
182 =cut
183
184 my $_tbl_u8m = q(\x00-\xFF);
185 my $_tbl_ue = q(\x00-\x03\x37\x2D\x2E\x2F\x16\x05\x15\x0B-\x0F\x10-\x13\x3C\x3D\x32\x26\x18\x19\x3F\x27\x1C-\x1F\x40\x5A\x7F\x7B\x5B\x6C\x50\x7D\x4D\x5D\x5C\x4E\x6B\x60\x4B\x61\xF0-\xF9\x7A\x5E\x4C\x7E\x6E\x6F\x7C\xC1-\xC9\xD1-\xD9\xE2-\xE9\xAD\xE0\xBD\x5F\x6D\x79\x81-\x89\x91-\x99\xA2-\xA9\xC0\x4F\xD0\xA1\x07\x20-\x25\x06\x17\x28-\x2C\x09\x0A\x1B\x30\x31\x1A\x33-\x36\x08\x38-\x3B\x04\x14\x3E\xFF\x41-\x49\x4A\x51-\x59\x62-\x6A\x70-\x78\x80\x8A-\x90\x9A-\xA0\xAA-\xAC\xAE-\xBC\xBE\xBF\xCA-\xCF\xDA-\xDF\xE1\xEA-\xEF\xFA-\xFE);
186 sub encode ($$;$) {
187 my ($obj, $str, $chk) = @_;
188 $str = Encode::encode ('utf-8-mod', $str);
189 eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
190 $_[1] = '' if $chk;
191 return $str;
192 }
193
194 sub decode ($$;$) {
195 my ($obj, $str, $chk) = @_;
196 eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
197 $_[1] = '' if $chk;
198 return Encode::decode ('utf-8-mod', $str);
199 }
200
201 package Encode::Unicode::UTF8::UTFEBCDICwBOM;
202 use base qw(Encode::Encoding);
203 __PACKAGE__->Define (qw/utf-ebcdic-with-bom/);
204
205 =item utf-ebcdic-with-bom
206
207 UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
208 defined in UTR #16, with BOM
209
210 =cut
211
212 sub encode ($$;$) {
213 my ($obj, $str, $chk) = @_;
214 $str = Encode::encode ('utf-8-mod', "\x{FEFF}".$str);
215 eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
216 $_[1] = '' if $chk;
217 $str;
218 }
219
220 sub decode ($$;$) {
221 my ($obj, $str, $chk) = @_;
222 eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
223 $_[1] = '' if $chk;
224 my $str = Encode::decode ('utf-8-mod', $str);
225 $str =~ s/^\x{FEFF}//;
226 $str;
227 }
228
229 1;
230
231 =back
232
233 Note that UTF-8-Mod and UTF-EBCDIC are supported by perl
234 for EBCDIC platforms. If we can use that code (written in C),
235 convertion of those encodings will become faster.
236
237 Note also that UTF-8 -> CESU-8 could be implemented as
238 utf8_off(decode_ucs2(encode_utf16(utf8))) and CESU-8 -> UTF-8
239 could be implemented as decode_utf16(encode_ucs2(cesu8)),
240 if Encode::Unicode did not check malformed UTF-8 sequences.
241 It might make convertion faster when XS is used.
242
243 =head1 SEE ALSO
244
245 "UTF-EBCDIC", Unicode Technical Report #16,
246 <http://www.unicode.org/unicode/reports/tr16/>.
247
248 "Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8)",
249 Unicode Technical Report #26, <http://www.unicode.org/unicode/reports/tr26/>.
250
251 =head1 LICENSE
252
253 Copyright 2002 Nanashi-san
254
255 This program is free software; you can redistribute it and/or modify
256 it under the terms of the GNU General Public License as published by
257 the Free Software Foundation; either version 2 of the License, or
258 (at your option) any later version.
259
260 This program is distributed in the hope that it will be useful,
261 but WITHOUT ANY WARRANTY; without even the implied warranty of
262 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
263 GNU General Public License for more details.
264
265 You should have received a copy of the GNU General Public License
266 along with this program; see the file COPYING. If not, write to
267 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
268 Boston, MA 02111-1307, USA.
269
270 =cut
271
272 ## $Date: 2002/09/15 04:15:51 $
273 ### UTF8.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24