/[suikacvs]/perl/lib/Encode/Unicode/UTF8.pm
Suika

Contents of /perl/lib/Encode/Unicode/UTF8.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download)
Mon Sep 23 08:28:39 2002 UTC (22 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
2002-09-23  Nanashi-san

	* UTF8.pm, UTF9.pm: New modules.  (Committed by
	Wakaba <w@suika.fam.cx>.)

1 wakaba 1.1 =head1 NAME
2    
3     Encode::Unicode::UTF8 --- Encode/decode of UTF-8 related encodings
4    
5     =head1 ENCODINGS
6    
7     =over 4
8    
9     =cut
10    
11     require v5.7.3;
12     package Encode::Unicode::UTF8;
13     use strict;
14     use vars qw($VERSION);
15     $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
16    
17     package Encode::Unicode::UTF8::CESU8;
18     use base qw(Encode::Encoding);
19     __PACKAGE__->Define (qw/CESU-8 cesu8 csCESU-8/);
20    
21     =item CESU-8
22    
23     Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8),
24     defined in UTR #26. (Alias: csCESU-8 (IANA), cesu8)
25    
26     =cut
27    
28     my %_U2C;
29     sub encode ($$;$) {
30     use integer;
31     my ($obj, $str, $chk) = @_;
32     $_[1] = '' if $chk;
33     $str =~ s{([\x{010000}-\x{10FFFF}])}{
34     my $u = $1;
35     unless ($_U2C{$u}) {
36     $_U2C{$u} = chr ((ord ($u) - 0x10000) / 0x400 + 0xD800).
37     chr ((ord ($u) - 0x10000) % 0x400 + 0xDC00);
38     }
39     $_U2C{$u};
40     }ge;
41     Encode::_utf8_off ($str);
42     $str;
43     }
44    
45     my %_C2U;
46     sub decode ($$;$) {
47     no warnings;
48     my ($obj, $str, $chk) = @_;
49     $_[1] = '' if $chk;
50     Encode::_utf8_on ($str);
51     $str =~ s{([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])}{
52     my ($u1,$u2) = ($1,$2);
53     unless ($_C2U{$u1.$u2}) {
54     $_C2U{$u1.$u2} = chr (0x10000+(ord($u1)-0xD800)*0x400+(ord($u2)-0xDC00));
55     }
56     $_C2U{$u1.$u2};
57     }ge;
58     return $str;
59     }
60    
61     package Encode::Unicode::UTF8::UTF8Mod;
62     use base qw(Encode::Encoding);
63     __PACKAGE__->Define (qw/utf-8-mod utf8-mod/);
64    
65     =item utf-8-mod
66    
67     Modified UTF-8 for UTF-EBCDIC, defined in UTR #16.
68     (Alias: utf8-mod)
69    
70     =cut
71    
72     my %_4to8m;
73     sub encode ($$;$) {
74     my ($obj, $str, $chk) = @_;
75     my $r = '';
76     for (split //, $str) {
77     unless ($_4to8m{$_}) {
78     my $U = ord $_;
79     if ($U <= 0x9F) {
80     $_4to8m{$_} = $_;
81     } else {
82     $_4to8m{$_} = _ucs4_to_utf8m ($U);
83     }
84     }
85     $r .= $_4to8m{$_};
86     }
87     $_[1] = '' if $chk;
88     return $r;
89     }
90    
91     my %_8mto4;
92     sub decode ($$;$) {
93     my ($obj, $str, $chk) = @_;
94     $str =~ s{
95     ([\xC0-\xDF][\xA0-\xFF])
96     |([\xE0-\xEF][\xA0-\xFF][\xA0-\xFF])
97     |([\xF0-\xF7][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
98     |([\xF8-\xFB][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
99     | ([\xFC\xFD][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
100     | ([\xFE\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF][\xA0-\xFF])
101     }{
102     my ($o2,$o3,$o4,$o5,$o6,$o7) = ($1,$2,$3,$4,$5,$6);
103     unless ($_8mto4{$o2.$o3.$o4.$o5.$o6.$o7}) {
104     if ($o2) {
105     my @o = split //, $o2;
106     $_8mto4{$o2} =
107     chr (((ord ($o[0]) & 0x1F) << 5) + (ord ($o[1]) & 0x1F));
108     } elsif ($o3) {
109     my @o = split //, $o3;
110     $_8mto4{$o3} =
111     chr (((ord ($o[0]) & 0x03) << 10) + ((ord ($o[1]) & 0x1F) << 5)
112     + (ord ($o[2]) & 0x1F));
113     } elsif ($o4) {
114     my @o = split //, $o4;
115     $_8mto4{$o4} =
116     chr (((ord ($o[0]) & 0x07) << 15) + ((ord ($o[1]) & 0x1F) << 10)
117     + ((ord ($o[2]) & 0x1F) << 5) + (ord ($o[3]) & 0x1F));
118     } elsif ($o5) {
119     my @o = split //, $o5;
120     $_8mto4{$o5} =
121     chr (((ord ($o[0]) & 0x03) << 20) + ((ord ($o[1]) & 0x1F) << 15)
122     + ((ord ($o[2]) & 0x1F) << 10) + ((ord ($o[3]) & 0x1F) << 5)
123     + (ord ($o[4]) & 0x1F));
124     } elsif ($o6) {
125     my @o = split //, $o6;
126     $_8mto4{$o6} =
127     chr (((ord ($o[0]) & 0x01) << 25) + ((ord ($o[1]) & 0x1F) << 20)
128     + ((ord ($o[2]) & 0x1F) << 15) + ((ord ($o[3]) & 0x1F) << 10)
129     + ((ord ($o[4]) & 0x1F) << 5) + (ord ($o[5]) & 0x1F));
130     } else {
131     my @o = split //, $o7;
132     $_8mto4{$o7} =
133     chr (((ord ($o[0]) & 0x01) << 30) + ((ord ($o[1]) & 0x1F) << 25)
134     + ((ord ($o[2]) & 0x1F) << 20) + ((ord ($o[3]) & 0x1F) << 15)
135     + ((ord ($o[4]) & 0x1F) << 10) + ((ord ($o[5]) & 0x1F) << 5)
136     + (ord ($o[6]) & 0x1F));
137     }
138     }
139     $_8mto4{$o2.$o3.$o4.$o5.$o6.$o7};
140     }goex;
141     $_[1] = '' if $chk;
142     return $str;
143     }
144    
145     sub _ucs4_to_utf8m ($) {
146     my $U = shift;
147     if ($U <= 0x009F) {
148     return pack 'C', $U;
149     } elsif ($U <= 0x03FF) {
150     return pack 'C2', (0xC0 | ($U >> 5)), (0xA0 | ($U & 0x1F));
151     } elsif ($U <= 0x3FFF) {
152     return pack 'C3', (0xE0 | ($U >> 10)), (0xA0 | (($U >> 5) & 0x1F)),
153     (0xA0 | ($U & 0x4F));
154     } elsif ($U <= 0x0003FFFF) {
155     return pack 'C4', (0xF0 | ($U >> 15)), (0xA0 | (($U >> 10) & 0x1F)),
156     (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
157     } elsif ($U <= 0x003FFFFF) {
158     return pack 'C5', (0xF8 | ($U >> 20)),
159     (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
160     (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
161     } elsif ($U <= 0x03FFFFFF) {
162     return pack 'C6', (0xFC | ($U >> 25)), (0xA0 | (($U >> 20) & 0x1F)),
163     (0xA0 | (($U >> 15) & 0x1F)), (0xA0 | (($U >> 10) & 0x1F)),
164     (0xA0 | (($U >> 5) & 0x1F)), (0xA0 | ($U & 0x1F));
165     } else {#if ($U <= 0x7FFFFFFF) {
166     return pack 'C7', (0xFE | (($U >> 30) & 0x01)), (0xA0 | (($U >> 25) & 0x1F)),
167     (0xA0 | (($U >> 20) & 0x1F)), (0xA0 | (($U >> 15) & 0x1F)),
168     (0xA0 | (($U >> 10) & 0x1F)), (0xA0 | (($U >> 5) & 0x1F)),
169     (0xA0 | ($U & 0x1F));
170     }
171     }
172    
173     package Encode::Unicode::UTF8::UTFEBCDIC;
174     use base qw(Encode::Encoding);
175     __PACKAGE__->Define (qw/utf-ebcdic ef-utf utf-ebcdic-without-bom/);
176    
177     =item utf-ebcdic
178    
179     UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
180     defined in UTR #16, without BOM. (Alias: ef-utf, utf-ebcdic-without-bom)
181    
182     =cut
183    
184     my $_tbl_u8m = q(\x00-\xFF);
185     my $_tbl_ue = q(\x00-\x03\x37\x2D\x2E\x2F\x16\x05\x15\x0B-\x0F\x10-\x13\x3C\x3D\x32\x26\x18\x19\x3F\x27\x1C-\x1F\x40\x5A\x7F\x7B\x5B\x6C\x50\x7D\x4D\x5D\x5C\x4E\x6B\x60\x4B\x61\xF0-\xF9\x7A\x5E\x4C\x7E\x6E\x6F\x7C\xC1-\xC9\xD1-\xD9\xE2-\xE9\xAD\xE0\xBD\x5F\x6D\x79\x81-\x89\x91-\x99\xA2-\xA9\xC0\x4F\xD0\xA1\x07\x20-\x25\x06\x17\x28-\x2C\x09\x0A\x1B\x30\x31\x1A\x33-\x36\x08\x38-\x3B\x04\x14\x3E\xFF\x41-\x49\x4A\x51-\x59\x62-\x6A\x70-\x78\x80\x8A-\x90\x9A-\xA0\xAA-\xAC\xAE-\xBC\xBE\xBF\xCA-\xCF\xDA-\xDF\xE1\xEA-\xEF\xFA-\xFE);
186     sub encode ($$;$) {
187     my ($obj, $str, $chk) = @_;
188     $str = Encode::encode ('utf-8-mod', $str);
189     eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
190     $_[1] = '' if $chk;
191     return $str;
192     }
193    
194     sub decode ($$;$) {
195     my ($obj, $str, $chk) = @_;
196     eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
197     $_[1] = '' if $chk;
198     return Encode::decode ('utf-8-mod', $str);
199     }
200    
201     package Encode::Unicode::UTF8::UTFEBCDICwBOM;
202     use base qw(Encode::Encoding);
203     __PACKAGE__->Define (qw/utf-ebcdic-with-bom/);
204    
205     =item utf-ebcdic-with-bom
206    
207     UTF-EBCDIC, EBCDIC-friendly Unicode (or UCS) Transformation Format,
208     defined in UTR #16, with BOM
209    
210     =cut
211    
212     sub encode ($$;$) {
213     my ($obj, $str, $chk) = @_;
214     $str = Encode::encode ('utf-8-mod', "\x{FEFF}".$str);
215     eval qq{\$str =~ tr/$_tbl_u8m/$_tbl_ue/} or die $@;
216     $_[1] = '' if $chk;
217     $str;
218     }
219    
220     sub decode ($$;$) {
221     my ($obj, $str, $chk) = @_;
222     eval qq{\$str =~ tr/$_tbl_ue/$_tbl_u8m/} or die $@;
223     $_[1] = '' if $chk;
224     my $str = Encode::decode ('utf-8-mod', $str);
225     $str =~ s/^\x{FEFF}//;
226     $str;
227     }
228    
229     1;
230    
231     =back
232    
233     Note that UTF-8-Mod and UTF-EBCDIC are supported by perl
234     for EBCDIC platforms. If we can use that code (written in C),
235     convertion of those encodings will become faster.
236    
237     Note also that UTF-8 -> CESU-8 could be implemented as
238     utf8_off(decode_ucs2(encode_utf16(utf8))) and CESU-8 -> UTF-8
239     could be implemented as decode_utf16(encode_ucs2(cesu8)),
240     if Encode::Unicode did not check malformed UTF-8 sequences.
241     It might make convertion faster when XS is used.
242    
243     =head1 SEE ALSO
244    
245     "UTF-EBCDIC", Unicode Technical Report #16,
246     <http://www.unicode.org/unicode/reports/tr16/>.
247    
248     "Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8)",
249     Unicode Technical Report #26, <http://www.unicode.org/unicode/reports/tr26/>.
250    
251     =head1 LICENSE
252    
253     Copyright 2002 Nanashi-san
254    
255     This program is free software; you can redistribute it and/or modify
256     it under the terms of the GNU General Public License as published by
257     the Free Software Foundation; either version 2 of the License, or
258     (at your option) any later version.
259    
260     This program is distributed in the hope that it will be useful,
261     but WITHOUT ANY WARRANTY; without even the implied warranty of
262     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
263     GNU General Public License for more details.
264    
265     You should have received a copy of the GNU General Public License
266     along with this program; see the file COPYING. If not, write to
267     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
268     Boston, MA 02111-1307, USA.
269    
270     =cut
271    
272     ## $Date: 2002/09/15 04:15:51 $
273     ### UTF8.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24