/[suikacvs]/markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Thu Sep 11 09:55:56 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +10 -1 lines
++ whatpm/Whatpm/Charset/ChangeLog	11 Sep 2008 09:55:54 -0000
	* UnicodeChecker.pm, DecodeHandle.pm: Tentative support
	for |read| method.

2008-09-11  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::UnicodeChecker;
2     use strict;
3    
4     ## NOTE: For more information (including rationals of checks performed
5     ## in this module), see
6     ## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7    
8     ## NOTE: Unicode's definition for character string conformance is
9     ## very, very vague so that it is difficult to determine what error
10     ## level is appropriate for each error. The Unicode Standard abuses
11     ## conformance-creteria-like terms such as "deprecated", "discouraged",
12     ## "preferred", "better", "not encouraged", "should", and so on with no
13     ## clear explanation of their difference (if any) or relationship to
14     ## the conformance. In fact, that specification does not define the
15     ## conformance class for character strings.
16    
17     sub new_handle ($$) {
18     my $self = bless {
19     queue => [],
20 wakaba 1.2 new_queue => [],
21 wakaba 1.1 onerror => sub {},
22     level => {
23     unicode_should => 'w',
24     unicode_deprecated => 'w', # = unicode_should
25     unicode_discouraged => 'w',
26     unicode_preferred => 'w',
27     ## NOTE: We do some "unification" of levels - for example,
28     ## "not encouraged" is unified with "discouraged" and
29     ## "better" is unified with "preferred".
30     },
31     }, shift;
32     $self->{handle} = shift; # char stream
33     return $self;
34     } # new_handle
35    
36     ## TODO: We need to do some perf optimization
37    
38     sub getc ($) {
39     my $self = $_[0];
40     return shift @{$self->{queue}} if @{$self->{queue}};
41 wakaba 1.2
42     my $char;
43     unless (@{$self->{new_queue}}) {
44     my $s = '';
45     $self->{handle}->read ($s, 256) or return undef;
46     push @{$self->{new_queue}}, split //, $s;
47     }
48     $char = shift @{$self->{new_queue}};
49    
50 wakaba 1.1 my $char_code = ord $char;
51    
52     if ({
53     0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
54     0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
55     0x206E => 1, 0x206F => 1, 0xE0001 => 1,
56     }->{$char_code} or
57     (0xE0020 <= $char_code and $char_code <= 0xE007F)) {
58     ## NOTE: From Unicode 5.1.0 |PropList.txt| (Deprecated).
59     $self->{onerror}->(type => 'unicode deprecated',
60     text => (sprintf 'U+%04X', $char_code),
61     layer => 'charset',
62     level => $self->{level}->{unicode_deprecated});
63     } elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
64     {
65     0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
66     0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
67     0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
68     0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
69     0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
70     0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
71     0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
72     0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
73     0x10FFFE => 1, 0x10FFFF => 1,
74     }->{$char_code}) {
75     ## NOTE: From Unicode 5.1.0 |PropList.txt| (Noncharacter_Code_Point).
76     $self->{onerror}->(type => 'nonchar',
77     text => (sprintf 'U+%04X', $char_code),
78     layer => 'charset',
79     level => $self->{level}->{unicode_should});
80     } elsif ({
81     0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
82     0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
83     0x20A4 => 1, # LIRA SIGN
84    
85     0x2126 => 1, # OHM SIGN # also, discouraged
86     0x212A => 1, # KELVIN SIGN
87     0x212B => 1, # ANGSTROM SIGN
88     }->{$char_code} or
89     (0xFB50 <= $char_code and $char_code <= 0xFDFB) or
90     (0xFE70 <= $char_code and $char_code <= 0xFEFE) or
91     (0xFA30 <= $char_code and $char_code <= 0xFA6A) or
92     (0xFA70 <= $char_code and $char_code <= 0xFAD9) or
93     (0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
94     (0x239B <= $char_code and $char_code <= 0x23B3)) {
95     ## NOTE: This case must come AFTER noncharacter checking, due to
96     ## their range overwrap.
97     if ({
98     ## In the Arabic Presentation Forms-A block, but no character is
99     ## assigned in Unicode 5.1.
100     0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
101     0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
102     0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
103     0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
104     0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
105     0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
106     0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
107     0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
108     0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
109     0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
110     0xFD4F => 1,
111     0xFD90 => 1, 0xFD91 => 1,
112     0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
113     0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
114     # 0xFDD0-0xFDEF noncharacters
115    
116     ## In Arabic Presentation Forms-A block, but explicitly
117     ## allowed
118     0xFD3E => 1, 0xFD3F => 1,
119    
120     ## In Arabic Presentation Forms-B block, unassigned
121     0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
122     }->{$char_code}) {
123     #
124     } else {
125     $self->{onerror}->(type => 'unicode should',
126     text => (sprintf 'U+%04X', $char_code),
127     layer => 'charset',
128     level => $self->{level}->{unicode_should});
129     }
130     } elsif ({
131     ## Styled overlines/underlines in CJK Compatibility Forms
132     0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
133     0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
134    
135     0x037E => 1, 0x0387 => 1, # greek punctuations
136    
137     #0x17A3 => 1, # also, deprecated
138     0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,
139    
140     0x2121 => 1, # tel
141     0x213B => 1, # fax
142     #0x2120 => 1, # SM (superscript)
143     #0x2122 => 1, # TM (superscript)
144    
145     0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
146     }->{$char_code} or
147     (0x2153 <= $char_code and $char_code <= 0x217F)) {
148     $self->{onerror}->(type => 'unicode discouraged',
149     text => (sprintf 'U+%04X', $char_code),
150     layer => 'charset',
151     level => $self->{level}->{unicode_discouraged});
152     } elsif ({
153     0x055A => 1, 0x0559 =>1, # greek punctuations
154    
155     0x2103 => 1, 0x2109 => 1, # degree signs
156    
157     0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
158     }->{$char_code}) {
159     $self->{onerror}->(type => 'unicode not preferred',
160     text => (sprintf 'U+%04X', $char_code),
161     layer => 'charset',
162     level => $self->{level}->{unicode_preferred});
163     }
164    
165     ## TODO: "khanda ta" should be represented by U+09CE
166     ## rather than <U+09A4, U+09CD, U+200D>.
167    
168     ## TODO: IDS syntax
169    
170     ## TODO: langtag syntax
171    
172     return $char;
173     } # getc
174    
175     sub ungetc ($$) {
176     unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
177     } # ungetc
178    
179     sub close ($) {
180     shift->{handle}->close;
181     } # close
182    
183     sub charset ($) {
184     shift->{handle}->charset;
185     } # charset
186    
187     sub has_bom ($) {
188     shift->{handle}->has_bom;
189     } # has_bom
190    
191     sub input_encoding ($) {
192     shift->{handle}->input_encoding;
193     } # input_encoding
194    
195     sub onerror ($;$) {
196     if (@_ > 1) {
197     if (defined $_[1]) {
198     $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
199     } else {
200     $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
201     }
202     }
203    
204     return $_[0]->{onerror};
205     } # onerror
206    
207     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24