1 |
wakaba |
1.1 |
package Whatpm::Charset::UnicodeChecker; |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
## NOTE: For more information (including rationals of checks performed |
5 |
|
|
## in this module), see |
6 |
|
|
## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>. |
7 |
|
|
|
8 |
|
|
## NOTE: Unicode's definition for character string conformance is |
9 |
|
|
## very, very vague so that it is difficult to determine what error |
10 |
|
|
## level is appropriate for each error. The Unicode Standard abuses |
11 |
|
|
## conformance-creteria-like terms such as "deprecated", "discouraged", |
12 |
|
|
## "preferred", "better", "not encouraged", "should", and so on with no |
13 |
|
|
## clear explanation of their difference (if any) or relationship to |
14 |
|
|
## the conformance. In fact, that specification does not define the |
15 |
|
|
## conformance class for character strings. |
16 |
|
|
|
17 |
|
|
sub new_handle ($$) { |
18 |
|
|
my $self = bless { |
19 |
|
|
queue => [], |
20 |
wakaba |
1.2 |
new_queue => [], |
21 |
wakaba |
1.1 |
onerror => sub {}, |
22 |
|
|
level => { |
23 |
|
|
unicode_should => 'w', |
24 |
|
|
unicode_deprecated => 'w', # = unicode_should |
25 |
|
|
unicode_discouraged => 'w', |
26 |
|
|
unicode_preferred => 'w', |
27 |
|
|
## NOTE: We do some "unification" of levels - for example, |
28 |
|
|
## "not encouraged" is unified with "discouraged" and |
29 |
|
|
## "better" is unified with "preferred". |
30 |
|
|
}, |
31 |
|
|
}, shift; |
32 |
|
|
$self->{handle} = shift; # char stream |
33 |
|
|
return $self; |
34 |
|
|
} # new_handle |
35 |
|
|
|
36 |
|
|
## TODO: We need to do some perf optimization |
37 |
|
|
|
38 |
|
|
sub getc ($) { |
39 |
|
|
my $self = $_[0]; |
40 |
|
|
return shift @{$self->{queue}} if @{$self->{queue}}; |
41 |
wakaba |
1.2 |
|
42 |
|
|
my $char; |
43 |
|
|
unless (@{$self->{new_queue}}) { |
44 |
|
|
my $s = ''; |
45 |
|
|
$self->{handle}->read ($s, 256) or return undef; |
46 |
|
|
push @{$self->{new_queue}}, split //, $s; |
47 |
|
|
} |
48 |
|
|
$char = shift @{$self->{new_queue}}; |
49 |
|
|
|
50 |
wakaba |
1.1 |
my $char_code = ord $char; |
51 |
|
|
|
52 |
|
|
if ({ |
53 |
|
|
0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1, |
54 |
|
|
0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1, |
55 |
|
|
0x206E => 1, 0x206F => 1, 0xE0001 => 1, |
56 |
|
|
}->{$char_code} or |
57 |
|
|
(0xE0020 <= $char_code and $char_code <= 0xE007F)) { |
58 |
|
|
## NOTE: From Unicode 5.1.0 |PropList.txt| (Deprecated). |
59 |
|
|
$self->{onerror}->(type => 'unicode deprecated', |
60 |
|
|
text => (sprintf 'U+%04X', $char_code), |
61 |
|
|
layer => 'charset', |
62 |
|
|
level => $self->{level}->{unicode_deprecated}); |
63 |
|
|
} elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or |
64 |
|
|
{ |
65 |
|
|
0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1, |
66 |
|
|
0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1, |
67 |
|
|
0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1, |
68 |
|
|
0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1, |
69 |
|
|
0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1, |
70 |
|
|
0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1, |
71 |
|
|
0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1, |
72 |
|
|
0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1, |
73 |
|
|
0x10FFFE => 1, 0x10FFFF => 1, |
74 |
|
|
}->{$char_code}) { |
75 |
|
|
## NOTE: From Unicode 5.1.0 |PropList.txt| (Noncharacter_Code_Point). |
76 |
|
|
$self->{onerror}->(type => 'nonchar', |
77 |
|
|
text => (sprintf 'U+%04X', $char_code), |
78 |
|
|
layer => 'charset', |
79 |
|
|
level => $self->{level}->{unicode_should}); |
80 |
|
|
} elsif ({ |
81 |
|
|
0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS |
82 |
|
|
0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ... |
83 |
|
|
0x20A4 => 1, # LIRA SIGN |
84 |
|
|
|
85 |
|
|
0x2126 => 1, # OHM SIGN # also, discouraged |
86 |
|
|
0x212A => 1, # KELVIN SIGN |
87 |
|
|
0x212B => 1, # ANGSTROM SIGN |
88 |
|
|
}->{$char_code} or |
89 |
|
|
(0xFB50 <= $char_code and $char_code <= 0xFDFB) or |
90 |
|
|
(0xFE70 <= $char_code and $char_code <= 0xFEFE) or |
91 |
|
|
(0xFA30 <= $char_code and $char_code <= 0xFA6A) or |
92 |
|
|
(0xFA70 <= $char_code and $char_code <= 0xFAD9) or |
93 |
|
|
(0x2F800 <= $char_code and $char_code <= 0x2FA1D) or |
94 |
|
|
(0x239B <= $char_code and $char_code <= 0x23B3)) { |
95 |
|
|
## NOTE: This case must come AFTER noncharacter checking, due to |
96 |
|
|
## their range overwrap. |
97 |
|
|
if ({ |
98 |
|
|
## In the Arabic Presentation Forms-A block, but no character is |
99 |
|
|
## assigned in Unicode 5.1. |
100 |
|
|
0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1, |
101 |
|
|
0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1, |
102 |
|
|
0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1, |
103 |
|
|
0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1, |
104 |
|
|
0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1, |
105 |
|
|
0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1, |
106 |
|
|
0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1, |
107 |
|
|
0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1, |
108 |
|
|
0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1, |
109 |
|
|
0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1, |
110 |
|
|
0xFD4F => 1, |
111 |
|
|
0xFD90 => 1, 0xFD91 => 1, |
112 |
|
|
0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1, |
113 |
|
|
0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1, |
114 |
|
|
# 0xFDD0-0xFDEF noncharacters |
115 |
|
|
|
116 |
|
|
## In Arabic Presentation Forms-A block, but explicitly |
117 |
|
|
## allowed |
118 |
|
|
0xFD3E => 1, 0xFD3F => 1, |
119 |
|
|
|
120 |
|
|
## In Arabic Presentation Forms-B block, unassigned |
121 |
|
|
0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1, |
122 |
|
|
}->{$char_code}) { |
123 |
|
|
# |
124 |
|
|
} else { |
125 |
|
|
$self->{onerror}->(type => 'unicode should', |
126 |
|
|
text => (sprintf 'U+%04X', $char_code), |
127 |
|
|
layer => 'charset', |
128 |
|
|
level => $self->{level}->{unicode_should}); |
129 |
|
|
} |
130 |
|
|
} elsif ({ |
131 |
|
|
## Styled overlines/underlines in CJK Compatibility Forms |
132 |
|
|
0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1, |
133 |
|
|
0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1, |
134 |
|
|
|
135 |
|
|
0x037E => 1, 0x0387 => 1, # greek punctuations |
136 |
|
|
|
137 |
|
|
#0x17A3 => 1, # also, deprecated |
138 |
|
|
0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1, |
139 |
|
|
|
140 |
|
|
0x2121 => 1, # tel |
141 |
|
|
0x213B => 1, # fax |
142 |
|
|
#0x2120 => 1, # SM (superscript) |
143 |
|
|
#0x2122 => 1, # TM (superscript) |
144 |
|
|
|
145 |
|
|
0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations |
146 |
|
|
}->{$char_code} or |
147 |
|
|
(0x2153 <= $char_code and $char_code <= 0x217F)) { |
148 |
|
|
$self->{onerror}->(type => 'unicode discouraged', |
149 |
|
|
text => (sprintf 'U+%04X', $char_code), |
150 |
|
|
layer => 'charset', |
151 |
|
|
level => $self->{level}->{unicode_discouraged}); |
152 |
|
|
} elsif ({ |
153 |
|
|
0x055A => 1, 0x0559 =>1, # greek punctuations |
154 |
|
|
|
155 |
|
|
0x2103 => 1, 0x2109 => 1, # degree signs |
156 |
|
|
|
157 |
|
|
0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER |
158 |
|
|
}->{$char_code}) { |
159 |
|
|
$self->{onerror}->(type => 'unicode not preferred', |
160 |
|
|
text => (sprintf 'U+%04X', $char_code), |
161 |
|
|
layer => 'charset', |
162 |
|
|
level => $self->{level}->{unicode_preferred}); |
163 |
|
|
} |
164 |
|
|
|
165 |
|
|
## TODO: "khanda ta" should be represented by U+09CE |
166 |
|
|
## rather than <U+09A4, U+09CD, U+200D>. |
167 |
|
|
|
168 |
|
|
## TODO: IDS syntax |
169 |
|
|
|
170 |
|
|
## TODO: langtag syntax |
171 |
|
|
|
172 |
|
|
return $char; |
173 |
|
|
} # getc |
174 |
|
|
|
175 |
|
|
sub ungetc ($$) { |
176 |
|
|
unshift @{$_[0]->{queue}}, chr int ($_[1] or 0); |
177 |
|
|
} # ungetc |
178 |
|
|
|
179 |
|
|
sub close ($) { |
180 |
|
|
shift->{handle}->close; |
181 |
|
|
} # close |
182 |
|
|
|
183 |
|
|
sub charset ($) { |
184 |
|
|
shift->{handle}->charset; |
185 |
|
|
} # charset |
186 |
|
|
|
187 |
|
|
sub has_bom ($) { |
188 |
|
|
shift->{handle}->has_bom; |
189 |
|
|
} # has_bom |
190 |
|
|
|
191 |
|
|
sub input_encoding ($) { |
192 |
|
|
shift->{handle}->input_encoding; |
193 |
|
|
} # input_encoding |
194 |
|
|
|
195 |
|
|
sub onerror ($;$) { |
196 |
|
|
if (@_ > 1) { |
197 |
|
|
if (defined $_[1]) { |
198 |
|
|
$_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]); |
199 |
|
|
} else { |
200 |
|
|
$_[0]->{handle}->onerror ($_[0]->{onerror} = sub {}); |
201 |
|
|
} |
202 |
|
|
} |
203 |
|
|
|
204 |
|
|
return $_[0]->{onerror}; |
205 |
|
|
} # onerror |
206 |
|
|
|
207 |
|
|
1; |