1 |
wakaba |
1.1 |
package Whatpm::Charset::UnicodeChecker; |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
## NOTE: For more information (including rationals of checks performed |
5 |
|
|
## in this module), see |
6 |
|
|
## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>. |
7 |
|
|
|
8 |
|
|
## NOTE: Unicode's definition for character string conformance is |
9 |
|
|
## very, very vague so that it is difficult to determine what error |
10 |
|
|
## level is appropriate for each error. The Unicode Standard abuses |
11 |
|
|
## conformance-creteria-like terms such as "deprecated", "discouraged", |
12 |
|
|
## "preferred", "better", "not encouraged", "should", and so on with no |
13 |
|
|
## clear explanation of their difference (if any) or relationship to |
14 |
|
|
## the conformance. In fact, that specification does not define the |
15 |
|
|
## conformance class for character strings. |
16 |
|
|
|
17 |
wakaba |
1.8 |
sub new_handle ($$;$) { |
18 |
wakaba |
1.1 |
my $self = bless { |
19 |
|
|
queue => [], |
20 |
wakaba |
1.2 |
new_queue => [], |
21 |
wakaba |
1.1 |
onerror => sub {}, |
22 |
wakaba |
1.8 |
#onerror_set |
23 |
wakaba |
1.1 |
level => { |
24 |
|
|
unicode_should => 'w', |
25 |
|
|
unicode_deprecated => 'w', # = unicode_should |
26 |
|
|
unicode_discouraged => 'w', |
27 |
|
|
unicode_preferred => 'w', |
28 |
|
|
## NOTE: We do some "unification" of levels - for example, |
29 |
|
|
## "not encouraged" is unified with "discouraged" and |
30 |
|
|
## "better" is unified with "preferred". |
31 |
wakaba |
1.8 |
|
32 |
|
|
must => 'm', |
33 |
|
|
warn => 'w', |
34 |
wakaba |
1.1 |
}, |
35 |
wakaba |
1.8 |
}, $_[0]; |
36 |
|
|
$self->{handle} = $_[1]; # char stream |
37 |
|
|
my $mode = $_[2] || 'default'; # or 'html5' |
38 |
|
|
$self->{level_map} = { |
39 |
|
|
## Unicode errors |
40 |
|
|
'unicode deprecated' => 'unicode_deprecated', |
41 |
|
|
'nonchar' => $mode eq 'html5' ? 'must' : 'unicode_should', |
42 |
|
|
## NOTE: HTML5 parse error. |
43 |
|
|
'unicode should' => 'unicode_should', |
44 |
|
|
'unicode discouraged' => 'unicode_discouraged', |
45 |
|
|
'unicode not preferred' => 'unicode_preferred', |
46 |
|
|
|
47 |
|
|
## HTML5 errors (See "text" definition of the spec) |
48 |
|
|
'control char' => $mode eq 'html5' ? 'must' : 'warn', |
49 |
|
|
## NOTE: HTML5 parse error. |
50 |
|
|
'non unicode' => $mode eq 'html5' ? 'must' : 'warn', |
51 |
|
|
## NOTE: In HTML5, replaced by U+FFFD (not a parse error). |
52 |
|
|
}; |
53 |
|
|
$self->{replace_non_unicode} = ($mode eq 'html5'); |
54 |
wakaba |
1.1 |
return $self; |
55 |
|
|
} # new_handle |
56 |
|
|
|
57 |
wakaba |
1.7 |
my $etypes = { |
58 |
|
|
0x0340 => 'unicode deprecated', |
59 |
|
|
0x0341 => 'unicode deprecated', |
60 |
|
|
0x17A3 => 'unicode deprecated', |
61 |
|
|
0x17D3 => 'unicode deprecated', |
62 |
|
|
0x206A => 'unicode deprecated', |
63 |
|
|
0x206B => 'unicode deprecated', |
64 |
|
|
0x206C => 'unicode deprecated', |
65 |
|
|
0x206D => 'unicode deprecated', |
66 |
|
|
0x206E => 'unicode deprecated', |
67 |
|
|
0x206F => 'unicode deprecated', |
68 |
|
|
0xE0001 => 'unicode deprecated', |
69 |
|
|
|
70 |
|
|
0xFFFE => 'nonchar', |
71 |
|
|
0xFFFF => 'nonchar', |
72 |
|
|
0x1FFFE => 'nonchar', |
73 |
|
|
0x1FFFF => 'nonchar', |
74 |
|
|
0x2FFFE => 'nonchar', |
75 |
|
|
0x2FFFF => 'nonchar', |
76 |
|
|
0x3FFFE => 'nonchar', |
77 |
|
|
0x3FFFF => 'nonchar', |
78 |
|
|
0x4FFFE => 'nonchar', |
79 |
|
|
0x4FFFF => 'nonchar', |
80 |
|
|
0x5FFFE => 'nonchar', |
81 |
|
|
0x5FFFF => 'nonchar', |
82 |
|
|
0x6FFFE => 'nonchar', |
83 |
|
|
0x6FFFF => 'nonchar', |
84 |
|
|
0x7FFFE => 'nonchar', |
85 |
|
|
0x7FFFF => 'nonchar', |
86 |
|
|
0x8FFFE => 'nonchar', |
87 |
|
|
0x8FFFF => 'nonchar', |
88 |
|
|
0x9FFFE => 'nonchar', |
89 |
|
|
0x9FFFF => 'nonchar', |
90 |
|
|
0xAFFFE => 'nonchar', |
91 |
|
|
0xAFFFF => 'nonchar', |
92 |
|
|
0xBFFFE => 'nonchar', |
93 |
|
|
0xBFFFF => 'nonchar', |
94 |
|
|
0xCFFFE => 'nonchar', |
95 |
|
|
0xCFFFF => 'nonchar', |
96 |
|
|
0xDFFFE => 'nonchar', |
97 |
|
|
0xDFFFF => 'nonchar', |
98 |
|
|
0xEFFFE => 'nonchar', |
99 |
|
|
0xEFFFF => 'nonchar', |
100 |
|
|
0xFFFFE => 'nonchar', |
101 |
|
|
0xFFFFF => 'nonchar', |
102 |
|
|
0x10FFFE => 'nonchar', |
103 |
|
|
0x10FFFF => 'nonchar', |
104 |
|
|
|
105 |
|
|
0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS |
106 |
|
|
0x03D3 => 'unicode should', # GREEK UPSILON WITH ... |
107 |
|
|
0x03D4 => 'unicode should', # GREEK UPSILON WITH ... |
108 |
|
|
0x20A4 => 'unicode should', # LIRA SIGN |
109 |
|
|
|
110 |
|
|
0x2126 => 'unicode should', # OHM SIGN # also, discouraged |
111 |
|
|
0x212A => 'unicode should', # KELVIN SIGN |
112 |
|
|
0x212B => 'unicode should', # ANGSTROM SIGN |
113 |
|
|
|
114 |
|
|
## Styled overlines/underlines in CJK Compatibility Forms |
115 |
|
|
0xFE49 => 'unicode discouraged', |
116 |
|
|
0xFE4A => 'unicode discouraged', |
117 |
|
|
0xFE4B => 'unicode discouraged', |
118 |
|
|
0xFE4C => 'unicode discouraged', |
119 |
|
|
0xFE4D => 'unicode discouraged', |
120 |
|
|
0xFE4E => 'unicode discouraged', |
121 |
|
|
0xFE4F => 'unicode discouraged', |
122 |
|
|
|
123 |
|
|
0x037E => 'unicode discouraged', # greek punctuations |
124 |
|
|
0x0387 => 'unicode discouraged', # greek punctuations |
125 |
|
|
|
126 |
|
|
#0x17A3 => 'unicode discouraged', # also, deprecated |
127 |
|
|
0x17A4 => 'unicode discouraged', |
128 |
|
|
0x17B4 => 'unicode discouraged', |
129 |
|
|
0x17B5 => 'unicode discouraged', |
130 |
|
|
0x17D8 => 'unicode discouraged', |
131 |
|
|
|
132 |
|
|
0x2121 => 'unicode discouraged', # tel |
133 |
|
|
0x213B => 'unicode discouraged', # fax |
134 |
|
|
#0x2120 => 'unicode discouraged', # SM (superscript) |
135 |
|
|
#0x2122 => 'unicode discouraged', # TM (superscript) |
136 |
|
|
|
137 |
|
|
## inline annotations |
138 |
|
|
0xFFF9 => 'unicode discouraged', |
139 |
|
|
0xFFFA => 'unicode discouraged', |
140 |
|
|
0xFFFB => 'unicode discouraged', |
141 |
|
|
|
142 |
|
|
## greek punctuations |
143 |
|
|
0x055A => 'unicode not preferred', |
144 |
|
|
0x0559 => 'unicode not preferred', |
145 |
|
|
|
146 |
|
|
## degree signs |
147 |
|
|
0x2103 => 'unicode not preferred', |
148 |
|
|
0x2109 => 'unicode not preferred', |
149 |
|
|
|
150 |
|
|
## strongly preferrs U+2060 WORD JOINTER |
151 |
|
|
0xFEFE => 'unicode not preferred', |
152 |
|
|
}; |
153 |
|
|
|
154 |
|
|
$etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F; |
155 |
|
|
$etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF; |
156 |
wakaba |
1.8 |
## ISSUE: U+FDE0-U+FDEF are not excluded in HTML5. |
157 |
wakaba |
1.7 |
$etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9; |
158 |
|
|
$etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3; |
159 |
|
|
$etypes->{$_} = 'unicode should' |
160 |
|
|
for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F, |
161 |
|
|
0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74, |
162 |
|
|
0xFE76 .. 0xFEFC; |
163 |
|
|
## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where |
164 |
|
|
## no character is assigned, noncharacter code points, and |
165 |
|
|
## U+FD3E and U+FD3F, which are explicitly allowed. |
166 |
|
|
$etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F; |
167 |
wakaba |
1.8 |
$etypes->{$_} = 'control char' |
168 |
wakaba |
1.9 |
for 0x0001 .. 0x0008, 0x000B, 0x000E .. 0x001F, 0x007F .. 0x009F; |
169 |
wakaba |
1.8 |
$etypes->{$_} = 'control char' for 0xD800 .. 0xDFFF; |
170 |
wakaba |
1.7 |
|
171 |
wakaba |
1.6 |
my $check_char = sub ($$) { |
172 |
|
|
my ($self, $char_code) = @_; |
173 |
wakaba |
1.1 |
|
174 |
wakaba |
1.8 |
## NOTE: Negative $char_code is not supported. |
175 |
|
|
|
176 |
wakaba |
1.6 |
if ($char_code == 0x000D) { |
177 |
|
|
$self->{line_diff}++; |
178 |
|
|
$self->{column_diff} = 0; |
179 |
|
|
$self->{set_column} = 1; |
180 |
|
|
$self->{has_cr} = 1; |
181 |
|
|
return; |
182 |
|
|
} elsif ($char_code == 0x000A) { |
183 |
|
|
if ($self->{has_cr}) { |
184 |
|
|
delete $self->{has_cr}; |
185 |
|
|
} else { |
186 |
|
|
$self->{line_diff}++; |
187 |
|
|
$self->{column_diff} = 0; |
188 |
|
|
$self->{set_column} = 1; |
189 |
|
|
} |
190 |
|
|
return; |
191 |
|
|
} else { |
192 |
|
|
$self->{column_diff}++; |
193 |
|
|
delete $self->{has_cr}; |
194 |
|
|
} |
195 |
wakaba |
1.7 |
|
196 |
wakaba |
1.8 |
if ($char_code > 0x10FFFF) { |
197 |
|
|
$self->{onerror}->(type => 'non unicode', |
198 |
|
|
text => (sprintf 'U-%08X', $char_code), |
199 |
|
|
layer => 'charset', |
200 |
|
|
level => $self->{level}->{$self->{level_map}->{'non unicode'}}, |
201 |
|
|
line_diff => $self->{line_diff}, |
202 |
|
|
column_diff => $self->{column_diff}, |
203 |
|
|
($self->{set_column} ? (column => 1) : ())); |
204 |
|
|
if ($self->{replace_non_unicode}) { |
205 |
|
|
return "\x{FFFD}"; |
206 |
|
|
} else { |
207 |
|
|
return; |
208 |
|
|
} |
209 |
|
|
} |
210 |
wakaba |
1.6 |
|
211 |
wakaba |
1.7 |
my $etype = $etypes->{$char_code}; |
212 |
|
|
if (defined $etype) { |
213 |
|
|
$self->{onerror}->(type => $etype, |
214 |
wakaba |
1.1 |
text => (sprintf 'U+%04X', $char_code), |
215 |
|
|
layer => 'charset', |
216 |
wakaba |
1.8 |
level => $self->{level}->{$self->{level_map}->{$etype}}, |
217 |
wakaba |
1.6 |
line_diff => $self->{line_diff}, |
218 |
|
|
column_diff => $self->{column_diff}, |
219 |
|
|
($self->{set_column} ? (column => 1) : ())); |
220 |
wakaba |
1.1 |
} |
221 |
|
|
|
222 |
|
|
## TODO: "khanda ta" should be represented by U+09CE |
223 |
|
|
## rather than <U+09A4, U+09CD, U+200D>. |
224 |
|
|
|
225 |
|
|
## TODO: IDS syntax |
226 |
|
|
|
227 |
|
|
## TODO: langtag syntax |
228 |
wakaba |
1.8 |
|
229 |
|
|
return; |
230 |
wakaba |
1.6 |
}; # $check_char |
231 |
wakaba |
1.1 |
|
232 |
wakaba |
1.6 |
sub read ($$$;$) { |
233 |
|
|
my $self = shift; |
234 |
|
|
my $offset = $_[2] || 0; |
235 |
|
|
my $count = $self->{handle}->read (@_); |
236 |
|
|
$self->{line_diff} = 0; |
237 |
|
|
$self->{column_diff} = -1; |
238 |
|
|
delete $self->{set_column}; |
239 |
|
|
delete $self->{has_cr}; |
240 |
|
|
for ($offset .. ($offset + $count - 1)) { |
241 |
wakaba |
1.8 |
my $c = $check_char->($self, ord substr $_[0], $_, 1); |
242 |
|
|
if (defined $c) { |
243 |
|
|
substr ($_[0], $_, 1) = $c; |
244 |
|
|
} |
245 |
wakaba |
1.6 |
} |
246 |
|
|
return $count; |
247 |
|
|
} # read |
248 |
wakaba |
1.4 |
|
249 |
wakaba |
1.5 |
sub manakai_read_until ($$$;$) { |
250 |
|
|
#my ($self, $scalar, $pattern, $offset) = @_; |
251 |
|
|
my $self = shift; |
252 |
wakaba |
1.6 |
my $offset = $_[2] || 0; |
253 |
|
|
my $count = $self->{handle}->manakai_read_until (@_); |
254 |
|
|
$self->{line_diff} = 0; |
255 |
|
|
$self->{column_diff} = -1; |
256 |
|
|
delete $self->{set_column}; |
257 |
|
|
delete $self->{has_cr}; |
258 |
|
|
for ($offset .. ($offset + $count - 1)) { |
259 |
wakaba |
1.8 |
my $c = $check_char->($self, ord substr $_[0], $_, 1); |
260 |
|
|
if (defined $c) { |
261 |
|
|
substr ($_[0], $_, 1) = $c; |
262 |
|
|
} |
263 |
wakaba |
1.4 |
} |
264 |
wakaba |
1.6 |
return $count; |
265 |
wakaba |
1.5 |
} # manakai_read_until |
266 |
wakaba |
1.1 |
|
267 |
|
|
sub ungetc ($$) { |
268 |
|
|
unshift @{$_[0]->{queue}}, chr int ($_[1] or 0); |
269 |
|
|
} # ungetc |
270 |
|
|
|
271 |
|
|
sub close ($) { |
272 |
|
|
shift->{handle}->close; |
273 |
|
|
} # close |
274 |
|
|
|
275 |
|
|
sub charset ($) { |
276 |
|
|
shift->{handle}->charset; |
277 |
|
|
} # charset |
278 |
|
|
|
279 |
|
|
sub has_bom ($) { |
280 |
|
|
shift->{handle}->has_bom; |
281 |
|
|
} # has_bom |
282 |
|
|
|
283 |
|
|
sub input_encoding ($) { |
284 |
|
|
shift->{handle}->input_encoding; |
285 |
|
|
} # input_encoding |
286 |
|
|
|
287 |
|
|
sub onerror ($;$) { |
288 |
|
|
if (@_ > 1) { |
289 |
|
|
if (defined $_[1]) { |
290 |
|
|
$_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]); |
291 |
wakaba |
1.8 |
$_[0]->{onerror_set} = 1; |
292 |
wakaba |
1.1 |
} else { |
293 |
|
|
$_[0]->{handle}->onerror ($_[0]->{onerror} = sub {}); |
294 |
wakaba |
1.8 |
delete $_[0]->{onerror_set}; |
295 |
wakaba |
1.1 |
} |
296 |
|
|
} |
297 |
|
|
|
298 |
wakaba |
1.8 |
return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef; |
299 |
wakaba |
1.1 |
} # onerror |
300 |
|
|
|
301 |
|
|
1; |