/[suikacvs]/markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (show annotations) (download)
Mon Sep 22 06:04:29 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +1 -1 lines
++ whatpm/t/ChangeLog	22 Sep 2008 05:59:48 -0000
	* tokenizer-test-1.test: Test data on invalid character references
	are added (cf. HTML5 revision 2138).

	* tokenizer-test-2.dat: Test data on U+000B are updated (HTML5
	revision 2138).

2008-09-22  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	22 Sep 2008 06:02:01 -0000
2008-09-22  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src: Character references for non-space C0 characters,
	including U+000B VT, DEL character, noncharacter code points, are
	now converted to the U+FFFD character (cf. HTML5 revision 2138).

1 package Whatpm::Charset::UnicodeChecker;
2 use strict;
3
4 ## NOTE: For more information (including rationals of checks performed
5 ## in this module), see
6 ## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7
8 ## NOTE: Unicode's definition for character string conformance is
9 ## very, very vague so that it is difficult to determine what error
10 ## level is appropriate for each error. The Unicode Standard abuses
11 ## conformance-creteria-like terms such as "deprecated", "discouraged",
12 ## "preferred", "better", "not encouraged", "should", and so on with no
13 ## clear explanation of their difference (if any) or relationship to
14 ## the conformance. In fact, that specification does not define the
15 ## conformance class for character strings.
16
17 sub new_handle ($$;$) {
18 my $self = bless {
19 queue => [],
20 new_queue => [],
21 onerror => sub {},
22 #onerror_set
23 level => {
24 unicode_should => 'w',
25 unicode_deprecated => 'w', # = unicode_should
26 unicode_discouraged => 'w',
27 unicode_preferred => 'w',
28 ## NOTE: We do some "unification" of levels - for example,
29 ## "not encouraged" is unified with "discouraged" and
30 ## "better" is unified with "preferred".
31
32 must => 'm',
33 warn => 'w',
34 },
35 }, $_[0];
36 $self->{handle} = $_[1]; # char stream
37 my $mode = $_[2] || 'default'; # or 'html5'
38 $self->{level_map} = {
39 ## Unicode errors
40 'unicode deprecated' => 'unicode_deprecated',
41 'nonchar' => $mode eq 'html5' ? 'must' : 'unicode_should',
42 ## NOTE: HTML5 parse error.
43 'unicode should' => 'unicode_should',
44 'unicode discouraged' => 'unicode_discouraged',
45 'unicode not preferred' => 'unicode_preferred',
46
47 ## HTML5 errors (See "text" definition of the spec)
48 'control char' => $mode eq 'html5' ? 'must' : 'warn',
49 ## NOTE: HTML5 parse error.
50 'non unicode' => $mode eq 'html5' ? 'must' : 'warn',
51 ## NOTE: In HTML5, replaced by U+FFFD (not a parse error).
52 };
53 $self->{replace_non_unicode} = ($mode eq 'html5');
54 return $self;
55 } # new_handle
56
57 my $etypes = {
58 0x0340 => 'unicode deprecated',
59 0x0341 => 'unicode deprecated',
60 0x17A3 => 'unicode deprecated',
61 0x17D3 => 'unicode deprecated',
62 0x206A => 'unicode deprecated',
63 0x206B => 'unicode deprecated',
64 0x206C => 'unicode deprecated',
65 0x206D => 'unicode deprecated',
66 0x206E => 'unicode deprecated',
67 0x206F => 'unicode deprecated',
68 0xE0001 => 'unicode deprecated',
69
70 0xFFFE => 'nonchar',
71 0xFFFF => 'nonchar',
72 0x1FFFE => 'nonchar',
73 0x1FFFF => 'nonchar',
74 0x2FFFE => 'nonchar',
75 0x2FFFF => 'nonchar',
76 0x3FFFE => 'nonchar',
77 0x3FFFF => 'nonchar',
78 0x4FFFE => 'nonchar',
79 0x4FFFF => 'nonchar',
80 0x5FFFE => 'nonchar',
81 0x5FFFF => 'nonchar',
82 0x6FFFE => 'nonchar',
83 0x6FFFF => 'nonchar',
84 0x7FFFE => 'nonchar',
85 0x7FFFF => 'nonchar',
86 0x8FFFE => 'nonchar',
87 0x8FFFF => 'nonchar',
88 0x9FFFE => 'nonchar',
89 0x9FFFF => 'nonchar',
90 0xAFFFE => 'nonchar',
91 0xAFFFF => 'nonchar',
92 0xBFFFE => 'nonchar',
93 0xBFFFF => 'nonchar',
94 0xCFFFE => 'nonchar',
95 0xCFFFF => 'nonchar',
96 0xDFFFE => 'nonchar',
97 0xDFFFF => 'nonchar',
98 0xEFFFE => 'nonchar',
99 0xEFFFF => 'nonchar',
100 0xFFFFE => 'nonchar',
101 0xFFFFF => 'nonchar',
102 0x10FFFE => 'nonchar',
103 0x10FFFF => 'nonchar',
104
105 0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS
106 0x03D3 => 'unicode should', # GREEK UPSILON WITH ...
107 0x03D4 => 'unicode should', # GREEK UPSILON WITH ...
108 0x20A4 => 'unicode should', # LIRA SIGN
109
110 0x2126 => 'unicode should', # OHM SIGN # also, discouraged
111 0x212A => 'unicode should', # KELVIN SIGN
112 0x212B => 'unicode should', # ANGSTROM SIGN
113
114 ## Styled overlines/underlines in CJK Compatibility Forms
115 0xFE49 => 'unicode discouraged',
116 0xFE4A => 'unicode discouraged',
117 0xFE4B => 'unicode discouraged',
118 0xFE4C => 'unicode discouraged',
119 0xFE4D => 'unicode discouraged',
120 0xFE4E => 'unicode discouraged',
121 0xFE4F => 'unicode discouraged',
122
123 0x037E => 'unicode discouraged', # greek punctuations
124 0x0387 => 'unicode discouraged', # greek punctuations
125
126 #0x17A3 => 'unicode discouraged', # also, deprecated
127 0x17A4 => 'unicode discouraged',
128 0x17B4 => 'unicode discouraged',
129 0x17B5 => 'unicode discouraged',
130 0x17D8 => 'unicode discouraged',
131
132 0x2121 => 'unicode discouraged', # tel
133 0x213B => 'unicode discouraged', # fax
134 #0x2120 => 'unicode discouraged', # SM (superscript)
135 #0x2122 => 'unicode discouraged', # TM (superscript)
136
137 ## inline annotations
138 0xFFF9 => 'unicode discouraged',
139 0xFFFA => 'unicode discouraged',
140 0xFFFB => 'unicode discouraged',
141
142 ## greek punctuations
143 0x055A => 'unicode not preferred',
144 0x0559 => 'unicode not preferred',
145
146 ## degree signs
147 0x2103 => 'unicode not preferred',
148 0x2109 => 'unicode not preferred',
149
150 ## strongly preferrs U+2060 WORD JOINTER
151 0xFEFE => 'unicode not preferred',
152 };
153
154 $etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F;
155 $etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF;
156 ## ISSUE: U+FDE0-U+FDEF are not excluded in HTML5.
157 $etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9;
158 $etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3;
159 $etypes->{$_} = 'unicode should'
160 for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F,
161 0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74,
162 0xFE76 .. 0xFEFC;
163 ## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where
164 ## no character is assigned, noncharacter code points, and
165 ## U+FD3E and U+FD3F, which are explicitly allowed.
166 $etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F;
167 $etypes->{$_} = 'control char'
168 for 0x0001 .. 0x0008, 0x000B, 0x000E .. 0x001F, 0x007F .. 0x009F;
169 $etypes->{$_} = 'control char' for 0xD800 .. 0xDFFF;
170
171 my $check_char = sub ($$) {
172 my ($self, $char_code) = @_;
173
174 ## NOTE: Negative $char_code is not supported.
175
176 if ($char_code == 0x000D) {
177 $self->{line_diff}++;
178 $self->{column_diff} = 0;
179 $self->{set_column} = 1;
180 $self->{has_cr} = 1;
181 return;
182 } elsif ($char_code == 0x000A) {
183 if ($self->{has_cr}) {
184 delete $self->{has_cr};
185 } else {
186 $self->{line_diff}++;
187 $self->{column_diff} = 0;
188 $self->{set_column} = 1;
189 }
190 return;
191 } else {
192 $self->{column_diff}++;
193 delete $self->{has_cr};
194 }
195
196 if ($char_code > 0x10FFFF) {
197 $self->{onerror}->(type => 'non unicode',
198 text => (sprintf 'U-%08X', $char_code),
199 layer => 'charset',
200 level => $self->{level}->{$self->{level_map}->{'non unicode'}},
201 line_diff => $self->{line_diff},
202 column_diff => $self->{column_diff},
203 ($self->{set_column} ? (column => 1) : ()));
204 if ($self->{replace_non_unicode}) {
205 return "\x{FFFD}";
206 } else {
207 return;
208 }
209 }
210
211 my $etype = $etypes->{$char_code};
212 if (defined $etype) {
213 $self->{onerror}->(type => $etype,
214 text => (sprintf 'U+%04X', $char_code),
215 layer => 'charset',
216 level => $self->{level}->{$self->{level_map}->{$etype}},
217 line_diff => $self->{line_diff},
218 column_diff => $self->{column_diff},
219 ($self->{set_column} ? (column => 1) : ()));
220 }
221
222 ## TODO: "khanda ta" should be represented by U+09CE
223 ## rather than <U+09A4, U+09CD, U+200D>.
224
225 ## TODO: IDS syntax
226
227 ## TODO: langtag syntax
228
229 return;
230 }; # $check_char
231
232 sub read ($$$;$) {
233 my $self = shift;
234 my $offset = $_[2] || 0;
235 my $count = $self->{handle}->read (@_);
236 $self->{line_diff} = 0;
237 $self->{column_diff} = -1;
238 delete $self->{set_column};
239 delete $self->{has_cr};
240 for ($offset .. ($offset + $count - 1)) {
241 my $c = $check_char->($self, ord substr $_[0], $_, 1);
242 if (defined $c) {
243 substr ($_[0], $_, 1) = $c;
244 }
245 }
246 return $count;
247 } # read
248
249 sub manakai_read_until ($$$;$) {
250 #my ($self, $scalar, $pattern, $offset) = @_;
251 my $self = shift;
252 my $offset = $_[2] || 0;
253 my $count = $self->{handle}->manakai_read_until (@_);
254 $self->{line_diff} = 0;
255 $self->{column_diff} = -1;
256 delete $self->{set_column};
257 delete $self->{has_cr};
258 for ($offset .. ($offset + $count - 1)) {
259 my $c = $check_char->($self, ord substr $_[0], $_, 1);
260 if (defined $c) {
261 substr ($_[0], $_, 1) = $c;
262 }
263 }
264 return $count;
265 } # manakai_read_until
266
267 sub ungetc ($$) {
268 unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
269 } # ungetc
270
271 sub close ($) {
272 shift->{handle}->close;
273 } # close
274
275 sub charset ($) {
276 shift->{handle}->charset;
277 } # charset
278
279 sub has_bom ($) {
280 shift->{handle}->has_bom;
281 } # has_bom
282
283 sub input_encoding ($) {
284 shift->{handle}->input_encoding;
285 } # input_encoding
286
287 sub onerror ($;$) {
288 if (@_ > 1) {
289 if (defined $_[1]) {
290 $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
291 $_[0]->{onerror_set} = 1;
292 } else {
293 $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
294 delete $_[0]->{onerror_set};
295 }
296 }
297
298 return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef;
299 } # onerror
300
301 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24