/[suikacvs]/markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations) (download)
Mon Sep 22 06:04:29 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +1 -1 lines
++ whatpm/t/ChangeLog	22 Sep 2008 05:59:48 -0000
	* tokenizer-test-1.test: Test data on invalid character references
	are added (cf. HTML5 revision 2138).

	* tokenizer-test-2.dat: Test data on U+000B are updated (HTML5
	revision 2138).

2008-09-22  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	22 Sep 2008 06:02:01 -0000
2008-09-22  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src: Character references for non-space C0 characters,
	including U+000B VT, DEL character, noncharacter code points, are
	now converted to the U+FFFD character (cf. HTML5 revision 2138).

1 wakaba 1.1 package Whatpm::Charset::UnicodeChecker;
2     use strict;
3    
4     ## NOTE: For more information (including rationals of checks performed
5     ## in this module), see
6     ## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7    
8     ## NOTE: Unicode's definition for character string conformance is
9     ## very, very vague so that it is difficult to determine what error
10     ## level is appropriate for each error. The Unicode Standard abuses
11     ## conformance-creteria-like terms such as "deprecated", "discouraged",
12     ## "preferred", "better", "not encouraged", "should", and so on with no
13     ## clear explanation of their difference (if any) or relationship to
14     ## the conformance. In fact, that specification does not define the
15     ## conformance class for character strings.
16    
17 wakaba 1.8 sub new_handle ($$;$) {
18 wakaba 1.1 my $self = bless {
19     queue => [],
20 wakaba 1.2 new_queue => [],
21 wakaba 1.1 onerror => sub {},
22 wakaba 1.8 #onerror_set
23 wakaba 1.1 level => {
24     unicode_should => 'w',
25     unicode_deprecated => 'w', # = unicode_should
26     unicode_discouraged => 'w',
27     unicode_preferred => 'w',
28     ## NOTE: We do some "unification" of levels - for example,
29     ## "not encouraged" is unified with "discouraged" and
30     ## "better" is unified with "preferred".
31 wakaba 1.8
32     must => 'm',
33     warn => 'w',
34 wakaba 1.1 },
35 wakaba 1.8 }, $_[0];
36     $self->{handle} = $_[1]; # char stream
37     my $mode = $_[2] || 'default'; # or 'html5'
38     $self->{level_map} = {
39     ## Unicode errors
40     'unicode deprecated' => 'unicode_deprecated',
41     'nonchar' => $mode eq 'html5' ? 'must' : 'unicode_should',
42     ## NOTE: HTML5 parse error.
43     'unicode should' => 'unicode_should',
44     'unicode discouraged' => 'unicode_discouraged',
45     'unicode not preferred' => 'unicode_preferred',
46    
47     ## HTML5 errors (See "text" definition of the spec)
48     'control char' => $mode eq 'html5' ? 'must' : 'warn',
49     ## NOTE: HTML5 parse error.
50     'non unicode' => $mode eq 'html5' ? 'must' : 'warn',
51     ## NOTE: In HTML5, replaced by U+FFFD (not a parse error).
52     };
53     $self->{replace_non_unicode} = ($mode eq 'html5');
54 wakaba 1.1 return $self;
55     } # new_handle
56    
57 wakaba 1.7 my $etypes = {
58     0x0340 => 'unicode deprecated',
59     0x0341 => 'unicode deprecated',
60     0x17A3 => 'unicode deprecated',
61     0x17D3 => 'unicode deprecated',
62     0x206A => 'unicode deprecated',
63     0x206B => 'unicode deprecated',
64     0x206C => 'unicode deprecated',
65     0x206D => 'unicode deprecated',
66     0x206E => 'unicode deprecated',
67     0x206F => 'unicode deprecated',
68     0xE0001 => 'unicode deprecated',
69    
70     0xFFFE => 'nonchar',
71     0xFFFF => 'nonchar',
72     0x1FFFE => 'nonchar',
73     0x1FFFF => 'nonchar',
74     0x2FFFE => 'nonchar',
75     0x2FFFF => 'nonchar',
76     0x3FFFE => 'nonchar',
77     0x3FFFF => 'nonchar',
78     0x4FFFE => 'nonchar',
79     0x4FFFF => 'nonchar',
80     0x5FFFE => 'nonchar',
81     0x5FFFF => 'nonchar',
82     0x6FFFE => 'nonchar',
83     0x6FFFF => 'nonchar',
84     0x7FFFE => 'nonchar',
85     0x7FFFF => 'nonchar',
86     0x8FFFE => 'nonchar',
87     0x8FFFF => 'nonchar',
88     0x9FFFE => 'nonchar',
89     0x9FFFF => 'nonchar',
90     0xAFFFE => 'nonchar',
91     0xAFFFF => 'nonchar',
92     0xBFFFE => 'nonchar',
93     0xBFFFF => 'nonchar',
94     0xCFFFE => 'nonchar',
95     0xCFFFF => 'nonchar',
96     0xDFFFE => 'nonchar',
97     0xDFFFF => 'nonchar',
98     0xEFFFE => 'nonchar',
99     0xEFFFF => 'nonchar',
100     0xFFFFE => 'nonchar',
101     0xFFFFF => 'nonchar',
102     0x10FFFE => 'nonchar',
103     0x10FFFF => 'nonchar',
104    
105     0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS
106     0x03D3 => 'unicode should', # GREEK UPSILON WITH ...
107     0x03D4 => 'unicode should', # GREEK UPSILON WITH ...
108     0x20A4 => 'unicode should', # LIRA SIGN
109    
110     0x2126 => 'unicode should', # OHM SIGN # also, discouraged
111     0x212A => 'unicode should', # KELVIN SIGN
112     0x212B => 'unicode should', # ANGSTROM SIGN
113    
114     ## Styled overlines/underlines in CJK Compatibility Forms
115     0xFE49 => 'unicode discouraged',
116     0xFE4A => 'unicode discouraged',
117     0xFE4B => 'unicode discouraged',
118     0xFE4C => 'unicode discouraged',
119     0xFE4D => 'unicode discouraged',
120     0xFE4E => 'unicode discouraged',
121     0xFE4F => 'unicode discouraged',
122    
123     0x037E => 'unicode discouraged', # greek punctuations
124     0x0387 => 'unicode discouraged', # greek punctuations
125    
126     #0x17A3 => 'unicode discouraged', # also, deprecated
127     0x17A4 => 'unicode discouraged',
128     0x17B4 => 'unicode discouraged',
129     0x17B5 => 'unicode discouraged',
130     0x17D8 => 'unicode discouraged',
131    
132     0x2121 => 'unicode discouraged', # tel
133     0x213B => 'unicode discouraged', # fax
134     #0x2120 => 'unicode discouraged', # SM (superscript)
135     #0x2122 => 'unicode discouraged', # TM (superscript)
136    
137     ## inline annotations
138     0xFFF9 => 'unicode discouraged',
139     0xFFFA => 'unicode discouraged',
140     0xFFFB => 'unicode discouraged',
141    
142     ## greek punctuations
143     0x055A => 'unicode not preferred',
144     0x0559 => 'unicode not preferred',
145    
146     ## degree signs
147     0x2103 => 'unicode not preferred',
148     0x2109 => 'unicode not preferred',
149    
150     ## strongly preferrs U+2060 WORD JOINTER
151     0xFEFE => 'unicode not preferred',
152     };
153    
154     $etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F;
155     $etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF;
156 wakaba 1.8 ## ISSUE: U+FDE0-U+FDEF are not excluded in HTML5.
157 wakaba 1.7 $etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9;
158     $etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3;
159     $etypes->{$_} = 'unicode should'
160     for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F,
161     0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74,
162     0xFE76 .. 0xFEFC;
163     ## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where
164     ## no character is assigned, noncharacter code points, and
165     ## U+FD3E and U+FD3F, which are explicitly allowed.
166     $etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F;
167 wakaba 1.8 $etypes->{$_} = 'control char'
168 wakaba 1.9 for 0x0001 .. 0x0008, 0x000B, 0x000E .. 0x001F, 0x007F .. 0x009F;
169 wakaba 1.8 $etypes->{$_} = 'control char' for 0xD800 .. 0xDFFF;
170 wakaba 1.7
171 wakaba 1.6 my $check_char = sub ($$) {
172     my ($self, $char_code) = @_;
173 wakaba 1.1
174 wakaba 1.8 ## NOTE: Negative $char_code is not supported.
175    
176 wakaba 1.6 if ($char_code == 0x000D) {
177     $self->{line_diff}++;
178     $self->{column_diff} = 0;
179     $self->{set_column} = 1;
180     $self->{has_cr} = 1;
181     return;
182     } elsif ($char_code == 0x000A) {
183     if ($self->{has_cr}) {
184     delete $self->{has_cr};
185     } else {
186     $self->{line_diff}++;
187     $self->{column_diff} = 0;
188     $self->{set_column} = 1;
189     }
190     return;
191     } else {
192     $self->{column_diff}++;
193     delete $self->{has_cr};
194     }
195 wakaba 1.7
196 wakaba 1.8 if ($char_code > 0x10FFFF) {
197     $self->{onerror}->(type => 'non unicode',
198     text => (sprintf 'U-%08X', $char_code),
199     layer => 'charset',
200     level => $self->{level}->{$self->{level_map}->{'non unicode'}},
201     line_diff => $self->{line_diff},
202     column_diff => $self->{column_diff},
203     ($self->{set_column} ? (column => 1) : ()));
204     if ($self->{replace_non_unicode}) {
205     return "\x{FFFD}";
206     } else {
207     return;
208     }
209     }
210 wakaba 1.6
211 wakaba 1.7 my $etype = $etypes->{$char_code};
212     if (defined $etype) {
213     $self->{onerror}->(type => $etype,
214 wakaba 1.1 text => (sprintf 'U+%04X', $char_code),
215     layer => 'charset',
216 wakaba 1.8 level => $self->{level}->{$self->{level_map}->{$etype}},
217 wakaba 1.6 line_diff => $self->{line_diff},
218     column_diff => $self->{column_diff},
219     ($self->{set_column} ? (column => 1) : ()));
220 wakaba 1.1 }
221    
222     ## TODO: "khanda ta" should be represented by U+09CE
223     ## rather than <U+09A4, U+09CD, U+200D>.
224    
225     ## TODO: IDS syntax
226    
227     ## TODO: langtag syntax
228 wakaba 1.8
229     return;
230 wakaba 1.6 }; # $check_char
231 wakaba 1.1
232 wakaba 1.6 sub read ($$$;$) {
233     my $self = shift;
234     my $offset = $_[2] || 0;
235     my $count = $self->{handle}->read (@_);
236     $self->{line_diff} = 0;
237     $self->{column_diff} = -1;
238     delete $self->{set_column};
239     delete $self->{has_cr};
240     for ($offset .. ($offset + $count - 1)) {
241 wakaba 1.8 my $c = $check_char->($self, ord substr $_[0], $_, 1);
242     if (defined $c) {
243     substr ($_[0], $_, 1) = $c;
244     }
245 wakaba 1.6 }
246     return $count;
247     } # read
248 wakaba 1.4
249 wakaba 1.5 sub manakai_read_until ($$$;$) {
250     #my ($self, $scalar, $pattern, $offset) = @_;
251     my $self = shift;
252 wakaba 1.6 my $offset = $_[2] || 0;
253     my $count = $self->{handle}->manakai_read_until (@_);
254     $self->{line_diff} = 0;
255     $self->{column_diff} = -1;
256     delete $self->{set_column};
257     delete $self->{has_cr};
258     for ($offset .. ($offset + $count - 1)) {
259 wakaba 1.8 my $c = $check_char->($self, ord substr $_[0], $_, 1);
260     if (defined $c) {
261     substr ($_[0], $_, 1) = $c;
262     }
263 wakaba 1.4 }
264 wakaba 1.6 return $count;
265 wakaba 1.5 } # manakai_read_until
266 wakaba 1.1
267     sub ungetc ($$) {
268     unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
269     } # ungetc
270    
271     sub close ($) {
272     shift->{handle}->close;
273     } # close
274    
275     sub charset ($) {
276     shift->{handle}->charset;
277     } # charset
278    
279     sub has_bom ($) {
280     shift->{handle}->has_bom;
281     } # has_bom
282    
283     sub input_encoding ($) {
284     shift->{handle}->input_encoding;
285     } # input_encoding
286    
287     sub onerror ($;$) {
288     if (@_ > 1) {
289     if (defined $_[1]) {
290     $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
291 wakaba 1.8 $_[0]->{onerror_set} = 1;
292 wakaba 1.1 } else {
293     $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
294 wakaba 1.8 delete $_[0]->{onerror_set};
295 wakaba 1.1 }
296     }
297    
298 wakaba 1.8 return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef;
299 wakaba 1.1 } # onerror
300    
301     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24