/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.49 - (show annotations) (download)
Sat Sep 29 04:45:09 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.48: +3 -3 lines
++ whatpm/t/ChangeLog	29 Sep 2007 04:36:22 -0000
2007-09-29  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: New tests for invalid
	attribute specifications are added.

++ whatpm/Whatpm/ChangeLog	29 Sep 2007 04:38:17 -0000
	* ContentChecker.pm: Raise specific error for invalid
	root element.

	* SelectorsParser.pm: Pass an empty string as a prefix
	for lookup namespace prefix callback, for loose compatibility
	with the |NSResolver| interface.

2007-09-24  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ContentChecker/ChangeLog	29 Sep 2007 04:38:46 -0000
	* Atom.pm (atom:link@title): Definition was missing.

2007-09-24  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::ContentChecker;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.48 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 require Whatpm::URIChecker;
6
7 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
8 ## be applied to an in-memory representation (i.e. DOM)?
9
10 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
11 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
12 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
13
14 my $Namespace = {
15 q<http://www.w3.org/2005/Atom> => {module => 'Whatpm::ContentChecker::Atom'},
16 $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
17 $XML_NS => {loaded => 1},
18 $XMLNS_NS => {loaded => 1},
19 };
20
21 our $AttrChecker = {
22 $XML_NS => {
23 space => sub {
24 my ($self, $attr) = @_;
25 my $value = $attr->value;
26 if ($value eq 'default' or $value eq 'preserve') {
27 #
28 } else {
29 ## NOTE: An XML "error"
30 $self->{onerror}->(node => $attr, level => 'error',
31 type => 'invalid attribute value');
32 }
33 },
34 lang => sub {
35 my ($self, $attr) = @_;
36 my $value = $attr->value;
37 if ($value eq '') {
38 #
39 } else {
40 require Whatpm::LangTag;
41 Whatpm::LangTag->check_rfc3066_language_tag ($value, sub {
42 my %opt = @_;
43 my $type = 'LangTag:'.$opt{type};
44 $type .= ':' . $opt{subtag} if defined $opt{subtag};
45 $self->{onerror}->(node => $attr, type => $type,
46 value => $opt{value}, level => $opt{level});
47 });
48 }
49
50 ## NOTE: "The values of the attribute are language identifiers
51 ## as defined by [IETF RFC 3066], Tags for the Identification
52 ## of Languages, or its successor; in addition, the empty string
53 ## may be specified." ("may" in lower case)
54 ## NOTE: Is an RFC 3066-valid (but RFC 4647-invalid) language tag
55 ## allowed today?
56
57 ## TODO: test data
58
59 if ($attr->owner_document->manakai_is_html) { # MUST NOT
60 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang');
61 ## TODO: Test data...
62 }
63 },
64 base => sub {
65 my ($self, $attr) = @_;
66 my $value = $attr->value;
67 if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
68 $self->{onerror}->(node => $attr,
69 type => 'invalid attribute value');
70 }
71 ## NOTE: Conformance to URI standard is not checked since there is
72 ## no author requirement on conformance in the XML Base specification.
73 },
74 id => sub {
75 my ($self, $attr) = @_;
76 my $value = $attr->value;
77 $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
78 $value =~ s/^\x20//;
79 $value =~ s/\x20$//;
80 ## TODO: NCName in XML 1.0 or 1.1
81 ## TODO: declared type is ID?
82 if ($self->{id}->{$value}) { ## NOTE: An xml:id error
83 $self->{onerror}->(node => $attr, level => 'error',
84 type => 'duplicate ID');
85 push @{$self->{id}->{$value}}, $attr;
86 } else {
87 $self->{id}->{$value} = [$attr];
88 }
89 },
90 },
91 $XMLNS_NS => {
92 '' => sub {
93 my ($self, $attr) = @_;
94 my $ln = $attr->manakai_local_name;
95 my $value = $attr->value;
96 if ($value eq $XML_NS and $ln ne 'xml') {
97 $self->{onerror}
98 ->(node => $attr, level => 'NC',
99 type => 'Reserved Prefixes and Namespace Names:=xml');
100 } elsif ($value eq $XMLNS_NS) {
101 $self->{onerror}
102 ->(node => $attr, level => 'NC',
103 type => 'Reserved Prefixes and Namespace Names:=xmlns');
104 }
105 if ($ln eq 'xml' and $value ne $XML_NS) {
106 $self->{onerror}
107 ->(node => $attr, level => 'NC',
108 type => 'Reserved Prefixes and Namespace Names:xmlns:xml=');
109 } elsif ($ln eq 'xmlns') {
110 $self->{onerror}
111 ->(node => $attr, level => 'NC',
112 type => 'Reserved Prefixes and Namespace Names:xmlns:xmlns=');
113 }
114 ## TODO: If XML 1.0 and empty
115 },
116 xmlns => sub {
117 my ($self, $attr) = @_;
118 ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
119 ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
120 ## TODO: relative references are deprecated
121 my $value = $attr->value;
122 if ($value eq $XML_NS) {
123 $self->{onerror}
124 ->(node => $attr, level => 'NC',
125 type => 'Reserved Prefixes and Namespace Names:=xml');
126 } elsif ($value eq $XMLNS_NS) {
127 $self->{onerror}
128 ->(node => $attr, level => 'NC',
129 type => 'Reserved Prefixes and Namespace Names:=xmlns');
130 }
131 },
132 },
133 };
134
135 ## ISSUE: Should we really allow these attributes?
136 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
137 $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
138 $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
139 $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
140
141 ## ANY
142 our $AnyChecker = sub {
143 my ($self, $todo) = @_;
144 my $el = $todo->{node};
145 my $new_todos = [];
146 my @nodes = (@{$el->child_nodes});
147 while (@nodes) {
148 my $node = shift @nodes;
149 $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
150
151 my $nt = $node->node_type;
152 if ($nt == 1) {
153 my $node_ns = $node->namespace_uri;
154 $node_ns = '' unless defined $node_ns;
155 my $node_ln = $node->manakai_local_name;
156 if ($self->{minuses}->{$node_ns}->{$node_ln}) {
157 $self->{onerror}->(node => $node, type => 'element not allowed');
158 }
159 push @$new_todos, {type => 'element', node => $node};
160 } elsif ($nt == 5) {
161 unshift @nodes, @{$node->child_nodes};
162 }
163 }
164 return ($new_todos);
165 }; # $AnyChecker
166
167 our $ElementDefault = {
168 checker => sub {
169 my ($self, $todo) = @_;
170 $self->{onerror}->(node => $todo->{node}, level => 'unsupported',
171 type => 'element');
172 return $AnyChecker->($self, $todo);
173 },
174 attrs_checker => sub {
175 my ($self, $todo) = @_;
176 for my $attr (@{$todo->{node}->attributes}) {
177 my $attr_ns = $attr->namespace_uri;
178 $attr_ns = '' unless defined $attr_ns;
179 my $attr_ln = $attr->manakai_local_name;
180 my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
181 || $AttrChecker->{$attr_ns}->{''};
182 if ($checker) {
183 $checker->($self, $attr);
184 } else {
185 $self->{onerror}->(node => $attr, level => 'unsupported',
186 type => 'attribute');
187 }
188 }
189 },
190 };
191
192 my $HTMLTransparentElements = {
193 $HTML_NS => {qw/ins 1 font 1 noscript 1/},
194 ## NOTE: |html:noscript| is transparent if scripting is disabled
195 ## and not in |head|.
196 };
197
198 our $Element = {};
199
200 sub check_document ($$$) {
201 my ($self, $doc, $onerror) = @_;
202 $self = bless {}, $self unless ref $self;
203 $self->{onerror} = $onerror;
204
205 $self->{must_level} = 'm';
206 $self->{fact_level} = 'f';
207 $self->{should_level} = 's';
208 $self->{good_level} = 'g';
209
210 my $docel = $doc->document_element;
211 unless (defined $docel) {
212 ## ISSUE: Should we check content of Document node?
213 $onerror->(node => $doc, type => 'no document element');
214 ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
215 return {
216 class => {},
217 id => {}, table => [], term => {},
218 };
219 }
220
221 ## ISSUE: Unexpanded entity references and HTML5 conformance
222
223 my $docel_nsuri = $docel->namespace_uri;
224 $docel_nsuri = '' unless defined $docel_nsuri;
225 unless ($Namespace->{$docel_nsuri}->{loaded}) {
226 if ($Namespace->{$docel_nsuri}->{module}) {
227 eval qq{ require $Namespace->{$docel_nsuri}->{module} } or die $@;
228 } else {
229 $Namespace->{$docel_nsuri}->{loaded} = 1;
230 }
231 }
232 my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
233 $Element->{$docel_nsuri}->{''} ||
234 $ElementDefault;
235 if ($docel_def->{is_root}) {
236 #
237 } else {
238 $onerror->(node => $docel, type => 'element not allowed:root');
239 }
240
241 ## TODO: Check for other items other than document element
242 ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
243
244 return $self->check_element ($docel, $onerror);
245 } # check_document
246
247 sub check_element ($$$) {
248 my ($self, $el, $onerror) = @_;
249 $self = bless {}, $self unless ref $self;
250 $self->{onerror} = $onerror;
251
252 $self->{must_level} = 'm';
253 $self->{fact_level} = 'f';
254 $self->{should_level} = 's';
255 $self->{good_level} = 'g';
256
257 $self->{minuses} = {};
258 $self->{id} = {};
259 $self->{term} = {};
260 $self->{usemap} = [];
261 $self->{contextmenu} = [];
262 $self->{map} = {};
263 $self->{menu} = {};
264 $self->{has_link_type} = {};
265 #$self->{has_uri_attr};
266 #$self->{has_hyperlink_element};
267 $self->{return} = {
268 class => {},
269 id => $self->{id}, table => [], term => $self->{term},
270 };
271
272 my @todo = ({type => 'element', node => $el});
273 while (@todo) {
274 my $todo = shift @todo;
275 if ($todo->{type} eq 'element') {
276 my $prefix = $todo->{node}->prefix;
277 if (defined $prefix and $prefix eq 'xmlns') {
278 $self->{onerror}
279 ->(node => $todo->{node}, level => 'NC',
280 type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
281 }
282 my $nsuri = $todo->{node}->namespace_uri;
283 $nsuri = '' unless defined $nsuri;
284 unless ($Namespace->{$nsuri}->{loaded}) {
285 if ($Namespace->{$nsuri}->{module}) {
286 eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
287 } else {
288 $Namespace->{$nsuri}->{loaded} = 1;
289 }
290 }
291 my $ln = $todo->{node}->manakai_local_name;
292 my $eldef = $Element->{$nsuri}->{$ln} ||
293 $Element->{$nsuri}->{''} ||
294 $ElementDefault;
295 $eldef->{attrs_checker}->($self, $todo);
296 my ($new_todos) = $eldef->{checker}->($self, $todo);
297 unshift @todo, @$new_todos;
298 } elsif ($todo->{type} eq 'element-attributes') {
299 my $prefix = $todo->{node}->prefix;
300 if (defined $prefix and $prefix eq 'xmlns') {
301 $self->{onerror}
302 ->(node => $todo->{node}, level => 'NC',
303 type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
304 }
305 my $nsuri = $todo->{node}->namespace_uri;
306 $nsuri = '' unless defined $nsuri;
307 unless ($Namespace->{$nsuri}->{loaded}) {
308 if ($Namespace->{$nsuri}->{module}) {
309 eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
310 } else {
311 $Namespace->{$nsuri}->{loaded} = 1;
312 }
313 }
314 my $ln = $todo->{node}->manakai_local_name;
315 my $eldef = $Element->{$nsuri}->{$ln} ||
316 $Element->{$nsuri}->{''} ||
317 $ElementDefault;
318 $eldef->{attrs_checker}->($self, $todo);
319 } elsif ($todo->{type} eq 'plus') {
320 $self->_remove_minuses ($todo);
321 } elsif ($todo->{type} eq 'code') {
322 $todo->{code}->();
323 } else {
324 die "$0: Internal error: Unsupported checking action type |$todo->{type}|";
325 }
326 }
327
328 for (@{$self->{usemap}}) {
329 unless ($self->{map}->{$_->[0]}) {
330 $self->{onerror}->(node => $_->[1], type => 'no referenced map');
331 }
332 }
333
334 for (@{$self->{contextmenu}}) {
335 unless ($self->{menu}->{$_->[0]}) {
336 $self->{onerror}->(node => $_->[1], type => 'no referenced menu');
337 }
338 }
339
340 delete $self->{minuses};
341 delete $self->{onerror};
342 delete $self->{id};
343 delete $self->{usemap};
344 delete $self->{map};
345 return $self->{return};
346 } # check_element
347
348 sub _add_minuses ($@) {
349 my $self = shift;
350 my $r = {};
351 for my $list (@_) {
352 for my $ns (keys %$list) {
353 for my $ln (keys %{$list->{$ns}}) {
354 unless ($self->{minuses}->{$ns}->{$ln}) {
355 $self->{minuses}->{$ns}->{$ln} = 1;
356 $r->{$ns}->{$ln} = 1;
357 }
358 }
359 }
360 }
361 return {type => 'plus', list => $r};
362 } # _add_minuses
363
364 sub _remove_minuses ($$) {
365 my ($self, $todo) = @_;
366 for my $ns (keys %{$todo->{list}}) {
367 for my $ln (keys %{$todo->{list}->{$ns}}) {
368 delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
369 }
370 }
371 1;
372 } # _remove_minuses
373
374 sub _check_get_children ($$$) {
375 my ($self, $node, $parent_todo) = @_;
376 my $new_todos = [];
377 my $sib = [];
378 TP: {
379 my $node_ns = $node->namespace_uri;
380 $node_ns = '' unless defined $node_ns;
381 my $node_ln = $node->manakai_local_name;
382 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
383 if ($node_ns eq $HTML_NS and $node_ln eq 'noscript') {
384 if ($parent_todo->{flag}->{in_head}) {
385 #
386 } else {
387 my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
388 push @$sib, $end;
389
390 unshift @$sib, @{$node->child_nodes};
391 push @$new_todos, {type => 'element-attributes', node => $node};
392 last TP;
393 }
394 } else {
395 unshift @$sib, @{$node->child_nodes};
396 push @$new_todos, {type => 'element-attributes', node => $node};
397 last TP;
398 }
399 }
400 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
401 if ($node->has_attribute_ns (undef, 'src')) {
402 unshift @$sib, @{$node->child_nodes};
403 push @$new_todos, {type => 'element-attributes', node => $node};
404 last TP;
405 } else {
406 my @cn = @{$node->child_nodes};
407 CN: while (@cn) {
408 my $cn = shift @cn;
409 my $cnt = $cn->node_type;
410 if ($cnt == 1) {
411 my $cn_nsuri = $cn->namespace_uri;
412 $cn_nsuri = '' unless defined $cn_nsuri;
413 if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
414 #
415 } else {
416 last CN;
417 }
418 } elsif ($cnt == 3 or $cnt == 4) {
419 if ($cn->data =~ /[^\x09-\x0D\x20]/) {
420 last CN;
421 }
422 }
423 } # CN
424 unshift @$sib, @cn;
425 }
426 }
427 push @$new_todos, {type => 'element', node => $node};
428 } # TP
429
430 for my $new_todo (@$new_todos) {
431 $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
432 }
433
434 return ($sib, $new_todos);
435 } # _check_get_children
436
437 =head1 LICENSE
438
439 Copyright 2007 Wakaba <w@suika.fam.cx>
440
441 This library is free software; you can redistribute it
442 and/or modify it under the same terms as Perl itself.
443
444 =cut
445
446 1;
447 # $Date: 2007/09/24 04:23:44 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24