/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.53 - (show annotations) (download)
Sun Nov 25 03:46:07 2007 UTC (16 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.52: +12 -2 lines
++ whatpm/Whatpm/ChangeLog	25 Nov 2007 03:45:37 -0000
2007-11-25  Wakaba  <wakaba@suika.fam.cx>

	* ContentChecker.pm (check_element): New todo item type |descendant|.

++ whatpm/Whatpm/ContentChecker/ChangeLog	25 Nov 2007 03:46:01 -0000
2007-11-25  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm (header): Descendant constraint is redefined by
	new todo item type |descendant|.

1 package Whatpm::ContentChecker;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.52 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 require Whatpm::URIChecker;
6
7 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
8 ## be applied to an in-memory representation (i.e. DOM)?
9
10 ## TODO: Conformance of an HTML document with non-html root element.
11
12 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
13 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
14 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
15
16 my $Namespace = {
17 q<http://www.w3.org/2005/Atom> => {module => 'Whatpm::ContentChecker::Atom'},
18 $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
19 $XML_NS => {loaded => 1},
20 $XMLNS_NS => {loaded => 1},
21 };
22
23 our $AttrChecker = {
24 $XML_NS => {
25 space => sub {
26 my ($self, $attr) = @_;
27 my $value = $attr->value;
28 if ($value eq 'default' or $value eq 'preserve') {
29 #
30 } else {
31 ## NOTE: An XML "error"
32 $self->{onerror}->(node => $attr, level => 'error',
33 type => 'invalid attribute value');
34 }
35 },
36 lang => sub {
37 my ($self, $attr) = @_;
38 my $value = $attr->value;
39 if ($value eq '') {
40 #
41 } else {
42 require Whatpm::LangTag;
43 Whatpm::LangTag->check_rfc3066_language_tag ($value, sub {
44 my %opt = @_;
45 my $type = 'LangTag:'.$opt{type};
46 $type .= ':' . $opt{subtag} if defined $opt{subtag};
47 $self->{onerror}->(node => $attr, type => $type,
48 value => $opt{value}, level => $opt{level});
49 });
50 }
51
52 ## NOTE: "The values of the attribute are language identifiers
53 ## as defined by [IETF RFC 3066], Tags for the Identification
54 ## of Languages, or its successor; in addition, the empty string
55 ## may be specified." ("may" in lower case)
56 ## NOTE: Is an RFC 3066-valid (but RFC 4647-invalid) language tag
57 ## allowed today?
58
59 ## TODO: test data
60
61 if ($attr->owner_document->manakai_is_html) { # MUST NOT
62 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang');
63 ## TODO: Test data...
64 }
65 },
66 base => sub {
67 my ($self, $attr) = @_;
68 my $value = $attr->value;
69 if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
70 $self->{onerror}->(node => $attr,
71 type => 'invalid attribute value');
72 }
73 ## NOTE: Conformance to URI standard is not checked since there is
74 ## no author requirement on conformance in the XML Base specification.
75 },
76 id => sub {
77 my ($self, $attr) = @_;
78 my $value = $attr->value;
79 $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
80 $value =~ s/^\x20//;
81 $value =~ s/\x20$//;
82 ## TODO: NCName in XML 1.0 or 1.1
83 ## TODO: declared type is ID?
84 if ($self->{id}->{$value}) { ## NOTE: An xml:id error
85 $self->{onerror}->(node => $attr, level => 'error',
86 type => 'duplicate ID');
87 push @{$self->{id}->{$value}}, $attr;
88 } else {
89 $self->{id}->{$value} = [$attr];
90 }
91 },
92 },
93 $XMLNS_NS => {
94 '' => sub {
95 my ($self, $attr) = @_;
96 my $ln = $attr->manakai_local_name;
97 my $value = $attr->value;
98 if ($value eq $XML_NS and $ln ne 'xml') {
99 $self->{onerror}
100 ->(node => $attr, level => 'NC',
101 type => 'Reserved Prefixes and Namespace Names:=xml');
102 } elsif ($value eq $XMLNS_NS) {
103 $self->{onerror}
104 ->(node => $attr, level => 'NC',
105 type => 'Reserved Prefixes and Namespace Names:=xmlns');
106 }
107 if ($ln eq 'xml' and $value ne $XML_NS) {
108 $self->{onerror}
109 ->(node => $attr, level => 'NC',
110 type => 'Reserved Prefixes and Namespace Names:xmlns:xml=');
111 } elsif ($ln eq 'xmlns') {
112 $self->{onerror}
113 ->(node => $attr, level => 'NC',
114 type => 'Reserved Prefixes and Namespace Names:xmlns:xmlns=');
115 }
116 ## TODO: If XML 1.0 and empty
117 },
118 xmlns => sub {
119 my ($self, $attr) = @_;
120 ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
121 ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
122 ## TODO: relative references are deprecated
123 my $value = $attr->value;
124 if ($value eq $XML_NS) {
125 $self->{onerror}
126 ->(node => $attr, level => 'NC',
127 type => 'Reserved Prefixes and Namespace Names:=xml');
128 } elsif ($value eq $XMLNS_NS) {
129 $self->{onerror}
130 ->(node => $attr, level => 'NC',
131 type => 'Reserved Prefixes and Namespace Names:=xmlns');
132 }
133 },
134 },
135 };
136
137 ## ISSUE: Should we really allow these attributes?
138 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
139 $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
140 $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
141 $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
142
143 ## ANY
144 our $AnyChecker = sub {
145 my ($self, $todo) = @_;
146 my $el = $todo->{node};
147 my $new_todos = [];
148 my @nodes = (@{$el->child_nodes});
149 while (@nodes) {
150 my $node = shift @nodes;
151 $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
152
153 my $nt = $node->node_type;
154 if ($nt == 1) {
155 my $node_ns = $node->namespace_uri;
156 $node_ns = '' unless defined $node_ns;
157 my $node_ln = $node->manakai_local_name;
158 if ($self->{minuses}->{$node_ns}->{$node_ln}) {
159 $self->{onerror}->(node => $node, type => 'element not allowed');
160 }
161 push @$new_todos, {type => 'element', node => $node};
162 } elsif ($nt == 5) {
163 unshift @nodes, @{$node->child_nodes};
164 }
165 }
166 return ($new_todos);
167 }; # $AnyChecker
168
169 our $ElementDefault = {
170 checker => sub {
171 my ($self, $todo) = @_;
172 $self->{onerror}->(node => $todo->{node}, level => 'unsupported',
173 type => 'element');
174 return $AnyChecker->($self, $todo);
175 },
176 attrs_checker => sub {
177 my ($self, $todo) = @_;
178 for my $attr (@{$todo->{node}->attributes}) {
179 my $attr_ns = $attr->namespace_uri;
180 $attr_ns = '' unless defined $attr_ns;
181 my $attr_ln = $attr->manakai_local_name;
182 my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
183 || $AttrChecker->{$attr_ns}->{''};
184 if ($checker) {
185 $checker->($self, $attr);
186 } else {
187 $self->{onerror}->(node => $attr, level => 'unsupported',
188 type => 'attribute');
189 }
190 }
191 },
192 };
193
194 my $HTMLTransparentElements = {
195 $HTML_NS => {qw/ins 1 font 1 noscript 1/},
196 ## NOTE: |html:noscript| is transparent if scripting is disabled
197 ## and not in |head|.
198 };
199
200 our $Element = {};
201
202 sub check_document ($$$) {
203 my ($self, $doc, $onerror) = @_;
204 $self = bless {}, $self unless ref $self;
205 $self->{onerror} = $onerror;
206
207 $self->{must_level} = 'm';
208 $self->{fact_level} = 'f';
209 $self->{should_level} = 's';
210 $self->{good_level} = 'w';
211
212 my $docel = $doc->document_element;
213 unless (defined $docel) {
214 ## ISSUE: Should we check content of Document node?
215 $onerror->(node => $doc, type => 'no document element');
216 ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
217 return {
218 class => {},
219 id => {}, table => [], term => {},
220 };
221 }
222
223 ## ISSUE: Unexpanded entity references and HTML5 conformance
224
225 my $docel_nsuri = $docel->namespace_uri;
226 $docel_nsuri = '' unless defined $docel_nsuri;
227 unless ($Namespace->{$docel_nsuri}->{loaded}) {
228 if ($Namespace->{$docel_nsuri}->{module}) {
229 eval qq{ require $Namespace->{$docel_nsuri}->{module} } or die $@;
230 } else {
231 $Namespace->{$docel_nsuri}->{loaded} = 1;
232 }
233 }
234 my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
235 $Element->{$docel_nsuri}->{''} ||
236 $ElementDefault;
237 if ($docel_def->{is_root}) {
238 #
239 } elsif ($docel_def->{is_xml_root}) {
240 unless ($doc->manakai_is_html) {
241 #
242 } else {
243 $onerror->(node => $docel, type => 'element not allowed:root:xml');
244 }
245 } else {
246 $onerror->(node => $docel, type => 'element not allowed:root');
247 }
248
249 ## TODO: Check for other items other than document element
250 ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
251
252 my $return = $self->check_element ($docel, $onerror);
253
254 ## TODO: Test for these checks are necessary.
255 my $charset_name = $doc->input_encoding;
256 if (defined $charset_name) {
257 require Message::Charset::Info;
258 my $charset = $Message::Charset::Info::IANACharset->{$charset_name};
259
260 if ($doc->manakai_is_html and
261 not $doc->manakai_has_bom and
262 not defined $doc->manakai_charset) {
263 unless ($charset->{is_html_ascii_superset}) {
264 $onerror->(node => $doc, level => $self->{must_level},
265 type => 'non ascii superset:'.$charset_name);
266 }
267
268 if (not $self->{has_charset} and
269 not $charset->{iana_names}->{'us-ascii'}) {
270 $onerror->(node => $doc, level => $self->{must_level},
271 type => 'no character encoding declaration:'.$charset_name);
272 }
273 }
274
275 if ($charset->{iana_names}->{'utf-8'}) {
276 #
277 } elsif ($charset->{iana_names}->{'jis_x0212-1990'} or
278 $charset->{iana_names}->{'x-jis0208'} or
279 $charset->{iana_names}->{'utf-32'} or ## ISSUE: UTF-32BE? UTF-32LE?
280 $charset->{is_ebcdic_based}) {
281 $onerror->(node => $doc,
282 type => 'character encoding:'.$charset_name,
283 level => $self->{should_level});
284 } elsif ($charset->{iana_names}->{'cesu-8'} or
285 $charset->{iana_names}->{'utf-8'} or ## ISSUE: UNICODE-1-1-UTF-7?
286 $charset->{iana_names}->{'bocu-1'} or
287 $charset->{iana_names}->{'scsu'}) {
288 $onerror->(node => $doc,
289 type => 'character encoding:'.$charset_name,
290 level => $self->{must_level});
291 } else {
292 $onerror->(node => $doc,
293 type => 'character encoding:'.$charset_name,
294 level => $self->{good_level});
295 }
296 } elsif ($doc->manakai_is_html) {
297 ## NOTE: MUST and SHOULD requirements above cannot be tested,
298 ## since the document has no input charset encoding information.
299 $onerror->(node => $doc,
300 type => 'character encoding:',
301 level => 'unsupported');
302 }
303
304 return $return;
305 } # check_document
306
307 sub check_element ($$$) {
308 my ($self, $el, $onerror) = @_;
309 $self = bless {}, $self unless ref $self;
310 $self->{onerror} = $onerror;
311
312 $self->{must_level} = 'm';
313 $self->{fact_level} = 'f';
314 $self->{should_level} = 's';
315 $self->{good_level} = 'w';
316
317 $self->{pluses} = {};
318 $self->{minuses} = {};
319 $self->{id} = {};
320 $self->{term} = {};
321 $self->{usemap} = [];
322 $self->{contextmenu} = [];
323 $self->{map} = {};
324 $self->{menu} = {};
325 $self->{has_link_type} = {};
326 #$self->{has_uri_attr};
327 #$self->{has_hyperlink_element};
328 #$self->{has_charset};
329 $self->{return} = {
330 class => {},
331 id => $self->{id}, table => [], term => $self->{term},
332 };
333
334 my @todo = ({type => 'element', node => $el});
335 while (@todo) {
336 my $todo = shift @todo;
337 if ($todo->{type} eq 'element') {
338 my $prefix = $todo->{node}->prefix;
339 if (defined $prefix and $prefix eq 'xmlns') {
340 $self->{onerror}
341 ->(node => $todo->{node}, level => 'NC',
342 type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
343 }
344 my $nsuri = $todo->{node}->namespace_uri;
345 $nsuri = '' unless defined $nsuri;
346 unless ($Namespace->{$nsuri}->{loaded}) {
347 if ($Namespace->{$nsuri}->{module}) {
348 eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
349 } else {
350 $Namespace->{$nsuri}->{loaded} = 1;
351 }
352 }
353 my $ln = $todo->{node}->manakai_local_name;
354 my $eldef = $Element->{$nsuri}->{$ln} ||
355 $Element->{$nsuri}->{''} ||
356 $ElementDefault;
357 $eldef->{attrs_checker}->($self, $todo);
358 my ($new_todos) = $eldef->{checker}->($self, $todo);
359 unshift @todo, @$new_todos;
360 } elsif ($todo->{type} eq 'element-attributes') {
361 my $prefix = $todo->{node}->prefix;
362 if (defined $prefix and $prefix eq 'xmlns') {
363 $self->{onerror}
364 ->(node => $todo->{node}, level => 'NC',
365 type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
366 }
367 my $nsuri = $todo->{node}->namespace_uri;
368 $nsuri = '' unless defined $nsuri;
369 unless ($Namespace->{$nsuri}->{loaded}) {
370 if ($Namespace->{$nsuri}->{module}) {
371 eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
372 } else {
373 $Namespace->{$nsuri}->{loaded} = 1;
374 }
375 }
376 my $ln = $todo->{node}->manakai_local_name;
377 my $eldef = $Element->{$nsuri}->{$ln} ||
378 $Element->{$nsuri}->{''} ||
379 $ElementDefault;
380 $eldef->{attrs_checker}->($self, $todo);
381 } elsif ($todo->{type} eq 'descendant') {
382 for my $key (keys %{$todo->{errors}}) {
383 unless ($todo->{flag}->{has_descendant}->{$key}) {
384 $todo->{errors}->{$key}->($self, $todo);
385 }
386 for my $key (keys %{$todo->{old_values}}) {
387 $todo->{flag}->{has_descendant}->{$key}
388 ||= $todo->{old_values}->{$key};
389 }
390 }
391 } elsif ($todo->{type} eq 'plus' or $todo->{type} eq 'minus') {
392 $self->_remove_minuses ($todo);
393 } elsif ($todo->{type} eq 'code') {
394 $todo->{code}->();
395 } else {
396 die "$0: Internal error: Unsupported checking action type |$todo->{type}|";
397 }
398 }
399
400 for (@{$self->{usemap}}) {
401 unless ($self->{map}->{$_->[0]}) {
402 $self->{onerror}->(node => $_->[1], type => 'no referenced map');
403 }
404 }
405
406 for (@{$self->{contextmenu}}) {
407 unless ($self->{menu}->{$_->[0]}) {
408 $self->{onerror}->(node => $_->[1], type => 'no referenced menu');
409 }
410 }
411
412 delete $self->{pluses};
413 delete $self->{minuses};
414 delete $self->{onerror};
415 delete $self->{id};
416 delete $self->{usemap};
417 delete $self->{map};
418 return $self->{return};
419 } # check_element
420
421 sub _add_minuses ($@) {
422 my $self = shift;
423 my $r = {};
424 for my $list (@_) {
425 for my $ns (keys %$list) {
426 for my $ln (keys %{$list->{$ns}}) {
427 unless ($self->{minuses}->{$ns}->{$ln}) {
428 $self->{minuses}->{$ns}->{$ln} = 1;
429 $r->{$ns}->{$ln} = 1;
430 }
431 }
432 }
433 }
434 return {type => 'plus', list => $r};
435 } # _add_minuses
436
437 sub _add_pluses ($@) {
438 my $self = shift;
439 my $r = {};
440 for my $list (@_) {
441 for my $ns (keys %$list) {
442 for my $ln (keys %{$list->{$ns}}) {
443 unless ($self->{pluses}->{$ns}->{$ln}) {
444 $self->{pluses}->{$ns}->{$ln} = 1;
445 $r->{$ns}->{$ln} = 1;
446 }
447 }
448 }
449 }
450 return {type => 'minus', list => $r};
451 } # _add_pluses
452
453 sub _remove_minuses ($$) {
454 my ($self, $todo) = @_;
455 if ($todo->{type} eq 'minus') {
456 for my $ns (keys %{$todo->{list}}) {
457 for my $ln (keys %{$todo->{list}->{$ns}}) {
458 delete $self->{pluses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
459 }
460 }
461 } elsif ($todo->{type} eq 'plus') {
462 for my $ns (keys %{$todo->{list}}) {
463 for my $ln (keys %{$todo->{list}->{$ns}}) {
464 delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
465 }
466 }
467 } else {
468 die "$0: Unknown +- type: $todo->{type}";
469 }
470 1;
471 } # _remove_minuses
472
473 ## NOTE: Priority for "minuses" and "pluses" are currently left
474 ## undefined and implemented inconsistently; it is not a problem for
475 ## now, since no element belongs to both lists.
476
477 sub _check_get_children ($$$) {
478 my ($self, $node, $parent_todo) = @_;
479 my $new_todos = [];
480 my $sib = [];
481 TP: {
482 my $node_ns = $node->namespace_uri;
483 $node_ns = '' unless defined $node_ns;
484 my $node_ln = $node->manakai_local_name;
485 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
486 if ($node_ns eq $HTML_NS and $node_ln eq 'noscript') {
487 if ($parent_todo->{flag}->{in_head}) {
488 #
489 } else {
490 my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
491 push @$sib, $end;
492
493 unshift @$sib, @{$node->child_nodes};
494 push @$new_todos, {type => 'element-attributes', node => $node};
495 last TP;
496 }
497 } else {
498 unshift @$sib, @{$node->child_nodes};
499 push @$new_todos, {type => 'element-attributes', node => $node};
500 last TP;
501 }
502 }
503 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
504 if ($node->has_attribute_ns (undef, 'src')) {
505 unshift @$sib, @{$node->child_nodes};
506 push @$new_todos, {type => 'element-attributes', node => $node};
507 last TP;
508 } else {
509 my @cn = @{$node->child_nodes};
510 CN: while (@cn) {
511 my $cn = shift @cn;
512 my $cnt = $cn->node_type;
513 if ($cnt == 1) {
514 my $cn_nsuri = $cn->namespace_uri;
515 $cn_nsuri = '' unless defined $cn_nsuri;
516 if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
517 #
518 } else {
519 last CN;
520 }
521 } elsif ($cnt == 3 or $cnt == 4) {
522 if ($cn->data =~ /[^\x09-\x0D\x20]/) {
523 last CN;
524 }
525 }
526 } # CN
527 unshift @$sib, @cn;
528 }
529 }
530 push @$new_todos, {type => 'element', node => $node};
531 } # TP
532
533 for my $new_todo (@$new_todos) {
534 $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
535 }
536
537 return ($sib, $new_todos);
538 } # _check_get_children
539
540 =head1 LICENSE
541
542 Copyright 2007 Wakaba <w@suika.fam.cx>
543
544 This library is free software; you can redistribute it
545 and/or modify it under the same terms as Perl itself.
546
547 =cut
548
549 1;
550 # $Date: 2007/11/23 05:39:43 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24