/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.111 - (show annotations) (download)
Sun Aug 30 03:40:50 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.110: +6 -6 lines
++ whatpm/t/dom-conformance/ChangeLog	30 Aug 2009 03:35:02 -0000
2009-08-30  Wakaba  <wakaba@suika.fam.cx>

	* xml-global.dat: Added a test of |xml:lang| attribute on a
	foreign element in an HTML document (HTML5 revision 3697).

++ whatpm/Whatpm/ChangeLog	30 Aug 2009 03:40:20 -0000
2009-08-30  Wakaba  <wakaba@suika.fam.cx>

	* ContentChecker.pm: Allow |xml:lang| in HTML document if the
	owner element is not an HTML element (HTML5 revision 3697).

1 package Whatpm::ContentChecker;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.110 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 require Whatpm::URIChecker;
6
7 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
8 ## be applied to an in-memory representation (i.e. DOM)?
9
10 ## TODO: Conformance of an HTML document with non-html root element.
11
12 ## Stability
13 sub FEATURE_STATUS_REC () { 0b1 } ## Interoperable standard
14 sub FEATURE_STATUS_CR () { 0b10 } ## Call for implementation
15 sub FEATURE_STATUS_LC () { 0b100 } ## Last call for comments
16 sub FEATURE_STATUS_WD () { 0b1000 } ## Working or editor's draft
17
18 ## Deprecated
19 sub FEATURE_DEPRECATED_SHOULD () { 0b100000 } ## SHOULD-level
20 sub FEATURE_DEPRECATED_INFO () { 0b1000000 } ## Does not affect conformance
21
22 ## Conformance
23 sub FEATURE_ALLOWED () { 0b10000 }
24
25 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
26 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
27 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
28
29 my $Namespace = {
30 '' => {loaded => 1},
31 q<http://www.w3.org/2005/Atom> => {module => 'Whatpm::ContentChecker::Atom'},
32 q<http://purl.org/syndication/history/1.0>
33 => {module => 'Whatpm::ContentChecker::Atom'},
34 q<http://purl.org/syndication/threading/1.0>
35 => {module => 'Whatpm::ContentChecker::Atom'},
36 $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
37 $XML_NS => {loaded => 1},
38 $XMLNS_NS => {loaded => 1},
39 q<http://www.w3.org/1999/02/22-rdf-syntax-ns#> => {loaded => 1},
40 };
41
42 sub load_ns_module ($) {
43 my $nsuri = shift; # namespace URI or ''
44 unless ($Namespace->{$nsuri}->{loaded}) {
45 if ($Namespace->{$nsuri}->{module}) {
46 eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
47 } else {
48 $Namespace->{$nsuri}->{loaded} = 1;
49 }
50 }
51 } # load_ns_module
52
53 our $AttrChecker = {
54 $XML_NS => {
55 space => sub {
56 my ($self, $attr) = @_;
57 my $value = $attr->value;
58 if ($value eq 'default' or $value eq 'preserve') {
59 #
60 } else {
61 ## NOTE: An XML "error"
62 $self->{onerror}->(node => $attr, level => $self->{level}->{xml_error},
63 type => 'invalid attribute value');
64 }
65 },
66 lang => sub {
67 my ($self, $attr) = @_;
68 my $value = $attr->value;
69 if ($value eq '') {
70 #
71 } else {
72 require Whatpm::LangTag;
73 Whatpm::LangTag->check_rfc3066_language_tag ($value, sub {
74 $self->{onerror}->(@_, node => $attr);
75 }, $self->{level});
76 }
77
78 ## NOTE: "The values of the attribute are language identifiers
79 ## as defined by [IETF RFC 3066], Tags for the Identification
80 ## of Languages, or its successor; in addition, the empty string
81 ## may be specified." ("may" in lower case)
82 ## NOTE: Is an RFC 3066-valid (but RFC 4646-invalid) language tag
83 ## allowed today?
84
85 ## TODO: test data
86
87 my $nsuri = $attr->owner_element->namespace_uri;
88 if (defined $nsuri and $nsuri eq $HTML_NS) {
89 my $lang_attr = $attr->owner_element->get_attribute_node_ns
90 (undef, 'lang');
91 if ($lang_attr) {
92 my $lang_attr_value = $lang_attr->value;
93 $lang_attr_value =~ tr/A-Z/a-z/; ## ASCII case-insensitive
94 my $value = $value;
95 $value =~ tr/A-Z/a-z/; ## ASCII case-insensitive
96 if ($lang_attr_value ne $value) {
97 ## NOTE: HTML5 Section "The |lang| and |xml:lang| attributes"
98 $self->{onerror}->(node => $attr,
99 type => 'xml:lang ne lang',
100 level => $self->{level}->{must});
101 }
102 }
103
104 if ($attr->owner_document->manakai_is_html) { # MUST NOT
105 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang',
106 level => $self->{level}->{must});
107 ## TODO: Test data...
108 }
109 }
110 },
111 base => sub {
112 my ($self, $attr) = @_;
113 my $value = $attr->value;
114 if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
115 $self->{onerror}->(node => $attr,
116 type => 'invalid attribute value',
117 level => $self->{level}->{fact}, ## TODO: correct?
118 );
119 }
120 ## NOTE: Conformance to URI standard is not checked since there is
121 ## no author requirement on conformance in the XML Base specification.
122 },
123 id => sub {
124 my ($self, $attr, $item, $element_state) = @_;
125 my $value = $attr->value;
126 $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
127 $value =~ s/^\x20//;
128 $value =~ s/\x20$//;
129 ## TODO: NCName in XML 1.0 or 1.1
130 ## TODO: declared type is ID?
131 if ($self->{id}->{$value}) {
132 $self->{onerror}->(node => $attr,
133 type => 'duplicate ID',
134 level => $self->{level}->{xml_id_error});
135 push @{$self->{id}->{$value}}, $attr;
136 } else {
137 $self->{id}->{$value} = [$attr];
138 $self->{id_type}->{$value} = $element_state->{id_type} || '';
139 }
140 },
141 },
142 $XMLNS_NS => {
143 '' => sub {
144 my ($self, $attr) = @_;
145 my $ln = $attr->manakai_local_name;
146 my $value = $attr->value;
147 if ($value eq $XML_NS and $ln ne 'xml') {
148 $self->{onerror}
149 ->(node => $attr,
150 type => 'Reserved Prefixes and Namespace Names:Name',
151 text => $value,
152 level => $self->{level}->{nc});
153 } elsif ($value eq $XMLNS_NS) {
154 $self->{onerror}
155 ->(node => $attr,
156 type => 'Reserved Prefixes and Namespace Names:Name',
157 text => $value,
158 level => $self->{level}->{nc});
159 }
160 if ($ln eq 'xml' and $value ne $XML_NS) {
161 $self->{onerror}
162 ->(node => $attr,
163 type => 'Reserved Prefixes and Namespace Names:Prefix',
164 text => $ln,
165 level => $self->{level}->{nc});
166 } elsif ($ln eq 'xmlns') {
167 $self->{onerror}
168 ->(node => $attr,
169 type => 'Reserved Prefixes and Namespace Names:Prefix',
170 text => $ln,
171 level => $self->{level}->{nc});
172 }
173 ## TODO: If XML 1.0 and empty
174 },
175 xmlns => sub {
176 my ($self, $attr) = @_;
177 ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
178 ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
179 ## TODO: relative references are deprecated
180 my $value = $attr->value;
181 if ($value eq $XML_NS) {
182 $self->{onerror}
183 ->(node => $attr,
184 type => 'Reserved Prefixes and Namespace Names:Name',
185 text => $value,
186 level => $self->{level}->{nc});
187 } elsif ($value eq $XMLNS_NS) {
188 $self->{onerror}
189 ->(node => $attr,
190 type => 'Reserved Prefixes and Namespace Names:Name',
191 text => $value,
192 level => $self->{level}->{nc});
193 }
194 },
195 },
196 };
197
198 ## ISSUE: Should we really allow these attributes?
199 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
200 $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
201 ## NOTE: Checker for (null, "xml:lang") attribute is shadowed for
202 ## HTML elements in Whatpm::ContentChecker::HTML.
203 $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
204 $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
205
206 our $AttrStatus;
207
208 for (qw/space lang base id/) {
209 $AttrStatus->{$XML_NS}->{$_} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
210 $AttrStatus->{''}->{"xml:$_"} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
211 ## XML 1.0: FEATURE_STATUS_CR
212 ## XML 1.1: FEATURE_STATUS_REC
213 ## XML Namespaces 1.0: FEATURE_STATUS_CR
214 ## XML Namespaces 1.1: FEATURE_STATUS_REC
215 ## XML Base: FEATURE_STATUS_REC
216 ## xml:id: FEATURE_STATUS_REC
217 }
218
219 $AttrStatus->{$XMLNS_NS}->{''} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
220
221 ## TODO: xsi:schemaLocation for XHTML2 support (very, very low priority)
222
223 our %AnyChecker = (
224 ## NOTE: |check_start| is invoked before anything on the element's
225 ## attributes and contents is checked.
226 check_start => sub { },
227 ## NOTE: |check_attrs| and |check_attrs2| are invoked after
228 ## |check_start| and before anything on the element's contents is
229 ## checked. |check_attrs| is invoked immediately before
230 ## |check_attrs2|.
231 check_attrs => sub {
232 my ($self, $item, $element_state) = @_;
233 for my $attr (@{$item->{node}->attributes}) {
234 my $attr_ns = $attr->namespace_uri;
235 if (defined $attr_ns) {
236 load_ns_module ($attr_ns);
237 } else {
238 $attr_ns = '';
239 }
240 my $attr_ln = $attr->manakai_local_name;
241
242 my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
243 || $AttrChecker->{$attr_ns}->{''};
244 my $status = $AttrStatus->{$attr_ns}->{$attr_ln}
245 || $AttrStatus->{$attr_ns}->{''};
246 if (not defined $status) {
247 $status = FEATURE_ALLOWED;
248 ## NOTE: FEATURE_ALLOWED for all attributes, since the element
249 ## is not supported and therefore "attribute not defined" error
250 ## should not raised (too verbose) and global attributes should be
251 ## allowed anyway (if a global attribute has its specified creteria
252 ## for where it may be specified, then it should be checked in it's
253 ## checker function).
254 }
255 if ($checker) {
256 $checker->($self, $attr);
257 } else {
258 $self->{onerror}->(node => $attr,
259 type => 'unknown attribute',
260 level => $self->{level}->{uncertain});
261 }
262 $self->_attr_status_info ($attr, $status);
263 }
264 },
265 check_attrs2 => sub { },
266 ## NOTE: |check_child_element| is invoked for each occurence of
267 ## child elements. It is invoked after |check_attrs| and before
268 ## |check_end|. |check_child_element| and |check_child_text| are
269 ## invoked for each child elements and text nodes in tree order.
270 check_child_element => sub {
271 my ($self, $item, $child_el, $child_nsuri, $child_ln,
272 $child_is_transparent, $element_state) = @_;
273 if ($self->{minus_elements}->{$child_nsuri}->{$child_ln}) {
274 $self->{onerror}->(node => $child_el,
275 type => 'element not allowed:minus',
276 level => $self->{level}->{must});
277 } elsif ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
278 #
279 } else {
280 #
281 }
282 },
283 ## NOTE: |check_child_text| is invoked for each occurence of child
284 ## text nodes. It is invoked after |check_attrs| and before
285 ## |check_end|. |check_child_element| and |check_child_text| are
286 ## invoked for each child elements and text nodes in tree order.
287 check_child_text => sub { },
288 ## NOTE: |check_end| is invoked after everything on the element's
289 ## attributes and contents are checked.
290 check_end => sub {
291 my ($self, $item, $element_state) = @_;
292 ## NOTE: There is a modified copy of the code below for |html:ruby|.
293 if ($element_state->{has_significant}) {
294 $item->{real_parent_state}->{has_significant} = 1;
295 }
296 },
297 );
298
299 our $ElementDefault = {
300 %AnyChecker,
301 status => FEATURE_ALLOWED,
302 ## NOTE: No "element not defined" error - it is not supported anyway.
303 check_start => sub {
304 my ($self, $item, $element_state) = @_;
305 $self->{onerror}->(node => $item->{node},
306 type => 'unknown element',
307 level => $self->{level}->{uncertain});
308 },
309 };
310
311 our $HTMLEmbeddedContent = {
312 ## NOTE: All embedded content is also phrasing content.
313 $HTML_NS => {
314 img => 1, iframe => 1, embed => 1, object => 1, video => 1, audio => 1,
315 canvas => 1,
316 },
317 q<http://www.w3.org/1998/Math/MathML> => {math => 1},
318 q<http://www.w3.org/2000/svg> => {svg => 1},
319 ## NOTE: Foreign elements with content (but no metadata) are
320 ## embedded content.
321 };
322
323 our $IsInHTMLInteractiveContent = sub {
324 my ($el, $nsuri, $ln) = @_;
325
326 ## NOTE: This CODE returns whether an element that is conditionally
327 ## categorizzed as an interactive content is currently in that
328 ## condition or not. See $HTMLInteractiveContent list defined in
329 ## Whatpm::ContentChecler::HTML for the list of all (conditionally
330 ## or permanently) interactive content.
331
332 ## The variable name is not good, since this method also returns
333 ## true for non-interactive content as long as the element cannot be
334 ## interactive content.
335
336 if ($nsuri eq $HTML_NS and $ln eq 'input') {
337 my $value = $el->get_attribute_ns (undef, 'type');
338 $value =~ tr/A-Z/a-z/; ## ASCII case-insensitive.
339 return ($value ne 'hidden');
340 } elsif ($nsuri eq $HTML_NS and ($ln eq 'img' or $ln eq 'object')) {
341 return $el->has_attribute_ns (undef, 'usemap');
342 } elsif ($nsuri eq $HTML_NS and ($ln eq 'video' or $ln eq 'audio')) {
343 return $el->has_attribute_ns (undef, 'controls');
344 } elsif ($nsuri eq $HTML_NS and $ln eq 'menu') {
345 my $value = $el->get_attribute_ns (undef, 'type');
346 $value =~ tr/A-Z/a-z/; ## ASCII case-insensitive.
347 return ($value eq 'toolbar');
348 } else {
349 return 1;
350 }
351 }; # $IsInHTMLInteractiveContent
352
353 my $HTMLTransparentElements = {
354 $HTML_NS => {
355 ins => 1, del => 1,
356 font => 1, ## dropped from the spec
357 noscript => 1,
358 ## NOTE: |html:noscript| is transparent if scripting is disabled
359 ## and not in |head|.
360 canvas => 1,
361 a => 1,
362 map => 1,
363 },
364 }; # $HTMLTransparentElements
365
366 ## NOTE: Now that the term "semi-transparent content model" is dropped
367 ## from the spec, but the concept is not.
368 my $HTMLSemiTransparentElements = {
369 $HTML_NS => {object => 1, video => 1, audio => 1},
370 }; # $HTMLSemiTransparentElements
371
372 our $Element = {};
373
374 $Element->{q<http://www.w3.org/1999/02/22-rdf-syntax-ns#>}->{RDF} = {
375 %AnyChecker,
376 status => FEATURE_STATUS_REC | FEATURE_ALLOWED,
377 is_root => 1, ## ISSUE: Not explicitly allowed for non application/rdf+xml
378 check_start => sub {
379 my ($self, $item, $element_state) = @_;
380 my $triple = [];
381 push @{$self->{return}->{rdf}}, [$item->{node}, $triple];
382 require Whatpm::RDFXML;
383 my $rdf = Whatpm::RDFXML->new;
384 ## TODO: Should we make bnodeid unique in a document?
385 $rdf->{onerror} = $self->{onerror};
386 $rdf->{level} = $self->{level};
387 $rdf->{ontriple} = sub {
388 my %opt = @_;
389 push @$triple,
390 [$opt{node}, $opt{subject}, $opt{predicate}, $opt{object}];
391 if (defined $opt{id}) {
392 push @$triple,
393 [$opt{node},
394 $opt{id},
395 {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#subject>},
396 $opt{subject}];
397 push @$triple,
398 [$opt{node},
399 $opt{id},
400 {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate>},
401 $opt{predicate}];
402 push @$triple,
403 [$opt{node},
404 $opt{id},
405 {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#object>},
406 $opt{object}];
407 push @$triple,
408 [$opt{node},
409 $opt{id},
410 {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>},
411 {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement>}];
412 }
413 };
414 $rdf->convert_rdf_element ($item->{node});
415 },
416 };
417
418 my $default_error_level = {
419 must => 'm',
420 should => 's',
421 warn => 'w',
422 good => 'w',
423 undefined => 'w',
424 info => 'i',
425
426 uncertain => 'u',
427
428 html4_fact => 'm',
429 html5_no_may => 'm',
430 html5_fact => 'm',
431
432 xml_error => 'm', ## TODO: correct?
433 xml_id_error => 'm', ## TODO: ?
434 nc => 'm', ## XML Namespace Constraints ## TODO: correct?
435
436 ## |Whatpm::URIChecker|
437 uri_syntax => 'm',
438 uri_fact => 'm',
439 uri_lc_must => 'm',
440 uri_lc_should => 'w',
441
442 ## |Whatpm::IMTChecker|
443 mime_must => 'm', # lowercase "must"
444 mime_fact => 'm',
445 mime_strongly_discouraged => 'w',
446 mime_discouraged => 'w',
447
448 ## |Whatpm::LangTag|
449 langtag_fact => 'm',
450
451 ## |Whatpm::RDFXML|
452 rdf_fact => 'm',
453 rdf_grammer => 'm',
454 rdf_lc_must => 'm',
455
456 ## |Message::Charset::Info| and |Whatpm::Charset::DecodeHandle|
457 charset_variant => 'm',
458 ## An error caused by use of a variant charset that is not conforming
459 ## to the original charset (e.g. use of 0x80 in an ISO-8859-1 document
460 ## which is interpreted as a Windows-1252 document instead).
461 charset_fact => 'm',
462 iso_shall => 'm',
463 };
464
465 sub check_document ($$$;$) {
466 my ($self, $doc, $onerror, $onsubdoc) = @_;
467 $self = bless {}, $self unless ref $self;
468 $self->{onerror} = $onerror;
469 $self->{onsubdoc} = $onsubdoc || sub {
470 warn "A subdocument is not conformance-checked";
471 };
472
473 $self->{level} ||= $default_error_level;
474
475 ## TODO: If application/rdf+xml, RDF/XML mode should be invoked.
476
477 my $docel = $doc->document_element;
478 unless (defined $docel) {
479 ## ISSUE: Should we check content of Document node?
480 $onerror->(node => $doc, type => 'no document element',
481 level => $self->{level}->{must});
482 ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
483 return {
484 class => {},
485 id => {}, table => [], term => {},
486 };
487 }
488
489 ## ISSUE: Unexpanded entity references and HTML5 conformance
490
491 my $docel_nsuri = $docel->namespace_uri;
492 if (defined $docel_nsuri) {
493 load_ns_module ($docel_nsuri);
494 } else {
495 $docel_nsuri = '';
496 }
497 my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
498 $Element->{$docel_nsuri}->{''} ||
499 $ElementDefault;
500 if ($docel_def->{is_root}) {
501 #
502 } elsif ($docel_def->{is_xml_root}) {
503 unless ($doc->manakai_is_html) {
504 #
505 } else {
506 $onerror->(node => $docel, type => 'element not allowed:root:xml',
507 level => $self->{level}->{must});
508 }
509 } else {
510 $onerror->(node => $docel, type => 'element not allowed:root',
511 level => $self->{level}->{must});
512 }
513
514 ## TODO: Check for other items other than document element
515 ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
516
517 my $return = $self->check_element ($docel, $onerror, $onsubdoc);
518
519 ## TODO: Test for these checks are necessary.
520 my $charset_name = $doc->input_encoding;
521 if (defined $charset_name) {
522 require Message::Charset::Info;
523 my $charset = $Message::Charset::Info::IANACharset->{$charset_name};
524
525 if ($doc->manakai_is_html) {
526 if (not $doc->manakai_has_bom and
527 not defined $doc->manakai_charset) {
528 unless ($charset->{category}
529 & Message::Charset::Info::CHARSET_CATEGORY_ASCII_COMPAT ()) {
530 $onerror->(node => $doc,
531 level => $self->{level}->{must},
532 type => 'non ascii superset',
533 text => $charset_name);
534 }
535
536 if (not $self->{has_charset} and ## TODO: This does not work now.
537 not $charset->{iana_names}->{'us-ascii'}) {
538 $onerror->(node => $doc,
539 level => $self->{level}->{must},
540 type => 'no character encoding declaration',
541 text => $charset_name);
542 }
543 }
544
545 if ($charset->{iana_names}->{'utf-8'}) {
546 #
547 } elsif ($charset->{iana_names}->{'jis_x0212-1990'} or
548 $charset->{iana_names}->{'x-jis0208'} or
549 $charset->{iana_names}->{'utf-32'} or ## ISSUE: UTF-32BE? UTF-32LE?
550 ($charset->{category} & Message::Charset::Info::CHARSET_CATEGORY_EBCDIC ())) {
551 $onerror->(node => $doc,
552 type => 'bad character encoding',
553 text => $charset_name,
554 level => $self->{level}->{should},
555 layer => 'encode');
556 } elsif ($charset->{iana_names}->{'cesu-8'} or
557 $charset->{iana_names}->{'utf-7'} or ## ISSUE: UNICODE-1-1-UTF-7?
558 $charset->{iana_names}->{'bocu-1'} or
559 $charset->{iana_names}->{'scsu'}) {
560 $onerror->(node => $doc,
561 type => 'disallowed character encoding',
562 text => $charset_name,
563 level => $self->{level}->{must},
564 layer => 'encode');
565 } else {
566 $onerror->(node => $doc,
567 type => 'non-utf-8 character encoding',
568 text => $charset_name,
569 level => $self->{level}->{good},
570 layer => 'encode');
571 }
572 }
573 } elsif ($doc->manakai_is_html) {
574 ## NOTE: MUST and SHOULD requirements above cannot be tested,
575 ## since the document has no input charset encoding information.
576 $onerror->(node => $doc,
577 type => 'character encoding unchecked',
578 level => $self->{level}->{info},
579 layer => 'encode');
580 }
581
582 return $return;
583 } # check_document
584
585 ## Check an element. The element is checked as if it is an orphan node (i.e.
586 ## an element without a parent node).
587 sub check_element ($$$;$) {
588 my ($self, $el, $onerror, $onsubdoc) = @_;
589 $self = bless {}, $self unless ref $self;
590 $self->{onerror} = $onerror;
591 $self->{onsubdoc} = $onsubdoc || sub {
592 warn "A subdocument is not conformance-checked";
593 };
594
595 $self->{level} ||= $default_error_level;
596
597 $self->{plus_elements} = {};
598 $self->{minus_elements} = {};
599 $self->{id} = {};
600 $self->{id_type} = {}; # 'form' / 'labelable' / 'menu'
601 $self->{form} = {}; # form/@name
602 #$self->{has_autofocus};
603 $self->{idref} = []; # @form, @for, @contextmenu
604 $self->{term} = {};
605 $self->{usemap} = [];
606 $self->{ref} = []; # datetemplate data references
607 $self->{template} = []; # datatemplate template references
608 $self->{map} = {};
609 $self->{has_link_type} = {};
610 $self->{flag} = {};
611 #$self->{has_uri_attr};
612 #$self->{has_hyperlink_element};
613 #$self->{has_charset};
614 #$self->{has_base};
615 $self->{return} = {
616 class => {},
617 id => $self->{id},
618 table => [], # table objects returned by Whatpm::HTMLTable
619 term => $self->{term},
620 uri => {}, # URIs other than those in RDF triples
621 ## TODO: xmlns="", SYSTEM "", atom:* src="", xml:base=""
622 rdf => [],
623 };
624
625 my @item = ({type => 'element', node => $el, parent_state => {}});
626 $item[-1]->{real_parent_state} = $item[-1]->{parent_state};
627 while (@item) {
628 my $item = shift @item;
629 if (ref $item eq 'ARRAY') {
630 my $code = shift @$item;
631 next unless $code;## TODO: temp.
632 $code->(@$item);
633 } elsif ($item->{type} eq 'element') {
634 my $el_nsuri = $item->{node}->namespace_uri;
635 if (defined $el_nsuri) {
636 load_ns_module ($el_nsuri);
637 } else {
638 $el_nsuri = '';
639 }
640 my $el_ln = $item->{node}->manakai_local_name;
641
642 my $element_state = {};
643 my $eldef = $Element->{$el_nsuri}->{$el_ln} ||
644 $Element->{$el_nsuri}->{''} ||
645 $ElementDefault;
646 my $content_def = $item->{transparent}
647 ? $item->{parent_def} || $eldef : $eldef;
648 my $content_state = $item->{transparent}
649 ? $item->{parent_def}
650 ? $item->{parent_state} || $element_state : $element_state
651 : $element_state;
652
653 unless ($eldef->{status} & FEATURE_STATUS_REC) {
654 my $status = $eldef->{status} & FEATURE_STATUS_CR ? 'cr' :
655 $eldef->{status} & FEATURE_STATUS_LC ? 'lc' :
656 $eldef->{status} & FEATURE_STATUS_WD ? 'wd' : 'non-standard';
657 $self->{onerror}->(node => $item->{node},
658 type => 'status:'.$status.':element',
659 level => $self->{level}->{info});
660 }
661 if (not ($eldef->{status} & FEATURE_ALLOWED)) {
662 $self->{onerror}->(node => $item->{node},
663 type => 'element not defined',
664 level => $self->{level}->{must});
665 } elsif ($eldef->{status} & FEATURE_DEPRECATED_SHOULD) {
666 $self->{onerror}->(node => $item->{node},
667 type => 'deprecated:element',
668 level => $self->{level}->{should});
669 } elsif ($eldef->{status} & FEATURE_DEPRECATED_INFO) {
670 $self->{onerror}->(node => $item->{node},
671 type => 'deprecated:element',
672 level => $self->{level}->{info});
673 }
674
675 my @new_item;
676 push @new_item, [$eldef->{check_start}, $self, $item, $element_state];
677 push @new_item, [$eldef->{check_attrs}, $self, $item, $element_state];
678 push @new_item, [$eldef->{check_attrs2}, $self, $item, $element_state];
679
680 my @child = @{$item->{node}->child_nodes};
681 while (@child) {
682 my $child = shift @child;
683 my $child_nt = $child->node_type;
684 if ($child_nt == 1) { # ELEMENT_NODE
685 my $child_nsuri = $child->namespace_uri;
686 $child_nsuri = '' unless defined $child_nsuri;
687 my $child_ln = $child->manakai_local_name;
688 if ($HTMLTransparentElements->{$child_nsuri}->{$child_ln} and
689 not (($self->{flag}->{in_head} or
690 ($el_nsuri eq $HTML_NS and $el_ln eq 'head')) and
691 $child_nsuri eq $HTML_NS and $child_ln eq 'noscript')) {
692 push @new_item, [$content_def->{check_child_element},
693 $self, $item, $child,
694 $child_nsuri, $child_ln, 1,
695 $content_state, $element_state];
696 push @new_item, {type => 'element', node => $child,
697 parent_state => $content_state,
698 parent_def => $content_def,
699 real_parent_state => $element_state,
700 transparent => 1};
701 } else {
702 if ($item->{parent_def} and # has parent
703 $el_nsuri eq $HTML_NS) { ## $HTMLSemiTransparentElements
704 if ($el_ln eq 'object') {
705 if ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
706 #
707 } elsif ($child_nsuri eq $HTML_NS and $child_ln eq 'param') {
708 #
709 } else {
710 $content_def = $item->{parent_def} || $content_def;
711 $content_state = $item->{parent_state} || $content_state;
712 }
713 } elsif ($el_ln eq 'video' or $el_ln eq 'audio') {
714 if ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
715 #
716 } elsif ($child_nsuri eq $HTML_NS and $child_ln eq 'source') {
717 $element_state->{has_source} = 1;
718 } else {
719 $content_def = $item->{parent_def} || $content_def;
720 $content_state = $item->{parent_state} || $content_state;
721 }
722 }
723 }
724
725 push @new_item, [$content_def->{check_child_element},
726 $self, $item, $child,
727 $child_nsuri, $child_ln,
728 $HTMLSemiTransparentElements
729 ->{$child_nsuri}->{$child_ln},
730 $content_state, $element_state];
731 push @new_item, {type => 'element', node => $child,
732 parent_def => $content_def,
733 real_parent_state => $element_state,
734 parent_state => $content_state};
735 }
736
737 if ($HTMLEmbeddedContent->{$child_nsuri}->{$child_ln}) {
738 $element_state->{has_significant} = 1;
739 }
740 } elsif ($child_nt == 3 or # TEXT_NODE
741 $child_nt == 4) { # CDATA_SECTION_NODE
742 my $has_significant = ($child->data =~ /[^\x09\x0A\x0C\x0D\x20]/);
743 push @new_item, [$content_def->{check_child_text},
744 $self, $item, $child, $has_significant,
745 $content_state, $element_state];
746 $element_state->{has_significant} ||= $has_significant;
747 if ($has_significant and
748 $HTMLSemiTransparentElements->{$el_nsuri}->{$el_ln}) {
749 $content_def = $item->{parent_def} || $content_def;
750 }
751 } elsif ($child_nt == 5) { # ENTITY_REFERENCE_NODE
752 push @child, @{$child->child_nodes};
753 }
754 ## TODO: PI_NODE
755 ## TODO: Unknown node type
756 }
757
758 push @new_item, [$eldef->{check_end}, $self, $item, $element_state];
759
760 unshift @item, @new_item;
761 } else {
762 die "$0: Internal error: Unsupported checking action type |$item->{type}|";
763 }
764 }
765
766 for (@{$self->{template}}) {
767 ## TODO: If the document is an XML document, ...
768 ## NOTE: If the document is an HTML document:
769 ## ISSUE: We need to percent-decode?
770 F: {
771 if ($self->{id}->{$_->[0]}) {
772 my $el = $self->{id}->{$_->[0]}->[0]->owner_element;
773 if ($el->node_type == 1 and # ELEMENT_NODE
774 $el->manakai_local_name eq 'datatemplate') {
775 my $nsuri = $el->namespace_uri;
776 if (defined $nsuri and $nsuri eq $HTML_NS) {
777 if ($el eq $_->[1]->owner_element) {
778 $self->{onerror}->(node => $_->[1],
779 type => 'fragment points itself',
780 level => $self->{level}->{must});
781 }
782
783 last F;
784 }
785 }
786 }
787 ## TODO: Should we raise a "fragment points nothing" error instead
788 ## if the fragment identifier identifies no element?
789
790 $self->{onerror}->(node => $_->[1], type => 'template:not template',
791 level => $self->{level}->{must});
792 } # F
793 }
794
795 for (@{$self->{ref}}) {
796 ## TOOD: If XML
797 ## NOTE: If it is an HTML document:
798 if ($_->[0] eq '') {
799 ## NOTE: It points the top of the document.
800 } elsif ($self->{id}->{$_->[0]}) {
801 if ($self->{id}->{$_->[0]}->[0]->owner_element
802 eq $_->[1]->owner_element) {
803 $self->{onerror}->(node => $_->[1], type => 'fragment points itself',
804 level => $self->{level}->{must});
805 }
806 } else {
807 $self->{onerror}->(node => $_->[1], type => 'fragment points nothing',
808 level => $self->{level}->{must});
809 }
810 }
811
812 ## TODO: Maybe we should have $document->manakai_get_by_fragment or something
813
814 for (@{$self->{usemap}}) {
815 unless ($self->{map}->{$_->[0]}) {
816 $self->{onerror}->(node => $_->[1], type => 'no referenced map',
817 level => $self->{level}->{must});
818 }
819 }
820
821 for (@{$self->{idref}}) {
822 if ($self->{id}->{$_->[1]} and $self->{id_type}->{$_->[1]} eq $_->[0]) {
823 #
824 } elsif ($_->[0] eq 'any' and $self->{id}->{$_->[1]}) {
825 #
826 } else {
827 $self->{onerror}->(node => $_->[2],
828 type => {
829 any => 'no referenced element', ## TODOC: type
830 form => 'no referenced form',
831 labelable => 'no referenced control',
832 menu => 'no referenced menu',
833 datalist => 'no referenced datalist', ## TODOC: type
834 }->{$_->[0]},
835 value => $_->[1],
836 level => $self->{level}->{must});
837 }
838 }
839
840 delete $self->{plus_elements};
841 delete $self->{minus_elements};
842 delete $self->{onerror};
843 delete $self->{id};
844 delete $self->{id_type};
845 delete $self->{form};
846 delete $self->{has_autofocus};
847 delete $self->{idref};
848 delete $self->{usemap};
849 delete $self->{ref};
850 delete $self->{template};
851 delete $self->{map};
852 return $self->{return};
853 } # check_element
854
855 sub _add_minus_elements ($$@) {
856 my $self = shift;
857 my $element_state = shift;
858 for my $elements (@_) {
859 for my $nsuri (keys %$elements) {
860 for my $ln (keys %{$elements->{$nsuri}}) {
861 unless ($self->{minus_elements}->{$nsuri}->{$ln}) {
862 $element_state->{minus_elements_original}->{$nsuri}->{$ln} = 0;
863 $self->{minus_elements}->{$nsuri}->{$ln} = 1;
864 }
865 }
866 }
867 }
868 } # _add_minus_elements
869
870 sub _remove_minus_elements ($$) {
871 my $self = shift;
872 my $element_state = shift;
873 for my $nsuri (keys %{$element_state->{minus_elements_original}}) {
874 for my $ln (keys %{$element_state->{minus_elements_original}->{$nsuri}}) {
875 delete $self->{minus_elements}->{$nsuri}->{$ln};
876 }
877 }
878 } # _remove_minus_elements
879
880 sub _add_plus_elements ($$@) {
881 my $self = shift;
882 my $element_state = shift;
883 for my $elements (@_) {
884 for my $nsuri (keys %$elements) {
885 for my $ln (keys %{$elements->{$nsuri}}) {
886 unless ($self->{plus_elements}->{$nsuri}->{$ln}) {
887 $element_state->{plus_elements_original}->{$nsuri}->{$ln} = 0;
888 $self->{plus_elements}->{$nsuri}->{$ln} = 1;
889 }
890 }
891 }
892 }
893 } # _add_plus_elements
894
895 sub _remove_plus_elements ($$) {
896 my $self = shift;
897 my $element_state = shift;
898 for my $nsuri (keys %{$element_state->{plus_elements_original}}) {
899 for my $ln (keys %{$element_state->{plus_elements_original}->{$nsuri}}) {
900 delete $self->{plus_elements}->{$nsuri}->{$ln};
901 }
902 }
903 } # _remove_plus_elements
904
905 sub _attr_status_info ($$$) {
906 my ($self, $attr, $status_code) = @_;
907
908 if (not ($status_code & FEATURE_ALLOWED)) {
909 $self->{onerror}->(node => $attr,
910 type => 'attribute not defined',
911 level => $self->{level}->{must});
912 } elsif ($status_code & FEATURE_DEPRECATED_SHOULD) {
913 $self->{onerror}->(node => $attr,
914 type => 'deprecated:attr',
915 level => $self->{level}->{should});
916 } elsif ($status_code & FEATURE_DEPRECATED_INFO) {
917 $self->{onerror}->(node => $attr,
918 type => 'deprecated:attr',
919 level => $self->{level}->{info});
920 }
921
922 my $status;
923 if ($status_code & FEATURE_STATUS_REC) {
924 return;
925 } elsif ($status_code & FEATURE_STATUS_CR) {
926 $status = 'cr';
927 } elsif ($status_code & FEATURE_STATUS_LC) {
928 $status = 'lc';
929 } elsif ($status_code & FEATURE_STATUS_WD) {
930 $status = 'wd';
931 } else {
932 $status = 'non-standard';
933 }
934 $self->{onerror}->(node => $attr,
935 type => 'status:'.$status.':attr',
936 level => $self->{level}->{info});
937 } # _attr_status_info
938
939 sub _add_minuses ($@) {
940 my $self = shift;
941 my $r = {};
942 for my $list (@_) {
943 for my $ns (keys %$list) {
944 for my $ln (keys %{$list->{$ns}}) {
945 unless ($self->{minuses}->{$ns}->{$ln}) {
946 $self->{minuses}->{$ns}->{$ln} = 1;
947 $r->{$ns}->{$ln} = 1;
948 }
949 }
950 }
951 }
952 return {type => 'plus', list => $r};
953 } # _add_minuses
954
955 sub _add_pluses ($@) {
956 my $self = shift;
957 my $r = {};
958 for my $list (@_) {
959 for my $ns (keys %$list) {
960 for my $ln (keys %{$list->{$ns}}) {
961 unless ($self->{pluses}->{$ns}->{$ln}) {
962 $self->{pluses}->{$ns}->{$ln} = 1;
963 $r->{$ns}->{$ln} = 1;
964 }
965 }
966 }
967 }
968 return {type => 'minus', list => $r};
969 } # _add_pluses
970
971 sub _remove_minuses ($$) {
972 my ($self, $todo) = @_;
973 if ($todo->{type} eq 'minus') {
974 for my $ns (keys %{$todo->{list}}) {
975 for my $ln (keys %{$todo->{list}->{$ns}}) {
976 delete $self->{pluses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
977 }
978 }
979 } elsif ($todo->{type} eq 'plus') {
980 for my $ns (keys %{$todo->{list}}) {
981 for my $ln (keys %{$todo->{list}->{$ns}}) {
982 delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
983 }
984 }
985 } else {
986 die "$0: Unknown +- type: $todo->{type}";
987 }
988 1;
989 } # _remove_minuses
990
991 ## NOTE: Priority for "minuses" and "pluses" are currently left
992 ## undefined and implemented inconsistently; it is not a problem for
993 ## now, since no element belongs to both lists.
994
995 sub _check_get_children ($$$) {
996 my ($self, $node, $parent_todo) = @_;
997 my $new_todos = [];
998 my $sib = [];
999 TP: {
1000 my $node_ns = $node->namespace_uri;
1001 $node_ns = '' unless defined $node_ns;
1002 my $node_ln = $node->manakai_local_name;
1003 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
1004 if ($node_ns eq $HTML_NS and $node_ln eq 'noscript') {
1005 if ($parent_todo->{flag}->{in_head}) {
1006 #
1007 } else {
1008 my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
1009 push @$sib, $end;
1010
1011 unshift @$sib, @{$node->child_nodes};
1012 push @$new_todos, {type => 'element-attributes', node => $node};
1013 last TP;
1014 }
1015 } elsif ($node_ns eq $HTML_NS and $node_ln eq 'del') {
1016 my $sig_flag = $parent_todo->{flag}->{has_descendant}->{significant};
1017 unshift @$sib, @{$node->child_nodes};
1018 push @$new_todos, {type => 'element-attributes', node => $node};
1019 push @$new_todos,
1020 {type => 'code',
1021 code => sub {
1022 $parent_todo->{flag}->{has_descendant}->{significant} = 0
1023 if not $sig_flag;
1024 }};
1025 last TP;
1026 } else {
1027 unshift @$sib, @{$node->child_nodes};
1028 push @$new_todos, {type => 'element-attributes', node => $node};
1029 last TP;
1030 }
1031 }
1032 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
1033 if ($node->has_attribute_ns (undef, 'src')) {
1034 unshift @$sib, @{$node->child_nodes};
1035 push @$new_todos, {type => 'element-attributes', node => $node};
1036 last TP;
1037 } else {
1038 my @cn = @{$node->child_nodes};
1039 CN: while (@cn) {
1040 my $cn = shift @cn;
1041 my $cnt = $cn->node_type;
1042 if ($cnt == 1) {
1043 my $cn_nsuri = $cn->namespace_uri;
1044 $cn_nsuri = '' unless defined $cn_nsuri;
1045 if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
1046 #
1047 } else {
1048 last CN;
1049 }
1050 } elsif ($cnt == 3 or $cnt == 4) {
1051 if ($cn->data =~ /[^\x09\x0A\x0C\x0D\x20]/) {
1052 last CN;
1053 }
1054 }
1055 } # CN
1056 unshift @$sib, @cn;
1057 }
1058 } elsif ($node_ns eq $HTML_NS and $node_ln eq 'object') {
1059 my @cn = @{$node->child_nodes};
1060 CN: while (@cn) {
1061 my $cn = shift @cn;
1062 my $cnt = $cn->node_type;
1063 if ($cnt == 1) {
1064 my $cn_nsuri = $cn->namespace_uri;
1065 $cn_nsuri = '' unless defined $cn_nsuri;
1066 if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'param') {
1067 #
1068 } else {
1069 last CN;
1070 }
1071 } elsif ($cnt == 3 or $cnt == 4) {
1072 if ($cn->data =~ /[^\x09\x0A\x0C\x0D\x20]/) {
1073 last CN;
1074 }
1075 }
1076 } # CN
1077 unshift @$sib, @cn;
1078 }
1079 push @$new_todos, {type => 'element', node => $node};
1080 } # TP
1081
1082 for my $new_todo (@$new_todos) {
1083 $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
1084 }
1085
1086 return ($sib, $new_todos);
1087 } # _check_get_children
1088
1089 =head1 LICENSE
1090
1091 Copyright 2007-2008 Wakaba <w@suika.fam.cx>
1092
1093 This library is free software; you can redistribute it
1094 and/or modify it under the same terms as Perl itself.
1095
1096 =cut
1097
1098 1;
1099 # $Date: 2009/08/23 02:35:33 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24