/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker/HTML.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker/HTML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Wed Aug 8 12:39:00 2007 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +6 -2 lines
++ whatpm/t/ChangeLog	8 Aug 2007 12:37:42 -0000
2007-08-08  Wakaba  <wakaba@suika.fam.cx>

	* content-model-2.dat: Some error messages are changed.

	* ContentType.t: New tests for |<script| sniffer are added.

++ whatpm/Whatpm/ChangeLog	8 Aug 2007 12:11:06 -0000
2007-08-08  Wakaba  <wakaba@suika.fam.cx>

	* ContentType.pm: Sniffing for |<script| (HTML5 revision
	983) is implemented.

++ whatpm/Whatpm/ContentChecker/ChangeLog	8 Aug 2007 12:38:37 -0000
2007-08-08  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm ($HTMLURIsAttrChecker): Syntax of error |type|
	value are changed for the ease of error message catalog
	lookup.

1 wakaba 1.1 package Whatpm::ContentChecker;
2     use strict;
3     require Whatpm::ContentChecker;
4    
5     my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
6    
7     my $HTMLMetadataElements = {
8     $HTML_NS => {
9     qw/link 1 meta 1 style 1 script 1 event-source 1 command 1 base 1 title 1
10     noscript 1
11     /,
12     },
13     };
14    
15     my $HTMLSectioningElements = {
16     $HTML_NS => {qw/body 1 section 1 nav 1 article 1 blockquote 1 aside 1/},
17     };
18    
19     my $HTMLBlockLevelElements = {
20     $HTML_NS => {
21     qw/
22     section 1 nav 1 article 1 blockquote 1 aside 1
23     h1 1 h2 1 h3 1 h4 1 h5 1 h6 1 header 1 footer 1
24     address 1 p 1 hr 1 dialog 1 pre 1 ol 1 ul 1 dl 1
25     ins 1 del 1 figure 1 map 1 table 1 script 1 noscript 1
26     event-source 1 details 1 datagrid 1 menu 1 div 1 font 1
27     /,
28     },
29     };
30    
31     my $HTMLStrictlyInlineLevelElements = {
32     $HTML_NS => {
33     qw/
34     br 1 a 1 q 1 cite 1 em 1 strong 1 small 1 m 1 dfn 1 abbr 1
35     time 1 meter 1 progress 1 code 1 var 1 samp 1 kbd 1
36     sub 1 sup 1 span 1 i 1 b 1 bdo 1 ins 1 del 1 img 1
37     iframe 1 embed 1 object 1 video 1 audio 1 canvas 1 area 1
38     script 1 noscript 1 event-source 1 command 1 font 1
39     /,
40     },
41     };
42    
43     my $HTMLStructuredInlineLevelElements = {
44     $HTML_NS => {qw/blockquote 1 pre 1 ol 1 ul 1 dl 1 table 1 menu 1/},
45     };
46    
47     my $HTMLInteractiveElements = {
48     $HTML_NS => {a => 1, details => 1, datagrid => 1},
49     };
50     ## NOTE: |html:a| and |html:datagrid| are not allowed as a descendant
51     ## of interactive elements
52    
53     # my $HTMLTransparentElements : in |Whatpm/ContentChecker.pm|.
54    
55     #my $HTMLSemiTransparentElements = {
56     # $HTML_NS => {qw/video 1 audio 1/},
57     #};
58    
59     my $HTMLEmbededElements = {
60     $HTML_NS => {qw/img 1 iframe 1 embed 1 object 1 video 1 audio 1 canvas 1/},
61     };
62    
63     ## Empty
64     my $HTMLEmptyChecker = sub {
65     my ($self, $todo) = @_;
66     my $el = $todo->{node};
67     my $new_todos = [];
68     my @nodes = (@{$el->child_nodes});
69    
70     while (@nodes) {
71     my $node = shift @nodes;
72     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
73    
74     my $nt = $node->node_type;
75     if ($nt == 1) {
76     ## NOTE: |minuses| list is not checked since redundant
77     $self->{onerror}->(node => $node, type => 'element not allowed');
78     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
79     unshift @nodes, @$sib;
80     push @$new_todos, @$ch;
81     } elsif ($nt == 3 or $nt == 4) {
82     if ($node->data =~ /[^\x09-\x0D\x20]/) {
83     $self->{onerror}->(node => $node, type => 'character not allowed');
84     }
85     } elsif ($nt == 5) {
86     unshift @nodes, @{$node->child_nodes};
87     }
88     }
89     return ($new_todos);
90     };
91    
92     ## Text
93     my $HTMLTextChecker = sub {
94     my ($self, $todo) = @_;
95     my $el = $todo->{node};
96     my $new_todos = [];
97     my @nodes = (@{$el->child_nodes});
98    
99     while (@nodes) {
100     my $node = shift @nodes;
101     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
102    
103     my $nt = $node->node_type;
104     if ($nt == 1) {
105     ## NOTE: |minuses| list is not checked since redundant
106     $self->{onerror}->(node => $node, type => 'element not allowed');
107     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
108     unshift @nodes, @$sib;
109     push @$new_todos, @$ch;
110     } elsif ($nt == 5) {
111     unshift @nodes, @{$node->child_nodes};
112     }
113     }
114     return ($new_todos);
115     };
116    
117     ## Zero or more |html:style| elements,
118     ## followed by zero or more block-level elements
119     my $HTMLStylableBlockChecker = sub {
120     my ($self, $todo) = @_;
121     my $el = $todo->{node};
122     my $new_todos = [];
123     my @nodes = (@{$el->child_nodes});
124    
125     my $has_non_style;
126     while (@nodes) {
127     my $node = shift @nodes;
128     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
129    
130     my $nt = $node->node_type;
131     if ($nt == 1) {
132     my $node_ns = $node->namespace_uri;
133     $node_ns = '' unless defined $node_ns;
134     my $node_ln = $node->manakai_local_name;
135     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
136     if ($node_ns eq $HTML_NS and $node_ln eq 'style') {
137     $not_allowed = 1 if $has_non_style or
138     not $node->has_attribute_ns (undef, 'scoped');
139     } elsif ($HTMLBlockLevelElements->{$node_ns}->{$node_ln}) {
140     $has_non_style = 1;
141     } else {
142     $has_non_style = 1;
143     $not_allowed = 1;
144     }
145     $self->{onerror}->(node => $node, type => 'element not allowed')
146     if $not_allowed;
147     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
148     unshift @nodes, @$sib;
149     push @$new_todos, @$ch;
150     } elsif ($nt == 3 or $nt == 4) {
151     if ($node->data =~ /[^\x09-\x0D\x20]/) {
152     $self->{onerror}->(node => $node, type => 'character not allowed');
153     }
154     } elsif ($nt == 5) {
155     unshift @nodes, @{$node->child_nodes};
156     }
157     }
158     return ($new_todos);
159     }; # $HTMLStylableBlockChecker
160    
161     ## Zero or more block-level elements
162     my $HTMLBlockChecker = sub {
163     my ($self, $todo) = @_;
164     my $el = $todo->{node};
165     my $new_todos = [];
166     my @nodes = (@{$el->child_nodes});
167    
168     while (@nodes) {
169     my $node = shift @nodes;
170     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
171    
172     my $nt = $node->node_type;
173     if ($nt == 1) {
174     my $node_ns = $node->namespace_uri;
175     $node_ns = '' unless defined $node_ns;
176     my $node_ln = $node->manakai_local_name;
177     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
178     $not_allowed = 1
179     unless $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
180     $self->{onerror}->(node => $node, type => 'element not allowed')
181     if $not_allowed;
182     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
183     unshift @nodes, @$sib;
184     push @$new_todos, @$ch;
185     } elsif ($nt == 3 or $nt == 4) {
186     if ($node->data =~ /[^\x09-\x0D\x20]/) {
187     $self->{onerror}->(node => $node, type => 'character not allowed');
188     }
189     } elsif ($nt == 5) {
190     unshift @nodes, @{$node->child_nodes};
191     }
192     }
193     return ($new_todos);
194     }; # $HTMLBlockChecker
195    
196     ## Inline-level content
197     my $HTMLInlineChecker = sub {
198     my ($self, $todo) = @_;
199     my $el = $todo->{node};
200     my $new_todos = [];
201     my @nodes = (@{$el->child_nodes});
202    
203     while (@nodes) {
204     my $node = shift @nodes;
205     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
206    
207     my $nt = $node->node_type;
208     if ($nt == 1) {
209     my $node_ns = $node->namespace_uri;
210     $node_ns = '' unless defined $node_ns;
211     my $node_ln = $node->manakai_local_name;
212     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
213     $not_allowed = 1
214     unless $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} or
215     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
216     $self->{onerror}->(node => $node, type => 'element not allowed')
217     if $not_allowed;
218     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
219     unshift @nodes, @$sib;
220     push @$new_todos, @$ch;
221     } elsif ($nt == 5) {
222     unshift @nodes, @{$node->child_nodes};
223     }
224     }
225    
226     for (@$new_todos) {
227     $_->{inline} = 1;
228     }
229     return ($new_todos);
230     }; # $HTMLInlineChecker
231    
232     my $HTMLSignificantInlineChecker = $HTMLInlineChecker;
233     ## TODO: check significant content
234    
235     ## Strictly inline-level content
236     my $HTMLStrictlyInlineChecker = sub {
237     my ($self, $todo) = @_;
238     my $el = $todo->{node};
239     my $new_todos = [];
240     my @nodes = (@{$el->child_nodes});
241    
242     while (@nodes) {
243     my $node = shift @nodes;
244     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
245    
246     my $nt = $node->node_type;
247     if ($nt == 1) {
248     my $node_ns = $node->namespace_uri;
249     $node_ns = '' unless defined $node_ns;
250     my $node_ln = $node->manakai_local_name;
251     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
252     $not_allowed = 1
253     unless $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln};
254     $self->{onerror}->(node => $node, type => 'element not allowed')
255     if $not_allowed;
256     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
257     unshift @nodes, @$sib;
258     push @$new_todos, @$ch;
259     } elsif ($nt == 5) {
260     unshift @nodes, @{$node->child_nodes};
261     }
262     }
263    
264     for (@$new_todos) {
265     $_->{inline} = 1;
266     $_->{strictly_inline} = 1;
267     }
268     return ($new_todos);
269     }; # $HTMLStrictlyInlineChecker
270    
271     my $HTMLSignificantStrictlyInlineChecker = $HTMLStrictlyInlineChecker;
272     ## TODO: check significant content
273    
274     ## Inline-level or strictly inline-kevek content
275     my $HTMLInlineOrStrictlyInlineChecker = sub {
276     my ($self, $todo) = @_;
277     my $el = $todo->{node};
278     my $new_todos = [];
279     my @nodes = (@{$el->child_nodes});
280    
281     while (@nodes) {
282     my $node = shift @nodes;
283     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
284    
285     my $nt = $node->node_type;
286     if ($nt == 1) {
287     my $node_ns = $node->namespace_uri;
288     $node_ns = '' unless defined $node_ns;
289     my $node_ln = $node->manakai_local_name;
290     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
291     if ($todo->{strictly_inline}) {
292     $not_allowed = 1
293     unless $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln};
294     } else {
295     $not_allowed = 1
296     unless $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} or
297     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
298     }
299     $self->{onerror}->(node => $node, type => 'element not allowed')
300     if $not_allowed;
301     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
302     unshift @nodes, @$sib;
303     push @$new_todos, @$ch;
304     } elsif ($nt == 5) {
305     unshift @nodes, @{$node->child_nodes};
306     }
307     }
308    
309     for (@$new_todos) {
310     $_->{inline} = 1;
311     $_->{strictly_inline} = 1;
312     }
313     return ($new_todos);
314     }; # $HTMLInlineOrStrictlyInlineChecker
315    
316     my $HTMLSignificantInlineOrStrictlyInlineChecker
317     = $HTMLInlineOrStrictlyInlineChecker;
318     ## TODO: check significant content
319    
320     my $HTMLBlockOrInlineChecker = sub {
321     my ($self, $todo) = @_;
322     my $el = $todo->{node};
323     my $new_todos = [];
324     my @nodes = (@{$el->child_nodes});
325    
326     my $content = 'block-or-inline'; # or 'block' or 'inline'
327     my @block_not_inline;
328     while (@nodes) {
329     my $node = shift @nodes;
330     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
331    
332     my $nt = $node->node_type;
333     if ($nt == 1) {
334     my $node_ns = $node->namespace_uri;
335     $node_ns = '' unless defined $node_ns;
336     my $node_ln = $node->manakai_local_name;
337     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
338     if ($content eq 'block') {
339     $not_allowed = 1
340     unless $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
341     } elsif ($content eq 'inline') {
342     $not_allowed = 1
343     unless $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} or
344     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
345     } else {
346     my $is_block = $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
347     my $is_inline
348     = $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} ||
349     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
350    
351     push @block_not_inline, $node
352     if $is_block and not $is_inline and not $not_allowed;
353     unless ($is_block) {
354     $content = 'inline';
355     for (@block_not_inline) {
356     $self->{onerror}->(node => $_, type => 'element not allowed');
357     }
358     $not_allowed = 1 unless $is_inline;
359     }
360     }
361     $self->{onerror}->(node => $node, type => 'element not allowed')
362     if $not_allowed;
363     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
364     unshift @nodes, @$sib;
365     push @$new_todos, @$ch;
366     } elsif ($nt == 3 or $nt == 4) {
367     if ($node->data =~ /[^\x09-\x0D\x20]/) {
368     if ($content eq 'block') {
369     $self->{onerror}->(node => $node, type => 'character not allowed');
370     } else {
371     $content = 'inline';
372     for (@block_not_inline) {
373     $self->{onerror}->(node => $_, type => 'element not allowed');
374     }
375     }
376     }
377     } elsif ($nt == 5) {
378     unshift @nodes, @{$node->child_nodes};
379     }
380     }
381    
382     if ($content eq 'inline') {
383     for (@$new_todos) {
384     $_->{inline} = 1;
385     }
386     }
387     return ($new_todos);
388     };
389    
390     ## Zero or more XXX element, then either block-level or inline-level
391     my $GetHTMLZeroOrMoreThenBlockOrInlineChecker = sub ($$) {
392     my ($elnsuri, $ellname) = @_;
393     return sub {
394     my ($self, $todo) = @_;
395     my $el = $todo->{node};
396     my $new_todos = [];
397     my @nodes = (@{$el->child_nodes});
398    
399     my $has_non_style;
400     my $content = 'block-or-inline'; # or 'block' or 'inline'
401     my @block_not_inline;
402     while (@nodes) {
403     my $node = shift @nodes;
404     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
405    
406     my $nt = $node->node_type;
407     if ($nt == 1) {
408     my $node_ns = $node->namespace_uri;
409     $node_ns = '' unless defined $node_ns;
410     my $node_ln = $node->manakai_local_name;
411     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
412     if ($node_ns eq $elnsuri and $node_ln eq $ellname) {
413     $not_allowed = 1 if $has_non_style;
414     if ($ellname eq 'style' and
415     not $node->has_attribute_ns (undef, 'scoped')) {
416     $not_allowed = 1;
417     }
418     } elsif ($content eq 'block') {
419     $has_non_style = 1;
420     $not_allowed = 1
421     unless $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
422     } elsif ($content eq 'inline') {
423     $has_non_style = 1;
424     $not_allowed = 1
425     unless $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} or
426     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
427     } else {
428     $has_non_style = 1;
429     my $is_block = $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
430     my $is_inline
431     = $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} ||
432     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
433    
434     push @block_not_inline, $node
435     if $is_block and not $is_inline and not $not_allowed;
436     unless ($is_block) {
437     $content = 'inline';
438     for (@block_not_inline) {
439     $self->{onerror}->(node => $_, type => 'element not allowed');
440     }
441     $not_allowed = 1 unless $is_inline;
442     }
443     }
444     $self->{onerror}->(node => $node, type => 'element not allowed')
445     if $not_allowed;
446     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
447     unshift @nodes, @$sib;
448     push @$new_todos, @$ch;
449     } elsif ($nt == 3 or $nt == 4) {
450     if ($node->data =~ /[^\x09-\x0D\x20]/) {
451     $has_non_style = 1;
452     if ($content eq 'block') {
453     $self->{onerror}->(node => $node, type => 'character not allowed');
454     } else {
455     $content = 'inline';
456     for (@block_not_inline) {
457     $self->{onerror}->(node => $_, type => 'element not allowed');
458     }
459     }
460     }
461     } elsif ($nt == 5) {
462     unshift @nodes, @{$node->child_nodes};
463     }
464     }
465    
466     if ($content eq 'inline') {
467     for (@$new_todos) {
468     $_->{inline} = 1;
469     }
470     }
471     return ($new_todos);
472     };
473     }; # $GetHTMLZeroOrMoreThenBlockOrInlineChecker
474    
475     my $HTMLTransparentChecker = $HTMLBlockOrInlineChecker;
476    
477     our $AttrChecker;
478    
479     my $GetHTMLEnumeratedAttrChecker = sub {
480     my $states = shift; # {value => conforming ? 1 : -1}
481     return sub {
482     my ($self, $attr) = @_;
483     my $value = lc $attr->value; ## TODO: ASCII case insensitibility?
484     if ($states->{$value} > 0) {
485     #
486     } elsif ($states->{$value}) {
487     $self->{onerror}->(node => $attr, type => 'enumerated:non-conforming');
488     } else {
489     $self->{onerror}->(node => $attr, type => 'enumerated:invalid');
490     }
491     };
492     }; # $GetHTMLEnumeratedAttrChecker
493    
494     my $GetHTMLBooleanAttrChecker = sub {
495     my $local_name = shift;
496     return sub {
497     my ($self, $attr) = @_;
498     my $value = $attr->value;
499     unless ($value eq $local_name or $value eq '') {
500     $self->{onerror}->(node => $attr, type => 'boolean:invalid');
501     }
502     };
503     }; # $GetHTMLBooleanAttrChecker
504    
505     ## |rel| attribute (unordered set of space separated tokens,
506     ## whose allowed values are defined by the section on link types)
507     my $HTMLLinkTypesAttrChecker = sub {
508     my ($a_or_area, $self, $attr) = @_;
509     my %word;
510     for my $word (grep {length $_} split /[\x09-\x0D\x20]/, $attr->value) {
511     unless ($word{$word}) {
512     $word{$word} = 1;
513     } else {
514     $self->{onerror}->(node => $attr, type => 'duplicate token:'.$word);
515     }
516     }
517     ## NOTE: Case sensitive match (since HTML5 spec does not say link
518     ## types are case-insensitive and it says "The value should not
519     ## be confusingly similar to any other defined value (e.g.
520     ## differing only in case).").
521     ## NOTE: Though there is no explicit "MUST NOT" for undefined values,
522     ## "MAY"s and "only ... MAY" restrict non-standard non-registered
523     ## values to be used conformingly.
524     require Whatpm::_LinkTypeList;
525     our $LinkType;
526     for my $word (keys %word) {
527     my $def = $LinkType->{$word};
528     if (defined $def) {
529     if ($def->{status} eq 'accepted') {
530     if (defined $def->{effect}->[$a_or_area]) {
531     #
532     } else {
533     $self->{onerror}->(node => $attr,
534     type => 'link type:bad context:'.$word);
535     }
536     } elsif ($def->{status} eq 'proposal') {
537     $self->{onerror}->(node => $attr, level => 's',
538     type => 'link type:proposed:'.$word);
539     } else { # rejected or synonym
540     $self->{onerror}->(node => $attr,
541     type => 'link type:non-conforming:'.$word);
542     }
543     if ($def->{unique}) {
544     unless ($self->{has_link_type}->{$word}) {
545     $self->{has_link_type}->{$word} = 1;
546     } else {
547     $self->{onerror}->(node => $attr,
548     type => 'link type:duplicate:'.$word);
549     }
550     }
551     } else {
552     $self->{onerror}->(node => $attr, level => 'unsupported',
553     type => 'link type:'.$word);
554     }
555     }
556     ## TODO: The Pingback 1.0 specification, which is referenced by HTML5,
557     ## says that using both X-Pingback: header field and HTML
558     ## <link rel=pingback> is deprecated and if both appears they
559     ## SHOULD contain exactly the same value.
560     ## ISSUE: Pingback 1.0 specification defines the exact representation
561     ## of its link element, which cannot be tested by the current arch.
562     ## ISSUE: Pingback 1.0 specification says that the document MUST NOT
563     ## include any string that matches to the pattern for the rel=pingback link,
564     ## which again inpossible to test.
565     ## ISSUE: rel=pingback href MUST NOT include entities other than predefined 4.
566     }; # $HTMLLinkTypesAttrChecker
567    
568     ## URI (or IRI)
569     my $HTMLURIAttrChecker = sub {
570     my ($self, $attr) = @_;
571     ## ISSUE: Relative references are allowed? (RFC 3987 "IRI" is an absolute reference with optional fragment identifier.)
572     my $value = $attr->value;
573     Whatpm::URIChecker->check_iri_reference ($value, sub {
574     my %opt = @_;
575     $self->{onerror}->(node => $attr, level => $opt{level},
576     type => 'URI::'.$opt{type}.
577     (defined $opt{position} ? ':'.$opt{position} : ''));
578     });
579     }; # $HTMLURIAttrChecker
580    
581     ## A space separated list of one or more URIs (or IRIs)
582     my $HTMLSpaceURIsAttrChecker = sub {
583     my ($self, $attr) = @_;
584     my $i = 0;
585     for my $value (split /[\x09-\x0D\x20]+/, $attr->value) {
586     Whatpm::URIChecker->check_iri_reference ($value, sub {
587     my %opt = @_;
588     $self->{onerror}->(node => $attr, level => $opt{level},
589 wakaba 1.2 type => 'URIs:'.':'.
590     $opt{type}.':'.$i.
591 wakaba 1.1 (defined $opt{position} ? ':'.$opt{position} : ''));
592     });
593     $i++;
594     }
595     ## ISSUE: Relative references?
596     ## ISSUE: Leading or trailing white spaces are conformant?
597     ## ISSUE: A sequence of white space characters are conformant?
598     ## ISSUE: A zero-length string is conformant? (It does contain a relative reference, i.e. same as base URI.)
599     ## NOTE: Duplication seems not an error.
600     }; # $HTMLSpaceURIsAttrChecker
601    
602     my $HTMLDatetimeAttrChecker = sub {
603     my ($self, $attr) = @_;
604     my $value = $attr->value;
605     ## ISSUE: "space", not "space character" (in parsing algorihtm, "space character")
606     if ($value =~ /\A([0-9]{4})-([0-9]{2})-([0-9]{2})(?>[\x09-\x0D\x20]+(?>T[\x09-\x0D\x20]*)?|T[\x09-\x0D\x20]*)([0-9]{2}):([0-9]{2})(?>:([0-9]{2}))?(?>\.([0-9]+))?[\x09-\x0D\x20]*(?>Z|[+-]([0-9]{2}):([0-9]{2}))\z/) {
607     my ($y, $M, $d, $h, $m, $s, $f, $zh, $zm)
608     = ($1, $2, $3, $4, $5, $6, $7, $8, $9);
609     if (0 < $M and $M < 13) { ## ISSUE: This is not explicitly specified (though in parsing algorithm)
610     $self->{onerror}->(node => $attr, type => 'datetime:bad day')
611     if $d < 1 or
612     $d > [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]->[$M];
613     $self->{onerror}->(node => $attr, type => 'datetime:bad day')
614     if $M == 2 and $d == 29 and
615     not ($y % 400 == 0 or ($y % 4 == 0 and $y % 100 != 0));
616     } else {
617     $self->{onerror}->(node => $attr, type => 'datetime:bad month');
618     }
619     $self->{onerror}->(node => $attr, type => 'datetime:bad hour') if $h > 23;
620     $self->{onerror}->(node => $attr, type => 'datetime:bad minute') if $m > 59;
621     $self->{onerror}->(node => $attr, type => 'datetime:bad second')
622     if defined $s and $s > 59;
623     $self->{onerror}->(node => $attr, type => 'datetime:bad timezone hour')
624     if $zh > 23;
625     $self->{onerror}->(node => $attr, type => 'datetime:bad timezone minute')
626     if $zm > 59;
627     ## ISSUE: Maybe timezone -00:00 should have same semantics as in RFC 3339.
628     } else {
629     $self->{onerror}->(node => $attr, type => 'datetime:syntax error');
630     }
631     }; # $HTMLDatetimeAttrChecker
632    
633     my $HTMLIntegerAttrChecker = sub {
634     my ($self, $attr) = @_;
635     my $value = $attr->value;
636     unless ($value =~ /\A-?[0-9]+\z/) {
637     $self->{onerror}->(node => $attr, type => 'integer:syntax error');
638     }
639     }; # $HTMLIntegerAttrChecker
640    
641     my $GetHTMLNonNegativeIntegerAttrChecker = sub {
642     my $range_check = shift;
643     return sub {
644     my ($self, $attr) = @_;
645     my $value = $attr->value;
646     if ($value =~ /\A[0-9]+\z/) {
647     unless ($range_check->($value + 0)) {
648     $self->{onerror}->(node => $attr, type => 'nninteger:out of range');
649     }
650     } else {
651     $self->{onerror}->(node => $attr,
652     type => 'nninteger:syntax error');
653     }
654     };
655     }; # $GetHTMLNonNegativeIntegerAttrChecker
656    
657     my $GetHTMLFloatingPointNumberAttrChecker = sub {
658     my $range_check = shift;
659     return sub {
660     my ($self, $attr) = @_;
661     my $value = $attr->value;
662     if ($value =~ /\A-?[0-9.]+\z/ and $value =~ /[0-9]/) {
663     unless ($range_check->($value + 0)) {
664     $self->{onerror}->(node => $attr, type => 'float:out of range');
665     }
666     } else {
667     $self->{onerror}->(node => $attr,
668     type => 'float:syntax error');
669     }
670     };
671     }; # $GetHTMLFloatingPointNumberAttrChecker
672    
673     ## "A valid MIME type, optionally with parameters. [RFC 2046]"
674     ## ISSUE: RFC 2046 does not define syntax of media types.
675     ## ISSUE: The definition of "a valid MIME type" is unknown.
676     ## Syntactical correctness?
677     my $HTMLIMTAttrChecker = sub {
678     my ($self, $attr) = @_;
679     my $value = $attr->value;
680     ## ISSUE: RFC 2045 Content-Type header field allows insertion
681     ## of LWS/comments between tokens. Is it allowed in HTML? Maybe no.
682     ## ISSUE: RFC 2231 extension? Maybe no.
683     my $lws0 = qr/(?>(?>\x0D\x0A)?[\x09\x20])*/;
684     my $token = qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
685     my $qs = qr/"(?>[\x00-\x0C\x0E-\x21\x23-\x5B\x5D-\x7E]|\x0D\x0A[\x09\x20]|\x5C[\x00-\x7F])*"/;
686     if ($value =~ m#\A$lws0($token)$lws0/$lws0($token)$lws0((?>;$lws0$token$lws0=$lws0(?>$token|$qs)$lws0)*)\z#) {
687     my @type = ($1, $2);
688     my $param = $3;
689     while ($param =~ s/^;$lws0($token)$lws0=$lws0(?>($token)|($qs))$lws0//) {
690     if (defined $2) {
691     push @type, $1 => $2;
692     } else {
693     my $n = $1;
694     my $v = $2;
695     $v =~ s/\\(.)/$1/gs;
696     push @type, $n => $v;
697     }
698     }
699     require Whatpm::IMTChecker;
700     Whatpm::IMTChecker->check_imt (sub {
701     my %opt = @_;
702     $self->{onerror}->(node => $attr, level => $opt{level},
703     type => 'IMT:'.$opt{type});
704     }, @type);
705     } else {
706     $self->{onerror}->(node => $attr, type => 'IMT:syntax error');
707     }
708     }; # $HTMLIMTAttrChecker
709    
710     my $HTMLLanguageTagAttrChecker = sub {
711     my ($self, $attr) = @_;
712     $self->{onerror}->(node => $attr, level => 'unsupported',
713     type => 'language tag');
714     if ($attr->value eq '') {
715     $self->{onerror}->(node => $attr, type => 'language tag:syntax error');
716     }
717     ## TODO: RFC 3066 test
718     ## ISSUE: RFC 4646 (3066bis)?
719     }; # $HTMLLanguageTagAttrChecker
720    
721     ## "A valid media query [MQ]"
722     my $HTMLMQAttrChecker = sub {
723     my ($self, $attr) = @_;
724     $self->{onerror}->(node => $attr, level => 'unsupported',
725     type => 'media query');
726     ## ISSUE: What is "a valid media query"?
727     }; # $HTMLMQAttrChecker
728    
729     my $HTMLEventHandlerAttrChecker = sub {
730     my ($self, $attr) = @_;
731     $self->{onerror}->(node => $attr, level => 'unsupported',
732     type => 'event handler');
733     ## TODO: MUST contain valid ECMAScript code matching the
734     ## ECMAScript |FunctionBody| production. [ECMA262]
735     ## ISSUE: MUST be ES3? E4X? ES4? JS1.x?
736     ## ISSUE: Automatic semicolon insertion does not apply?
737     ## ISSUE: Other script languages?
738     }; # $HTMLEventHandlerAttrChecker
739    
740     my $HTMLUsemapAttrChecker = sub {
741     my ($self, $attr) = @_;
742     ## MUST be a valid hashed ID reference to a |map| element
743     my $value = $attr->value;
744     if ($value =~ s/^#//) {
745     ## ISSUE: Is |usemap="#"| conformant? (c.f. |id=""| is non-conformant.)
746     push @{$self->{usemap}}, [$value => $attr];
747     } else {
748     $self->{onerror}->(node => $attr, type => '#idref:syntax error');
749     }
750     ## NOTE: Space characters in hashed ID references are conforming.
751     ## ISSUE: UA algorithm for matching is case-insensitive; IDs only different in cases should be reported
752     }; # $HTMLUsemapAttrChecker
753    
754     my $HTMLTargetAttrChecker = sub {
755     my ($self, $attr) = @_;
756     my $value = $attr->value;
757     if ($value =~ /^_/) {
758     $value = lc $value; ## ISSUE: ASCII case-insentitive?
759     unless ({
760     _self => 1, _parent => 1, _top => 1,
761     }->{$value}) {
762     $self->{onerror}->(node => $attr,
763     type => 'reserved browsing context name');
764     }
765     } else {
766     #$ ISSUE: An empty string is conforming?
767     }
768     }; # $HTMLTargetAttrChecker
769    
770     my $HTMLAttrChecker = {
771     id => sub {
772     ## NOTE: |map| has its own variant of |id=""| checker
773     my ($self, $attr) = @_;
774     my $value = $attr->value;
775     if (length $value > 0) {
776     if ($self->{id}->{$value}) {
777     $self->{onerror}->(node => $attr, type => 'duplicate ID');
778     push @{$self->{id}->{$value}}, $attr;
779     } else {
780     $self->{id}->{$value} = [$attr];
781     }
782     if ($value =~ /[\x09-\x0D\x20]/) {
783     $self->{onerror}->(node => $attr, type => 'space in ID');
784     }
785     } else {
786     ## NOTE: MUST contain at least one character
787     $self->{onerror}->(node => $attr, type => 'empty attribute value');
788     }
789     },
790     title => sub {}, ## NOTE: No conformance creteria
791     lang => sub {
792     my ($self, $attr) = @_;
793     $self->{onerror}->(node => $attr, level => 'unsupported',
794     type => 'language tag');
795     ## TODO: RFC 3066 or empty test
796     ## ISSUE: RFC 4646 (3066bis)?
797     unless ($attr->owner_document->manakai_is_html) {
798     $self->{onerror}->(node => $attr, type => 'in XML:lang');
799     }
800     },
801     dir => $GetHTMLEnumeratedAttrChecker->({ltr => 1, rtl => 1}),
802     class => sub {
803     my ($self, $attr) = @_;
804     my %word;
805     for my $word (grep {length $_} split /[\x09-\x0D\x20]/, $attr->value) {
806     unless ($word{$word}) {
807     $word{$word} = 1;
808     push @{$self->{return}->{class}->{$word}||=[]}, $attr;
809     } else {
810     $self->{onerror}->(node => $attr, type => 'duplicate token:'.$word);
811     }
812     }
813     },
814     contextmenu => sub {
815     my ($self, $attr) = @_;
816     my $value = $attr->value;
817     push @{$self->{contextmenu}}, [$value => $attr];
818     ## ISSUE: "The value must be the ID of a menu element in the DOM."
819     ## What is "in the DOM"? A menu Element node that is not part
820     ## of the Document tree is in the DOM? A menu Element node that
821     ## belong to another Document tree is in the DOM?
822     },
823     irrelevant => $GetHTMLBooleanAttrChecker->('irrelevant'),
824     tabindex => $HTMLIntegerAttrChecker,
825     };
826    
827     for (qw/
828     onabort onbeforeunload onblur onchange onclick oncontextmenu
829     ondblclick ondrag ondragend ondragenter ondragleave ondragover
830     ondragstart ondrop onerror onfocus onkeydown onkeypress
831     onkeyup onload onmessage onmousedown onmousemove onmouseout
832     onmouseover onmouseup onmousewheel onresize onscroll onselect
833     onsubmit onunload
834     /) {
835     $HTMLAttrChecker->{$_} = $HTMLEventHandlerAttrChecker;
836     }
837    
838     my $GetHTMLAttrsChecker = sub {
839     my $element_specific_checker = shift;
840     return sub {
841     my ($self, $todo) = @_;
842     for my $attr (@{$todo->{node}->attributes}) {
843     my $attr_ns = $attr->namespace_uri;
844     $attr_ns = '' unless defined $attr_ns;
845     my $attr_ln = $attr->manakai_local_name;
846     my $checker;
847     if ($attr_ns eq '') {
848     $checker = $element_specific_checker->{$attr_ln}
849     || $HTMLAttrChecker->{$attr_ln};
850     }
851     $checker ||= $AttrChecker->{$attr_ns}->{$attr_ln}
852     || $AttrChecker->{$attr_ns}->{''};
853     if ($checker) {
854     $checker->($self, $attr, $todo);
855     } else {
856     $self->{onerror}->(node => $attr, level => 'unsupported',
857     type => 'attribute');
858     ## ISSUE: No comformance createria for unknown attributes in the spec
859     }
860     }
861     };
862     }; # $GetHTMLAttrsChecker
863    
864     our $Element;
865     our $ElementDefault;
866     our $AnyChecker;
867    
868     $Element->{$HTML_NS}->{''} = {
869     attrs_checker => $GetHTMLAttrsChecker->({}),
870     checker => $ElementDefault->{checker},
871     };
872    
873     $Element->{$HTML_NS}->{html} = {
874     is_root => 1,
875     attrs_checker => $GetHTMLAttrsChecker->({
876     xmlns => sub {
877     my ($self, $attr) = @_;
878     my $value = $attr->value;
879     unless ($value eq $HTML_NS) {
880     $self->{onerror}->(node => $attr, type => 'invalid attribute value');
881     }
882     unless ($attr->owner_document->manakai_is_html) {
883     $self->{onerror}->(node => $attr, type => 'in XML:xmlns');
884     ## TODO: Test
885     }
886     },
887     }),
888     checker => sub {
889     my ($self, $todo) = @_;
890     my $el = $todo->{node};
891     my $new_todos = [];
892     my @nodes = (@{$el->child_nodes});
893    
894     my $phase = 'before head';
895     while (@nodes) {
896     my $node = shift @nodes;
897     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
898    
899     my $nt = $node->node_type;
900     if ($nt == 1) {
901     my $node_ns = $node->namespace_uri;
902     $node_ns = '' unless defined $node_ns;
903     my $node_ln = $node->manakai_local_name;
904     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
905     if ($phase eq 'before head') {
906     if ($node_ns eq $HTML_NS and $node_ln eq 'head') {
907     $phase = 'after head';
908     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'body') {
909     $self->{onerror}->(node => $node, type => 'ps element missing:head');
910     $phase = 'after body';
911     } else {
912     $not_allowed = 1;
913     # before head
914     }
915     } elsif ($phase eq 'after head') {
916     if ($node_ns eq $HTML_NS and $node_ln eq 'body') {
917     $phase = 'after body';
918     } else {
919     $not_allowed = 1;
920     # after head
921     }
922     } else { #elsif ($phase eq 'after body') {
923     $not_allowed = 1;
924     # after body
925     }
926     $self->{onerror}->(node => $node, type => 'element not allowed')
927     if $not_allowed;
928     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
929     unshift @nodes, @$sib;
930     push @$new_todos, @$ch;
931     } elsif ($nt == 3 or $nt == 4) {
932     if ($node->data =~ /[^\x09-\x0D\x20]/) {
933     $self->{onerror}->(node => $node, type => 'character not allowed');
934     }
935     } elsif ($nt == 5) {
936     unshift @nodes, @{$node->child_nodes};
937     }
938     }
939    
940     if ($phase eq 'before head') {
941     $self->{onerror}->(node => $el, type => 'child element missing:head');
942     $self->{onerror}->(node => $el, type => 'child element missing:body');
943     } elsif ($phase eq 'after head') {
944     $self->{onerror}->(node => $el, type => 'child element missing:body');
945     }
946    
947     return ($new_todos);
948     },
949     };
950    
951     $Element->{$HTML_NS}->{head} = {
952     attrs_checker => $GetHTMLAttrsChecker->({}),
953     checker => sub {
954     my ($self, $todo) = @_;
955     my $el = $todo->{node};
956     my $new_todos = [];
957     my @nodes = (@{$el->child_nodes});
958    
959     my $has_title;
960     my $phase = 'initial'; # 'after charset', 'after base'
961     while (@nodes) {
962     my $node = shift @nodes;
963     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
964    
965     my $nt = $node->node_type;
966     if ($nt == 1) {
967     my $node_ns = $node->namespace_uri;
968     $node_ns = '' unless defined $node_ns;
969     my $node_ln = $node->manakai_local_name;
970     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
971     if ($node_ns eq $HTML_NS and $node_ln eq 'title') {
972     $phase = 'after base';
973     unless ($has_title) {
974     $has_title = 1;
975     } else {
976     $not_allowed = 1;
977     }
978     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'meta') {
979     if ($node->has_attribute_ns (undef, 'charset')) {
980     if ($phase eq 'initial') {
981     $phase = 'after charset';
982     } else {
983     $not_allowed = 1;
984     ## NOTE: See also |base|'s "contexts" field in the spec
985     }
986     } else {
987     $phase = 'after base';
988     }
989     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'base') {
990     if ($phase eq 'initial' or $phase eq 'after charset') {
991     $phase = 'after base';
992     } else {
993     $not_allowed = 1;
994     }
995     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'style') {
996     $phase = 'after base';
997     if ($node->has_attribute_ns (undef, 'scoped')) {
998     $not_allowed = 1;
999     }
1000     } elsif ($HTMLMetadataElements->{$node_ns}->{$node_ln}) {
1001     $phase = 'after base';
1002     } else {
1003     $not_allowed = 1;
1004     }
1005     $self->{onerror}->(node => $node, type => 'element not allowed')
1006     if $not_allowed;
1007     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
1008     unshift @nodes, @$sib;
1009     push @$new_todos, @$ch;
1010     } elsif ($nt == 3 or $nt == 4) {
1011     if ($node->data =~ /[^\x09-\x0D\x20]/) {
1012     $self->{onerror}->(node => $node, type => 'character not allowed');
1013     }
1014     } elsif ($nt == 5) {
1015     unshift @nodes, @{$node->child_nodes};
1016     }
1017     }
1018     unless ($has_title) {
1019     $self->{onerror}->(node => $el, type => 'child element missing:title');
1020     }
1021     return ($new_todos);
1022     },
1023     };
1024    
1025     $Element->{$HTML_NS}->{title} = {
1026     attrs_checker => $GetHTMLAttrsChecker->({}),
1027     checker => $HTMLTextChecker,
1028     };
1029    
1030 wakaba 1.2 ## TODO: |base| with |href| MUST come before elements with URI attributes.
1031     ## For example: <title xml:base=""/><base href=""/> is non-conformant.
1032     ## TODO: |base| with |target| MUST come before any hyperlink.
1033 wakaba 1.1 $Element->{$HTML_NS}->{base} = {
1034     attrs_checker => $GetHTMLAttrsChecker->({
1035     href => $HTMLURIAttrChecker,
1036     target => $HTMLTargetAttrChecker,
1037     }),
1038     checker => $HTMLEmptyChecker,
1039     };
1040    
1041     $Element->{$HTML_NS}->{link} = {
1042     attrs_checker => sub {
1043     my ($self, $todo) = @_;
1044     $GetHTMLAttrsChecker->({
1045     href => $HTMLURIAttrChecker,
1046     rel => sub { $HTMLLinkTypesAttrChecker->(0, @_) },
1047     media => $HTMLMQAttrChecker,
1048     hreflang => $HTMLLanguageTagAttrChecker,
1049     type => $HTMLIMTAttrChecker,
1050     ## NOTE: Though |title| has special semantics,
1051     ## syntactically same as the |title| as global attribute.
1052     })->($self, $todo);
1053     unless ($todo->{node}->has_attribute_ns (undef, 'href')) {
1054     $self->{onerror}->(node => $todo->{node},
1055     type => 'attribute missing:href');
1056     }
1057     unless ($todo->{node}->has_attribute_ns (undef, 'rel')) {
1058     $self->{onerror}->(node => $todo->{node},
1059     type => 'attribute missing:rel');
1060     }
1061     },
1062     checker => $HTMLEmptyChecker,
1063     };
1064    
1065     $Element->{$HTML_NS}->{meta} = {
1066     attrs_checker => sub {
1067     my ($self, $todo) = @_;
1068     my $name_attr;
1069     my $http_equiv_attr;
1070     my $charset_attr;
1071     my $content_attr;
1072     for my $attr (@{$todo->{node}->attributes}) {
1073     my $attr_ns = $attr->namespace_uri;
1074     $attr_ns = '' unless defined $attr_ns;
1075     my $attr_ln = $attr->manakai_local_name;
1076     my $checker;
1077     if ($attr_ns eq '') {
1078     if ($attr_ln eq 'content') {
1079     $content_attr = $attr;
1080     $checker = 1;
1081     } elsif ($attr_ln eq 'name') {
1082     $name_attr = $attr;
1083     $checker = 1;
1084     } elsif ($attr_ln eq 'http-equiv') {
1085     $http_equiv_attr = $attr;
1086     $checker = 1;
1087     } elsif ($attr_ln eq 'charset') {
1088     $charset_attr = $attr;
1089     $checker = 1;
1090     } else {
1091     $checker = $HTMLAttrChecker->{$attr_ln}
1092     || $AttrChecker->{$attr_ns}->{$attr_ln}
1093     || $AttrChecker->{$attr_ns}->{''};
1094     }
1095     } else {
1096     $checker ||= $AttrChecker->{$attr_ns}->{$attr_ln}
1097     || $AttrChecker->{$attr_ns}->{''};
1098     }
1099     if ($checker) {
1100     $checker->($self, $attr) if ref $checker;
1101     } else {
1102     $self->{onerror}->(node => $attr, level => 'unsupported',
1103     type => 'attribute');
1104     ## ISSUE: No comformance createria for unknown attributes in the spec
1105     }
1106     }
1107    
1108     if (defined $name_attr) {
1109     if (defined $http_equiv_attr) {
1110     $self->{onerror}->(node => $http_equiv_attr,
1111     type => 'attribute not allowed');
1112     } elsif (defined $charset_attr) {
1113     $self->{onerror}->(node => $charset_attr,
1114     type => 'attribute not allowed');
1115     }
1116     my $metadata_name = $name_attr->value;
1117     my $metadata_value;
1118     if (defined $content_attr) {
1119     $metadata_value = $content_attr->value;
1120     } else {
1121     $self->{onerror}->(node => $todo->{node},
1122     type => 'attribute missing:content');
1123     $metadata_value = '';
1124     }
1125     } elsif (defined $http_equiv_attr) {
1126     if (defined $charset_attr) {
1127     $self->{onerror}->(node => $charset_attr,
1128     type => 'attribute not allowed');
1129     }
1130     unless (defined $content_attr) {
1131     $self->{onerror}->(node => $todo->{node},
1132     type => 'attribute missing:content');
1133     }
1134     } elsif (defined $charset_attr) {
1135     if (defined $content_attr) {
1136     $self->{onerror}->(node => $content_attr,
1137     type => 'attribute not allowed');
1138     }
1139     } else {
1140     if (defined $content_attr) {
1141     $self->{onerror}->(node => $content_attr,
1142     type => 'attribute not allowed');
1143     $self->{onerror}->(node => $todo->{node},
1144     type => 'attribute missing:name|http-equiv');
1145     } else {
1146     $self->{onerror}->(node => $todo->{node},
1147     type => 'attribute missing:name|http-equiv|charset');
1148     }
1149     }
1150    
1151     ## TODO: metadata conformance
1152    
1153     ## TODO: pragma conformance
1154     if (defined $http_equiv_attr) { ## An enumerated attribute
1155     my $keyword = lc $http_equiv_attr->value; ## TODO: ascii case?
1156     if ({
1157     'refresh' => 1,
1158     'default-style' => 1,
1159     }->{$keyword}) {
1160     #
1161     } else {
1162     $self->{onerror}->(node => $http_equiv_attr,
1163     type => 'enumerated:invalid');
1164     }
1165     }
1166    
1167     if (defined $charset_attr) {
1168     unless ($todo->{node}->owner_document->manakai_is_html) {
1169     $self->{onerror}->(node => $charset_attr,
1170     type => 'in XML:charset');
1171     }
1172     ## TODO: charset
1173     }
1174     },
1175     checker => $HTMLEmptyChecker,
1176     };
1177    
1178     $Element->{$HTML_NS}->{style} = {
1179     attrs_checker => $GetHTMLAttrsChecker->({
1180     type => $HTMLIMTAttrChecker, ## TODO: MUST be a styling language
1181     media => $HTMLMQAttrChecker,
1182     scoped => $GetHTMLBooleanAttrChecker->('scoped'),
1183     ## NOTE: |title| has special semantics for |style|s, but is syntactically
1184     ## not different
1185     }),
1186     checker => sub {
1187     ## NOTE: |html:style| has no conformance creteria on content model
1188     my ($self, $todo) = @_;
1189     my $type = $todo->{node}->get_attribute_ns (undef, 'type');
1190     $type = 'text/css' unless defined $type;
1191     $self->{onerror}->(node => $todo->{node}, level => 'unsupported',
1192     type => 'style:'.$type); ## TODO: $type normalization
1193     return $AnyChecker->($self, $todo);
1194     },
1195     };
1196    
1197     $Element->{$HTML_NS}->{body} = {
1198     attrs_checker => $GetHTMLAttrsChecker->({}),
1199     checker => $HTMLBlockChecker,
1200     };
1201    
1202     $Element->{$HTML_NS}->{section} = {
1203     attrs_checker => $GetHTMLAttrsChecker->({}),
1204     checker => $HTMLStylableBlockChecker,
1205     };
1206    
1207     $Element->{$HTML_NS}->{nav} = {
1208     attrs_checker => $GetHTMLAttrsChecker->({}),
1209     checker => $HTMLBlockOrInlineChecker,
1210     };
1211    
1212     $Element->{$HTML_NS}->{article} = {
1213     attrs_checker => $GetHTMLAttrsChecker->({}),
1214     checker => $HTMLStylableBlockChecker,
1215     };
1216    
1217     $Element->{$HTML_NS}->{blockquote} = {
1218     attrs_checker => $GetHTMLAttrsChecker->({
1219     cite => $HTMLURIAttrChecker,
1220     }),
1221     checker => $HTMLBlockChecker,
1222     };
1223    
1224     $Element->{$HTML_NS}->{aside} = {
1225     attrs_checker => $GetHTMLAttrsChecker->({}),
1226     checker => $GetHTMLZeroOrMoreThenBlockOrInlineChecker->($HTML_NS, 'style'),
1227     };
1228    
1229     $Element->{$HTML_NS}->{h1} = {
1230     attrs_checker => $GetHTMLAttrsChecker->({}),
1231     checker => sub {
1232     my ($self, $todo) = @_;
1233     $todo->{flag}->{has_heading}->[0] = 1;
1234     return $HTMLSignificantStrictlyInlineChecker->($self, $todo);
1235     },
1236     };
1237    
1238     $Element->{$HTML_NS}->{h2} = {
1239     attrs_checker => $GetHTMLAttrsChecker->({}),
1240     checker => $Element->{$HTML_NS}->{h1}->{checker},
1241     };
1242    
1243     $Element->{$HTML_NS}->{h3} = {
1244     attrs_checker => $GetHTMLAttrsChecker->({}),
1245     checker => $Element->{$HTML_NS}->{h1}->{checker},
1246     };
1247    
1248     $Element->{$HTML_NS}->{h4} = {
1249     attrs_checker => $GetHTMLAttrsChecker->({}),
1250     checker => $Element->{$HTML_NS}->{h1}->{checker},
1251     };
1252    
1253     $Element->{$HTML_NS}->{h5} = {
1254     attrs_checker => $GetHTMLAttrsChecker->({}),
1255     checker => $Element->{$HTML_NS}->{h1}->{checker},
1256     };
1257    
1258     $Element->{$HTML_NS}->{h6} = {
1259     attrs_checker => $GetHTMLAttrsChecker->({}),
1260     checker => $Element->{$HTML_NS}->{h1}->{checker},
1261     };
1262    
1263     $Element->{$HTML_NS}->{header} = {
1264     attrs_checker => $GetHTMLAttrsChecker->({}),
1265     checker => sub {
1266     my ($self, $todo) = @_;
1267     my $old_flag = $todo->{flag}->{has_heading} || [];
1268     my $new_flag = [];
1269     local $todo->{flag}->{has_heading} = $new_flag;
1270     my $node = $todo->{node};
1271    
1272     my $end = $self->_add_minuses
1273     ({$HTML_NS => {qw/header 1 footer 1/}},
1274     $HTMLSectioningElements);
1275     my ($new_todos, $ch) = $HTMLBlockChecker->($self, $todo);
1276     push @$new_todos, $end,
1277     {type => 'code', code => sub {
1278     if ($new_flag->[0]) {
1279     $old_flag->[0] = 1;
1280     } else {
1281     $self->{onerror}->(node => $node, type => 'element missing:hn');
1282     }
1283     }};
1284     return ($new_todos, $ch);
1285     },
1286     };
1287    
1288     $Element->{$HTML_NS}->{footer} = {
1289     attrs_checker => $GetHTMLAttrsChecker->({}),
1290     checker => sub { ## block -hn -header -footer -sectioning or inline
1291     my ($self, $todo) = @_;
1292     my $el = $todo->{node};
1293     my $new_todos = [];
1294     my @nodes = (@{$el->child_nodes});
1295    
1296     my $content = 'block-or-inline'; # or 'block' or 'inline'
1297     my @block_not_inline;
1298     while (@nodes) {
1299     my $node = shift @nodes;
1300     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
1301    
1302     my $nt = $node->node_type;
1303     if ($nt == 1) {
1304     my $node_ns = $node->namespace_uri;
1305     $node_ns = '' unless defined $node_ns;
1306     my $node_ln = $node->manakai_local_name;
1307     my $not_allowed;
1308     if ($self->{minuses}->{$node_ns}->{$node_ln}) {
1309     $not_allowed = 1;
1310     } elsif ($node_ns eq $HTML_NS and
1311     {
1312     qw/h1 1 h2 1 h3 1 h4 1 h5 1 h6 1 header 1 footer 1/
1313     }->{$node_ln}) {
1314     $not_allowed = 1;
1315     } elsif ($HTMLSectioningElements->{$node_ns}->{$node_ln}) {
1316     $not_allowed = 1;
1317     }
1318     if ($content eq 'block') {
1319     $not_allowed = 1
1320     unless $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
1321     } elsif ($content eq 'inline') {
1322     $not_allowed = 1
1323     unless $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} or
1324     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
1325     } else {
1326     my $is_block = $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
1327     my $is_inline
1328     = $HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} ||
1329     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln};
1330    
1331     push @block_not_inline, $node
1332     if $is_block and not $is_inline and not $not_allowed;
1333     unless ($is_block) {
1334     $content = 'inline';
1335     for (@block_not_inline) {
1336     $self->{onerror}->(node => $_, type => 'element not allowed');
1337     }
1338     $not_allowed = 1 unless $is_inline;
1339     }
1340     }
1341     $self->{onerror}->(node => $node, type => 'element not allowed')
1342     if $not_allowed;
1343     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
1344     unshift @nodes, @$sib;
1345     push @$new_todos, @$ch;
1346     } elsif ($nt == 3 or $nt == 4) {
1347     if ($node->data =~ /[^\x09-\x0D\x20]/) {
1348     if ($content eq 'block') {
1349     $self->{onerror}->(node => $node, type => 'character not allowed');
1350     } else {
1351     $content = 'inline';
1352     for (@block_not_inline) {
1353     $self->{onerror}->(node => $_, type => 'element not allowed');
1354     }
1355     }
1356     }
1357     } elsif ($nt == 5) {
1358     unshift @nodes, @{$node->child_nodes};
1359     }
1360     }
1361    
1362     my $end = $self->_add_minuses
1363     ({$HTML_NS => {qw/h1 1 h2 1 h3 1 h4 1 h5 1 h6 1/}},
1364     $HTMLSectioningElements);
1365     push @$new_todos, $end;
1366    
1367     if ($content eq 'inline') {
1368     for (@$new_todos) {
1369     $_->{inline} = 1;
1370     }
1371     }
1372    
1373     return ($new_todos);
1374     },
1375     };
1376    
1377     $Element->{$HTML_NS}->{address} = {
1378     attrs_checker => $GetHTMLAttrsChecker->({}),
1379     checker => $HTMLInlineChecker,
1380     };
1381    
1382     $Element->{$HTML_NS}->{p} = {
1383     attrs_checker => $GetHTMLAttrsChecker->({}),
1384     checker => $HTMLSignificantInlineChecker,
1385     };
1386    
1387     $Element->{$HTML_NS}->{hr} = {
1388     attrs_checker => $GetHTMLAttrsChecker->({}),
1389     checker => $HTMLEmptyChecker,
1390     };
1391    
1392     $Element->{$HTML_NS}->{br} = {
1393     attrs_checker => $GetHTMLAttrsChecker->({}),
1394     checker => $HTMLEmptyChecker,
1395     };
1396    
1397     $Element->{$HTML_NS}->{dialog} = {
1398     attrs_checker => $GetHTMLAttrsChecker->({}),
1399     checker => sub {
1400     my ($self, $todo) = @_;
1401     my $el = $todo->{node};
1402     my $new_todos = [];
1403     my @nodes = (@{$el->child_nodes});
1404    
1405     my $phase = 'before dt';
1406     while (@nodes) {
1407     my $node = shift @nodes;
1408     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
1409    
1410     my $nt = $node->node_type;
1411     if ($nt == 1) {
1412     my $node_ns = $node->namespace_uri;
1413     $node_ns = '' unless defined $node_ns;
1414     my $node_ln = $node->manakai_local_name;
1415     ## NOTE: |minuses| list is not checked since redundant
1416     if ($phase eq 'before dt') {
1417     if ($node_ns eq $HTML_NS and $node_ln eq 'dt') {
1418     $phase = 'before dd';
1419     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'dd') {
1420     $self->{onerror}
1421     ->(node => $node, type => 'ps element missing:dt');
1422     $phase = 'before dt';
1423     } else {
1424     $self->{onerror}->(node => $node, type => 'element not allowed');
1425     }
1426     } else { # before dd
1427     if ($node_ns eq $HTML_NS and $node_ln eq 'dd') {
1428     $phase = 'before dt';
1429     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'dt') {
1430     $self->{onerror}
1431     ->(node => $node, type => 'ps element missing:dd');
1432     $phase = 'before dd';
1433     } else {
1434     $self->{onerror}->(node => $node, type => 'element not allowed');
1435     }
1436     }
1437     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
1438     unshift @nodes, @$sib;
1439     push @$new_todos, @$ch;
1440     } elsif ($nt == 3 or $nt == 4) {
1441     if ($node->data =~ /[^\x09-\x0D\x20]/) {
1442     $self->{onerror}->(node => $node, type => 'character not allowed');
1443     }
1444     } elsif ($nt == 5) {
1445     unshift @nodes, @{$node->child_nodes};
1446     }
1447     }
1448     if ($phase eq 'before dd') {
1449     $self->{onerror}->(node => $el, type => 'ps element missing:dd');
1450     }
1451     return ($new_todos);
1452     },
1453     };
1454    
1455     $Element->{$HTML_NS}->{pre} = {
1456     attrs_checker => $GetHTMLAttrsChecker->({}),
1457     checker => $HTMLStrictlyInlineChecker,
1458     };
1459    
1460     $Element->{$HTML_NS}->{ol} = {
1461     attrs_checker => $GetHTMLAttrsChecker->({
1462     start => $HTMLIntegerAttrChecker,
1463     }),
1464     checker => sub {
1465     my ($self, $todo) = @_;
1466     my $el = $todo->{node};
1467     my $new_todos = [];
1468     my @nodes = (@{$el->child_nodes});
1469    
1470     while (@nodes) {
1471     my $node = shift @nodes;
1472     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
1473    
1474     my $nt = $node->node_type;
1475     if ($nt == 1) {
1476     my $node_ns = $node->namespace_uri;
1477     $node_ns = '' unless defined $node_ns;
1478     my $node_ln = $node->manakai_local_name;
1479     ## NOTE: |minuses| list is not checked since redundant
1480     unless ($node_ns eq $HTML_NS and $node_ln eq 'li') {
1481     $self->{onerror}->(node => $node, type => 'element not allowed');
1482     }
1483     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
1484     unshift @nodes, @$sib;
1485     push @$new_todos, @$ch;
1486     } elsif ($nt == 3 or $nt == 4) {
1487     if ($node->data =~ /[^\x09-\x0D\x20]/) {
1488     $self->{onerror}->(node => $node, type => 'character not allowed');
1489     }
1490     } elsif ($nt == 5) {
1491     unshift @nodes, @{$node->child_nodes};
1492     }
1493     }
1494    
1495     if ($todo->{inline}) {
1496     for (@$new_todos) {
1497     $_->{inline} = 1;
1498     }
1499     }
1500     return ($new_todos);
1501     },
1502     };
1503    
1504     $Element->{$HTML_NS}->{ul} = {
1505     attrs_checker => $GetHTMLAttrsChecker->({}),
1506     checker => $Element->{$HTML_NS}->{ol}->{checker},
1507     };
1508    
1509    
1510     $Element->{$HTML_NS}->{li} = {
1511     attrs_checker => $GetHTMLAttrsChecker->({
1512     start => sub {
1513     my ($self, $attr) = @_;
1514     my $parent = $attr->owner_element->manakai_parent_element;
1515     if (defined $parent) {
1516     my $parent_ns = $parent->namespace_uri;
1517     $parent_ns = '' unless defined $parent_ns;
1518     my $parent_ln = $parent->manakai_local_name;
1519     unless ($parent_ns eq $HTML_NS and $parent_ln eq 'ol') {
1520     $self->{onerror}->(node => $attr, level => 'unsupported',
1521     type => 'attribute');
1522     }
1523     }
1524     $HTMLIntegerAttrChecker->($self, $attr);
1525     },
1526     }),
1527     checker => sub {
1528     my ($self, $todo) = @_;
1529     if ($todo->{inline}) {
1530     return $HTMLInlineChecker->($self, $todo);
1531     } else {
1532     return $HTMLBlockOrInlineChecker->($self, $todo);
1533     }
1534     },
1535     };
1536    
1537     $Element->{$HTML_NS}->{dl} = {
1538     attrs_checker => $GetHTMLAttrsChecker->({}),
1539     checker => sub {
1540     my ($self, $todo) = @_;
1541     my $el = $todo->{node};
1542     my $new_todos = [];
1543     my @nodes = (@{$el->child_nodes});
1544    
1545     my $phase = 'before dt';
1546     while (@nodes) {
1547     my $node = shift @nodes;
1548     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
1549    
1550     my $nt = $node->node_type;
1551     if ($nt == 1) {
1552     my $node_ns = $node->namespace_uri;
1553     $node_ns = '' unless defined $node_ns;
1554     my $node_ln = $node->manakai_local_name;
1555     ## NOTE: |minuses| list is not checked since redundant
1556     if ($phase eq 'in dds') {
1557     if ($node_ns eq $HTML_NS and $node_ln eq 'dd') {
1558     #$phase = 'in dds';
1559     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'dt') {
1560     $phase = 'in dts';
1561     } else {
1562     $self->{onerror}->(node => $node, type => 'element not allowed');
1563     }
1564     } elsif ($phase eq 'in dts') {
1565     if ($node_ns eq $HTML_NS and $node_ln eq 'dt') {
1566     #$phase = 'in dts';
1567     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'dd') {
1568     $phase = 'in dds';
1569     } else {
1570     $self->{onerror}->(node => $node, type => 'element not allowed');
1571     }
1572     } else { # before dt
1573     if ($node_ns eq $HTML_NS and $node_ln eq 'dt') {
1574     $phase = 'in dts';
1575     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'dd') {
1576     $self->{onerror}
1577     ->(node => $node, type => 'ps element missing:dt');
1578     $phase = 'in dds';
1579     } else {
1580     $self->{onerror}->(node => $node, type => 'element not allowed');
1581     }
1582     }
1583     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
1584     unshift @nodes, @$sib;
1585     push @$new_todos, @$ch;
1586     } elsif ($nt == 3 or $nt == 4) {
1587     if ($node->data =~ /[^\x09-\x0D\x20]/) {
1588     $self->{onerror}->(node => $node, type => 'character not allowed');
1589     }
1590     } elsif ($nt == 5) {
1591     unshift @nodes, @{$node->child_nodes};
1592     }
1593     }
1594     if ($phase eq 'in dts') {
1595     $self->{onerror}->(node => $el, type => 'ps element missing:dd');
1596     }
1597    
1598     if ($todo->{inline}) {
1599     for (@$new_todos) {
1600     $_->{inline} = 1;
1601     }
1602     }
1603     return ($new_todos);
1604     },
1605     };
1606    
1607     $Element->{$HTML_NS}->{dt} = {
1608     attrs_checker => $GetHTMLAttrsChecker->({}),
1609     checker => $HTMLStrictlyInlineChecker,
1610     };
1611    
1612     $Element->{$HTML_NS}->{dd} = {
1613     attrs_checker => $GetHTMLAttrsChecker->({}),
1614     checker => $Element->{$HTML_NS}->{li}->{checker},
1615     };
1616    
1617     $Element->{$HTML_NS}->{a} = {
1618     attrs_checker => sub {
1619     my ($self, $todo) = @_;
1620     my %attr;
1621     for my $attr (@{$todo->{node}->attributes}) {
1622     my $attr_ns = $attr->namespace_uri;
1623     $attr_ns = '' unless defined $attr_ns;
1624     my $attr_ln = $attr->manakai_local_name;
1625     my $checker;
1626     if ($attr_ns eq '') {
1627     $checker = {
1628     target => $HTMLTargetAttrChecker,
1629     href => $HTMLURIAttrChecker,
1630     ping => $HTMLSpaceURIsAttrChecker,
1631     rel => sub { $HTMLLinkTypesAttrChecker->(1, @_) },
1632     media => $HTMLMQAttrChecker,
1633     hreflang => $HTMLLanguageTagAttrChecker,
1634     type => $HTMLIMTAttrChecker,
1635     }->{$attr_ln};
1636     if ($checker) {
1637     $attr{$attr_ln} = $attr;
1638     } else {
1639     $checker = $HTMLAttrChecker->{$attr_ln};
1640     }
1641     }
1642     $checker ||= $AttrChecker->{$attr_ns}->{$attr_ln}
1643     || $AttrChecker->{$attr_ns}->{''};
1644     if ($checker) {
1645     $checker->($self, $attr) if ref $checker;
1646     } else {
1647     $self->{onerror}->(node => $attr, level => 'unsupported',
1648     type => 'attribute');
1649     ## ISSUE: No comformance createria for unknown attributes in the spec
1650     }
1651     }
1652    
1653     unless (defined $attr{href}) {
1654     for (qw/target ping rel media hreflang type/) {
1655     if (defined $attr{$_}) {
1656     $self->{onerror}->(node => $attr{$_},
1657     type => 'attribute not allowed');
1658     }
1659     }
1660     }
1661     },
1662     checker => sub {
1663     my ($self, $todo) = @_;
1664    
1665     my $end = $self->_add_minuses ($HTMLInteractiveElements);
1666     my ($new_todos, $ch)
1667     = $HTMLSignificantInlineOrStrictlyInlineChecker->($self, $todo);
1668     push @$new_todos, $end;
1669    
1670     $_->{flag}->{has_a} = 1 for @$new_todos;
1671    
1672     return ($new_todos, $ch);
1673     },
1674     };
1675    
1676     $Element->{$HTML_NS}->{q} = {
1677     attrs_checker => $GetHTMLAttrsChecker->({
1678     cite => $HTMLURIAttrChecker,
1679     }),
1680     checker => $HTMLInlineOrStrictlyInlineChecker,
1681     };
1682    
1683     $Element->{$HTML_NS}->{cite} = {
1684     attrs_checker => $GetHTMLAttrsChecker->({}),
1685     checker => $HTMLStrictlyInlineChecker,
1686     };
1687    
1688     $Element->{$HTML_NS}->{em} = {
1689     attrs_checker => $GetHTMLAttrsChecker->({}),
1690     checker => $HTMLInlineOrStrictlyInlineChecker,
1691     };
1692    
1693     $Element->{$HTML_NS}->{strong} = {
1694     attrs_checker => $GetHTMLAttrsChecker->({}),
1695     checker => $HTMLInlineOrStrictlyInlineChecker,
1696     };
1697    
1698     $Element->{$HTML_NS}->{small} = {
1699     attrs_checker => $GetHTMLAttrsChecker->({}),
1700     checker => $HTMLInlineOrStrictlyInlineChecker,
1701     };
1702    
1703     $Element->{$HTML_NS}->{m} = {
1704     attrs_checker => $GetHTMLAttrsChecker->({}),
1705     checker => $HTMLInlineOrStrictlyInlineChecker,
1706     };
1707    
1708     $Element->{$HTML_NS}->{dfn} = {
1709     attrs_checker => $GetHTMLAttrsChecker->({}),
1710     checker => sub {
1711     my ($self, $todo) = @_;
1712    
1713     my $end = $self->_add_minuses ({$HTML_NS => {dfn => 1}});
1714     my ($sib, $ch) = $HTMLStrictlyInlineChecker->($self, $todo);
1715     push @$sib, $end;
1716    
1717     my $node = $todo->{node};
1718     my $term = $node->get_attribute_ns (undef, 'title');
1719     unless (defined $term) {
1720     for my $child (@{$node->child_nodes}) {
1721     if ($child->node_type == 1) { # ELEMENT_NODE
1722     if (defined $term) {
1723     undef $term;
1724     last;
1725     } elsif ($child->manakai_local_name eq 'abbr') {
1726     my $nsuri = $child->namespace_uri;
1727     if (defined $nsuri and $nsuri eq $HTML_NS) {
1728     my $attr = $child->get_attribute_node_ns (undef, 'title');
1729     if ($attr) {
1730     $term = $attr->value;
1731     }
1732     }
1733     }
1734     } elsif ($child->node_type == 3 or $child->node_type == 4) {
1735     ## TEXT_NODE or CDATA_SECTION_NODE
1736     if ($child->data =~ /\A[\x09-\x0D\x20]+\z/) { # Inter-element whitespace
1737     next;
1738     }
1739     undef $term;
1740     last;
1741     }
1742     }
1743     unless (defined $term) {
1744     $term = $node->text_content;
1745     }
1746     }
1747     if ($self->{term}->{$term}) {
1748     $self->{onerror}->(node => $node, type => 'duplicate term');
1749     push @{$self->{term}->{$term}}, $node;
1750     } else {
1751     $self->{term}->{$term} = [$node];
1752     }
1753     ## ISSUE: The HTML5 algorithm does not work with |ruby| unless |dfn|
1754     ## has |title|.
1755    
1756     return ($sib, $ch);
1757     },
1758     };
1759    
1760     $Element->{$HTML_NS}->{abbr} = {
1761     attrs_checker => $GetHTMLAttrsChecker->({
1762     ## NOTE: |title| has special semantics for |abbr|s, but is syntactically
1763     ## not different. The spec says that the |title| MAY be omitted
1764     ## if there is a |dfn| whose defining term is the abbreviation,
1765     ## but it does not prohibit |abbr| w/o |title| in other cases.
1766     }),
1767     checker => $HTMLStrictlyInlineChecker,
1768     };
1769    
1770     $Element->{$HTML_NS}->{time} = {
1771     attrs_checker => $GetHTMLAttrsChecker->({
1772     datetime => sub { 1 }, # checked in |checker|
1773     }),
1774     ## TODO: Write tests
1775     checker => sub {
1776     my ($self, $todo) = @_;
1777    
1778     my $attr = $todo->{node}->get_attribute_node_ns (undef, 'datetime');
1779     my $input;
1780     my $reg_sp;
1781     my $input_node;
1782     if ($attr) {
1783     $input = $attr->value;
1784     $reg_sp = qr/[\x09-\x0D\x20]*/;
1785     $input_node = $attr;
1786     } else {
1787     $input = $todo->{node}->text_content;
1788     $reg_sp = qr/\p{Zs}*/;
1789     $input_node = $todo->{node};
1790    
1791     ## ISSUE: What is the definition for "successfully extracts a date
1792     ## or time"? If the algorithm says the string is invalid but
1793     ## return some date or time, is it "successfully"?
1794     }
1795    
1796     my $hour;
1797     my $minute;
1798     my $second;
1799     if ($input =~ /
1800     \A
1801     [\x09-\x0D\x20]*
1802     ([0-9]+) # 1
1803     (?>
1804     -([0-9]+) # 2
1805     -([0-9]+) # 3
1806     [\x09-\x0D\x20]*
1807     (?>
1808     T
1809     [\x09-\x0D\x20]*
1810     )?
1811     ([0-9]+) # 4
1812     :([0-9]+) # 5
1813     (?>
1814     :([0-9]+(?>\.[0-9]*)?|\.[0-9]*) # 6
1815     )?
1816     [\x09-\x0D\x20]*
1817     (?>
1818     Z
1819     [\x09-\x0D\x20]*
1820     |
1821     [+-]([0-9]+):([0-9]+) # 7, 8
1822     [\x09-\x0D\x20]*
1823     )?
1824     \z
1825     |
1826     :([0-9]+) # 9
1827     (?>
1828     :([0-9]+(?>\.[0-9]*)?|\.[0-9]*) # 10
1829     )?
1830     [\x09-\x0D\x20]*\z
1831     )
1832     /x) {
1833     if (defined $2) { ## YYYY-MM-DD T? hh:mm
1834     if (length $1 != 4 or length $2 != 2 or length $3 != 2 or
1835     length $4 != 2 or length $5 != 2) {
1836     $self->{onerror}->(node => $input_node,
1837     type => 'dateortime:syntax error');
1838     }
1839    
1840     if (1 <= $2 and $2 <= 12) {
1841     $self->{onerror}->(node => $input_node, type => 'datetime:bad day')
1842     if $3 < 1 or
1843     $3 > [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]->[$2];
1844     $self->{onerror}->(node => $input_node, type => 'datetime:bad day')
1845     if $2 == 2 and $3 == 29 and
1846     not ($1 % 400 == 0 or ($1 % 4 == 0 and $1 % 100 != 0));
1847     } else {
1848     $self->{onerror}->(node => $input_node,
1849     type => 'datetime:bad month');
1850     }
1851    
1852     ($hour, $minute, $second) = ($4, $5, $6);
1853    
1854     if (defined $7) { ## [+-]hh:mm
1855     if (length $7 != 2 or length $8 != 2) {
1856     $self->{onerror}->(node => $input_node,
1857     type => 'dateortime:syntax error');
1858     }
1859    
1860     $self->{onerror}->(node => $input_node,
1861     type => 'datetime:bad timezone hour')
1862     if $7 > 23;
1863     $self->{onerror}->(node => $input_node,
1864     type => 'datetime:bad timezone minute')
1865     if $8 > 59;
1866     }
1867     } else { ## hh:mm
1868     if (length $1 != 2 or length $9 != 2) {
1869     $self->{onerror}->(node => $input_node,
1870     type => qq'dateortime:syntax error');
1871     }
1872    
1873     ($hour, $minute, $second) = ($1, $9, $10);
1874     }
1875    
1876     $self->{onerror}->(node => $input_node, type => 'datetime:bad hour')
1877     if $hour > 23;
1878     $self->{onerror}->(node => $input_node, type => 'datetime:bad minute')
1879     if $minute > 59;
1880    
1881     if (defined $second) { ## s
1882     ## NOTE: Integer part of second don't have to have length of two.
1883    
1884     if (substr ($second, 0, 1) eq '.') {
1885     $self->{onerror}->(node => $input_node,
1886     type => 'dateortime:syntax error');
1887     }
1888    
1889     $self->{onerror}->(node => $input_node, type => 'datetime:bad second')
1890     if $second >= 60;
1891     }
1892     } else {
1893     $self->{onerror}->(node => $input_node,
1894     type => 'dateortime:syntax error');
1895     }
1896    
1897     return $HTMLStrictlyInlineChecker->($self, $todo);
1898     },
1899     };
1900    
1901     $Element->{$HTML_NS}->{meter} = { ## TODO: "The recommended way of giving the value is to include it as contents of the element"
1902     attrs_checker => $GetHTMLAttrsChecker->({
1903     value => $GetHTMLFloatingPointNumberAttrChecker->(sub { 1 }),
1904     min => $GetHTMLFloatingPointNumberAttrChecker->(sub { 1 }),
1905     low => $GetHTMLFloatingPointNumberAttrChecker->(sub { 1 }),
1906     high => $GetHTMLFloatingPointNumberAttrChecker->(sub { 1 }),
1907     max => $GetHTMLFloatingPointNumberAttrChecker->(sub { 1 }),
1908     optimum => $GetHTMLFloatingPointNumberAttrChecker->(sub { 1 }),
1909     }),
1910     checker => $HTMLStrictlyInlineChecker,
1911     };
1912    
1913     $Element->{$HTML_NS}->{progress} = { ## TODO: recommended to use content
1914     attrs_checker => $GetHTMLAttrsChecker->({
1915     value => $GetHTMLFloatingPointNumberAttrChecker->(sub { shift >= 0 }),
1916     max => $GetHTMLFloatingPointNumberAttrChecker->(sub { shift > 0 }),
1917     }),
1918     checker => $HTMLStrictlyInlineChecker,
1919     };
1920    
1921     $Element->{$HTML_NS}->{code} = {
1922     attrs_checker => $GetHTMLAttrsChecker->({}),
1923     ## NOTE: Though |title| has special semantics,
1924     ## syntatically same as the |title| as global attribute.
1925     checker => $HTMLInlineOrStrictlyInlineChecker,
1926     };
1927    
1928     $Element->{$HTML_NS}->{var} = {
1929     attrs_checker => $GetHTMLAttrsChecker->({}),
1930     ## NOTE: Though |title| has special semantics,
1931     ## syntatically same as the |title| as global attribute.
1932     checker => $HTMLStrictlyInlineChecker,
1933     };
1934    
1935     $Element->{$HTML_NS}->{samp} = {
1936     attrs_checker => $GetHTMLAttrsChecker->({}),
1937     ## NOTE: Though |title| has special semantics,
1938     ## syntatically same as the |title| as global attribute.
1939     checker => $HTMLInlineOrStrictlyInlineChecker,
1940     };
1941    
1942     $Element->{$HTML_NS}->{kbd} = {
1943     attrs_checker => $GetHTMLAttrsChecker->({}),
1944     checker => $HTMLStrictlyInlineChecker,
1945     };
1946    
1947     $Element->{$HTML_NS}->{sub} = {
1948     attrs_checker => $GetHTMLAttrsChecker->({}),
1949     checker => $HTMLStrictlyInlineChecker,
1950     };
1951    
1952     $Element->{$HTML_NS}->{sup} = {
1953     attrs_checker => $GetHTMLAttrsChecker->({}),
1954     checker => $HTMLStrictlyInlineChecker,
1955     };
1956    
1957     $Element->{$HTML_NS}->{span} = {
1958     attrs_checker => $GetHTMLAttrsChecker->({}),
1959     ## NOTE: Though |title| has special semantics,
1960     ## syntatically same as the |title| as global attribute.
1961     checker => $HTMLInlineOrStrictlyInlineChecker,
1962     };
1963    
1964     $Element->{$HTML_NS}->{i} = {
1965     attrs_checker => $GetHTMLAttrsChecker->({}),
1966     ## NOTE: Though |title| has special semantics,
1967     ## syntatically same as the |title| as global attribute.
1968     checker => $HTMLStrictlyInlineChecker,
1969     };
1970    
1971     $Element->{$HTML_NS}->{b} = {
1972     attrs_checker => $GetHTMLAttrsChecker->({}),
1973     checker => $HTMLStrictlyInlineChecker,
1974     };
1975    
1976     $Element->{$HTML_NS}->{bdo} = {
1977     attrs_checker => sub {
1978     my ($self, $todo) = @_;
1979     $GetHTMLAttrsChecker->({})->($self, $todo);
1980     unless ($todo->{node}->has_attribute_ns (undef, 'dir')) {
1981     $self->{onerror}->(node => $todo->{node}, type => 'attribute missing:dir');
1982     }
1983     },
1984     ## ISSUE: The spec does not directly say that |dir| is a enumerated attr.
1985     checker => $HTMLStrictlyInlineChecker,
1986     };
1987    
1988     $Element->{$HTML_NS}->{ins} = {
1989     attrs_checker => $GetHTMLAttrsChecker->({
1990     cite => $HTMLURIAttrChecker,
1991     datetime => $HTMLDatetimeAttrChecker,
1992     }),
1993     checker => $HTMLTransparentChecker,
1994     };
1995    
1996     $Element->{$HTML_NS}->{del} = {
1997     attrs_checker => $GetHTMLAttrsChecker->({
1998     cite => $HTMLURIAttrChecker,
1999     datetime => $HTMLDatetimeAttrChecker,
2000     }),
2001     checker => sub {
2002     my ($self, $todo) = @_;
2003    
2004     my $parent = $todo->{node}->manakai_parent_element;
2005     if (defined $parent) {
2006     my $nsuri = $parent->namespace_uri;
2007     $nsuri = '' unless defined $nsuri;
2008     my $ln = $parent->manakai_local_name;
2009     my $eldef = $Element->{$nsuri}->{$ln} ||
2010     $Element->{$nsuri}->{''} ||
2011     $ElementDefault;
2012     return $eldef->{checker}->($self, $todo);
2013     } else {
2014     return $HTMLBlockOrInlineChecker->($self, $todo);
2015     }
2016     },
2017     };
2018    
2019     ## TODO: figure
2020    
2021     $Element->{$HTML_NS}->{img} = {
2022     attrs_checker => sub {
2023     my ($self, $todo) = @_;
2024     $GetHTMLAttrsChecker->({
2025     alt => sub { }, ## NOTE: No syntactical requirement
2026     src => $HTMLURIAttrChecker,
2027     usemap => $HTMLUsemapAttrChecker,
2028     ismap => sub {
2029     my ($self, $attr, $parent_todo) = @_;
2030     if (not $todo->{flag}->{has_a}) {
2031     $self->{onerror}->(node => $attr, type => 'attribute not allowed');
2032     }
2033     $GetHTMLBooleanAttrChecker->('ismap')->($self, $attr, $parent_todo);
2034     },
2035     ## TODO: height
2036     ## TODO: width
2037     })->($self, $todo);
2038     unless ($todo->{node}->has_attribute_ns (undef, 'alt')) {
2039     $self->{onerror}->(node => $todo->{node}, type => 'attribute missing:alt');
2040     }
2041     unless ($todo->{node}->has_attribute_ns (undef, 'src')) {
2042     $self->{onerror}->(node => $todo->{node}, type => 'attribute missing:src');
2043     }
2044     },
2045     checker => $HTMLEmptyChecker,
2046     };
2047    
2048     $Element->{$HTML_NS}->{iframe} = {
2049     attrs_checker => $GetHTMLAttrsChecker->({
2050     src => $HTMLURIAttrChecker,
2051     }),
2052     checker => $HTMLTextChecker,
2053     };
2054    
2055     $Element->{$HTML_NS}->{embed} = {
2056     attrs_checker => sub {
2057     my ($self, $todo) = @_;
2058     my $has_src;
2059     for my $attr (@{$todo->{node}->attributes}) {
2060     my $attr_ns = $attr->namespace_uri;
2061     $attr_ns = '' unless defined $attr_ns;
2062     my $attr_ln = $attr->manakai_local_name;
2063     my $checker;
2064     if ($attr_ns eq '') {
2065     if ($attr_ln eq 'src') {
2066     $checker = $HTMLURIAttrChecker;
2067     $has_src = 1;
2068     } elsif ($attr_ln eq 'type') {
2069     $checker = $HTMLIMTAttrChecker;
2070     } else {
2071     ## TODO: height
2072     ## TODO: width
2073     $checker = $HTMLAttrChecker->{$attr_ln}
2074     || sub { }; ## NOTE: Any local attribute is ok.
2075     }
2076     }
2077     $checker ||= $AttrChecker->{$attr_ns}->{$attr_ln}
2078     || $AttrChecker->{$attr_ns}->{''};
2079     if ($checker) {
2080     $checker->($self, $attr);
2081     } else {
2082     $self->{onerror}->(node => $attr, level => 'unsupported',
2083     type => 'attribute');
2084     ## ISSUE: No comformance createria for global attributes in the spec
2085     }
2086     }
2087    
2088     unless ($has_src) {
2089     $self->{onerror}->(node => $todo->{node},
2090     type => 'attribute missing:src');
2091     }
2092     },
2093     checker => $HTMLEmptyChecker,
2094     };
2095    
2096     $Element->{$HTML_NS}->{object} = {
2097     attrs_checker => sub {
2098     my ($self, $todo) = @_;
2099     $GetHTMLAttrsChecker->({
2100     data => $HTMLURIAttrChecker,
2101     type => $HTMLIMTAttrChecker,
2102     usemap => $HTMLUsemapAttrChecker,
2103     ## TODO: width
2104     ## TODO: height
2105     })->($self, $todo);
2106     unless ($todo->{node}->has_attribute_ns (undef, 'data')) {
2107     unless ($todo->{node}->has_attribute_ns (undef, 'type')) {
2108     $self->{onerror}->(node => $todo->{node},
2109     type => 'attribute missing:data|type');
2110     }
2111     }
2112     },
2113     checker => $ElementDefault->{checker}, ## TODO
2114     };
2115    
2116     $Element->{$HTML_NS}->{param} = {
2117     attrs_checker => sub {
2118     my ($self, $todo) = @_;
2119     $GetHTMLAttrsChecker->({
2120     name => sub { },
2121     value => sub { },
2122     })->($self, $todo);
2123     unless ($todo->{node}->has_attribute_ns (undef, 'name')) {
2124     $self->{onerror}->(node => $todo->{node},
2125     type => 'attribute missing:name');
2126     }
2127     unless ($todo->{node}->has_attribute_ns (undef, 'value')) {
2128     $self->{onerror}->(node => $todo->{node},
2129     type => 'attribute missing:value');
2130     }
2131     },
2132     checker => $HTMLEmptyChecker,
2133     };
2134    
2135     $Element->{$HTML_NS}->{video} = {
2136     attrs_checker => $GetHTMLAttrsChecker->({
2137     src => $HTMLURIAttrChecker,
2138     ## TODO: start, loopstart, loopend, end
2139     ## ISSUE: they MUST be "value time offset"s. Value?
2140     ## ISSUE: loopcount has no conformance creteria
2141     autoplay => $GetHTMLBooleanAttrChecker->('autoplay'),
2142     controls => $GetHTMLBooleanAttrChecker->('controls'),
2143     }),
2144     checker => sub {
2145     my ($self, $todo) = @_;
2146    
2147     if ($todo->{node}->has_attribute_ns (undef, 'src')) {
2148     return $HTMLBlockOrInlineChecker->($self, $todo);
2149     } else {
2150     return $GetHTMLZeroOrMoreThenBlockOrInlineChecker->($HTML_NS, 'source')
2151     ->($self, $todo);
2152     }
2153     },
2154     };
2155    
2156     $Element->{$HTML_NS}->{audio} = {
2157     attrs_checker => $Element->{$HTML_NS}->{video}->{attrs_checker},
2158     checker => $Element->{$HTML_NS}->{video}->{checker},
2159     };
2160    
2161     $Element->{$HTML_NS}->{source} = {
2162     attrs_checker => sub {
2163     my ($self, $todo) = @_;
2164     $GetHTMLAttrsChecker->({
2165     src => $HTMLURIAttrChecker,
2166     type => $HTMLIMTAttrChecker,
2167     media => $HTMLMQAttrChecker,
2168     })->($self, $todo);
2169     unless ($todo->{node}->has_attribute_ns (undef, 'src')) {
2170     $self->{onerror}->(node => $todo->{node},
2171     type => 'attribute missing:src');
2172     }
2173     },
2174     checker => $HTMLEmptyChecker,
2175     };
2176    
2177     $Element->{$HTML_NS}->{canvas} = {
2178     attrs_checker => $GetHTMLAttrsChecker->({
2179     height => $GetHTMLNonNegativeIntegerAttrChecker->(sub { 1 }),
2180     width => $GetHTMLNonNegativeIntegerAttrChecker->(sub { 1 }),
2181     }),
2182     checker => $HTMLInlineChecker,
2183     };
2184    
2185     $Element->{$HTML_NS}->{map} = {
2186     attrs_checker => $GetHTMLAttrsChecker->({
2187     id => sub {
2188     ## NOTE: same as global |id=""|, with |$self->{map}| registeration
2189     my ($self, $attr) = @_;
2190     my $value = $attr->value;
2191     if (length $value > 0) {
2192     if ($self->{id}->{$value}) {
2193     $self->{onerror}->(node => $attr, type => 'duplicate ID');
2194     push @{$self->{id}->{$value}}, $attr;
2195     } else {
2196     $self->{id}->{$value} = [$attr];
2197     }
2198     } else {
2199     ## NOTE: MUST contain at least one character
2200     $self->{onerror}->(node => $attr, type => 'empty attribute value');
2201     }
2202     if ($value =~ /[\x09-\x0D\x20]/) {
2203     $self->{onerror}->(node => $attr, type => 'space in ID');
2204     }
2205     $self->{map}->{$value} ||= $attr;
2206     },
2207     }),
2208     checker => $HTMLBlockChecker,
2209 wakaba 1.2 ## TODO: |id| is required.
2210 wakaba 1.1 };
2211    
2212     $Element->{$HTML_NS}->{area} = {
2213     attrs_checker => sub {
2214     my ($self, $todo) = @_;
2215     my %attr;
2216     my $coords;
2217     for my $attr (@{$todo->{node}->attributes}) {
2218     my $attr_ns = $attr->namespace_uri;
2219     $attr_ns = '' unless defined $attr_ns;
2220     my $attr_ln = $attr->manakai_local_name;
2221     my $checker;
2222     if ($attr_ns eq '') {
2223     $checker = {
2224     alt => sub { },
2225     ## NOTE: |alt| value has no conformance creteria.
2226     shape => $GetHTMLEnumeratedAttrChecker->({
2227     circ => -1, circle => 1,
2228     default => 1,
2229     poly => 1, polygon => -1,
2230     rect => 1, rectangle => -1,
2231     }),
2232     coords => sub {
2233     my ($self, $attr) = @_;
2234     my $value = $attr->value;
2235     if ($value =~ /\A-?[0-9]+(?>,-?[0-9]+)*\z/) {
2236     $coords = [split /,/, $value];
2237     } else {
2238     $self->{onerror}->(node => $attr,
2239     type => 'coords:syntax error');
2240     }
2241     },
2242     target => $HTMLTargetAttrChecker,
2243     href => $HTMLURIAttrChecker,
2244     ping => $HTMLSpaceURIsAttrChecker,
2245     rel => sub { $HTMLLinkTypesAttrChecker->(1, @_) },
2246     media => $HTMLMQAttrChecker,
2247     hreflang => $HTMLLanguageTagAttrChecker,
2248     type => $HTMLIMTAttrChecker,
2249     }->{$attr_ln};
2250     if ($checker) {
2251     $attr{$attr_ln} = $attr;
2252     } else {
2253     $checker = $HTMLAttrChecker->{$attr_ln};
2254     }
2255     }
2256     $checker ||= $AttrChecker->{$attr_ns}->{$attr_ln}
2257     || $AttrChecker->{$attr_ns}->{''};
2258     if ($checker) {
2259     $checker->($self, $attr) if ref $checker;
2260     } else {
2261     $self->{onerror}->(node => $attr, level => 'unsupported',
2262     type => 'attribute');
2263     ## ISSUE: No comformance createria for unknown attributes in the spec
2264     }
2265     }
2266    
2267     if (defined $attr{href}) {
2268     unless (defined $attr{alt}) {
2269     $self->{onerror}->(node => $todo->{node},
2270     type => 'attribute missing:alt');
2271     }
2272     } else {
2273     for (qw/target ping rel media hreflang type alt/) {
2274     if (defined $attr{$_}) {
2275     $self->{onerror}->(node => $attr{$_},
2276     type => 'attribute not allowed');
2277     }
2278     }
2279     }
2280    
2281     my $shape = 'rectangle';
2282     if (defined $attr{shape}) {
2283     $shape = {
2284     circ => 'circle', circle => 'circle',
2285     default => 'default',
2286     poly => 'polygon', polygon => 'polygon',
2287     rect => 'rectangle', rectangle => 'rectangle',
2288     }->{lc $attr{shape}->value} || 'rectangle';
2289     ## TODO: ASCII lowercase?
2290     }
2291    
2292     if ($shape eq 'circle') {
2293     if (defined $attr{coords}) {
2294     if (defined $coords) {
2295     if (@$coords == 3) {
2296     if ($coords->[2] < 0) {
2297     $self->{onerror}->(node => $attr{coords},
2298     type => 'coords:out of range:2');
2299     }
2300     } else {
2301     $self->{onerror}->(node => $attr{coords},
2302     type => 'coords:number:3:'.@$coords);
2303     }
2304     } else {
2305     ## NOTE: A syntax error has been reported.
2306     }
2307     } else {
2308     $self->{onerror}->(node => $todo->{node},
2309     type => 'attribute missing:coords');
2310     }
2311     } elsif ($shape eq 'default') {
2312     if (defined $attr{coords}) {
2313     $self->{onerror}->(node => $attr{coords},
2314     type => 'attribute not allowed');
2315     }
2316     } elsif ($shape eq 'polygon') {
2317     if (defined $attr{coords}) {
2318     if (defined $coords) {
2319     if (@$coords >= 6) {
2320     unless (@$coords % 2 == 0) {
2321     $self->{onerror}->(node => $attr{coords},
2322     type => 'coords:number:even:'.@$coords);
2323     }
2324     } else {
2325     $self->{onerror}->(node => $attr{coords},
2326     type => 'coords:number:>=6:'.@$coords);
2327     }
2328     } else {
2329     ## NOTE: A syntax error has been reported.
2330     }
2331     } else {
2332     $self->{onerror}->(node => $todo->{node},
2333     type => 'attribute missing:coords');
2334     }
2335     } elsif ($shape eq 'rectangle') {
2336     if (defined $attr{coords}) {
2337     if (defined $coords) {
2338     if (@$coords == 4) {
2339     unless ($coords->[0] < $coords->[2]) {
2340     $self->{onerror}->(node => $attr{coords},
2341     type => 'coords:out of range:0');
2342     }
2343     unless ($coords->[1] < $coords->[3]) {
2344     $self->{onerror}->(node => $attr{coords},
2345     type => 'coords:out of range:1');
2346     }
2347     } else {
2348     $self->{onerror}->(node => $attr{coords},
2349     type => 'coords:number:4:'.@$coords);
2350     }
2351     } else {
2352     ## NOTE: A syntax error has been reported.
2353     }
2354     } else {
2355     $self->{onerror}->(node => $todo->{node},
2356     type => 'attribute missing:coords');
2357     }
2358     }
2359     },
2360     checker => $HTMLEmptyChecker,
2361     };
2362     ## TODO: only in map
2363    
2364     $Element->{$HTML_NS}->{table} = {
2365     attrs_checker => $GetHTMLAttrsChecker->({}),
2366     checker => sub {
2367     my ($self, $todo) = @_;
2368     my $el = $todo->{node};
2369     my $new_todos = [];
2370     my @nodes = (@{$el->child_nodes});
2371    
2372     my $phase = 'before caption';
2373     my $has_tfoot;
2374     while (@nodes) {
2375     my $node = shift @nodes;
2376     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
2377    
2378     my $nt = $node->node_type;
2379     if ($nt == 1) {
2380     my $node_ns = $node->namespace_uri;
2381     $node_ns = '' unless defined $node_ns;
2382     my $node_ln = $node->manakai_local_name;
2383     ## NOTE: |minuses| list is not checked since redundant
2384     if ($phase eq 'in tbodys') {
2385     if ($node_ns eq $HTML_NS and $node_ln eq 'tbody') {
2386     #$phase = 'in tbodys';
2387     } elsif (not $has_tfoot and
2388     $node_ns eq $HTML_NS and $node_ln eq 'tfoot') {
2389     $phase = 'after tfoot';
2390     $has_tfoot = 1;
2391     } else {
2392     $self->{onerror}->(node => $node, type => 'element not allowed');
2393     }
2394     } elsif ($phase eq 'in trs') {
2395     if ($node_ns eq $HTML_NS and $node_ln eq 'tr') {
2396     #$phase = 'in trs';
2397     } elsif (not $has_tfoot and
2398     $node_ns eq $HTML_NS and $node_ln eq 'tfoot') {
2399     $phase = 'after tfoot';
2400     $has_tfoot = 1;
2401     } else {
2402     $self->{onerror}->(node => $node, type => 'element not allowed');
2403     }
2404     } elsif ($phase eq 'after thead') {
2405     if ($node_ns eq $HTML_NS and $node_ln eq 'tbody') {
2406     $phase = 'in tbodys';
2407     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tr') {
2408     $phase = 'in trs';
2409     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tfoot') {
2410     $phase = 'in tbodys';
2411     $has_tfoot = 1;
2412     } else {
2413     $self->{onerror}->(node => $node, type => 'element not allowed');
2414     }
2415     } elsif ($phase eq 'in colgroup') {
2416     if ($node_ns eq $HTML_NS and $node_ln eq 'colgroup') {
2417     $phase = 'in colgroup';
2418     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'thead') {
2419     $phase = 'after thead';
2420     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tbody') {
2421     $phase = 'in tbodys';
2422     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tr') {
2423     $phase = 'in trs';
2424     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tfoot') {
2425     $phase = 'in tbodys';
2426     $has_tfoot = 1;
2427     } else {
2428     $self->{onerror}->(node => $node, type => 'element not allowed');
2429     }
2430     } elsif ($phase eq 'before caption') {
2431     if ($node_ns eq $HTML_NS and $node_ln eq 'caption') {
2432     $phase = 'in colgroup';
2433     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'colgroup') {
2434     $phase = 'in colgroup';
2435     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'thead') {
2436     $phase = 'after thead';
2437     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tbody') {
2438     $phase = 'in tbodys';
2439     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tr') {
2440     $phase = 'in trs';
2441     } elsif ($node_ns eq $HTML_NS and $node_ln eq 'tfoot') {
2442     $phase = 'in tbodys';
2443     $has_tfoot = 1;
2444     } else {
2445     $self->{onerror}->(node => $node, type => 'element not allowed');
2446     }
2447     } else { # after tfoot
2448     $self->{onerror}->(node => $node, type => 'element not allowed');
2449     }
2450     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
2451     unshift @nodes, @$sib;
2452     push @$new_todos, @$ch;
2453     } elsif ($nt == 3 or $nt == 4) {
2454     if ($node->data =~ /[^\x09-\x0D\x20]/) {
2455     $self->{onerror}->(node => $node, type => 'character not allowed');
2456     }
2457     } elsif ($nt == 5) {
2458     unshift @nodes, @{$node->child_nodes};
2459     }
2460     }
2461    
2462     ## Table model errors
2463     require Whatpm::HTMLTable;
2464     Whatpm::HTMLTable->form_table ($todo->{node}, sub {
2465     my %opt = @_;
2466     $self->{onerror}->(type => 'table:'.$opt{type}, node => $opt{node});
2467     });
2468     push @{$self->{return}->{table}}, $todo->{node};
2469    
2470     return ($new_todos);
2471     },
2472     };
2473    
2474     $Element->{$HTML_NS}->{caption} = {
2475     attrs_checker => $GetHTMLAttrsChecker->({}),
2476     checker => $HTMLSignificantStrictlyInlineChecker,
2477     };
2478    
2479     $Element->{$HTML_NS}->{colgroup} = {
2480     attrs_checker => $GetHTMLAttrsChecker->({
2481     span => $GetHTMLNonNegativeIntegerAttrChecker->(sub { shift > 0 }),
2482     ## NOTE: Defined only if "the |colgroup| element contains no |col| elements"
2483     ## TODO: "attribute not supported" if |col|.
2484     ## ISSUE: MUST NOT if any |col|?
2485     ## ISSUE: MUST NOT for |<colgroup span="1"><any><col/></any></colgroup>| (though non-conforming)?
2486     }),
2487     checker => sub {
2488     my ($self, $todo) = @_;
2489     my $el = $todo->{node};
2490     my $new_todos = [];
2491     my @nodes = (@{$el->child_nodes});
2492    
2493     while (@nodes) {
2494     my $node = shift @nodes;
2495     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
2496    
2497     my $nt = $node->node_type;
2498     if ($nt == 1) {
2499     my $node_ns = $node->namespace_uri;
2500     $node_ns = '' unless defined $node_ns;
2501     my $node_ln = $node->manakai_local_name;
2502     ## NOTE: |minuses| list is not checked since redundant
2503     unless ($node_ns eq $HTML_NS and $node_ln eq 'col') {
2504     $self->{onerror}->(node => $node, type => 'element not allowed');
2505     }
2506     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
2507     unshift @nodes, @$sib;
2508     push @$new_todos, @$ch;
2509     } elsif ($nt == 3 or $nt == 4) {
2510     if ($node->data =~ /[^\x09-\x0D\x20]/) {
2511     $self->{onerror}->(node => $node, type => 'character not allowed');
2512     }
2513     } elsif ($nt == 5) {
2514     unshift @nodes, @{$node->child_nodes};
2515     }
2516     }
2517     return ($new_todos);
2518     },
2519     };
2520    
2521     $Element->{$HTML_NS}->{col} = {
2522     attrs_checker => $GetHTMLAttrsChecker->({
2523     span => $GetHTMLNonNegativeIntegerAttrChecker->(sub { shift > 0 }),
2524     }),
2525     checker => $HTMLEmptyChecker,
2526     };
2527    
2528     $Element->{$HTML_NS}->{tbody} = {
2529     attrs_checker => $GetHTMLAttrsChecker->({}),
2530     checker => sub {
2531     my ($self, $todo) = @_;
2532     my $el = $todo->{node};
2533     my $new_todos = [];
2534     my @nodes = (@{$el->child_nodes});
2535    
2536     my $has_tr;
2537     while (@nodes) {
2538     my $node = shift @nodes;
2539     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
2540    
2541     my $nt = $node->node_type;
2542     if ($nt == 1) {
2543     my $node_ns = $node->namespace_uri;
2544     $node_ns = '' unless defined $node_ns;
2545     my $node_ln = $node->manakai_local_name;
2546     ## NOTE: |minuses| list is not checked since redundant
2547     if ($node_ns eq $HTML_NS and $node_ln eq 'tr') {
2548     $has_tr = 1;
2549     } else {
2550     $self->{onerror}->(node => $node, type => 'element not allowed');
2551     }
2552     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
2553     unshift @nodes, @$sib;
2554     push @$new_todos, @$ch;
2555     } elsif ($nt == 3 or $nt == 4) {
2556     if ($node->data =~ /[^\x09-\x0D\x20]/) {
2557     $self->{onerror}->(node => $node, type => 'character not allowed');
2558     }
2559     } elsif ($nt == 5) {
2560     unshift @nodes, @{$node->child_nodes};
2561     }
2562     }
2563     unless ($has_tr) {
2564     $self->{onerror}->(node => $el, type => 'child element missing:tr');
2565     }
2566     return ($new_todos);
2567     },
2568     };
2569    
2570     $Element->{$HTML_NS}->{thead} = {
2571     attrs_checker => $GetHTMLAttrsChecker->({}),
2572     checker => $Element->{$HTML_NS}->{tbody}->{checker},
2573     };
2574    
2575     $Element->{$HTML_NS}->{tfoot} = {
2576     attrs_checker => $GetHTMLAttrsChecker->({}),
2577     checker => $Element->{$HTML_NS}->{tbody}->{checker},
2578     };
2579    
2580     $Element->{$HTML_NS}->{tr} = {
2581     attrs_checker => $GetHTMLAttrsChecker->({}),
2582     checker => sub {
2583     my ($self, $todo) = @_;
2584     my $el = $todo->{node};
2585     my $new_todos = [];
2586     my @nodes = (@{$el->child_nodes});
2587    
2588     my $has_td;
2589     while (@nodes) {
2590     my $node = shift @nodes;
2591     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
2592    
2593     my $nt = $node->node_type;
2594     if ($nt == 1) {
2595     my $node_ns = $node->namespace_uri;
2596     $node_ns = '' unless defined $node_ns;
2597     my $node_ln = $node->manakai_local_name;
2598     ## NOTE: |minuses| list is not checked since redundant
2599     if ($node_ns eq $HTML_NS and ($node_ln eq 'td' or $node_ln eq 'th')) {
2600     $has_td = 1;
2601     } else {
2602     $self->{onerror}->(node => $node, type => 'element not allowed');
2603     }
2604     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
2605     unshift @nodes, @$sib;
2606     push @$new_todos, @$ch;
2607     } elsif ($nt == 3 or $nt == 4) {
2608     if ($node->data =~ /[^\x09-\x0D\x20]/) {
2609     $self->{onerror}->(node => $node, type => 'character not allowed');
2610     }
2611     } elsif ($nt == 5) {
2612     unshift @nodes, @{$node->child_nodes};
2613     }
2614     }
2615     unless ($has_td) {
2616     $self->{onerror}->(node => $el, type => 'child element missing:td|th');
2617     }
2618     return ($new_todos);
2619     },
2620     };
2621    
2622     $Element->{$HTML_NS}->{td} = {
2623     attrs_checker => $GetHTMLAttrsChecker->({
2624     colspan => $GetHTMLNonNegativeIntegerAttrChecker->(sub { shift > 0 }),
2625     rowspan => $GetHTMLNonNegativeIntegerAttrChecker->(sub { shift > 0 }),
2626     }),
2627     checker => $HTMLBlockOrInlineChecker,
2628     };
2629    
2630     $Element->{$HTML_NS}->{th} = {
2631     attrs_checker => $GetHTMLAttrsChecker->({
2632     colspan => $GetHTMLNonNegativeIntegerAttrChecker->(sub { shift > 0 }),
2633     rowspan => $GetHTMLNonNegativeIntegerAttrChecker->(sub { shift > 0 }),
2634     scope => $GetHTMLEnumeratedAttrChecker
2635     ->({row => 1, col => 1, rowgroup => 1, colgroup => 1}),
2636     }),
2637     checker => $HTMLBlockOrInlineChecker,
2638     };
2639    
2640     ## TODO: forms
2641    
2642     $Element->{$HTML_NS}->{script} = {
2643     attrs_checker => sub {
2644     my ($self, $todo) = @_;
2645     $GetHTMLAttrsChecker->({
2646     src => $HTMLURIAttrChecker,
2647     defer => $GetHTMLBooleanAttrChecker->('defer'),
2648     async => $GetHTMLBooleanAttrChecker->('async'),
2649     type => $HTMLIMTAttrChecker,
2650     })->($self, $todo);
2651     if ($todo->{node}->has_attribute_ns (undef, 'defer')) {
2652     my $async_attr = $todo->{node}->get_attribute_node_ns (undef, 'async');
2653     if ($async_attr) {
2654     $self->{onerror}->(node => $async_attr,
2655     type => 'attribute not allowed'); # MUST NOT
2656     }
2657     }
2658     },
2659     checker => sub {
2660     my ($self, $todo) = @_;
2661    
2662     if ($todo->{node}->has_attribute_ns (undef, 'src')) {
2663     return $HTMLEmptyChecker->($self, $todo);
2664     } else {
2665     ## NOTE: No content model conformance in HTML5 spec.
2666     my $type = $todo->{node}->get_attribute_ns (undef, 'type');
2667     my $language = $todo->{node}->get_attribute_ns (undef, 'language');
2668     if ((defined $type and $type eq '') or
2669     (defined $language and $language eq '')) {
2670     $type = 'text/javascript';
2671     } elsif (defined $type) {
2672     #
2673     } elsif (defined $language) {
2674     $type = 'text/' . $language;
2675     } else {
2676     $type = 'text/javascript';
2677     }
2678     $self->{onerror}->(node => $todo->{node}, level => 'unsupported',
2679     type => 'script:'.$type); ## TODO: $type normalization
2680     return $AnyChecker->($self, $todo);
2681     }
2682     },
2683     };
2684    
2685     ## NOTE: When script is disabled.
2686     $Element->{$HTML_NS}->{noscript} = {
2687     attrs_checker => $GetHTMLAttrsChecker->({}),
2688     checker => sub {
2689     my ($self, $todo) = @_;
2690    
2691     my $end = $self->_add_minuses ({$HTML_NS => {noscript => 1}});
2692     my ($sib, $ch) = $HTMLBlockOrInlineChecker->($self, $todo);
2693     push @$sib, $end;
2694     return ($sib, $ch);
2695     },
2696     };
2697     ## TODO: noscript in head
2698     ## TODO: noscript in XHTML
2699    
2700     $Element->{$HTML_NS}->{'event-source'} = {
2701     attrs_checker => $GetHTMLAttrsChecker->({
2702     src => $HTMLURIAttrChecker,
2703     }),
2704     checker => $HTMLEmptyChecker,
2705     };
2706    
2707     $Element->{$HTML_NS}->{details} = {
2708     attrs_checker => $GetHTMLAttrsChecker->({
2709     open => $GetHTMLBooleanAttrChecker->('open'),
2710     }),
2711     checker => sub {
2712     my ($self, $todo) = @_;
2713    
2714     my $end = $self->_add_minuses ({$HTML_NS => {a => 1, datagrid => 1}});
2715     my ($sib, $ch)
2716     = $GetHTMLZeroOrMoreThenBlockOrInlineChecker->($HTML_NS, 'legend')
2717     ->($self, $todo);
2718     push @$sib, $end;
2719     return ($sib, $ch);
2720     },
2721     };
2722    
2723     $Element->{$HTML_NS}->{datagrid} = {
2724     attrs_checker => $GetHTMLAttrsChecker->({
2725     disabled => $GetHTMLBooleanAttrChecker->('disabled'),
2726     multiple => $GetHTMLBooleanAttrChecker->('multiple'),
2727     }),
2728     checker => sub {
2729     my ($self, $todo) = @_;
2730     my $el = $todo->{node};
2731     my $new_todos = [];
2732     my @nodes = (@{$el->child_nodes});
2733    
2734     my $end = $self->_add_minuses ({$HTML_NS => {a => 1, datagrid => 1}});
2735    
2736     ## Block-table Block* | table | select | datalist | Empty
2737     my $mode = 'any';
2738     while (@nodes) {
2739     my $node = shift @nodes;
2740     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
2741    
2742     my $nt = $node->node_type;
2743     if ($nt == 1) {
2744     my $node_ns = $node->namespace_uri;
2745     $node_ns = '' unless defined $node_ns;
2746     my $node_ln = $node->manakai_local_name;
2747     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
2748     if ($mode eq 'block') {
2749     $not_allowed = 1
2750     unless $HTMLBlockLevelElements->{$node_ns}->{$node_ln};
2751     } elsif ($mode eq 'any') {
2752     if ($node_ns eq $HTML_NS and
2753     {table => 1, select => 1, datalist => 1}->{$node_ln}) {
2754     $mode = 'none';
2755     } elsif ($HTMLBlockLevelElements->{$node_ns}->{$node_ln}) {
2756     $mode = 'block';
2757     } else {
2758     $not_allowed = 1;
2759     }
2760     } else {
2761     $not_allowed = 1;
2762     }
2763     $self->{onerror}->(node => $node, type => 'element not allowed')
2764     if $not_allowed;
2765     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
2766     unshift @nodes, @$sib;
2767     push @$new_todos, @$ch;
2768     } elsif ($nt == 3 or $nt == 4) {
2769     if ($node->data =~ /[^\x09-\x0D\x20]/) {
2770     $self->{onerror}->(node => $node, type => 'character not allowed');
2771     }
2772     } elsif ($nt == 5) {
2773     unshift @nodes, @{$node->child_nodes};
2774     }
2775     }
2776    
2777     push @$new_todos, $end;
2778     return ($new_todos);
2779     },
2780     };
2781    
2782     $Element->{$HTML_NS}->{command} = {
2783     attrs_checker => $GetHTMLAttrsChecker->({
2784     checked => $GetHTMLBooleanAttrChecker->('checked'),
2785     default => $GetHTMLBooleanAttrChecker->('default'),
2786     disabled => $GetHTMLBooleanAttrChecker->('disabled'),
2787     hidden => $GetHTMLBooleanAttrChecker->('hidden'),
2788     icon => $HTMLURIAttrChecker,
2789     label => sub { }, ## NOTE: No conformance creteria
2790     radiogroup => sub { }, ## NOTE: No conformance creteria
2791     ## NOTE: |title| has special semantics, but no syntactical difference
2792     type => sub {
2793     my ($self, $attr) = @_;
2794     my $value = $attr->value;
2795     unless ({command => 1, checkbox => 1, radio => 1}->{$value}) {
2796     $self->{onerror}->(node => $attr, type => 'attribute value not allowed');
2797     }
2798     },
2799     }),
2800     checker => $HTMLEmptyChecker,
2801     };
2802    
2803     $Element->{$HTML_NS}->{menu} = {
2804     attrs_checker => $GetHTMLAttrsChecker->({
2805     autosubmit => $GetHTMLBooleanAttrChecker->('autosubmit'),
2806     id => sub {
2807     ## NOTE: same as global |id=""|, with |$self->{menu}| registeration
2808     my ($self, $attr) = @_;
2809     my $value = $attr->value;
2810     if (length $value > 0) {
2811     if ($self->{id}->{$value}) {
2812     $self->{onerror}->(node => $attr, type => 'duplicate ID');
2813     push @{$self->{id}->{$value}}, $attr;
2814     } else {
2815     $self->{id}->{$value} = [$attr];
2816     }
2817     } else {
2818     ## NOTE: MUST contain at least one character
2819     $self->{onerror}->(node => $attr, type => 'empty attribute value');
2820     }
2821     if ($value =~ /[\x09-\x0D\x20]/) {
2822     $self->{onerror}->(node => $attr, type => 'space in ID');
2823     }
2824     $self->{menu}->{$value} ||= $attr;
2825     ## ISSUE: <menu id=""><p contextmenu=""> match?
2826     },
2827     label => sub { }, ## NOTE: No conformance creteria
2828     type => $GetHTMLEnumeratedAttrChecker->({context => 1, toolbar => 1}),
2829     }),
2830     checker => sub {
2831     my ($self, $todo) = @_;
2832     my $el = $todo->{node};
2833     my $new_todos = [];
2834     my @nodes = (@{$el->child_nodes});
2835    
2836     my $content = 'li or inline';
2837     while (@nodes) {
2838     my $node = shift @nodes;
2839     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
2840    
2841     my $nt = $node->node_type;
2842     if ($nt == 1) {
2843     my $node_ns = $node->namespace_uri;
2844     $node_ns = '' unless defined $node_ns;
2845     my $node_ln = $node->manakai_local_name;
2846     my $not_allowed = $self->{minuses}->{$node_ns}->{$node_ln};
2847     if ($node_ns eq $HTML_NS and $node_ln eq 'li') {
2848     if ($content eq 'inline') {
2849     $not_allowed = 1;
2850     } elsif ($content eq 'li or inline') {
2851     $content = 'li';
2852     }
2853     } else {
2854     if ($HTMLStrictlyInlineLevelElements->{$node_ns}->{$node_ln} or
2855     $HTMLStructuredInlineLevelElements->{$node_ns}->{$node_ln}) {
2856     $content = 'inline';
2857     } else {
2858     $not_allowed = 1;
2859     }
2860     }
2861     $self->{onerror}->(node => $node, type => 'element not allowed')
2862     if $not_allowed;
2863     my ($sib, $ch) = $self->_check_get_children ($node, $todo);
2864     unshift @nodes, @$sib;
2865     push @$new_todos, @$ch;
2866     } elsif ($nt == 3 or $nt == 4) {
2867     if ($node->data =~ /[^\x09-\x0D\x20]/) {
2868     if ($content eq 'li') {
2869     $self->{onerror}->(node => $node, type => 'character not allowed');
2870     } elsif ($content eq 'li or inline') {
2871     $content = 'inline';
2872     }
2873     }
2874     } elsif ($nt == 5) {
2875     unshift @nodes, @{$node->child_nodes};
2876     }
2877     }
2878    
2879     for (@$new_todos) {
2880     $_->{inline} = 1;
2881     }
2882     return ($new_todos);
2883     },
2884     };
2885    
2886     $Element->{$HTML_NS}->{legend} = {
2887     attrs_checker => $GetHTMLAttrsChecker->({}),
2888     checker => sub {
2889     my ($self, $todo) = @_;
2890    
2891     my $parent = $todo->{node}->manakai_parent_element;
2892     if (defined $parent) {
2893     my $nsuri = $parent->namespace_uri;
2894     $nsuri = '' unless defined $nsuri;
2895     my $ln = $parent->manakai_local_name;
2896     if ($nsuri eq $HTML_NS and $ln eq 'figure') {
2897     return $HTMLInlineChecker->($self, $todo);
2898     } else {
2899     return $HTMLSignificantStrictlyInlineChecker->($self, $todo);
2900     }
2901     } else {
2902     return $HTMLInlineChecker->($self, $todo);
2903     }
2904    
2905     ## ISSUE: Content model is defined only for fieldset/legend,
2906     ## details/legend, and figure/legend.
2907     },
2908     };
2909    
2910     $Element->{$HTML_NS}->{div} = {
2911     attrs_checker => $GetHTMLAttrsChecker->({}),
2912     checker => $GetHTMLZeroOrMoreThenBlockOrInlineChecker->($HTML_NS, 'style'),
2913     };
2914    
2915     $Element->{$HTML_NS}->{font} = {
2916     attrs_checker => $GetHTMLAttrsChecker->({}), ## TODO
2917     checker => $HTMLTransparentChecker,
2918     };
2919    
2920     $Whatpm::ContentChecker::Namespace->{$HTML_NS}->{loaded} = 1;
2921    
2922     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24