/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.50 - (hide annotations) (download)
Sun Oct 14 09:21:46 2007 UTC (17 years ago) by wakaba
Branch: MAIN
Changes since 1.49: +46 -6 lines
++ whatpm/t/ChangeLog	14 Oct 2007 09:21:32 -0000
2007-10-14  Wakaba  <wakaba@suika.fam.cx>

	* content-model-1.dat, content-model-2.dat: New test
	data for |rule|, |nest|, and |datatemplate| elements.

++ whatpm/Whatpm/ChangeLog	14 Oct 2007 09:20:23 -0000
2007-10-14  Wakaba  <wakaba@suika.fam.cx>

	* ContentChecker.pm (check_document): Support for
	new |is_xml_root| flag.
	(check_element): Support for new |pluses| state.
	(_add_pluses): New method.
	(_remove_minuses): Support for new |minus| item.

++ whatpm/Whatpm/ContentChecker/ChangeLog	14 Oct 2007 09:20:50 -0000
2007-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Atom.pm, HTML.pm: Support for |html:nest|, |html:datatemplate|,
	and |html:rule| elements.

1 wakaba 1.1 package Whatpm::ContentChecker;
2     use strict;
3 wakaba 1.50 our $VERSION=do{my @r=(q$Revision: 1.49 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.18 require Whatpm::URIChecker;
6    
7 wakaba 1.13 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
8     ## be applied to an in-memory representation (i.e. DOM)?
9    
10 wakaba 1.50 ## TODO: Conformance of an HTML document with non-html root element.
11    
12 wakaba 1.42 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
13 wakaba 1.9 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
14     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
15    
16 wakaba 1.42 my $Namespace = {
17 wakaba 1.43 q<http://www.w3.org/2005/Atom> => {module => 'Whatpm::ContentChecker::Atom'},
18 wakaba 1.42 $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
19     $XML_NS => {loaded => 1},
20     $XMLNS_NS => {loaded => 1},
21     };
22    
23     our $AttrChecker = {
24 wakaba 1.9 $XML_NS => {
25 wakaba 1.13 space => sub {
26     my ($self, $attr) = @_;
27     my $value = $attr->value;
28     if ($value eq 'default' or $value eq 'preserve') {
29     #
30     } else {
31     ## NOTE: An XML "error"
32 wakaba 1.33 $self->{onerror}->(node => $attr, level => 'error',
33     type => 'invalid attribute value');
34 wakaba 1.13 }
35     },
36     lang => sub {
37 wakaba 1.35 my ($self, $attr) = @_;
38 wakaba 1.47 my $value = $attr->value;
39     if ($value eq '') {
40     #
41     } else {
42     require Whatpm::LangTag;
43     Whatpm::LangTag->check_rfc3066_language_tag ($value, sub {
44     my %opt = @_;
45     my $type = 'LangTag:'.$opt{type};
46     $type .= ':' . $opt{subtag} if defined $opt{subtag};
47     $self->{onerror}->(node => $attr, type => $type,
48     value => $opt{value}, level => $opt{level});
49     });
50     }
51    
52 wakaba 1.13 ## NOTE: "The values of the attribute are language identifiers
53     ## as defined by [IETF RFC 3066], Tags for the Identification
54     ## of Languages, or its successor; in addition, the empty string
55     ## may be specified." ("may" in lower case)
56 wakaba 1.47 ## NOTE: Is an RFC 3066-valid (but RFC 4647-invalid) language tag
57     ## allowed today?
58    
59     ## TODO: test data
60    
61 wakaba 1.35 if ($attr->owner_document->manakai_is_html) { # MUST NOT
62 wakaba 1.36 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang');
63 wakaba 1.35 ## TODO: Test data...
64     }
65 wakaba 1.13 },
66     base => sub {
67     my ($self, $attr) = @_;
68     my $value = $attr->value;
69     if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
70     $self->{onerror}->(node => $attr,
71 wakaba 1.33 type => 'invalid attribute value');
72 wakaba 1.13 }
73 wakaba 1.18 ## NOTE: Conformance to URI standard is not checked since there is
74     ## no author requirement on conformance in the XML Base specification.
75 wakaba 1.13 },
76     id => sub {
77     my ($self, $attr) = @_;
78     my $value = $attr->value;
79     $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
80     $value =~ s/^\x20//;
81     $value =~ s/\x20$//;
82     ## TODO: NCName in XML 1.0 or 1.1
83     ## TODO: declared type is ID?
84 wakaba 1.33 if ($self->{id}->{$value}) { ## NOTE: An xml:id error
85     $self->{onerror}->(node => $attr, level => 'error',
86     type => 'duplicate ID');
87 wakaba 1.37 push @{$self->{id}->{$value}}, $attr;
88 wakaba 1.13 } else {
89 wakaba 1.37 $self->{id}->{$value} = [$attr];
90 wakaba 1.13 }
91     },
92 wakaba 1.9 },
93     $XMLNS_NS => {
94 wakaba 1.13 '' => sub {
95     my ($self, $attr) = @_;
96     my $ln = $attr->manakai_local_name;
97     my $value = $attr->value;
98     if ($value eq $XML_NS and $ln ne 'xml') {
99     $self->{onerror}
100 wakaba 1.33 ->(node => $attr, level => 'NC',
101     type => 'Reserved Prefixes and Namespace Names:=xml');
102 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
103     $self->{onerror}
104 wakaba 1.33 ->(node => $attr, level => 'NC',
105     type => 'Reserved Prefixes and Namespace Names:=xmlns');
106 wakaba 1.13 }
107     if ($ln eq 'xml' and $value ne $XML_NS) {
108     $self->{onerror}
109 wakaba 1.33 ->(node => $attr, level => 'NC',
110     type => 'Reserved Prefixes and Namespace Names:xmlns:xml=');
111 wakaba 1.13 } elsif ($ln eq 'xmlns') {
112     $self->{onerror}
113 wakaba 1.33 ->(node => $attr, level => 'NC',
114     type => 'Reserved Prefixes and Namespace Names:xmlns:xmlns=');
115 wakaba 1.13 }
116     ## TODO: If XML 1.0 and empty
117     },
118     xmlns => sub {
119     my ($self, $attr) = @_;
120     ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
121     ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
122 wakaba 1.18 ## TODO: relative references are deprecated
123 wakaba 1.13 my $value = $attr->value;
124     if ($value eq $XML_NS) {
125     $self->{onerror}
126 wakaba 1.33 ->(node => $attr, level => 'NC',
127     type => 'Reserved Prefixes and Namespace Names:=xml');
128 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
129     $self->{onerror}
130 wakaba 1.33 ->(node => $attr, level => 'NC',
131     type => 'Reserved Prefixes and Namespace Names:=xmlns');
132 wakaba 1.13 }
133     },
134 wakaba 1.9 },
135     };
136    
137 wakaba 1.14 ## ISSUE: Should we really allow these attributes?
138 wakaba 1.13 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
139     $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
140     $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
141     $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
142    
143 wakaba 1.3 ## ANY
144 wakaba 1.42 our $AnyChecker = sub {
145 wakaba 1.4 my ($self, $todo) = @_;
146     my $el = $todo->{node};
147     my $new_todos = [];
148 wakaba 1.3 my @nodes = (@{$el->child_nodes});
149     while (@nodes) {
150     my $node = shift @nodes;
151     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
152    
153     my $nt = $node->node_type;
154     if ($nt == 1) {
155     my $node_ns = $node->namespace_uri;
156     $node_ns = '' unless defined $node_ns;
157     my $node_ln = $node->manakai_local_name;
158     if ($self->{minuses}->{$node_ns}->{$node_ln}) {
159     $self->{onerror}->(node => $node, type => 'element not allowed');
160     }
161 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
162 wakaba 1.3 } elsif ($nt == 5) {
163     unshift @nodes, @{$node->child_nodes};
164     }
165     }
166 wakaba 1.4 return ($new_todos);
167 wakaba 1.3 }; # $AnyChecker
168    
169 wakaba 1.42 our $ElementDefault = {
170 wakaba 1.1 checker => sub {
171 wakaba 1.4 my ($self, $todo) = @_;
172 wakaba 1.33 $self->{onerror}->(node => $todo->{node}, level => 'unsupported',
173     type => 'element');
174 wakaba 1.4 return $AnyChecker->($self, $todo);
175 wakaba 1.1 },
176 wakaba 1.9 attrs_checker => sub {
177     my ($self, $todo) = @_;
178     for my $attr (@{$todo->{node}->attributes}) {
179     my $attr_ns = $attr->namespace_uri;
180     $attr_ns = '' unless defined $attr_ns;
181     my $attr_ln = $attr->manakai_local_name;
182     my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
183     || $AttrChecker->{$attr_ns}->{''};
184     if ($checker) {
185     $checker->($self, $attr);
186 wakaba 1.17 } else {
187 wakaba 1.33 $self->{onerror}->(node => $attr, level => 'unsupported',
188     type => 'attribute');
189 wakaba 1.9 }
190     }
191     },
192 wakaba 1.1 };
193    
194 wakaba 1.7 my $HTMLTransparentElements = {
195     $HTML_NS => {qw/ins 1 font 1 noscript 1/},
196 wakaba 1.29 ## NOTE: |html:noscript| is transparent if scripting is disabled
197     ## and not in |head|.
198 wakaba 1.7 };
199    
200 wakaba 1.42 our $Element = {};
201 wakaba 1.7
202 wakaba 1.42 sub check_document ($$$) {
203     my ($self, $doc, $onerror) = @_;
204     $self = bless {}, $self unless ref $self;
205     $self->{onerror} = $onerror;
206 wakaba 1.1
207 wakaba 1.48 $self->{must_level} = 'm';
208     $self->{fact_level} = 'f';
209     $self->{should_level} = 's';
210     $self->{good_level} = 'g';
211    
212 wakaba 1.42 my $docel = $doc->document_element;
213     unless (defined $docel) {
214     ## ISSUE: Should we check content of Document node?
215     $onerror->(node => $doc, type => 'no document element');
216     ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
217     return {
218     class => {},
219     id => {}, table => [], term => {},
220     };
221 wakaba 1.1 }
222    
223 wakaba 1.42 ## ISSUE: Unexpanded entity references and HTML5 conformance
224 wakaba 1.1
225 wakaba 1.42 my $docel_nsuri = $docel->namespace_uri;
226     $docel_nsuri = '' unless defined $docel_nsuri;
227 wakaba 1.43 unless ($Namespace->{$docel_nsuri}->{loaded}) {
228     if ($Namespace->{$docel_nsuri}->{module}) {
229     eval qq{ require $Namespace->{$docel_nsuri}->{module} } or die $@;
230     } else {
231     $Namespace->{$docel_nsuri}->{loaded} = 1;
232     }
233     }
234 wakaba 1.42 my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
235     $Element->{$docel_nsuri}->{''} ||
236     $ElementDefault;
237     if ($docel_def->{is_root}) {
238     #
239 wakaba 1.50 } elsif ($docel_def->{is_xml_root}) {
240     unless ($doc->manakai_is_html) {
241     #
242     } else {
243     $onerror->(node => $docel, type => 'element not allowed:root:xml');
244     }
245 wakaba 1.42 } else {
246 wakaba 1.49 $onerror->(node => $docel, type => 'element not allowed:root');
247 wakaba 1.1 }
248    
249 wakaba 1.42 ## TODO: Check for other items other than document element
250     ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
251 wakaba 1.2
252 wakaba 1.42 return $self->check_element ($docel, $onerror);
253     } # check_document
254 wakaba 1.1
255 wakaba 1.42 sub check_element ($$$) {
256     my ($self, $el, $onerror) = @_;
257     $self = bless {}, $self unless ref $self;
258     $self->{onerror} = $onerror;
259 wakaba 1.2
260 wakaba 1.48 $self->{must_level} = 'm';
261     $self->{fact_level} = 'f';
262     $self->{should_level} = 's';
263     $self->{good_level} = 'g';
264    
265 wakaba 1.50 $self->{pluses} = {};
266 wakaba 1.42 $self->{minuses} = {};
267     $self->{id} = {};
268     $self->{term} = {};
269     $self->{usemap} = [];
270     $self->{contextmenu} = [];
271     $self->{map} = {};
272     $self->{menu} = {};
273     $self->{has_link_type} = {};
274 wakaba 1.46 #$self->{has_uri_attr};
275     #$self->{has_hyperlink_element};
276 wakaba 1.42 $self->{return} = {
277     class => {},
278     id => $self->{id}, table => [], term => $self->{term},
279     };
280 wakaba 1.4
281 wakaba 1.42 my @todo = ({type => 'element', node => $el});
282     while (@todo) {
283     my $todo = shift @todo;
284     if ($todo->{type} eq 'element') {
285     my $prefix = $todo->{node}->prefix;
286     if (defined $prefix and $prefix eq 'xmlns') {
287     $self->{onerror}
288     ->(node => $todo->{node}, level => 'NC',
289     type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
290 wakaba 1.7 }
291 wakaba 1.42 my $nsuri = $todo->{node}->namespace_uri;
292     $nsuri = '' unless defined $nsuri;
293     unless ($Namespace->{$nsuri}->{loaded}) {
294     if ($Namespace->{$nsuri}->{module}) {
295     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
296     } else {
297     $Namespace->{$nsuri}->{loaded} = 1;
298 wakaba 1.1 }
299     }
300 wakaba 1.42 my $ln = $todo->{node}->manakai_local_name;
301     my $eldef = $Element->{$nsuri}->{$ln} ||
302     $Element->{$nsuri}->{''} ||
303     $ElementDefault;
304     $eldef->{attrs_checker}->($self, $todo);
305     my ($new_todos) = $eldef->{checker}->($self, $todo);
306     unshift @todo, @$new_todos;
307     } elsif ($todo->{type} eq 'element-attributes') {
308     my $prefix = $todo->{node}->prefix;
309     if (defined $prefix and $prefix eq 'xmlns') {
310     $self->{onerror}
311     ->(node => $todo->{node}, level => 'NC',
312     type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
313     }
314     my $nsuri = $todo->{node}->namespace_uri;
315     $nsuri = '' unless defined $nsuri;
316     unless ($Namespace->{$nsuri}->{loaded}) {
317     if ($Namespace->{$nsuri}->{module}) {
318     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
319 wakaba 1.1 } else {
320 wakaba 1.42 $Namespace->{$nsuri}->{loaded} = 1;
321 wakaba 1.1 }
322     }
323 wakaba 1.9 my $ln = $todo->{node}->manakai_local_name;
324     my $eldef = $Element->{$nsuri}->{$ln} ||
325     $Element->{$nsuri}->{''} ||
326     $ElementDefault;
327     $eldef->{attrs_checker}->($self, $todo);
328 wakaba 1.50 } elsif ($todo->{type} eq 'plus' or $todo->{type} eq 'minus') {
329 wakaba 1.4 $self->_remove_minuses ($todo);
330 wakaba 1.30 } elsif ($todo->{type} eq 'code') {
331     $todo->{code}->();
332     } else {
333     die "$0: Internal error: Unsupported checking action type |$todo->{type}|";
334 wakaba 1.4 }
335 wakaba 1.1 }
336 wakaba 1.17
337     for (@{$self->{usemap}}) {
338     unless ($self->{map}->{$_->[0]}) {
339     $self->{onerror}->(node => $_->[1], type => 'no referenced map');
340     }
341     }
342    
343 wakaba 1.32 for (@{$self->{contextmenu}}) {
344     unless ($self->{menu}->{$_->[0]}) {
345     $self->{onerror}->(node => $_->[1], type => 'no referenced menu');
346     }
347     }
348    
349 wakaba 1.50 delete $self->{pluses};
350 wakaba 1.17 delete $self->{minuses};
351     delete $self->{onerror};
352     delete $self->{id};
353     delete $self->{usemap};
354     delete $self->{map};
355 wakaba 1.33 return $self->{return};
356 wakaba 1.1 } # check_element
357    
358 wakaba 1.2 sub _add_minuses ($@) {
359     my $self = shift;
360     my $r = {};
361     for my $list (@_) {
362     for my $ns (keys %$list) {
363     for my $ln (keys %{$list->{$ns}}) {
364     unless ($self->{minuses}->{$ns}->{$ln}) {
365     $self->{minuses}->{$ns}->{$ln} = 1;
366     $r->{$ns}->{$ln} = 1;
367     }
368     }
369     }
370     }
371 wakaba 1.4 return {type => 'plus', list => $r};
372 wakaba 1.2 } # _add_minuses
373    
374 wakaba 1.50 sub _add_pluses ($@) {
375     my $self = shift;
376     my $r = {};
377     for my $list (@_) {
378     for my $ns (keys %$list) {
379     for my $ln (keys %{$list->{$ns}}) {
380     unless ($self->{pluses}->{$ns}->{$ln}) {
381     $self->{pluses}->{$ns}->{$ln} = 1;
382     $r->{$ns}->{$ln} = 1;
383     }
384     }
385     }
386     }
387     return {type => 'minus', list => $r};
388     } # _add_pluses
389    
390 wakaba 1.2 sub _remove_minuses ($$) {
391 wakaba 1.4 my ($self, $todo) = @_;
392 wakaba 1.50 if ($todo->{type} eq 'minus') {
393     for my $ns (keys %{$todo->{list}}) {
394     for my $ln (keys %{$todo->{list}->{$ns}}) {
395     delete $self->{pluses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
396     }
397 wakaba 1.2 }
398 wakaba 1.50 } elsif ($todo->{type} eq 'plus') {
399     for my $ns (keys %{$todo->{list}}) {
400     for my $ln (keys %{$todo->{list}->{$ns}}) {
401     delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
402     }
403     }
404     } else {
405     die "$0: Unknown +- type: $todo->{type}";
406 wakaba 1.2 }
407     1;
408     } # _remove_minuses
409    
410 wakaba 1.50 ## NOTE: Priority for "minuses" and "pluses" are currently left
411     ## undefined and implemented inconsistently; it is not a problem for
412     ## now, since no element belongs to both lists.
413    
414 wakaba 1.30 sub _check_get_children ($$$) {
415     my ($self, $node, $parent_todo) = @_;
416 wakaba 1.4 my $new_todos = [];
417 wakaba 1.2 my $sib = [];
418     TP: {
419     my $node_ns = $node->namespace_uri;
420     $node_ns = '' unless defined $node_ns;
421     my $node_ln = $node->manakai_local_name;
422 wakaba 1.45 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
423     if ($node_ns eq $HTML_NS and $node_ln eq 'noscript') {
424     if ($parent_todo->{flag}->{in_head}) {
425     #
426     } else {
427     my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
428     push @$sib, $end;
429    
430     unshift @$sib, @{$node->child_nodes};
431     push @$new_todos, {type => 'element-attributes', node => $node};
432     last TP;
433     }
434     } else {
435     unshift @$sib, @{$node->child_nodes};
436     push @$new_todos, {type => 'element-attributes', node => $node};
437     last TP;
438 wakaba 1.2 }
439     }
440 wakaba 1.8 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
441 wakaba 1.2 if ($node->has_attribute_ns (undef, 'src')) {
442     unshift @$sib, @{$node->child_nodes};
443 wakaba 1.9 push @$new_todos, {type => 'element-attributes', node => $node};
444 wakaba 1.2 last TP;
445     } else {
446     my @cn = @{$node->child_nodes};
447     CN: while (@cn) {
448     my $cn = shift @cn;
449     my $cnt = $cn->node_type;
450     if ($cnt == 1) {
451 wakaba 1.8 my $cn_nsuri = $cn->namespace_uri;
452     $cn_nsuri = '' unless defined $cn_nsuri;
453     if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
454 wakaba 1.2 #
455     } else {
456     last CN;
457     }
458     } elsif ($cnt == 3 or $cnt == 4) {
459     if ($cn->data =~ /[^\x09-\x0D\x20]/) {
460     last CN;
461     }
462     }
463     } # CN
464     unshift @$sib, @cn;
465     }
466     }
467 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
468 wakaba 1.2 } # TP
469 wakaba 1.30
470     for my $new_todo (@$new_todos) {
471     $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
472     }
473    
474 wakaba 1.4 return ($sib, $new_todos);
475 wakaba 1.2 } # _check_get_children
476    
477 wakaba 1.44 =head1 LICENSE
478    
479     Copyright 2007 Wakaba <w@suika.fam.cx>
480    
481     This library is free software; you can redistribute it
482     and/or modify it under the same terms as Perl itself.
483    
484     =cut
485    
486 wakaba 1.1 1;
487 wakaba 1.50 # $Date: 2007/09/29 04:45:09 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24