/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.49 - (hide annotations) (download)
Sat Sep 29 04:45:09 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.48: +3 -3 lines
++ whatpm/t/ChangeLog	29 Sep 2007 04:36:22 -0000
2007-09-29  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: New tests for invalid
	attribute specifications are added.

++ whatpm/Whatpm/ChangeLog	29 Sep 2007 04:38:17 -0000
	* ContentChecker.pm: Raise specific error for invalid
	root element.

	* SelectorsParser.pm: Pass an empty string as a prefix
	for lookup namespace prefix callback, for loose compatibility
	with the |NSResolver| interface.

2007-09-24  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ContentChecker/ChangeLog	29 Sep 2007 04:38:46 -0000
	* Atom.pm (atom:link@title): Definition was missing.

2007-09-24  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::ContentChecker;
2     use strict;
3 wakaba 1.49 our $VERSION=do{my @r=(q$Revision: 1.48 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.18 require Whatpm::URIChecker;
6    
7 wakaba 1.13 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
8     ## be applied to an in-memory representation (i.e. DOM)?
9    
10 wakaba 1.42 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
11 wakaba 1.9 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
12     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
13    
14 wakaba 1.42 my $Namespace = {
15 wakaba 1.43 q<http://www.w3.org/2005/Atom> => {module => 'Whatpm::ContentChecker::Atom'},
16 wakaba 1.42 $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
17     $XML_NS => {loaded => 1},
18     $XMLNS_NS => {loaded => 1},
19     };
20    
21     our $AttrChecker = {
22 wakaba 1.9 $XML_NS => {
23 wakaba 1.13 space => sub {
24     my ($self, $attr) = @_;
25     my $value = $attr->value;
26     if ($value eq 'default' or $value eq 'preserve') {
27     #
28     } else {
29     ## NOTE: An XML "error"
30 wakaba 1.33 $self->{onerror}->(node => $attr, level => 'error',
31     type => 'invalid attribute value');
32 wakaba 1.13 }
33     },
34     lang => sub {
35 wakaba 1.35 my ($self, $attr) = @_;
36 wakaba 1.47 my $value = $attr->value;
37     if ($value eq '') {
38     #
39     } else {
40     require Whatpm::LangTag;
41     Whatpm::LangTag->check_rfc3066_language_tag ($value, sub {
42     my %opt = @_;
43     my $type = 'LangTag:'.$opt{type};
44     $type .= ':' . $opt{subtag} if defined $opt{subtag};
45     $self->{onerror}->(node => $attr, type => $type,
46     value => $opt{value}, level => $opt{level});
47     });
48     }
49    
50 wakaba 1.13 ## NOTE: "The values of the attribute are language identifiers
51     ## as defined by [IETF RFC 3066], Tags for the Identification
52     ## of Languages, or its successor; in addition, the empty string
53     ## may be specified." ("may" in lower case)
54 wakaba 1.47 ## NOTE: Is an RFC 3066-valid (but RFC 4647-invalid) language tag
55     ## allowed today?
56    
57     ## TODO: test data
58    
59 wakaba 1.35 if ($attr->owner_document->manakai_is_html) { # MUST NOT
60 wakaba 1.36 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang');
61 wakaba 1.35 ## TODO: Test data...
62     }
63 wakaba 1.13 },
64     base => sub {
65     my ($self, $attr) = @_;
66     my $value = $attr->value;
67     if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
68     $self->{onerror}->(node => $attr,
69 wakaba 1.33 type => 'invalid attribute value');
70 wakaba 1.13 }
71 wakaba 1.18 ## NOTE: Conformance to URI standard is not checked since there is
72     ## no author requirement on conformance in the XML Base specification.
73 wakaba 1.13 },
74     id => sub {
75     my ($self, $attr) = @_;
76     my $value = $attr->value;
77     $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
78     $value =~ s/^\x20//;
79     $value =~ s/\x20$//;
80     ## TODO: NCName in XML 1.0 or 1.1
81     ## TODO: declared type is ID?
82 wakaba 1.33 if ($self->{id}->{$value}) { ## NOTE: An xml:id error
83     $self->{onerror}->(node => $attr, level => 'error',
84     type => 'duplicate ID');
85 wakaba 1.37 push @{$self->{id}->{$value}}, $attr;
86 wakaba 1.13 } else {
87 wakaba 1.37 $self->{id}->{$value} = [$attr];
88 wakaba 1.13 }
89     },
90 wakaba 1.9 },
91     $XMLNS_NS => {
92 wakaba 1.13 '' => sub {
93     my ($self, $attr) = @_;
94     my $ln = $attr->manakai_local_name;
95     my $value = $attr->value;
96     if ($value eq $XML_NS and $ln ne 'xml') {
97     $self->{onerror}
98 wakaba 1.33 ->(node => $attr, level => 'NC',
99     type => 'Reserved Prefixes and Namespace Names:=xml');
100 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
101     $self->{onerror}
102 wakaba 1.33 ->(node => $attr, level => 'NC',
103     type => 'Reserved Prefixes and Namespace Names:=xmlns');
104 wakaba 1.13 }
105     if ($ln eq 'xml' and $value ne $XML_NS) {
106     $self->{onerror}
107 wakaba 1.33 ->(node => $attr, level => 'NC',
108     type => 'Reserved Prefixes and Namespace Names:xmlns:xml=');
109 wakaba 1.13 } elsif ($ln eq 'xmlns') {
110     $self->{onerror}
111 wakaba 1.33 ->(node => $attr, level => 'NC',
112     type => 'Reserved Prefixes and Namespace Names:xmlns:xmlns=');
113 wakaba 1.13 }
114     ## TODO: If XML 1.0 and empty
115     },
116     xmlns => sub {
117     my ($self, $attr) = @_;
118     ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
119     ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
120 wakaba 1.18 ## TODO: relative references are deprecated
121 wakaba 1.13 my $value = $attr->value;
122     if ($value eq $XML_NS) {
123     $self->{onerror}
124 wakaba 1.33 ->(node => $attr, level => 'NC',
125     type => 'Reserved Prefixes and Namespace Names:=xml');
126 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
127     $self->{onerror}
128 wakaba 1.33 ->(node => $attr, level => 'NC',
129     type => 'Reserved Prefixes and Namespace Names:=xmlns');
130 wakaba 1.13 }
131     },
132 wakaba 1.9 },
133     };
134    
135 wakaba 1.14 ## ISSUE: Should we really allow these attributes?
136 wakaba 1.13 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
137     $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
138     $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
139     $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
140    
141 wakaba 1.3 ## ANY
142 wakaba 1.42 our $AnyChecker = sub {
143 wakaba 1.4 my ($self, $todo) = @_;
144     my $el = $todo->{node};
145     my $new_todos = [];
146 wakaba 1.3 my @nodes = (@{$el->child_nodes});
147     while (@nodes) {
148     my $node = shift @nodes;
149     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
150    
151     my $nt = $node->node_type;
152     if ($nt == 1) {
153     my $node_ns = $node->namespace_uri;
154     $node_ns = '' unless defined $node_ns;
155     my $node_ln = $node->manakai_local_name;
156     if ($self->{minuses}->{$node_ns}->{$node_ln}) {
157     $self->{onerror}->(node => $node, type => 'element not allowed');
158     }
159 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
160 wakaba 1.3 } elsif ($nt == 5) {
161     unshift @nodes, @{$node->child_nodes};
162     }
163     }
164 wakaba 1.4 return ($new_todos);
165 wakaba 1.3 }; # $AnyChecker
166    
167 wakaba 1.42 our $ElementDefault = {
168 wakaba 1.1 checker => sub {
169 wakaba 1.4 my ($self, $todo) = @_;
170 wakaba 1.33 $self->{onerror}->(node => $todo->{node}, level => 'unsupported',
171     type => 'element');
172 wakaba 1.4 return $AnyChecker->($self, $todo);
173 wakaba 1.1 },
174 wakaba 1.9 attrs_checker => sub {
175     my ($self, $todo) = @_;
176     for my $attr (@{$todo->{node}->attributes}) {
177     my $attr_ns = $attr->namespace_uri;
178     $attr_ns = '' unless defined $attr_ns;
179     my $attr_ln = $attr->manakai_local_name;
180     my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
181     || $AttrChecker->{$attr_ns}->{''};
182     if ($checker) {
183     $checker->($self, $attr);
184 wakaba 1.17 } else {
185 wakaba 1.33 $self->{onerror}->(node => $attr, level => 'unsupported',
186     type => 'attribute');
187 wakaba 1.9 }
188     }
189     },
190 wakaba 1.1 };
191    
192 wakaba 1.7 my $HTMLTransparentElements = {
193     $HTML_NS => {qw/ins 1 font 1 noscript 1/},
194 wakaba 1.29 ## NOTE: |html:noscript| is transparent if scripting is disabled
195     ## and not in |head|.
196 wakaba 1.7 };
197    
198 wakaba 1.42 our $Element = {};
199 wakaba 1.7
200 wakaba 1.42 sub check_document ($$$) {
201     my ($self, $doc, $onerror) = @_;
202     $self = bless {}, $self unless ref $self;
203     $self->{onerror} = $onerror;
204 wakaba 1.1
205 wakaba 1.48 $self->{must_level} = 'm';
206     $self->{fact_level} = 'f';
207     $self->{should_level} = 's';
208     $self->{good_level} = 'g';
209    
210 wakaba 1.42 my $docel = $doc->document_element;
211     unless (defined $docel) {
212     ## ISSUE: Should we check content of Document node?
213     $onerror->(node => $doc, type => 'no document element');
214     ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
215     return {
216     class => {},
217     id => {}, table => [], term => {},
218     };
219 wakaba 1.1 }
220    
221 wakaba 1.42 ## ISSUE: Unexpanded entity references and HTML5 conformance
222 wakaba 1.1
223 wakaba 1.42 my $docel_nsuri = $docel->namespace_uri;
224     $docel_nsuri = '' unless defined $docel_nsuri;
225 wakaba 1.43 unless ($Namespace->{$docel_nsuri}->{loaded}) {
226     if ($Namespace->{$docel_nsuri}->{module}) {
227     eval qq{ require $Namespace->{$docel_nsuri}->{module} } or die $@;
228     } else {
229     $Namespace->{$docel_nsuri}->{loaded} = 1;
230     }
231     }
232 wakaba 1.42 my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
233     $Element->{$docel_nsuri}->{''} ||
234     $ElementDefault;
235     if ($docel_def->{is_root}) {
236     #
237     } else {
238 wakaba 1.49 $onerror->(node => $docel, type => 'element not allowed:root');
239 wakaba 1.1 }
240    
241 wakaba 1.42 ## TODO: Check for other items other than document element
242     ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
243 wakaba 1.2
244 wakaba 1.42 return $self->check_element ($docel, $onerror);
245     } # check_document
246 wakaba 1.1
247 wakaba 1.42 sub check_element ($$$) {
248     my ($self, $el, $onerror) = @_;
249     $self = bless {}, $self unless ref $self;
250     $self->{onerror} = $onerror;
251 wakaba 1.2
252 wakaba 1.48 $self->{must_level} = 'm';
253     $self->{fact_level} = 'f';
254     $self->{should_level} = 's';
255     $self->{good_level} = 'g';
256    
257 wakaba 1.42 $self->{minuses} = {};
258     $self->{id} = {};
259     $self->{term} = {};
260     $self->{usemap} = [];
261     $self->{contextmenu} = [];
262     $self->{map} = {};
263     $self->{menu} = {};
264     $self->{has_link_type} = {};
265 wakaba 1.46 #$self->{has_uri_attr};
266     #$self->{has_hyperlink_element};
267 wakaba 1.42 $self->{return} = {
268     class => {},
269     id => $self->{id}, table => [], term => $self->{term},
270     };
271 wakaba 1.4
272 wakaba 1.42 my @todo = ({type => 'element', node => $el});
273     while (@todo) {
274     my $todo = shift @todo;
275     if ($todo->{type} eq 'element') {
276     my $prefix = $todo->{node}->prefix;
277     if (defined $prefix and $prefix eq 'xmlns') {
278     $self->{onerror}
279     ->(node => $todo->{node}, level => 'NC',
280     type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
281 wakaba 1.7 }
282 wakaba 1.42 my $nsuri = $todo->{node}->namespace_uri;
283     $nsuri = '' unless defined $nsuri;
284     unless ($Namespace->{$nsuri}->{loaded}) {
285     if ($Namespace->{$nsuri}->{module}) {
286     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
287     } else {
288     $Namespace->{$nsuri}->{loaded} = 1;
289 wakaba 1.1 }
290     }
291 wakaba 1.42 my $ln = $todo->{node}->manakai_local_name;
292     my $eldef = $Element->{$nsuri}->{$ln} ||
293     $Element->{$nsuri}->{''} ||
294     $ElementDefault;
295     $eldef->{attrs_checker}->($self, $todo);
296     my ($new_todos) = $eldef->{checker}->($self, $todo);
297     unshift @todo, @$new_todos;
298     } elsif ($todo->{type} eq 'element-attributes') {
299     my $prefix = $todo->{node}->prefix;
300     if (defined $prefix and $prefix eq 'xmlns') {
301     $self->{onerror}
302     ->(node => $todo->{node}, level => 'NC',
303     type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
304     }
305     my $nsuri = $todo->{node}->namespace_uri;
306     $nsuri = '' unless defined $nsuri;
307     unless ($Namespace->{$nsuri}->{loaded}) {
308     if ($Namespace->{$nsuri}->{module}) {
309     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
310 wakaba 1.1 } else {
311 wakaba 1.42 $Namespace->{$nsuri}->{loaded} = 1;
312 wakaba 1.1 }
313     }
314 wakaba 1.9 my $ln = $todo->{node}->manakai_local_name;
315     my $eldef = $Element->{$nsuri}->{$ln} ||
316     $Element->{$nsuri}->{''} ||
317     $ElementDefault;
318     $eldef->{attrs_checker}->($self, $todo);
319 wakaba 1.4 } elsif ($todo->{type} eq 'plus') {
320     $self->_remove_minuses ($todo);
321 wakaba 1.30 } elsif ($todo->{type} eq 'code') {
322     $todo->{code}->();
323     } else {
324     die "$0: Internal error: Unsupported checking action type |$todo->{type}|";
325 wakaba 1.4 }
326 wakaba 1.1 }
327 wakaba 1.17
328     for (@{$self->{usemap}}) {
329     unless ($self->{map}->{$_->[0]}) {
330     $self->{onerror}->(node => $_->[1], type => 'no referenced map');
331     }
332     }
333    
334 wakaba 1.32 for (@{$self->{contextmenu}}) {
335     unless ($self->{menu}->{$_->[0]}) {
336     $self->{onerror}->(node => $_->[1], type => 'no referenced menu');
337     }
338     }
339    
340 wakaba 1.17 delete $self->{minuses};
341     delete $self->{onerror};
342     delete $self->{id};
343     delete $self->{usemap};
344     delete $self->{map};
345 wakaba 1.33 return $self->{return};
346 wakaba 1.1 } # check_element
347    
348 wakaba 1.2 sub _add_minuses ($@) {
349     my $self = shift;
350     my $r = {};
351     for my $list (@_) {
352     for my $ns (keys %$list) {
353     for my $ln (keys %{$list->{$ns}}) {
354     unless ($self->{minuses}->{$ns}->{$ln}) {
355     $self->{minuses}->{$ns}->{$ln} = 1;
356     $r->{$ns}->{$ln} = 1;
357     }
358     }
359     }
360     }
361 wakaba 1.4 return {type => 'plus', list => $r};
362 wakaba 1.2 } # _add_minuses
363    
364     sub _remove_minuses ($$) {
365 wakaba 1.4 my ($self, $todo) = @_;
366     for my $ns (keys %{$todo->{list}}) {
367     for my $ln (keys %{$todo->{list}->{$ns}}) {
368     delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
369 wakaba 1.2 }
370     }
371     1;
372     } # _remove_minuses
373    
374 wakaba 1.30 sub _check_get_children ($$$) {
375     my ($self, $node, $parent_todo) = @_;
376 wakaba 1.4 my $new_todos = [];
377 wakaba 1.2 my $sib = [];
378     TP: {
379     my $node_ns = $node->namespace_uri;
380     $node_ns = '' unless defined $node_ns;
381     my $node_ln = $node->manakai_local_name;
382 wakaba 1.45 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
383     if ($node_ns eq $HTML_NS and $node_ln eq 'noscript') {
384     if ($parent_todo->{flag}->{in_head}) {
385     #
386     } else {
387     my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
388     push @$sib, $end;
389    
390     unshift @$sib, @{$node->child_nodes};
391     push @$new_todos, {type => 'element-attributes', node => $node};
392     last TP;
393     }
394     } else {
395     unshift @$sib, @{$node->child_nodes};
396     push @$new_todos, {type => 'element-attributes', node => $node};
397     last TP;
398 wakaba 1.2 }
399     }
400 wakaba 1.8 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
401 wakaba 1.2 if ($node->has_attribute_ns (undef, 'src')) {
402     unshift @$sib, @{$node->child_nodes};
403 wakaba 1.9 push @$new_todos, {type => 'element-attributes', node => $node};
404 wakaba 1.2 last TP;
405     } else {
406     my @cn = @{$node->child_nodes};
407     CN: while (@cn) {
408     my $cn = shift @cn;
409     my $cnt = $cn->node_type;
410     if ($cnt == 1) {
411 wakaba 1.8 my $cn_nsuri = $cn->namespace_uri;
412     $cn_nsuri = '' unless defined $cn_nsuri;
413     if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
414 wakaba 1.2 #
415     } else {
416     last CN;
417     }
418     } elsif ($cnt == 3 or $cnt == 4) {
419     if ($cn->data =~ /[^\x09-\x0D\x20]/) {
420     last CN;
421     }
422     }
423     } # CN
424     unshift @$sib, @cn;
425     }
426     }
427 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
428 wakaba 1.2 } # TP
429 wakaba 1.30
430     for my $new_todo (@$new_todos) {
431     $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
432     }
433    
434 wakaba 1.4 return ($sib, $new_todos);
435 wakaba 1.2 } # _check_get_children
436    
437 wakaba 1.44 =head1 LICENSE
438    
439     Copyright 2007 Wakaba <w@suika.fam.cx>
440    
441     This library is free software; you can redistribute it
442     and/or modify it under the same terms as Perl itself.
443    
444     =cut
445    
446 wakaba 1.1 1;
447 wakaba 1.49 # $Date: 2007/09/24 04:23:44 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24