/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.42 - (hide annotations) (download)
Sat Aug 4 13:48:25 2007 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.41: +91 -2973 lines
++ whatpm/Whatpm/ChangeLog	4 Aug 2007 13:32:41 -0000
	* ContentChecker/: New directory.

2007-08-04  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ContentChecker/ChangeLog	4 Aug 2007 13:48:19 -0000
2007-08-04  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm: New Perl module, split from |Whatpm/ContentChecker.pm|.

	* ChangeLog: New file.

1 wakaba 1.1 package Whatpm::ContentChecker;
2     use strict;
3    
4 wakaba 1.18 require Whatpm::URIChecker;
5    
6 wakaba 1.13 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
7     ## be applied to an in-memory representation (i.e. DOM)?
8    
9 wakaba 1.42 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
10 wakaba 1.9 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
11     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
12    
13 wakaba 1.42 my $Namespace = {
14     $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
15     $XML_NS => {loaded => 1},
16     $XMLNS_NS => {loaded => 1},
17     };
18    
19     our $AttrChecker = {
20 wakaba 1.9 $XML_NS => {
21 wakaba 1.13 space => sub {
22     my ($self, $attr) = @_;
23     my $value = $attr->value;
24     if ($value eq 'default' or $value eq 'preserve') {
25     #
26     } else {
27     ## NOTE: An XML "error"
28 wakaba 1.33 $self->{onerror}->(node => $attr, level => 'error',
29     type => 'invalid attribute value');
30 wakaba 1.13 }
31     },
32     lang => sub {
33 wakaba 1.35 my ($self, $attr) = @_;
34 wakaba 1.13 ## NOTE: "The values of the attribute are language identifiers
35     ## as defined by [IETF RFC 3066], Tags for the Identification
36     ## of Languages, or its successor; in addition, the empty string
37     ## may be specified." ("may" in lower case)
38 wakaba 1.36 $self->{onerror}->(node => $attr, level => 'unsupported',
39     type => 'language tag');
40 wakaba 1.35 if ($attr->owner_document->manakai_is_html) { # MUST NOT
41 wakaba 1.36 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang');
42 wakaba 1.35 ## TODO: Test data...
43     }
44 wakaba 1.13 },
45     base => sub {
46     my ($self, $attr) = @_;
47     my $value = $attr->value;
48     if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
49     $self->{onerror}->(node => $attr,
50 wakaba 1.33 type => 'invalid attribute value');
51 wakaba 1.13 }
52 wakaba 1.18 ## NOTE: Conformance to URI standard is not checked since there is
53     ## no author requirement on conformance in the XML Base specification.
54 wakaba 1.13 },
55     id => sub {
56     my ($self, $attr) = @_;
57     my $value = $attr->value;
58     $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
59     $value =~ s/^\x20//;
60     $value =~ s/\x20$//;
61     ## TODO: NCName in XML 1.0 or 1.1
62     ## TODO: declared type is ID?
63 wakaba 1.33 if ($self->{id}->{$value}) { ## NOTE: An xml:id error
64     $self->{onerror}->(node => $attr, level => 'error',
65     type => 'duplicate ID');
66 wakaba 1.37 push @{$self->{id}->{$value}}, $attr;
67 wakaba 1.13 } else {
68 wakaba 1.37 $self->{id}->{$value} = [$attr];
69 wakaba 1.13 }
70     },
71 wakaba 1.9 },
72     $XMLNS_NS => {
73 wakaba 1.13 '' => sub {
74     my ($self, $attr) = @_;
75     my $ln = $attr->manakai_local_name;
76     my $value = $attr->value;
77     if ($value eq $XML_NS and $ln ne 'xml') {
78     $self->{onerror}
79 wakaba 1.33 ->(node => $attr, level => 'NC',
80     type => 'Reserved Prefixes and Namespace Names:=xml');
81 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
82     $self->{onerror}
83 wakaba 1.33 ->(node => $attr, level => 'NC',
84     type => 'Reserved Prefixes and Namespace Names:=xmlns');
85 wakaba 1.13 }
86     if ($ln eq 'xml' and $value ne $XML_NS) {
87     $self->{onerror}
88 wakaba 1.33 ->(node => $attr, level => 'NC',
89     type => 'Reserved Prefixes and Namespace Names:xmlns:xml=');
90 wakaba 1.13 } elsif ($ln eq 'xmlns') {
91     $self->{onerror}
92 wakaba 1.33 ->(node => $attr, level => 'NC',
93     type => 'Reserved Prefixes and Namespace Names:xmlns:xmlns=');
94 wakaba 1.13 }
95     ## TODO: If XML 1.0 and empty
96     },
97     xmlns => sub {
98     my ($self, $attr) = @_;
99     ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
100     ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
101 wakaba 1.18 ## TODO: relative references are deprecated
102 wakaba 1.13 my $value = $attr->value;
103     if ($value eq $XML_NS) {
104     $self->{onerror}
105 wakaba 1.33 ->(node => $attr, level => 'NC',
106     type => 'Reserved Prefixes and Namespace Names:=xml');
107 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
108     $self->{onerror}
109 wakaba 1.33 ->(node => $attr, level => 'NC',
110     type => 'Reserved Prefixes and Namespace Names:=xmlns');
111 wakaba 1.13 }
112     },
113 wakaba 1.9 },
114     };
115    
116 wakaba 1.14 ## ISSUE: Should we really allow these attributes?
117 wakaba 1.13 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
118     $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
119     $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
120     $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
121    
122 wakaba 1.3 ## ANY
123 wakaba 1.42 our $AnyChecker = sub {
124 wakaba 1.4 my ($self, $todo) = @_;
125     my $el = $todo->{node};
126     my $new_todos = [];
127 wakaba 1.3 my @nodes = (@{$el->child_nodes});
128     while (@nodes) {
129     my $node = shift @nodes;
130     $self->_remove_minuses ($node) and next if ref $node eq 'HASH';
131    
132     my $nt = $node->node_type;
133     if ($nt == 1) {
134     my $node_ns = $node->namespace_uri;
135     $node_ns = '' unless defined $node_ns;
136     my $node_ln = $node->manakai_local_name;
137     if ($self->{minuses}->{$node_ns}->{$node_ln}) {
138     $self->{onerror}->(node => $node, type => 'element not allowed');
139     }
140 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
141 wakaba 1.3 } elsif ($nt == 5) {
142     unshift @nodes, @{$node->child_nodes};
143     }
144     }
145 wakaba 1.4 return ($new_todos);
146 wakaba 1.3 }; # $AnyChecker
147    
148 wakaba 1.42 our $ElementDefault = {
149 wakaba 1.1 checker => sub {
150 wakaba 1.4 my ($self, $todo) = @_;
151 wakaba 1.33 $self->{onerror}->(node => $todo->{node}, level => 'unsupported',
152     type => 'element');
153 wakaba 1.4 return $AnyChecker->($self, $todo);
154 wakaba 1.1 },
155 wakaba 1.9 attrs_checker => sub {
156     my ($self, $todo) = @_;
157     for my $attr (@{$todo->{node}->attributes}) {
158     my $attr_ns = $attr->namespace_uri;
159     $attr_ns = '' unless defined $attr_ns;
160     my $attr_ln = $attr->manakai_local_name;
161     my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
162     || $AttrChecker->{$attr_ns}->{''};
163     if ($checker) {
164     $checker->($self, $attr);
165 wakaba 1.17 } else {
166 wakaba 1.33 $self->{onerror}->(node => $attr, level => 'unsupported',
167     type => 'attribute');
168 wakaba 1.9 }
169     }
170     },
171 wakaba 1.1 };
172    
173 wakaba 1.7 my $HTMLTransparentElements = {
174     $HTML_NS => {qw/ins 1 font 1 noscript 1/},
175 wakaba 1.29 ## NOTE: |html:noscript| is transparent if scripting is disabled
176     ## and not in |head|.
177 wakaba 1.7 };
178    
179 wakaba 1.42 our $Element = {};
180 wakaba 1.7
181 wakaba 1.42 sub check_document ($$$) {
182     my ($self, $doc, $onerror) = @_;
183     $self = bless {}, $self unless ref $self;
184     $self->{onerror} = $onerror;
185 wakaba 1.1
186 wakaba 1.42 my $docel = $doc->document_element;
187     unless (defined $docel) {
188     ## ISSUE: Should we check content of Document node?
189     $onerror->(node => $doc, type => 'no document element');
190     ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
191     return {
192     class => {},
193     id => {}, table => [], term => {},
194     };
195 wakaba 1.1 }
196    
197 wakaba 1.42 ## ISSUE: Unexpanded entity references and HTML5 conformance
198 wakaba 1.1
199 wakaba 1.42 my $docel_nsuri = $docel->namespace_uri;
200     $docel_nsuri = '' unless defined $docel_nsuri;
201     my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
202     $Element->{$docel_nsuri}->{''} ||
203     $ElementDefault;
204     if ($docel_def->{is_root}) {
205     #
206     } else {
207     $onerror->(node => $docel, type => 'element not allowed');
208 wakaba 1.1 }
209    
210 wakaba 1.42 ## TODO: Check for other items other than document element
211     ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
212 wakaba 1.2
213 wakaba 1.42 return $self->check_element ($docel, $onerror);
214     } # check_document
215 wakaba 1.1
216 wakaba 1.42 sub check_element ($$$) {
217     my ($self, $el, $onerror) = @_;
218     $self = bless {}, $self unless ref $self;
219     $self->{onerror} = $onerror;
220 wakaba 1.2
221 wakaba 1.42 $self->{minuses} = {};
222     $self->{id} = {};
223     $self->{term} = {};
224     $self->{usemap} = [];
225     $self->{contextmenu} = [];
226     $self->{map} = {};
227     $self->{menu} = {};
228     $self->{has_link_type} = {};
229     $self->{return} = {
230     class => {},
231     id => $self->{id}, table => [], term => $self->{term},
232     };
233 wakaba 1.4
234 wakaba 1.42 my @todo = ({type => 'element', node => $el});
235     while (@todo) {
236     my $todo = shift @todo;
237     if ($todo->{type} eq 'element') {
238     my $prefix = $todo->{node}->prefix;
239     if (defined $prefix and $prefix eq 'xmlns') {
240     $self->{onerror}
241     ->(node => $todo->{node}, level => 'NC',
242     type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
243 wakaba 1.7 }
244 wakaba 1.42 my $nsuri = $todo->{node}->namespace_uri;
245     $nsuri = '' unless defined $nsuri;
246     unless ($Namespace->{$nsuri}->{loaded}) {
247     if ($Namespace->{$nsuri}->{module}) {
248     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
249     } else {
250     $Namespace->{$nsuri}->{loaded} = 1;
251 wakaba 1.1 }
252     }
253 wakaba 1.42 my $ln = $todo->{node}->manakai_local_name;
254     my $eldef = $Element->{$nsuri}->{$ln} ||
255     $Element->{$nsuri}->{''} ||
256     $ElementDefault;
257     $eldef->{attrs_checker}->($self, $todo);
258     my ($new_todos) = $eldef->{checker}->($self, $todo);
259     unshift @todo, @$new_todos;
260     } elsif ($todo->{type} eq 'element-attributes') {
261     my $prefix = $todo->{node}->prefix;
262     if (defined $prefix and $prefix eq 'xmlns') {
263     $self->{onerror}
264     ->(node => $todo->{node}, level => 'NC',
265     type => 'Reserved Prefixes and Namespace Names:<xmlns:>');
266     }
267     my $nsuri = $todo->{node}->namespace_uri;
268     $nsuri = '' unless defined $nsuri;
269     unless ($Namespace->{$nsuri}->{loaded}) {
270     if ($Namespace->{$nsuri}->{module}) {
271     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
272 wakaba 1.1 } else {
273 wakaba 1.42 $Namespace->{$nsuri}->{loaded} = 1;
274 wakaba 1.1 }
275     }
276 wakaba 1.9 my $ln = $todo->{node}->manakai_local_name;
277     my $eldef = $Element->{$nsuri}->{$ln} ||
278     $Element->{$nsuri}->{''} ||
279     $ElementDefault;
280     $eldef->{attrs_checker}->($self, $todo);
281 wakaba 1.4 } elsif ($todo->{type} eq 'plus') {
282     $self->_remove_minuses ($todo);
283 wakaba 1.30 } elsif ($todo->{type} eq 'code') {
284     $todo->{code}->();
285     } else {
286     die "$0: Internal error: Unsupported checking action type |$todo->{type}|";
287 wakaba 1.4 }
288 wakaba 1.1 }
289 wakaba 1.17
290     for (@{$self->{usemap}}) {
291     unless ($self->{map}->{$_->[0]}) {
292     $self->{onerror}->(node => $_->[1], type => 'no referenced map');
293     }
294     }
295    
296 wakaba 1.32 for (@{$self->{contextmenu}}) {
297     unless ($self->{menu}->{$_->[0]}) {
298     $self->{onerror}->(node => $_->[1], type => 'no referenced menu');
299     }
300     }
301    
302 wakaba 1.17 delete $self->{minuses};
303     delete $self->{onerror};
304     delete $self->{id};
305     delete $self->{usemap};
306     delete $self->{map};
307 wakaba 1.33 return $self->{return};
308 wakaba 1.1 } # check_element
309    
310 wakaba 1.2 sub _add_minuses ($@) {
311     my $self = shift;
312     my $r = {};
313     for my $list (@_) {
314     for my $ns (keys %$list) {
315     for my $ln (keys %{$list->{$ns}}) {
316     unless ($self->{minuses}->{$ns}->{$ln}) {
317     $self->{minuses}->{$ns}->{$ln} = 1;
318     $r->{$ns}->{$ln} = 1;
319     }
320     }
321     }
322     }
323 wakaba 1.4 return {type => 'plus', list => $r};
324 wakaba 1.2 } # _add_minuses
325    
326     sub _remove_minuses ($$) {
327 wakaba 1.4 my ($self, $todo) = @_;
328     for my $ns (keys %{$todo->{list}}) {
329     for my $ln (keys %{$todo->{list}->{$ns}}) {
330     delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
331 wakaba 1.2 }
332     }
333     1;
334     } # _remove_minuses
335    
336 wakaba 1.30 sub _check_get_children ($$$) {
337     my ($self, $node, $parent_todo) = @_;
338 wakaba 1.4 my $new_todos = [];
339 wakaba 1.2 my $sib = [];
340     TP: {
341     my $node_ns = $node->namespace_uri;
342     $node_ns = '' unless defined $node_ns;
343     my $node_ln = $node->manakai_local_name;
344     if ($node_ns eq $HTML_NS) {
345     if ($node_ln eq 'noscript') {
346     my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
347     push @$sib, $end;
348     }
349     }
350 wakaba 1.31 ## TODO: |noscript| is not a transparent element in |head|.
351 wakaba 1.7 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
352     unshift @$sib, @{$node->child_nodes};
353 wakaba 1.9 push @$new_todos, {type => 'element-attributes', node => $node};
354 wakaba 1.7 last TP;
355 wakaba 1.2 }
356 wakaba 1.8 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
357 wakaba 1.2 if ($node->has_attribute_ns (undef, 'src')) {
358     unshift @$sib, @{$node->child_nodes};
359 wakaba 1.9 push @$new_todos, {type => 'element-attributes', node => $node};
360 wakaba 1.2 last TP;
361     } else {
362     my @cn = @{$node->child_nodes};
363     CN: while (@cn) {
364     my $cn = shift @cn;
365     my $cnt = $cn->node_type;
366     if ($cnt == 1) {
367 wakaba 1.8 my $cn_nsuri = $cn->namespace_uri;
368     $cn_nsuri = '' unless defined $cn_nsuri;
369     if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
370 wakaba 1.2 #
371     } else {
372     last CN;
373     }
374     } elsif ($cnt == 3 or $cnt == 4) {
375     if ($cn->data =~ /[^\x09-\x0D\x20]/) {
376     last CN;
377     }
378     }
379     } # CN
380     unshift @$sib, @cn;
381     }
382     }
383 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
384 wakaba 1.2 } # TP
385 wakaba 1.30
386     for my $new_todo (@$new_todos) {
387     $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
388     }
389    
390 wakaba 1.4 return ($sib, $new_todos);
391 wakaba 1.2 } # _check_get_children
392    
393 wakaba 1.1 1;
394 wakaba 1.42 # $Date: 2007/08/04 13:23:36 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24