/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.88 - (hide annotations) (download)
Sat Aug 30 10:26:39 2008 UTC (16 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.87: +3 -2 lines
++ whatpm/t/ChangeLog	30 Aug 2008 10:22:30 -0000
	* ContentChecker.t: Updated for latest version of the
	Whatpm::ContentChecker module.

	* content-model-1.dat, content-model-2.dat, content-model-3.dat,
	content-model-4.dat, content-model-6.dat, content-model-atom-1.dat,
	content-model-atom-2.dat, content-model-atom-threading-1.dat,
	table-1.dat: Results updated.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	30 Aug 2008 10:24:24 -0000
	* ContentChecker.pm: Error level definition for |xml_id_error|
	was missing.

	* URIChecker.pm: The end of the URL should be marked as the
	error location for an empty path error.  The position
	between the userinfo and the port components should be
	marked as the error location for an empty host error.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ContentChecker/ChangeLog	30 Aug 2008 10:26:28 -0000
2008-08-30  Wakaba  <wakaba@suika.fam.cx>

	* Atom.pm: s/element missing/child element missing/ for
	consistency.

	* HTML.pm: Typos fixed.
	(pre): "No significant content" error was unintentionally
	disabled.  s/element missing/child element missing/ for
	consistency.

1 wakaba 1.1 package Whatpm::ContentChecker;
2     use strict;
3 wakaba 1.88 our $VERSION=do{my @r=(q$Revision: 1.87 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.18 require Whatpm::URIChecker;
6    
7 wakaba 1.13 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
8     ## be applied to an in-memory representation (i.e. DOM)?
9    
10 wakaba 1.50 ## TODO: Conformance of an HTML document with non-html root element.
11    
12 wakaba 1.70 ## Stability
13 wakaba 1.67 sub FEATURE_STATUS_REC () { 0b1 } ## Interoperable standard
14     sub FEATURE_STATUS_CR () { 0b10 } ## Call for implementation
15     sub FEATURE_STATUS_LC () { 0b100 } ## Last call for comments
16     sub FEATURE_STATUS_WD () { 0b1000 } ## Working or editor's draft
17    
18 wakaba 1.70 ## Deprecated
19     sub FEATURE_DEPRECATED_SHOULD () { 0b100000 } ## SHOULD-level
20     sub FEATURE_DEPRECATED_INFO () { 0b1000000 } ## Does not affect conformance
21    
22     ## Conformance
23     sub FEATURE_ALLOWED () { 0b10000 }
24    
25 wakaba 1.42 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
26 wakaba 1.9 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
27     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
28    
29 wakaba 1.42 my $Namespace = {
30 wakaba 1.79 '' => {loaded => 1},
31 wakaba 1.43 q<http://www.w3.org/2005/Atom> => {module => 'Whatpm::ContentChecker::Atom'},
32 wakaba 1.72 q<http://purl.org/syndication/history/1.0>
33     => {module => 'Whatpm::ContentChecker::Atom'},
34     q<http://purl.org/syndication/threading/1.0>
35     => {module => 'Whatpm::ContentChecker::Atom'},
36 wakaba 1.42 $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
37     $XML_NS => {loaded => 1},
38     $XMLNS_NS => {loaded => 1},
39 wakaba 1.73 q<http://www.w3.org/1999/02/22-rdf-syntax-ns#> => {loaded => 1},
40 wakaba 1.42 };
41    
42 wakaba 1.79 sub load_ns_module ($) {
43     my $nsuri = shift; # namespace URI or ''
44     unless ($Namespace->{$nsuri}->{loaded}) {
45     if ($Namespace->{$nsuri}->{module}) {
46     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
47     } else {
48     $Namespace->{$nsuri}->{loaded} = 1;
49     }
50     }
51     } # load_ns_module
52    
53 wakaba 1.42 our $AttrChecker = {
54 wakaba 1.9 $XML_NS => {
55 wakaba 1.13 space => sub {
56     my ($self, $attr) = @_;
57     my $value = $attr->value;
58     if ($value eq 'default' or $value eq 'preserve') {
59     #
60     } else {
61     ## NOTE: An XML "error"
62 wakaba 1.83 $self->{onerror}->(node => $attr, level => $self->{level}->{xml_error},
63 wakaba 1.33 type => 'invalid attribute value');
64 wakaba 1.13 }
65     },
66     lang => sub {
67 wakaba 1.35 my ($self, $attr) = @_;
68 wakaba 1.47 my $value = $attr->value;
69     if ($value eq '') {
70     #
71     } else {
72     require Whatpm::LangTag;
73     Whatpm::LangTag->check_rfc3066_language_tag ($value, sub {
74 wakaba 1.83 $self->{onerror}->(@_, node => $attr);
75 wakaba 1.85 }, $self->{level});
76 wakaba 1.47 }
77    
78 wakaba 1.13 ## NOTE: "The values of the attribute are language identifiers
79     ## as defined by [IETF RFC 3066], Tags for the Identification
80     ## of Languages, or its successor; in addition, the empty string
81     ## may be specified." ("may" in lower case)
82 wakaba 1.47 ## NOTE: Is an RFC 3066-valid (but RFC 4647-invalid) language tag
83     ## allowed today?
84    
85     ## TODO: test data
86    
87 wakaba 1.35 if ($attr->owner_document->manakai_is_html) { # MUST NOT
88 wakaba 1.83 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang',
89     level => $self->{level}->{must});
90 wakaba 1.35 ## TODO: Test data...
91     }
92 wakaba 1.13 },
93     base => sub {
94     my ($self, $attr) = @_;
95     my $value = $attr->value;
96     if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
97     $self->{onerror}->(node => $attr,
98 wakaba 1.83 type => 'invalid attribute value',
99     level => $self->{level}->{fact}, ## TODO: correct?
100     );
101 wakaba 1.13 }
102 wakaba 1.18 ## NOTE: Conformance to URI standard is not checked since there is
103     ## no author requirement on conformance in the XML Base specification.
104 wakaba 1.13 },
105     id => sub {
106     my ($self, $attr) = @_;
107     my $value = $attr->value;
108     $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
109     $value =~ s/^\x20//;
110     $value =~ s/\x20$//;
111     ## TODO: NCName in XML 1.0 or 1.1
112     ## TODO: declared type is ID?
113 wakaba 1.83 if ($self->{id}->{$value}) {
114     $self->{onerror}->(node => $attr,
115     type => 'duplicate ID',
116     level => $self->{level}->{xml_id_error});
117 wakaba 1.37 push @{$self->{id}->{$value}}, $attr;
118 wakaba 1.13 } else {
119 wakaba 1.37 $self->{id}->{$value} = [$attr];
120 wakaba 1.13 }
121     },
122 wakaba 1.9 },
123     $XMLNS_NS => {
124 wakaba 1.13 '' => sub {
125     my ($self, $attr) = @_;
126     my $ln = $attr->manakai_local_name;
127     my $value = $attr->value;
128     if ($value eq $XML_NS and $ln ne 'xml') {
129     $self->{onerror}
130 wakaba 1.83 ->(node => $attr,
131     type => 'Reserved Prefixes and Namespace Names:Name',
132     text => $value,
133     level => $self->{level}->{nc});
134 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
135     $self->{onerror}
136 wakaba 1.83 ->(node => $attr,
137     type => 'Reserved Prefixes and Namespace Names:Name',
138     text => $value,
139     level => $self->{level}->{nc});
140 wakaba 1.13 }
141     if ($ln eq 'xml' and $value ne $XML_NS) {
142     $self->{onerror}
143 wakaba 1.83 ->(node => $attr,
144     type => 'Reserved Prefixes and Namespace Names:Prefix',
145     text => $ln,
146     level => $self->{level}->{nc});
147 wakaba 1.13 } elsif ($ln eq 'xmlns') {
148     $self->{onerror}
149 wakaba 1.83 ->(node => $attr,
150     type => 'Reserved Prefixes and Namespace Names:Prefix',
151     text => $ln,
152     level => $self->{level}->{nc});
153 wakaba 1.13 }
154     ## TODO: If XML 1.0 and empty
155     },
156     xmlns => sub {
157     my ($self, $attr) = @_;
158     ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
159     ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
160 wakaba 1.18 ## TODO: relative references are deprecated
161 wakaba 1.13 my $value = $attr->value;
162     if ($value eq $XML_NS) {
163     $self->{onerror}
164 wakaba 1.83 ->(node => $attr,
165     type => 'Reserved Prefixes and Namespace Names:Name',
166     text => $value,
167     level => $self->{level}->{nc});
168 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
169     $self->{onerror}
170 wakaba 1.83 ->(node => $attr,
171     type => 'Reserved Prefixes and Namespace Names:Name',
172     text => $value,
173     level => $self->{level}->{nc});
174 wakaba 1.13 }
175     },
176 wakaba 1.9 },
177     };
178    
179 wakaba 1.14 ## ISSUE: Should we really allow these attributes?
180 wakaba 1.13 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
181     $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
182     $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
183     $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
184    
185 wakaba 1.79 our $AttrStatus;
186    
187     for (qw/space lang base id/) {
188     $AttrStatus->{$XML_NS}->{$_} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
189     $AttrStatus->{''}->{"xml:$_"} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
190     ## XML 1.0: FEATURE_STATUS_CR
191     ## XML 1.1: FEATURE_STATUS_REC
192     ## XML Namespaces 1.0: FEATURE_STATUS_CR
193     ## XML Namespaces 1.1: FEATURE_STATUS_REC
194     ## XML Base: FEATURE_STATUS_REC
195     ## xml:id: FEATURE_STATUS_REC
196     }
197    
198     $AttrStatus->{$XMLNS_NS}->{''} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
199    
200     ## TODO: xsi:schemaLocation for XHTML2 support (very, very low priority)
201    
202 wakaba 1.60 our %AnyChecker = (
203     check_start => sub { },
204     check_attrs => sub {
205     my ($self, $item, $element_state) = @_;
206     for my $attr (@{$item->{node}->attributes}) {
207 wakaba 1.9 my $attr_ns = $attr->namespace_uri;
208     $attr_ns = '' unless defined $attr_ns;
209     my $attr_ln = $attr->manakai_local_name;
210 wakaba 1.79
211     load_ns_module ($attr_ns);
212    
213 wakaba 1.9 my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
214 wakaba 1.60 || $AttrChecker->{$attr_ns}->{''};
215 wakaba 1.79 my $status = $AttrStatus->{$attr_ns}->{$attr_ln}
216     || $AttrStatus->{$attr_ns}->{''};
217     if (not defined $status) {
218     $status = FEATURE_ALLOWED;
219     ## NOTE: FEATURE_ALLOWED for all attributes, since the element
220     ## is not supported and therefore "attribute not defined" error
221     ## should not raised (too verbose) and global attributes should be
222     ## allowed anyway (if a global attribute has its specified creteria
223     ## for where it may be specified, then it should be checked in it's
224     ## checker function).
225     }
226 wakaba 1.9 if ($checker) {
227     $checker->($self, $attr);
228 wakaba 1.17 } else {
229 wakaba 1.83 $self->{onerror}->(node => $attr,
230     type => 'unknown attribute',
231     level => $self->{level}->{uncertain});
232 wakaba 1.9 }
233 wakaba 1.79 $self->_attr_status_info ($attr, $status);
234 wakaba 1.9 }
235     },
236 wakaba 1.60 check_child_element => sub {
237     my ($self, $item, $child_el, $child_nsuri, $child_ln,
238     $child_is_transparent, $element_state) = @_;
239     if ($self->{minus_elements}->{$child_nsuri}->{$child_ln}) {
240     $self->{onerror}->(node => $child_el,
241     type => 'element not allowed:minus',
242 wakaba 1.83 level => $self->{level}->{must});
243 wakaba 1.60 } elsif ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
244     #
245     } else {
246     #
247     }
248     },
249     check_child_text => sub { },
250     check_end => sub {
251     my ($self, $item, $element_state) = @_;
252 wakaba 1.82 ## NOTE: There is a modified copy of the code below for |html:ruby|.
253 wakaba 1.60 if ($element_state->{has_significant}) {
254 wakaba 1.66 $item->{real_parent_state}->{has_significant} = 1;
255 wakaba 1.60 }
256     },
257     );
258    
259     our $ElementDefault = {
260     %AnyChecker,
261 wakaba 1.70 status => FEATURE_ALLOWED,
262     ## NOTE: No "element not defined" error - it is not supported anyway.
263 wakaba 1.60 check_start => sub {
264     my ($self, $item, $element_state) = @_;
265 wakaba 1.83 $self->{onerror}->(node => $item->{node},
266     type => 'unknown element',
267     level => $self->{level}->{uncertain});
268 wakaba 1.60 },
269 wakaba 1.1 };
270    
271 wakaba 1.60 our $HTMLEmbeddedContent = {
272     ## NOTE: All embedded content is also phrasing content.
273     $HTML_NS => {
274     img => 1, iframe => 1, embed => 1, object => 1, video => 1, audio => 1,
275     canvas => 1,
276     },
277     q<http://www.w3.org/1998/Math/MathML> => {math => 1},
278     q<http://www.w3.org/2000/svg> => {svg => 1},
279     ## NOTE: Foreign elements with content (but no metadata) are
280     ## embedded content.
281     };
282    
283 wakaba 1.7 my $HTMLTransparentElements = {
284 wakaba 1.57 $HTML_NS => {qw/ins 1 del 1 font 1 noscript 1 canvas 1/},
285 wakaba 1.29 ## NOTE: |html:noscript| is transparent if scripting is disabled
286     ## and not in |head|.
287 wakaba 1.7 };
288    
289 wakaba 1.61 my $HTMLSemiTransparentElements = {
290     $HTML_NS => {object => 1, video => 1, audio => 1},
291     };
292 wakaba 1.57
293 wakaba 1.42 our $Element = {};
294 wakaba 1.7
295 wakaba 1.73 $Element->{q<http://www.w3.org/1999/02/22-rdf-syntax-ns#>}->{RDF} = {
296     %AnyChecker,
297     status => FEATURE_STATUS_REC | FEATURE_ALLOWED,
298     is_root => 1, ## ISSUE: Not explicitly allowed for non application/rdf+xml
299     check_start => sub {
300     my ($self, $item, $element_state) = @_;
301     my $triple = [];
302     push @{$self->{return}->{rdf}}, [$item->{node}, $triple];
303     require Whatpm::RDFXML;
304     my $rdf = Whatpm::RDFXML->new;
305 wakaba 1.75 ## TODO: Should we make bnodeid unique in a document?
306 wakaba 1.73 $rdf->{onerror} = $self->{onerror};
307 wakaba 1.84 $rdf->{level} = $self->{level};
308 wakaba 1.73 $rdf->{ontriple} = sub {
309     my %opt = @_;
310     push @$triple,
311     [$opt{node}, $opt{subject}, $opt{predicate}, $opt{object}];
312 wakaba 1.74 if (defined $opt{id}) {
313     push @$triple,
314     [$opt{node},
315     $opt{id},
316     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#subject>},
317     $opt{subject}];
318     push @$triple,
319     [$opt{node},
320     $opt{id},
321     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate>},
322     $opt{predicate}];
323     push @$triple,
324     [$opt{node},
325     $opt{id},
326     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#object>},
327     $opt{object}];
328     push @$triple,
329     [$opt{node},
330     $opt{id},
331     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>},
332     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement>}];
333     }
334 wakaba 1.73 };
335     $rdf->convert_rdf_element ($item->{node});
336     },
337     };
338    
339 wakaba 1.83 my $default_error_level = {
340     must => 'm',
341     should => 's',
342     warn => 'w',
343     good => 'w',
344     info => 'i',
345 wakaba 1.85
346 wakaba 1.83 uncertain => 'u',
347    
348 wakaba 1.84 html4_fact => 'm',
349 wakaba 1.83 xml_error => 'm', ## TODO: correct?
350 wakaba 1.88 xml_id_error => 'm', ## TODO: ?
351 wakaba 1.83 nc => 'm', ## XML Namespace Constraints ## TODO: correct?
352 wakaba 1.84
353 wakaba 1.85 uri_syntax => 'm',
354     uri_fact => 'm',
355     uri_lc_must => 'm',
356     uri_lc_should => 'w',
357    
358 wakaba 1.87 mime_must => 'm', # lowercase "must"
359     mime_fact => 'm',
360     mime_strongly_discouraged => 'w',
361     mime_discouraged => 'w',
362    
363 wakaba 1.85 langtag_fact => 'm',
364    
365 wakaba 1.84 rdf_fact => 'm',
366     rdf_grammer => 'm',
367     rdf_lc_must => 'm',
368 wakaba 1.83 };
369    
370 wakaba 1.56 sub check_document ($$$;$) {
371     my ($self, $doc, $onerror, $onsubdoc) = @_;
372 wakaba 1.42 $self = bless {}, $self unless ref $self;
373     $self->{onerror} = $onerror;
374 wakaba 1.56 $self->{onsubdoc} = $onsubdoc || sub {
375     warn "A subdocument is not conformance-checked";
376     };
377 wakaba 1.1
378 wakaba 1.83 $self->{level} ||= $default_error_level;
379 wakaba 1.48
380 wakaba 1.73 ## TODO: If application/rdf+xml, RDF/XML mode should be invoked.
381    
382 wakaba 1.42 my $docel = $doc->document_element;
383     unless (defined $docel) {
384     ## ISSUE: Should we check content of Document node?
385 wakaba 1.83 $onerror->(node => $doc, type => 'no document element',
386     level => $self->{level}->{must});
387 wakaba 1.42 ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
388     return {
389     class => {},
390     id => {}, table => [], term => {},
391     };
392 wakaba 1.1 }
393    
394 wakaba 1.42 ## ISSUE: Unexpanded entity references and HTML5 conformance
395 wakaba 1.1
396 wakaba 1.42 my $docel_nsuri = $docel->namespace_uri;
397     $docel_nsuri = '' unless defined $docel_nsuri;
398 wakaba 1.79 load_ns_module ($docel_nsuri);
399 wakaba 1.42 my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
400     $Element->{$docel_nsuri}->{''} ||
401     $ElementDefault;
402     if ($docel_def->{is_root}) {
403     #
404 wakaba 1.50 } elsif ($docel_def->{is_xml_root}) {
405     unless ($doc->manakai_is_html) {
406     #
407     } else {
408 wakaba 1.83 $onerror->(node => $docel, type => 'element not allowed:root:xml',
409     level => $self->{level}->{must});
410 wakaba 1.50 }
411 wakaba 1.42 } else {
412 wakaba 1.83 $onerror->(node => $docel, type => 'element not allowed:root',
413     level => $self->{level}->{must});
414 wakaba 1.1 }
415    
416 wakaba 1.42 ## TODO: Check for other items other than document element
417     ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
418 wakaba 1.2
419 wakaba 1.56 my $return = $self->check_element ($docel, $onerror, $onsubdoc);
420 wakaba 1.51
421 wakaba 1.52 ## TODO: Test for these checks are necessary.
422 wakaba 1.51 my $charset_name = $doc->input_encoding;
423     if (defined $charset_name) {
424     require Message::Charset::Info;
425     my $charset = $Message::Charset::Info::IANACharset->{$charset_name};
426    
427 wakaba 1.71 if ($doc->manakai_is_html) {
428     if (not $doc->manakai_has_bom and
429     not defined $doc->manakai_charset) {
430     unless ($charset->{is_html_ascii_superset}) {
431 wakaba 1.86 $onerror->(node => $doc,
432     level => $self->{level}->{must},
433 wakaba 1.83 type => 'non ascii superset',
434     text => $charset_name);
435 wakaba 1.71 }
436    
437     if (not $self->{has_charset} and ## TODO: This does not work now.
438     not $charset->{iana_names}->{'us-ascii'}) {
439 wakaba 1.86 $onerror->(node => $doc,
440     level => $self->{level}->{must},
441 wakaba 1.83 type => 'no character encoding declaration',
442     text => $charset_name);
443 wakaba 1.71 }
444 wakaba 1.51 }
445 wakaba 1.71
446     if ($charset->{iana_names}->{'utf-8'}) {
447     #
448     } elsif ($charset->{iana_names}->{'jis_x0212-1990'} or
449     $charset->{iana_names}->{'x-jis0208'} or
450     $charset->{iana_names}->{'utf-32'} or ## ISSUE: UTF-32BE? UTF-32LE?
451     $charset->{is_ebcdic_based}) {
452     $onerror->(node => $doc,
453 wakaba 1.83 type => 'bad character encoding',
454     text => $charset_name,
455     level => $self->{level}->{should},
456     layer => 'encode');
457 wakaba 1.71 } elsif ($charset->{iana_names}->{'cesu-8'} or
458     $charset->{iana_names}->{'utf-8'} or ## ISSUE: UNICODE-1-1-UTF-7?
459     $charset->{iana_names}->{'bocu-1'} or
460     $charset->{iana_names}->{'scsu'}) {
461     $onerror->(node => $doc,
462 wakaba 1.83 type => 'disallowed character encoding',
463     text => $charset_name,
464     level => $self->{level}->{must},
465     layer => 'encode');
466 wakaba 1.71 } else {
467     $onerror->(node => $doc,
468 wakaba 1.83 type => 'non-utf-8 character encoding',
469     text => $charset_name,
470     level => $self->{level}->{good},
471     layer => 'encode');
472 wakaba 1.51 }
473     }
474 wakaba 1.52 } elsif ($doc->manakai_is_html) {
475     ## NOTE: MUST and SHOULD requirements above cannot be tested,
476     ## since the document has no input charset encoding information.
477     $onerror->(node => $doc,
478 wakaba 1.83 type => 'character encoding unchecked',
479     level => $self->{level}->{info},
480     layer => 'encode');
481 wakaba 1.51 }
482    
483     return $return;
484 wakaba 1.42 } # check_document
485 wakaba 1.1
486 wakaba 1.81 ## Check an element. The element is checked as if it is an orphan node (i.e.
487     ## an element without a parent node).
488 wakaba 1.56 sub check_element ($$$;$) {
489     my ($self, $el, $onerror, $onsubdoc) = @_;
490 wakaba 1.42 $self = bless {}, $self unless ref $self;
491     $self->{onerror} = $onerror;
492 wakaba 1.56 $self->{onsubdoc} = $onsubdoc || sub {
493     warn "A subdocument is not conformance-checked";
494     };
495 wakaba 1.2
496 wakaba 1.83 $self->{level} ||= $default_error_level;
497 wakaba 1.48
498 wakaba 1.61 $self->{plus_elements} = {};
499     $self->{minus_elements} = {};
500 wakaba 1.42 $self->{id} = {};
501     $self->{term} = {};
502     $self->{usemap} = [];
503 wakaba 1.78 $self->{ref} = []; # datetemplate data references
504     $self->{template} = []; # datatemplate template references
505 wakaba 1.42 $self->{contextmenu} = [];
506     $self->{map} = {};
507     $self->{menu} = {};
508     $self->{has_link_type} = {};
509 wakaba 1.60 $self->{flag} = {};
510 wakaba 1.46 #$self->{has_uri_attr};
511     #$self->{has_hyperlink_element};
512 wakaba 1.51 #$self->{has_charset};
513 wakaba 1.57 #$self->{has_base};
514 wakaba 1.42 $self->{return} = {
515     class => {},
516 wakaba 1.80 id => $self->{id},
517     table => [], # table objects returned by Whatpm::HTMLTable
518     term => $self->{term},
519 wakaba 1.76 uri => {}, # URIs other than those in RDF triples
520     ## TODO: xmlns="", SYSTEM "", atom:* src="", xml:base=""
521 wakaba 1.73 rdf => [],
522 wakaba 1.42 };
523 wakaba 1.4
524 wakaba 1.60 my @item = ({type => 'element', node => $el, parent_state => {}});
525 wakaba 1.66 $item[-1]->{real_parent_state} = $item[-1]->{parent_state};
526 wakaba 1.60 while (@item) {
527     my $item = shift @item;
528     if (ref $item eq 'ARRAY') {
529     my $code = shift @$item;
530     next unless $code;## TODO: temp.
531     $code->(@$item);
532     } elsif ($item->{type} eq 'element') {
533     my $el_nsuri = $item->{node}->namespace_uri;
534     $el_nsuri = '' unless defined $el_nsuri;
535     my $el_ln = $item->{node}->manakai_local_name;
536 wakaba 1.79
537     load_ns_module ($el_nsuri);
538 wakaba 1.63
539     my $element_state = {};
540 wakaba 1.60 my $eldef = $Element->{$el_nsuri}->{$el_ln} ||
541     $Element->{$el_nsuri}->{''} ||
542 wakaba 1.42 $ElementDefault;
543 wakaba 1.61 my $content_def = $item->{transparent}
544     ? $item->{parent_def} || $eldef : $eldef;
545 wakaba 1.63 my $content_state = $item->{transparent}
546 wakaba 1.65 ? $item->{parent_def}
547     ? $item->{parent_state} || $element_state : $element_state
548     : $element_state;
549 wakaba 1.60
550 wakaba 1.67 unless ($eldef->{status} & FEATURE_STATUS_REC) {
551     my $status = $eldef->{status} & FEATURE_STATUS_CR ? 'cr' :
552     $eldef->{status} & FEATURE_STATUS_LC ? 'lc' :
553     $eldef->{status} & FEATURE_STATUS_WD ? 'wd' : 'non-standard';
554     $self->{onerror}->(node => $item->{node},
555     type => 'status:'.$status.':element',
556 wakaba 1.83 level => $self->{level}->{info});
557 wakaba 1.67 }
558 wakaba 1.70 if (not ($eldef->{status} & FEATURE_ALLOWED)) {
559     $self->{onerror}->(node => $item->{node},
560     type => 'element not defined',
561 wakaba 1.83 level => $self->{level}->{must});
562 wakaba 1.70 } elsif ($eldef->{status} & FEATURE_DEPRECATED_SHOULD) {
563     $self->{onerror}->(node => $item->{node},
564     type => 'deprecated:element',
565 wakaba 1.83 level => $self->{level}->{should});
566 wakaba 1.70 } elsif ($eldef->{status} & FEATURE_DEPRECATED_INFO) {
567     $self->{onerror}->(node => $item->{node},
568     type => 'deprecated:element',
569 wakaba 1.83 level => $self->{level}->{info});
570 wakaba 1.70 }
571 wakaba 1.67
572 wakaba 1.60 my @new_item;
573     push @new_item, [$eldef->{check_start}, $self, $item, $element_state];
574     push @new_item, [$eldef->{check_attrs}, $self, $item, $element_state];
575 wakaba 1.61
576 wakaba 1.60 my @child = @{$item->{node}->child_nodes};
577     while (@child) {
578     my $child = shift @child;
579     my $child_nt = $child->node_type;
580     if ($child_nt == 1) { # ELEMENT_NODE
581     my $child_nsuri = $child->namespace_uri;
582     $child_nsuri = '' unless defined $child_nsuri;
583     my $child_ln = $child->manakai_local_name;
584     if ($HTMLTransparentElements->{$child_nsuri}->{$child_ln} and
585     not (($self->{flag}->{in_head} or
586 wakaba 1.61 ($el_nsuri eq $HTML_NS and $el_ln eq 'head')) and
587     $child_nsuri eq $HTML_NS and $child_ln eq 'noscript')) {
588 wakaba 1.60 push @new_item, [$content_def->{check_child_element},
589     $self, $item, $child,
590 wakaba 1.66 $child_nsuri, $child_ln, 1,
591     $content_state, $element_state];
592 wakaba 1.60 push @new_item, {type => 'element', node => $child,
593 wakaba 1.65 parent_state => $content_state,
594 wakaba 1.61 parent_def => $content_def,
595 wakaba 1.66 real_parent_state => $element_state,
596 wakaba 1.60 transparent => 1};
597     } else {
598 wakaba 1.65 if ($item->{parent_def} and # has parent
599     $el_nsuri eq $HTML_NS) { ## $HTMLSemiTransparentElements
600 wakaba 1.61 if ($el_ln eq 'object') {
601     if ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
602     #
603     } elsif ($child_nsuri eq $HTML_NS and $child_ln eq 'param') {
604     #
605     } else {
606 wakaba 1.62 $content_def = $item->{parent_def} || $content_def;
607 wakaba 1.63 $content_state = $item->{parent_state} || $content_state;
608 wakaba 1.62 }
609     } elsif ($el_ln eq 'video' or $el_ln eq 'audio') {
610     if ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
611     #
612     } elsif ($child_nsuri eq $HTML_NS and $child_ln eq 'source') {
613     $element_state->{has_source} = 1;
614     } else {
615     $content_def = $item->{parent_def} || $content_def;
616 wakaba 1.63 $content_state = $item->{parent_state} || $content_state;
617 wakaba 1.61 }
618     }
619     }
620    
621 wakaba 1.60 push @new_item, [$content_def->{check_child_element},
622     $self, $item, $child,
623 wakaba 1.64 $child_nsuri, $child_ln,
624     $HTMLSemiTransparentElements
625     ->{$child_nsuri}->{$child_ln},
626 wakaba 1.66 $content_state, $element_state];
627 wakaba 1.60 push @new_item, {type => 'element', node => $child,
628 wakaba 1.65 parent_def => $content_def,
629 wakaba 1.66 real_parent_state => $element_state,
630 wakaba 1.65 parent_state => $content_state};
631 wakaba 1.60 }
632    
633     if ($HTMLEmbeddedContent->{$child_nsuri}->{$child_ln}) {
634     $element_state->{has_significant} = 1;
635     }
636     } elsif ($child_nt == 3 or # TEXT_NODE
637     $child_nt == 4) { # CDATA_SECTION_NODE
638     my $has_significant = ($child->data =~ /[^\x09-\x0D\x20]/);
639     push @new_item, [$content_def->{check_child_text},
640     $self, $item, $child, $has_significant,
641 wakaba 1.66 $content_state, $element_state];
642     $element_state->{has_significant} ||= $has_significant;
643 wakaba 1.61 if ($has_significant and
644     $HTMLSemiTransparentElements->{$el_nsuri}->{$el_ln}) {
645     $content_def = $item->{parent_def} || $content_def;
646     }
647 wakaba 1.60 } elsif ($child_nt == 5) { # ENTITY_REFERENCE_NODE
648     push @child, @{$child->child_nodes};
649 wakaba 1.1 }
650 wakaba 1.60 ## TODO: PI_NODE
651     ## TODO: Unknown node type
652 wakaba 1.1 }
653 wakaba 1.60
654     push @new_item, [$eldef->{check_end}, $self, $item, $element_state];
655    
656     unshift @item, @new_item;
657 wakaba 1.30 } else {
658 wakaba 1.60 die "$0: Internal error: Unsupported checking action type |$item->{type}|";
659 wakaba 1.4 }
660 wakaba 1.1 }
661 wakaba 1.17
662 wakaba 1.78 for (@{$self->{template}}) {
663     ## TODO: If the document is an XML document, ...
664     ## NOTE: If the document is an HTML document:
665     ## ISSUE: We need to percent-decode?
666     F: {
667     if ($self->{id}->{$_->[0]}) {
668     my $el = $self->{id}->{$_->[0]}->[0]->owner_element;
669     if ($el->node_type == 1 and # ELEMENT_NODE
670     $el->manakai_local_name eq 'datatemplate') {
671     my $nsuri = $el->namespace_uri;
672     if (defined $nsuri and $nsuri eq $HTML_NS) {
673     if ($el eq $_->[1]->owner_element) {
674     $self->{onerror}->(node => $_->[1],
675     type => 'fragment points itself',
676 wakaba 1.83 level => $self->{level}->{must});
677 wakaba 1.78 }
678    
679     last F;
680     }
681     }
682     }
683     ## TODO: Should we raise a "fragment points nothing" error instead
684     ## if the fragment identifier identifies no element?
685    
686     $self->{onerror}->(node => $_->[1], type => 'template:not template',
687 wakaba 1.83 level => $self->{level}->{must});
688 wakaba 1.78 } # F
689     }
690    
691     for (@{$self->{ref}}) {
692     ## TOOD: If XML
693     ## NOTE: If it is an HTML document:
694     if ($_->[0] eq '') {
695     ## NOTE: It points the top of the document.
696     } elsif ($self->{id}->{$_->[0]}) {
697     if ($self->{id}->{$_->[0]}->[0]->owner_element
698     eq $_->[1]->owner_element) {
699     $self->{onerror}->(node => $_->[1], type => 'fragment points itself',
700 wakaba 1.83 level => $self->{level}->{must});
701 wakaba 1.78 }
702     } else {
703     $self->{onerror}->(node => $_->[1], type => 'fragment points nothing',
704 wakaba 1.83 level => $self->{level}->{must});
705 wakaba 1.78 }
706     }
707    
708     ## TODO: Maybe we should have $document->manakai_get_by_fragment or something
709    
710 wakaba 1.17 for (@{$self->{usemap}}) {
711     unless ($self->{map}->{$_->[0]}) {
712 wakaba 1.83 $self->{onerror}->(node => $_->[1], type => 'no referenced map',
713     level => $self->{level}->{must});
714 wakaba 1.17 }
715     }
716    
717 wakaba 1.32 for (@{$self->{contextmenu}}) {
718     unless ($self->{menu}->{$_->[0]}) {
719 wakaba 1.83 $self->{onerror}->(node => $_->[1], type => 'no referenced menu',
720     level => $self->{level}->{must});
721 wakaba 1.32 }
722     }
723    
724 wakaba 1.61 delete $self->{plus_elements};
725     delete $self->{minus_elements};
726 wakaba 1.17 delete $self->{onerror};
727     delete $self->{id};
728     delete $self->{usemap};
729 wakaba 1.78 delete $self->{ref};
730     delete $self->{template};
731 wakaba 1.17 delete $self->{map};
732 wakaba 1.33 return $self->{return};
733 wakaba 1.1 } # check_element
734    
735 wakaba 1.60 sub _add_minus_elements ($$@) {
736     my $self = shift;
737     my $element_state = shift;
738     for my $elements (@_) {
739     for my $nsuri (keys %$elements) {
740     for my $ln (keys %{$elements->{$nsuri}}) {
741     unless ($self->{minus_elements}->{$nsuri}->{$ln}) {
742     $element_state->{minus_elements_original}->{$nsuri}->{$ln} = 0;
743     $self->{minus_elements}->{$nsuri}->{$ln} = 1;
744     }
745     }
746     }
747     }
748     } # _add_minus_elements
749    
750     sub _remove_minus_elements ($$) {
751     my $self = shift;
752     my $element_state = shift;
753     for my $nsuri (keys %{$element_state->{minus_elements_original}}) {
754     for my $ln (keys %{$element_state->{minus_elements_original}->{$nsuri}}) {
755     delete $self->{minus_elements}->{$nsuri}->{$ln};
756     }
757     }
758     } # _remove_minus_elements
759    
760     sub _add_plus_elements ($$@) {
761     my $self = shift;
762     my $element_state = shift;
763     for my $elements (@_) {
764     for my $nsuri (keys %$elements) {
765     for my $ln (keys %{$elements->{$nsuri}}) {
766     unless ($self->{plus_elements}->{$nsuri}->{$ln}) {
767     $element_state->{plus_elements_original}->{$nsuri}->{$ln} = 0;
768     $self->{plus_elements}->{$nsuri}->{$ln} = 1;
769     }
770     }
771     }
772     }
773     } # _add_plus_elements
774    
775     sub _remove_plus_elements ($$) {
776     my $self = shift;
777     my $element_state = shift;
778     for my $nsuri (keys %{$element_state->{plus_elements_original}}) {
779     for my $ln (keys %{$element_state->{plus_elements_original}->{$nsuri}}) {
780     delete $self->{plus_elements}->{$nsuri}->{$ln};
781     }
782     }
783     } # _remove_plus_elements
784    
785 wakaba 1.68 sub _attr_status_info ($$$) {
786     my ($self, $attr, $status_code) = @_;
787 wakaba 1.70
788     if (not ($status_code & FEATURE_ALLOWED)) {
789     $self->{onerror}->(node => $attr,
790     type => 'attribute not defined',
791 wakaba 1.83 level => $self->{level}->{must});
792 wakaba 1.70 } elsif ($status_code & FEATURE_DEPRECATED_SHOULD) {
793     $self->{onerror}->(node => $attr,
794     type => 'deprecated:attr',
795 wakaba 1.83 level => $self->{level}->{should});
796 wakaba 1.70 } elsif ($status_code & FEATURE_DEPRECATED_INFO) {
797     $self->{onerror}->(node => $attr,
798     type => 'deprecated:attr',
799 wakaba 1.83 level => $self->{level}->{info});
800 wakaba 1.70 }
801    
802 wakaba 1.68 my $status;
803     if ($status_code & FEATURE_STATUS_REC) {
804     return;
805     } elsif ($status_code & FEATURE_STATUS_CR) {
806     $status = 'cr';
807     } elsif ($status_code & FEATURE_STATUS_LC) {
808     $status = 'lc';
809     } elsif ($status_code & FEATURE_STATUS_WD) {
810     $status = 'wd';
811     } else {
812     $status = 'non-standard';
813     }
814     $self->{onerror}->(node => $attr,
815     type => 'status:'.$status.':attr',
816 wakaba 1.83 level => $self->{level}->{info});
817 wakaba 1.68 } # _attr_status_info
818    
819 wakaba 1.2 sub _add_minuses ($@) {
820     my $self = shift;
821     my $r = {};
822     for my $list (@_) {
823     for my $ns (keys %$list) {
824     for my $ln (keys %{$list->{$ns}}) {
825     unless ($self->{minuses}->{$ns}->{$ln}) {
826     $self->{minuses}->{$ns}->{$ln} = 1;
827     $r->{$ns}->{$ln} = 1;
828     }
829     }
830     }
831     }
832 wakaba 1.4 return {type => 'plus', list => $r};
833 wakaba 1.2 } # _add_minuses
834    
835 wakaba 1.50 sub _add_pluses ($@) {
836     my $self = shift;
837     my $r = {};
838     for my $list (@_) {
839     for my $ns (keys %$list) {
840     for my $ln (keys %{$list->{$ns}}) {
841     unless ($self->{pluses}->{$ns}->{$ln}) {
842     $self->{pluses}->{$ns}->{$ln} = 1;
843     $r->{$ns}->{$ln} = 1;
844     }
845     }
846     }
847     }
848     return {type => 'minus', list => $r};
849     } # _add_pluses
850    
851 wakaba 1.2 sub _remove_minuses ($$) {
852 wakaba 1.4 my ($self, $todo) = @_;
853 wakaba 1.50 if ($todo->{type} eq 'minus') {
854     for my $ns (keys %{$todo->{list}}) {
855     for my $ln (keys %{$todo->{list}->{$ns}}) {
856     delete $self->{pluses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
857     }
858 wakaba 1.2 }
859 wakaba 1.50 } elsif ($todo->{type} eq 'plus') {
860     for my $ns (keys %{$todo->{list}}) {
861     for my $ln (keys %{$todo->{list}->{$ns}}) {
862     delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
863     }
864     }
865     } else {
866     die "$0: Unknown +- type: $todo->{type}";
867 wakaba 1.2 }
868     1;
869     } # _remove_minuses
870    
871 wakaba 1.50 ## NOTE: Priority for "minuses" and "pluses" are currently left
872     ## undefined and implemented inconsistently; it is not a problem for
873     ## now, since no element belongs to both lists.
874    
875 wakaba 1.30 sub _check_get_children ($$$) {
876     my ($self, $node, $parent_todo) = @_;
877 wakaba 1.4 my $new_todos = [];
878 wakaba 1.2 my $sib = [];
879     TP: {
880     my $node_ns = $node->namespace_uri;
881     $node_ns = '' unless defined $node_ns;
882     my $node_ln = $node->manakai_local_name;
883 wakaba 1.45 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
884     if ($node_ns eq $HTML_NS and $node_ln eq 'noscript') {
885     if ($parent_todo->{flag}->{in_head}) {
886     #
887     } else {
888     my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
889     push @$sib, $end;
890    
891     unshift @$sib, @{$node->child_nodes};
892     push @$new_todos, {type => 'element-attributes', node => $node};
893     last TP;
894     }
895 wakaba 1.58 } elsif ($node_ns eq $HTML_NS and $node_ln eq 'del') {
896     my $sig_flag = $parent_todo->{flag}->{has_descendant}->{significant};
897     unshift @$sib, @{$node->child_nodes};
898     push @$new_todos, {type => 'element-attributes', node => $node};
899     push @$new_todos,
900     {type => 'code',
901     code => sub {
902     $parent_todo->{flag}->{has_descendant}->{significant} = 0
903     if not $sig_flag;
904     }};
905     last TP;
906 wakaba 1.45 } else {
907     unshift @$sib, @{$node->child_nodes};
908     push @$new_todos, {type => 'element-attributes', node => $node};
909     last TP;
910 wakaba 1.2 }
911     }
912 wakaba 1.8 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
913 wakaba 1.2 if ($node->has_attribute_ns (undef, 'src')) {
914     unshift @$sib, @{$node->child_nodes};
915 wakaba 1.9 push @$new_todos, {type => 'element-attributes', node => $node};
916 wakaba 1.2 last TP;
917     } else {
918     my @cn = @{$node->child_nodes};
919     CN: while (@cn) {
920     my $cn = shift @cn;
921     my $cnt = $cn->node_type;
922     if ($cnt == 1) {
923 wakaba 1.8 my $cn_nsuri = $cn->namespace_uri;
924     $cn_nsuri = '' unless defined $cn_nsuri;
925     if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
926 wakaba 1.2 #
927     } else {
928     last CN;
929     }
930     } elsif ($cnt == 3 or $cnt == 4) {
931     if ($cn->data =~ /[^\x09-\x0D\x20]/) {
932     last CN;
933     }
934     }
935     } # CN
936     unshift @$sib, @cn;
937     }
938 wakaba 1.57 } elsif ($node_ns eq $HTML_NS and $node_ln eq 'object') {
939     my @cn = @{$node->child_nodes};
940     CN: while (@cn) {
941     my $cn = shift @cn;
942     my $cnt = $cn->node_type;
943     if ($cnt == 1) {
944     my $cn_nsuri = $cn->namespace_uri;
945     $cn_nsuri = '' unless defined $cn_nsuri;
946     if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'param') {
947     #
948     } else {
949     last CN;
950     }
951     } elsif ($cnt == 3 or $cnt == 4) {
952     if ($cn->data =~ /[^\x09-\x0D\x20]/) {
953     last CN;
954     }
955     }
956     } # CN
957     unshift @$sib, @cn;
958 wakaba 1.2 }
959 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
960 wakaba 1.2 } # TP
961 wakaba 1.30
962     for my $new_todo (@$new_todos) {
963     $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
964     }
965    
966 wakaba 1.4 return ($sib, $new_todos);
967 wakaba 1.2 } # _check_get_children
968    
969 wakaba 1.44 =head1 LICENSE
970    
971 wakaba 1.56 Copyright 2007-2008 Wakaba <w@suika.fam.cx>
972 wakaba 1.44
973     This library is free software; you can redistribute it
974     and/or modify it under the same terms as Perl itself.
975    
976     =cut
977    
978 wakaba 1.1 1;
979 wakaba 1.88 # $Date: 2008/08/29 13:34:36 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24