/[suikacvs]/markup/html/whatpm/Whatpm/ContentChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/ContentChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.91 - (hide annotations) (download)
Wed Sep 10 10:27:07 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.90: +15 -3 lines
++ whatpm/Whatpm/ChangeLog	10 Sep 2008 10:25:19 -0000
2008-09-10  Wakaba  <wakaba@suika.fam.cx>

	* ContentChecker.pm: Support for charset-layer error levels.

	* HTML.pm.src: Don't specify |text| argument for the
	|chardecode:fallback| error, since it is not the encoding
	being used alternatively.

++ whatpm/Whatpm/Charset/ChangeLog	10 Sep 2008 10:26:52 -0000
2008-09-10  Wakaba  <wakaba@suika.fam.cx>

	* DecodeHandle.pm: Set error levels.

	* WebLatin1.pm: Support for |us-ascii| and |iso-8859-5|
	charsets (this module no longer for Latin1, but for Latin*
	encodings).

	* WebThai.pm: Support for |tis-620| charset.

1 wakaba 1.1 package Whatpm::ContentChecker;
2     use strict;
3 wakaba 1.91 our $VERSION=do{my @r=(q$Revision: 1.90 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.18 require Whatpm::URIChecker;
6    
7 wakaba 1.13 ## ISSUE: How XML and XML Namespaces conformance can (or cannot)
8     ## be applied to an in-memory representation (i.e. DOM)?
9    
10 wakaba 1.50 ## TODO: Conformance of an HTML document with non-html root element.
11    
12 wakaba 1.70 ## Stability
13 wakaba 1.67 sub FEATURE_STATUS_REC () { 0b1 } ## Interoperable standard
14     sub FEATURE_STATUS_CR () { 0b10 } ## Call for implementation
15     sub FEATURE_STATUS_LC () { 0b100 } ## Last call for comments
16     sub FEATURE_STATUS_WD () { 0b1000 } ## Working or editor's draft
17    
18 wakaba 1.70 ## Deprecated
19     sub FEATURE_DEPRECATED_SHOULD () { 0b100000 } ## SHOULD-level
20     sub FEATURE_DEPRECATED_INFO () { 0b1000000 } ## Does not affect conformance
21    
22     ## Conformance
23     sub FEATURE_ALLOWED () { 0b10000 }
24    
25 wakaba 1.42 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
26 wakaba 1.9 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
27     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
28    
29 wakaba 1.42 my $Namespace = {
30 wakaba 1.79 '' => {loaded => 1},
31 wakaba 1.43 q<http://www.w3.org/2005/Atom> => {module => 'Whatpm::ContentChecker::Atom'},
32 wakaba 1.72 q<http://purl.org/syndication/history/1.0>
33     => {module => 'Whatpm::ContentChecker::Atom'},
34     q<http://purl.org/syndication/threading/1.0>
35     => {module => 'Whatpm::ContentChecker::Atom'},
36 wakaba 1.42 $HTML_NS => {module => 'Whatpm::ContentChecker::HTML'},
37     $XML_NS => {loaded => 1},
38     $XMLNS_NS => {loaded => 1},
39 wakaba 1.73 q<http://www.w3.org/1999/02/22-rdf-syntax-ns#> => {loaded => 1},
40 wakaba 1.42 };
41    
42 wakaba 1.79 sub load_ns_module ($) {
43     my $nsuri = shift; # namespace URI or ''
44     unless ($Namespace->{$nsuri}->{loaded}) {
45     if ($Namespace->{$nsuri}->{module}) {
46     eval qq{ require $Namespace->{$nsuri}->{module} } or die $@;
47     } else {
48     $Namespace->{$nsuri}->{loaded} = 1;
49     }
50     }
51     } # load_ns_module
52    
53 wakaba 1.42 our $AttrChecker = {
54 wakaba 1.9 $XML_NS => {
55 wakaba 1.13 space => sub {
56     my ($self, $attr) = @_;
57     my $value = $attr->value;
58     if ($value eq 'default' or $value eq 'preserve') {
59     #
60     } else {
61     ## NOTE: An XML "error"
62 wakaba 1.83 $self->{onerror}->(node => $attr, level => $self->{level}->{xml_error},
63 wakaba 1.33 type => 'invalid attribute value');
64 wakaba 1.13 }
65     },
66     lang => sub {
67 wakaba 1.35 my ($self, $attr) = @_;
68 wakaba 1.47 my $value = $attr->value;
69     if ($value eq '') {
70     #
71     } else {
72     require Whatpm::LangTag;
73     Whatpm::LangTag->check_rfc3066_language_tag ($value, sub {
74 wakaba 1.83 $self->{onerror}->(@_, node => $attr);
75 wakaba 1.85 }, $self->{level});
76 wakaba 1.47 }
77    
78 wakaba 1.13 ## NOTE: "The values of the attribute are language identifiers
79     ## as defined by [IETF RFC 3066], Tags for the Identification
80     ## of Languages, or its successor; in addition, the empty string
81     ## may be specified." ("may" in lower case)
82 wakaba 1.47 ## NOTE: Is an RFC 3066-valid (but RFC 4647-invalid) language tag
83     ## allowed today?
84    
85     ## TODO: test data
86    
87 wakaba 1.89 my $nsuri = $attr->owner_element->namespace_uri;
88     if (defined $nsuri and $nsuri eq $HTML_NS) {
89     my $lang_attr = $attr->owner_element->get_attribute_node_ns
90     (undef, 'lang');
91     if ($lang_attr) {
92     my $lang_attr_value = $lang_attr->value;
93     $lang_attr_value =~ tr/A-Z/a-z/; ## ASCII case-insensitive
94     my $value = $value;
95     $value =~ tr/A-Z/a-z/; ## ASCII case-insensitive
96     if ($lang_attr_value ne $value) {
97     ## NOTE: HTML5 Section "The |lang| and |xml:lang| attributes"
98     $self->{onerror}->(node => $attr,
99     type => 'xml:lang ne lang',
100     level => $self->{level}->{must});
101     }
102     }
103     }
104    
105 wakaba 1.35 if ($attr->owner_document->manakai_is_html) { # MUST NOT
106 wakaba 1.83 $self->{onerror}->(node => $attr, type => 'in HTML:xml:lang',
107     level => $self->{level}->{must});
108 wakaba 1.35 ## TODO: Test data...
109     }
110 wakaba 1.13 },
111     base => sub {
112     my ($self, $attr) = @_;
113     my $value = $attr->value;
114     if ($value =~ /[^\x{0000}-\x{10FFFF}]/) { ## ISSUE: Should we disallow noncharacters?
115     $self->{onerror}->(node => $attr,
116 wakaba 1.83 type => 'invalid attribute value',
117     level => $self->{level}->{fact}, ## TODO: correct?
118     );
119 wakaba 1.13 }
120 wakaba 1.18 ## NOTE: Conformance to URI standard is not checked since there is
121     ## no author requirement on conformance in the XML Base specification.
122 wakaba 1.13 },
123     id => sub {
124     my ($self, $attr) = @_;
125     my $value = $attr->value;
126     $value =~ s/[\x09\x0A\x0D\x20]+/ /g;
127     $value =~ s/^\x20//;
128     $value =~ s/\x20$//;
129     ## TODO: NCName in XML 1.0 or 1.1
130     ## TODO: declared type is ID?
131 wakaba 1.83 if ($self->{id}->{$value}) {
132     $self->{onerror}->(node => $attr,
133     type => 'duplicate ID',
134     level => $self->{level}->{xml_id_error});
135 wakaba 1.37 push @{$self->{id}->{$value}}, $attr;
136 wakaba 1.13 } else {
137 wakaba 1.37 $self->{id}->{$value} = [$attr];
138 wakaba 1.13 }
139     },
140 wakaba 1.9 },
141     $XMLNS_NS => {
142 wakaba 1.13 '' => sub {
143     my ($self, $attr) = @_;
144     my $ln = $attr->manakai_local_name;
145     my $value = $attr->value;
146     if ($value eq $XML_NS and $ln ne 'xml') {
147     $self->{onerror}
148 wakaba 1.83 ->(node => $attr,
149     type => 'Reserved Prefixes and Namespace Names:Name',
150     text => $value,
151     level => $self->{level}->{nc});
152 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
153     $self->{onerror}
154 wakaba 1.83 ->(node => $attr,
155     type => 'Reserved Prefixes and Namespace Names:Name',
156     text => $value,
157     level => $self->{level}->{nc});
158 wakaba 1.13 }
159     if ($ln eq 'xml' and $value ne $XML_NS) {
160     $self->{onerror}
161 wakaba 1.83 ->(node => $attr,
162     type => 'Reserved Prefixes and Namespace Names:Prefix',
163     text => $ln,
164     level => $self->{level}->{nc});
165 wakaba 1.13 } elsif ($ln eq 'xmlns') {
166     $self->{onerror}
167 wakaba 1.83 ->(node => $attr,
168     type => 'Reserved Prefixes and Namespace Names:Prefix',
169     text => $ln,
170     level => $self->{level}->{nc});
171 wakaba 1.13 }
172     ## TODO: If XML 1.0 and empty
173     },
174     xmlns => sub {
175     my ($self, $attr) = @_;
176     ## TODO: In XML 1.0, URI reference [RFC 3986] or an empty string
177     ## TODO: In XML 1.1, IRI reference [RFC 3987] or an empty string
178 wakaba 1.18 ## TODO: relative references are deprecated
179 wakaba 1.13 my $value = $attr->value;
180     if ($value eq $XML_NS) {
181     $self->{onerror}
182 wakaba 1.83 ->(node => $attr,
183     type => 'Reserved Prefixes and Namespace Names:Name',
184     text => $value,
185     level => $self->{level}->{nc});
186 wakaba 1.13 } elsif ($value eq $XMLNS_NS) {
187     $self->{onerror}
188 wakaba 1.83 ->(node => $attr,
189     type => 'Reserved Prefixes and Namespace Names:Name',
190     text => $value,
191     level => $self->{level}->{nc});
192 wakaba 1.13 }
193     },
194 wakaba 1.9 },
195     };
196    
197 wakaba 1.14 ## ISSUE: Should we really allow these attributes?
198 wakaba 1.13 $AttrChecker->{''}->{'xml:space'} = $AttrChecker->{$XML_NS}->{space};
199     $AttrChecker->{''}->{'xml:lang'} = $AttrChecker->{$XML_NS}->{lang};
200 wakaba 1.89 ## NOTE: Checker for (null, "xml:lang") attribute is shadowed for
201     ## HTML elements in Whatpm::ContentChecker::HTML.
202 wakaba 1.13 $AttrChecker->{''}->{'xml:base'} = $AttrChecker->{$XML_NS}->{base};
203     $AttrChecker->{''}->{'xml:id'} = $AttrChecker->{$XML_NS}->{id};
204    
205 wakaba 1.79 our $AttrStatus;
206    
207     for (qw/space lang base id/) {
208     $AttrStatus->{$XML_NS}->{$_} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
209     $AttrStatus->{''}->{"xml:$_"} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
210     ## XML 1.0: FEATURE_STATUS_CR
211     ## XML 1.1: FEATURE_STATUS_REC
212     ## XML Namespaces 1.0: FEATURE_STATUS_CR
213     ## XML Namespaces 1.1: FEATURE_STATUS_REC
214     ## XML Base: FEATURE_STATUS_REC
215     ## xml:id: FEATURE_STATUS_REC
216     }
217    
218     $AttrStatus->{$XMLNS_NS}->{''} = FEATURE_STATUS_REC | FEATURE_ALLOWED;
219    
220     ## TODO: xsi:schemaLocation for XHTML2 support (very, very low priority)
221    
222 wakaba 1.60 our %AnyChecker = (
223     check_start => sub { },
224     check_attrs => sub {
225     my ($self, $item, $element_state) = @_;
226     for my $attr (@{$item->{node}->attributes}) {
227 wakaba 1.9 my $attr_ns = $attr->namespace_uri;
228     $attr_ns = '' unless defined $attr_ns;
229     my $attr_ln = $attr->manakai_local_name;
230 wakaba 1.79
231     load_ns_module ($attr_ns);
232    
233 wakaba 1.9 my $checker = $AttrChecker->{$attr_ns}->{$attr_ln}
234 wakaba 1.60 || $AttrChecker->{$attr_ns}->{''};
235 wakaba 1.79 my $status = $AttrStatus->{$attr_ns}->{$attr_ln}
236     || $AttrStatus->{$attr_ns}->{''};
237     if (not defined $status) {
238     $status = FEATURE_ALLOWED;
239     ## NOTE: FEATURE_ALLOWED for all attributes, since the element
240     ## is not supported and therefore "attribute not defined" error
241     ## should not raised (too verbose) and global attributes should be
242     ## allowed anyway (if a global attribute has its specified creteria
243     ## for where it may be specified, then it should be checked in it's
244     ## checker function).
245     }
246 wakaba 1.9 if ($checker) {
247     $checker->($self, $attr);
248 wakaba 1.17 } else {
249 wakaba 1.83 $self->{onerror}->(node => $attr,
250     type => 'unknown attribute',
251     level => $self->{level}->{uncertain});
252 wakaba 1.9 }
253 wakaba 1.79 $self->_attr_status_info ($attr, $status);
254 wakaba 1.9 }
255     },
256 wakaba 1.60 check_child_element => sub {
257     my ($self, $item, $child_el, $child_nsuri, $child_ln,
258     $child_is_transparent, $element_state) = @_;
259     if ($self->{minus_elements}->{$child_nsuri}->{$child_ln}) {
260     $self->{onerror}->(node => $child_el,
261     type => 'element not allowed:minus',
262 wakaba 1.83 level => $self->{level}->{must});
263 wakaba 1.60 } elsif ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
264     #
265     } else {
266     #
267     }
268     },
269     check_child_text => sub { },
270     check_end => sub {
271     my ($self, $item, $element_state) = @_;
272 wakaba 1.82 ## NOTE: There is a modified copy of the code below for |html:ruby|.
273 wakaba 1.60 if ($element_state->{has_significant}) {
274 wakaba 1.66 $item->{real_parent_state}->{has_significant} = 1;
275 wakaba 1.60 }
276     },
277     );
278    
279     our $ElementDefault = {
280     %AnyChecker,
281 wakaba 1.70 status => FEATURE_ALLOWED,
282     ## NOTE: No "element not defined" error - it is not supported anyway.
283 wakaba 1.60 check_start => sub {
284     my ($self, $item, $element_state) = @_;
285 wakaba 1.83 $self->{onerror}->(node => $item->{node},
286     type => 'unknown element',
287     level => $self->{level}->{uncertain});
288 wakaba 1.60 },
289 wakaba 1.1 };
290    
291 wakaba 1.60 our $HTMLEmbeddedContent = {
292     ## NOTE: All embedded content is also phrasing content.
293     $HTML_NS => {
294     img => 1, iframe => 1, embed => 1, object => 1, video => 1, audio => 1,
295     canvas => 1,
296     },
297     q<http://www.w3.org/1998/Math/MathML> => {math => 1},
298     q<http://www.w3.org/2000/svg> => {svg => 1},
299     ## NOTE: Foreign elements with content (but no metadata) are
300     ## embedded content.
301     };
302    
303 wakaba 1.7 my $HTMLTransparentElements = {
304 wakaba 1.90 $HTML_NS => {qw/ins 1 del 1 font 1 noscript 1 canvas 1 a 1/},
305 wakaba 1.29 ## NOTE: |html:noscript| is transparent if scripting is disabled
306     ## and not in |head|.
307 wakaba 1.7 };
308    
309 wakaba 1.61 my $HTMLSemiTransparentElements = {
310     $HTML_NS => {object => 1, video => 1, audio => 1},
311     };
312 wakaba 1.57
313 wakaba 1.42 our $Element = {};
314 wakaba 1.7
315 wakaba 1.73 $Element->{q<http://www.w3.org/1999/02/22-rdf-syntax-ns#>}->{RDF} = {
316     %AnyChecker,
317     status => FEATURE_STATUS_REC | FEATURE_ALLOWED,
318     is_root => 1, ## ISSUE: Not explicitly allowed for non application/rdf+xml
319     check_start => sub {
320     my ($self, $item, $element_state) = @_;
321     my $triple = [];
322     push @{$self->{return}->{rdf}}, [$item->{node}, $triple];
323     require Whatpm::RDFXML;
324     my $rdf = Whatpm::RDFXML->new;
325 wakaba 1.75 ## TODO: Should we make bnodeid unique in a document?
326 wakaba 1.73 $rdf->{onerror} = $self->{onerror};
327 wakaba 1.84 $rdf->{level} = $self->{level};
328 wakaba 1.73 $rdf->{ontriple} = sub {
329     my %opt = @_;
330     push @$triple,
331     [$opt{node}, $opt{subject}, $opt{predicate}, $opt{object}];
332 wakaba 1.74 if (defined $opt{id}) {
333     push @$triple,
334     [$opt{node},
335     $opt{id},
336     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#subject>},
337     $opt{subject}];
338     push @$triple,
339     [$opt{node},
340     $opt{id},
341     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate>},
342     $opt{predicate}];
343     push @$triple,
344     [$opt{node},
345     $opt{id},
346     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#object>},
347     $opt{object}];
348     push @$triple,
349     [$opt{node},
350     $opt{id},
351     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>},
352     {uri => q<http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement>}];
353     }
354 wakaba 1.73 };
355     $rdf->convert_rdf_element ($item->{node});
356     },
357     };
358    
359 wakaba 1.83 my $default_error_level = {
360     must => 'm',
361     should => 's',
362     warn => 'w',
363     good => 'w',
364     info => 'i',
365 wakaba 1.85
366 wakaba 1.83 uncertain => 'u',
367    
368 wakaba 1.84 html4_fact => 'm',
369 wakaba 1.89 html5_no_may => 'm',
370    
371 wakaba 1.83 xml_error => 'm', ## TODO: correct?
372 wakaba 1.88 xml_id_error => 'm', ## TODO: ?
373 wakaba 1.83 nc => 'm', ## XML Namespace Constraints ## TODO: correct?
374 wakaba 1.84
375 wakaba 1.91 ## |Whatpm::URIChecker|
376 wakaba 1.85 uri_syntax => 'm',
377     uri_fact => 'm',
378     uri_lc_must => 'm',
379     uri_lc_should => 'w',
380    
381 wakaba 1.91 ## |Whatpm::IMTChecker|
382 wakaba 1.87 mime_must => 'm', # lowercase "must"
383     mime_fact => 'm',
384     mime_strongly_discouraged => 'w',
385     mime_discouraged => 'w',
386    
387 wakaba 1.91 ## |Whatpm::LangTag|
388 wakaba 1.85 langtag_fact => 'm',
389    
390 wakaba 1.91 ## |Whatpm::RDFXML|
391 wakaba 1.84 rdf_fact => 'm',
392     rdf_grammer => 'm',
393     rdf_lc_must => 'm',
394 wakaba 1.91
395     ## |Message::Charset::Info| and |Whatpm::Charset::DecodeHandle|
396     charset_variant => 'm',
397     ## An error caused by use of a variant charset that is not conforming
398     ## to the original charset (e.g. use of 0x80 in an ISO-8859-1 document
399     ## which is interpreted as a Windows-1252 document instead).
400     charset_fact => 'm',
401     iso_shall => 'm',
402 wakaba 1.83 };
403    
404 wakaba 1.56 sub check_document ($$$;$) {
405     my ($self, $doc, $onerror, $onsubdoc) = @_;
406 wakaba 1.42 $self = bless {}, $self unless ref $self;
407     $self->{onerror} = $onerror;
408 wakaba 1.56 $self->{onsubdoc} = $onsubdoc || sub {
409     warn "A subdocument is not conformance-checked";
410     };
411 wakaba 1.1
412 wakaba 1.83 $self->{level} ||= $default_error_level;
413 wakaba 1.48
414 wakaba 1.73 ## TODO: If application/rdf+xml, RDF/XML mode should be invoked.
415    
416 wakaba 1.42 my $docel = $doc->document_element;
417     unless (defined $docel) {
418     ## ISSUE: Should we check content of Document node?
419 wakaba 1.83 $onerror->(node => $doc, type => 'no document element',
420     level => $self->{level}->{must});
421 wakaba 1.42 ## ISSUE: Is this non-conforming (to what spec)? Or just a warning?
422     return {
423     class => {},
424     id => {}, table => [], term => {},
425     };
426 wakaba 1.1 }
427    
428 wakaba 1.42 ## ISSUE: Unexpanded entity references and HTML5 conformance
429 wakaba 1.1
430 wakaba 1.42 my $docel_nsuri = $docel->namespace_uri;
431     $docel_nsuri = '' unless defined $docel_nsuri;
432 wakaba 1.79 load_ns_module ($docel_nsuri);
433 wakaba 1.42 my $docel_def = $Element->{$docel_nsuri}->{$docel->manakai_local_name} ||
434     $Element->{$docel_nsuri}->{''} ||
435     $ElementDefault;
436     if ($docel_def->{is_root}) {
437     #
438 wakaba 1.50 } elsif ($docel_def->{is_xml_root}) {
439     unless ($doc->manakai_is_html) {
440     #
441     } else {
442 wakaba 1.83 $onerror->(node => $docel, type => 'element not allowed:root:xml',
443     level => $self->{level}->{must});
444 wakaba 1.50 }
445 wakaba 1.42 } else {
446 wakaba 1.83 $onerror->(node => $docel, type => 'element not allowed:root',
447     level => $self->{level}->{must});
448 wakaba 1.1 }
449    
450 wakaba 1.42 ## TODO: Check for other items other than document element
451     ## (second (errorous) element, text nodes, PI nodes, doctype nodes)
452 wakaba 1.2
453 wakaba 1.56 my $return = $self->check_element ($docel, $onerror, $onsubdoc);
454 wakaba 1.51
455 wakaba 1.52 ## TODO: Test for these checks are necessary.
456 wakaba 1.51 my $charset_name = $doc->input_encoding;
457     if (defined $charset_name) {
458     require Message::Charset::Info;
459     my $charset = $Message::Charset::Info::IANACharset->{$charset_name};
460    
461 wakaba 1.71 if ($doc->manakai_is_html) {
462     if (not $doc->manakai_has_bom and
463     not defined $doc->manakai_charset) {
464     unless ($charset->{is_html_ascii_superset}) {
465 wakaba 1.86 $onerror->(node => $doc,
466     level => $self->{level}->{must},
467 wakaba 1.83 type => 'non ascii superset',
468     text => $charset_name);
469 wakaba 1.71 }
470    
471     if (not $self->{has_charset} and ## TODO: This does not work now.
472     not $charset->{iana_names}->{'us-ascii'}) {
473 wakaba 1.86 $onerror->(node => $doc,
474     level => $self->{level}->{must},
475 wakaba 1.83 type => 'no character encoding declaration',
476     text => $charset_name);
477 wakaba 1.71 }
478 wakaba 1.51 }
479 wakaba 1.71
480     if ($charset->{iana_names}->{'utf-8'}) {
481     #
482     } elsif ($charset->{iana_names}->{'jis_x0212-1990'} or
483     $charset->{iana_names}->{'x-jis0208'} or
484     $charset->{iana_names}->{'utf-32'} or ## ISSUE: UTF-32BE? UTF-32LE?
485 wakaba 1.91 ($charset->{category} & Message::Charset::Info::CHARSET_CATEGORY_EBCDIC ())) {
486 wakaba 1.71 $onerror->(node => $doc,
487 wakaba 1.83 type => 'bad character encoding',
488     text => $charset_name,
489     level => $self->{level}->{should},
490     layer => 'encode');
491 wakaba 1.71 } elsif ($charset->{iana_names}->{'cesu-8'} or
492     $charset->{iana_names}->{'utf-8'} or ## ISSUE: UNICODE-1-1-UTF-7?
493     $charset->{iana_names}->{'bocu-1'} or
494     $charset->{iana_names}->{'scsu'}) {
495     $onerror->(node => $doc,
496 wakaba 1.83 type => 'disallowed character encoding',
497     text => $charset_name,
498     level => $self->{level}->{must},
499     layer => 'encode');
500 wakaba 1.71 } else {
501     $onerror->(node => $doc,
502 wakaba 1.83 type => 'non-utf-8 character encoding',
503     text => $charset_name,
504     level => $self->{level}->{good},
505     layer => 'encode');
506 wakaba 1.51 }
507     }
508 wakaba 1.52 } elsif ($doc->manakai_is_html) {
509     ## NOTE: MUST and SHOULD requirements above cannot be tested,
510     ## since the document has no input charset encoding information.
511     $onerror->(node => $doc,
512 wakaba 1.83 type => 'character encoding unchecked',
513     level => $self->{level}->{info},
514     layer => 'encode');
515 wakaba 1.51 }
516    
517     return $return;
518 wakaba 1.42 } # check_document
519 wakaba 1.1
520 wakaba 1.81 ## Check an element. The element is checked as if it is an orphan node (i.e.
521     ## an element without a parent node).
522 wakaba 1.56 sub check_element ($$$;$) {
523     my ($self, $el, $onerror, $onsubdoc) = @_;
524 wakaba 1.42 $self = bless {}, $self unless ref $self;
525     $self->{onerror} = $onerror;
526 wakaba 1.56 $self->{onsubdoc} = $onsubdoc || sub {
527     warn "A subdocument is not conformance-checked";
528     };
529 wakaba 1.2
530 wakaba 1.83 $self->{level} ||= $default_error_level;
531 wakaba 1.48
532 wakaba 1.61 $self->{plus_elements} = {};
533     $self->{minus_elements} = {};
534 wakaba 1.42 $self->{id} = {};
535     $self->{term} = {};
536     $self->{usemap} = [];
537 wakaba 1.78 $self->{ref} = []; # datetemplate data references
538     $self->{template} = []; # datatemplate template references
539 wakaba 1.42 $self->{contextmenu} = [];
540     $self->{map} = {};
541     $self->{menu} = {};
542     $self->{has_link_type} = {};
543 wakaba 1.60 $self->{flag} = {};
544 wakaba 1.46 #$self->{has_uri_attr};
545     #$self->{has_hyperlink_element};
546 wakaba 1.51 #$self->{has_charset};
547 wakaba 1.57 #$self->{has_base};
548 wakaba 1.42 $self->{return} = {
549     class => {},
550 wakaba 1.80 id => $self->{id},
551     table => [], # table objects returned by Whatpm::HTMLTable
552     term => $self->{term},
553 wakaba 1.76 uri => {}, # URIs other than those in RDF triples
554     ## TODO: xmlns="", SYSTEM "", atom:* src="", xml:base=""
555 wakaba 1.73 rdf => [],
556 wakaba 1.42 };
557 wakaba 1.4
558 wakaba 1.60 my @item = ({type => 'element', node => $el, parent_state => {}});
559 wakaba 1.66 $item[-1]->{real_parent_state} = $item[-1]->{parent_state};
560 wakaba 1.60 while (@item) {
561     my $item = shift @item;
562     if (ref $item eq 'ARRAY') {
563     my $code = shift @$item;
564     next unless $code;## TODO: temp.
565     $code->(@$item);
566     } elsif ($item->{type} eq 'element') {
567     my $el_nsuri = $item->{node}->namespace_uri;
568     $el_nsuri = '' unless defined $el_nsuri;
569     my $el_ln = $item->{node}->manakai_local_name;
570 wakaba 1.79
571     load_ns_module ($el_nsuri);
572 wakaba 1.63
573     my $element_state = {};
574 wakaba 1.60 my $eldef = $Element->{$el_nsuri}->{$el_ln} ||
575     $Element->{$el_nsuri}->{''} ||
576 wakaba 1.42 $ElementDefault;
577 wakaba 1.61 my $content_def = $item->{transparent}
578     ? $item->{parent_def} || $eldef : $eldef;
579 wakaba 1.63 my $content_state = $item->{transparent}
580 wakaba 1.65 ? $item->{parent_def}
581     ? $item->{parent_state} || $element_state : $element_state
582     : $element_state;
583 wakaba 1.60
584 wakaba 1.67 unless ($eldef->{status} & FEATURE_STATUS_REC) {
585     my $status = $eldef->{status} & FEATURE_STATUS_CR ? 'cr' :
586     $eldef->{status} & FEATURE_STATUS_LC ? 'lc' :
587     $eldef->{status} & FEATURE_STATUS_WD ? 'wd' : 'non-standard';
588     $self->{onerror}->(node => $item->{node},
589     type => 'status:'.$status.':element',
590 wakaba 1.83 level => $self->{level}->{info});
591 wakaba 1.67 }
592 wakaba 1.70 if (not ($eldef->{status} & FEATURE_ALLOWED)) {
593     $self->{onerror}->(node => $item->{node},
594     type => 'element not defined',
595 wakaba 1.83 level => $self->{level}->{must});
596 wakaba 1.70 } elsif ($eldef->{status} & FEATURE_DEPRECATED_SHOULD) {
597     $self->{onerror}->(node => $item->{node},
598     type => 'deprecated:element',
599 wakaba 1.83 level => $self->{level}->{should});
600 wakaba 1.70 } elsif ($eldef->{status} & FEATURE_DEPRECATED_INFO) {
601     $self->{onerror}->(node => $item->{node},
602     type => 'deprecated:element',
603 wakaba 1.83 level => $self->{level}->{info});
604 wakaba 1.70 }
605 wakaba 1.67
606 wakaba 1.60 my @new_item;
607     push @new_item, [$eldef->{check_start}, $self, $item, $element_state];
608     push @new_item, [$eldef->{check_attrs}, $self, $item, $element_state];
609 wakaba 1.61
610 wakaba 1.60 my @child = @{$item->{node}->child_nodes};
611     while (@child) {
612     my $child = shift @child;
613     my $child_nt = $child->node_type;
614     if ($child_nt == 1) { # ELEMENT_NODE
615     my $child_nsuri = $child->namespace_uri;
616     $child_nsuri = '' unless defined $child_nsuri;
617     my $child_ln = $child->manakai_local_name;
618     if ($HTMLTransparentElements->{$child_nsuri}->{$child_ln} and
619     not (($self->{flag}->{in_head} or
620 wakaba 1.61 ($el_nsuri eq $HTML_NS and $el_ln eq 'head')) and
621     $child_nsuri eq $HTML_NS and $child_ln eq 'noscript')) {
622 wakaba 1.60 push @new_item, [$content_def->{check_child_element},
623     $self, $item, $child,
624 wakaba 1.66 $child_nsuri, $child_ln, 1,
625     $content_state, $element_state];
626 wakaba 1.60 push @new_item, {type => 'element', node => $child,
627 wakaba 1.65 parent_state => $content_state,
628 wakaba 1.61 parent_def => $content_def,
629 wakaba 1.66 real_parent_state => $element_state,
630 wakaba 1.60 transparent => 1};
631     } else {
632 wakaba 1.65 if ($item->{parent_def} and # has parent
633     $el_nsuri eq $HTML_NS) { ## $HTMLSemiTransparentElements
634 wakaba 1.61 if ($el_ln eq 'object') {
635     if ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
636     #
637     } elsif ($child_nsuri eq $HTML_NS and $child_ln eq 'param') {
638     #
639     } else {
640 wakaba 1.62 $content_def = $item->{parent_def} || $content_def;
641 wakaba 1.63 $content_state = $item->{parent_state} || $content_state;
642 wakaba 1.62 }
643     } elsif ($el_ln eq 'video' or $el_ln eq 'audio') {
644     if ($self->{plus_elements}->{$child_nsuri}->{$child_ln}) {
645     #
646     } elsif ($child_nsuri eq $HTML_NS and $child_ln eq 'source') {
647     $element_state->{has_source} = 1;
648     } else {
649     $content_def = $item->{parent_def} || $content_def;
650 wakaba 1.63 $content_state = $item->{parent_state} || $content_state;
651 wakaba 1.61 }
652     }
653     }
654    
655 wakaba 1.60 push @new_item, [$content_def->{check_child_element},
656     $self, $item, $child,
657 wakaba 1.64 $child_nsuri, $child_ln,
658     $HTMLSemiTransparentElements
659     ->{$child_nsuri}->{$child_ln},
660 wakaba 1.66 $content_state, $element_state];
661 wakaba 1.60 push @new_item, {type => 'element', node => $child,
662 wakaba 1.65 parent_def => $content_def,
663 wakaba 1.66 real_parent_state => $element_state,
664 wakaba 1.65 parent_state => $content_state};
665 wakaba 1.60 }
666    
667     if ($HTMLEmbeddedContent->{$child_nsuri}->{$child_ln}) {
668     $element_state->{has_significant} = 1;
669     }
670     } elsif ($child_nt == 3 or # TEXT_NODE
671     $child_nt == 4) { # CDATA_SECTION_NODE
672     my $has_significant = ($child->data =~ /[^\x09-\x0D\x20]/);
673     push @new_item, [$content_def->{check_child_text},
674     $self, $item, $child, $has_significant,
675 wakaba 1.66 $content_state, $element_state];
676     $element_state->{has_significant} ||= $has_significant;
677 wakaba 1.61 if ($has_significant and
678     $HTMLSemiTransparentElements->{$el_nsuri}->{$el_ln}) {
679     $content_def = $item->{parent_def} || $content_def;
680     }
681 wakaba 1.60 } elsif ($child_nt == 5) { # ENTITY_REFERENCE_NODE
682     push @child, @{$child->child_nodes};
683 wakaba 1.1 }
684 wakaba 1.60 ## TODO: PI_NODE
685     ## TODO: Unknown node type
686 wakaba 1.1 }
687 wakaba 1.60
688     push @new_item, [$eldef->{check_end}, $self, $item, $element_state];
689    
690     unshift @item, @new_item;
691 wakaba 1.30 } else {
692 wakaba 1.60 die "$0: Internal error: Unsupported checking action type |$item->{type}|";
693 wakaba 1.4 }
694 wakaba 1.1 }
695 wakaba 1.17
696 wakaba 1.78 for (@{$self->{template}}) {
697     ## TODO: If the document is an XML document, ...
698     ## NOTE: If the document is an HTML document:
699     ## ISSUE: We need to percent-decode?
700     F: {
701     if ($self->{id}->{$_->[0]}) {
702     my $el = $self->{id}->{$_->[0]}->[0]->owner_element;
703     if ($el->node_type == 1 and # ELEMENT_NODE
704     $el->manakai_local_name eq 'datatemplate') {
705     my $nsuri = $el->namespace_uri;
706     if (defined $nsuri and $nsuri eq $HTML_NS) {
707     if ($el eq $_->[1]->owner_element) {
708     $self->{onerror}->(node => $_->[1],
709     type => 'fragment points itself',
710 wakaba 1.83 level => $self->{level}->{must});
711 wakaba 1.78 }
712    
713     last F;
714     }
715     }
716     }
717     ## TODO: Should we raise a "fragment points nothing" error instead
718     ## if the fragment identifier identifies no element?
719    
720     $self->{onerror}->(node => $_->[1], type => 'template:not template',
721 wakaba 1.83 level => $self->{level}->{must});
722 wakaba 1.78 } # F
723     }
724    
725     for (@{$self->{ref}}) {
726     ## TOOD: If XML
727     ## NOTE: If it is an HTML document:
728     if ($_->[0] eq '') {
729     ## NOTE: It points the top of the document.
730     } elsif ($self->{id}->{$_->[0]}) {
731     if ($self->{id}->{$_->[0]}->[0]->owner_element
732     eq $_->[1]->owner_element) {
733     $self->{onerror}->(node => $_->[1], type => 'fragment points itself',
734 wakaba 1.83 level => $self->{level}->{must});
735 wakaba 1.78 }
736     } else {
737     $self->{onerror}->(node => $_->[1], type => 'fragment points nothing',
738 wakaba 1.83 level => $self->{level}->{must});
739 wakaba 1.78 }
740     }
741    
742     ## TODO: Maybe we should have $document->manakai_get_by_fragment or something
743    
744 wakaba 1.17 for (@{$self->{usemap}}) {
745     unless ($self->{map}->{$_->[0]}) {
746 wakaba 1.83 $self->{onerror}->(node => $_->[1], type => 'no referenced map',
747     level => $self->{level}->{must});
748 wakaba 1.17 }
749     }
750    
751 wakaba 1.32 for (@{$self->{contextmenu}}) {
752     unless ($self->{menu}->{$_->[0]}) {
753 wakaba 1.83 $self->{onerror}->(node => $_->[1], type => 'no referenced menu',
754     level => $self->{level}->{must});
755 wakaba 1.32 }
756     }
757    
758 wakaba 1.61 delete $self->{plus_elements};
759     delete $self->{minus_elements};
760 wakaba 1.17 delete $self->{onerror};
761     delete $self->{id};
762     delete $self->{usemap};
763 wakaba 1.78 delete $self->{ref};
764     delete $self->{template};
765 wakaba 1.17 delete $self->{map};
766 wakaba 1.33 return $self->{return};
767 wakaba 1.1 } # check_element
768    
769 wakaba 1.60 sub _add_minus_elements ($$@) {
770     my $self = shift;
771     my $element_state = shift;
772     for my $elements (@_) {
773     for my $nsuri (keys %$elements) {
774     for my $ln (keys %{$elements->{$nsuri}}) {
775     unless ($self->{minus_elements}->{$nsuri}->{$ln}) {
776     $element_state->{minus_elements_original}->{$nsuri}->{$ln} = 0;
777     $self->{minus_elements}->{$nsuri}->{$ln} = 1;
778     }
779     }
780     }
781     }
782     } # _add_minus_elements
783    
784     sub _remove_minus_elements ($$) {
785     my $self = shift;
786     my $element_state = shift;
787     for my $nsuri (keys %{$element_state->{minus_elements_original}}) {
788     for my $ln (keys %{$element_state->{minus_elements_original}->{$nsuri}}) {
789     delete $self->{minus_elements}->{$nsuri}->{$ln};
790     }
791     }
792     } # _remove_minus_elements
793    
794     sub _add_plus_elements ($$@) {
795     my $self = shift;
796     my $element_state = shift;
797     for my $elements (@_) {
798     for my $nsuri (keys %$elements) {
799     for my $ln (keys %{$elements->{$nsuri}}) {
800     unless ($self->{plus_elements}->{$nsuri}->{$ln}) {
801     $element_state->{plus_elements_original}->{$nsuri}->{$ln} = 0;
802     $self->{plus_elements}->{$nsuri}->{$ln} = 1;
803     }
804     }
805     }
806     }
807     } # _add_plus_elements
808    
809     sub _remove_plus_elements ($$) {
810     my $self = shift;
811     my $element_state = shift;
812     for my $nsuri (keys %{$element_state->{plus_elements_original}}) {
813     for my $ln (keys %{$element_state->{plus_elements_original}->{$nsuri}}) {
814     delete $self->{plus_elements}->{$nsuri}->{$ln};
815     }
816     }
817     } # _remove_plus_elements
818    
819 wakaba 1.68 sub _attr_status_info ($$$) {
820     my ($self, $attr, $status_code) = @_;
821 wakaba 1.70
822     if (not ($status_code & FEATURE_ALLOWED)) {
823     $self->{onerror}->(node => $attr,
824     type => 'attribute not defined',
825 wakaba 1.83 level => $self->{level}->{must});
826 wakaba 1.70 } elsif ($status_code & FEATURE_DEPRECATED_SHOULD) {
827     $self->{onerror}->(node => $attr,
828     type => 'deprecated:attr',
829 wakaba 1.83 level => $self->{level}->{should});
830 wakaba 1.70 } elsif ($status_code & FEATURE_DEPRECATED_INFO) {
831     $self->{onerror}->(node => $attr,
832     type => 'deprecated:attr',
833 wakaba 1.83 level => $self->{level}->{info});
834 wakaba 1.70 }
835    
836 wakaba 1.68 my $status;
837     if ($status_code & FEATURE_STATUS_REC) {
838     return;
839     } elsif ($status_code & FEATURE_STATUS_CR) {
840     $status = 'cr';
841     } elsif ($status_code & FEATURE_STATUS_LC) {
842     $status = 'lc';
843     } elsif ($status_code & FEATURE_STATUS_WD) {
844     $status = 'wd';
845     } else {
846     $status = 'non-standard';
847     }
848     $self->{onerror}->(node => $attr,
849     type => 'status:'.$status.':attr',
850 wakaba 1.83 level => $self->{level}->{info});
851 wakaba 1.68 } # _attr_status_info
852    
853 wakaba 1.2 sub _add_minuses ($@) {
854     my $self = shift;
855     my $r = {};
856     for my $list (@_) {
857     for my $ns (keys %$list) {
858     for my $ln (keys %{$list->{$ns}}) {
859     unless ($self->{minuses}->{$ns}->{$ln}) {
860     $self->{minuses}->{$ns}->{$ln} = 1;
861     $r->{$ns}->{$ln} = 1;
862     }
863     }
864     }
865     }
866 wakaba 1.4 return {type => 'plus', list => $r};
867 wakaba 1.2 } # _add_minuses
868    
869 wakaba 1.50 sub _add_pluses ($@) {
870     my $self = shift;
871     my $r = {};
872     for my $list (@_) {
873     for my $ns (keys %$list) {
874     for my $ln (keys %{$list->{$ns}}) {
875     unless ($self->{pluses}->{$ns}->{$ln}) {
876     $self->{pluses}->{$ns}->{$ln} = 1;
877     $r->{$ns}->{$ln} = 1;
878     }
879     }
880     }
881     }
882     return {type => 'minus', list => $r};
883     } # _add_pluses
884    
885 wakaba 1.2 sub _remove_minuses ($$) {
886 wakaba 1.4 my ($self, $todo) = @_;
887 wakaba 1.50 if ($todo->{type} eq 'minus') {
888     for my $ns (keys %{$todo->{list}}) {
889     for my $ln (keys %{$todo->{list}->{$ns}}) {
890     delete $self->{pluses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
891     }
892 wakaba 1.2 }
893 wakaba 1.50 } elsif ($todo->{type} eq 'plus') {
894     for my $ns (keys %{$todo->{list}}) {
895     for my $ln (keys %{$todo->{list}->{$ns}}) {
896     delete $self->{minuses}->{$ns}->{$ln} if $todo->{list}->{$ns}->{$ln};
897     }
898     }
899     } else {
900     die "$0: Unknown +- type: $todo->{type}";
901 wakaba 1.2 }
902     1;
903     } # _remove_minuses
904    
905 wakaba 1.50 ## NOTE: Priority for "minuses" and "pluses" are currently left
906     ## undefined and implemented inconsistently; it is not a problem for
907     ## now, since no element belongs to both lists.
908    
909 wakaba 1.30 sub _check_get_children ($$$) {
910     my ($self, $node, $parent_todo) = @_;
911 wakaba 1.4 my $new_todos = [];
912 wakaba 1.2 my $sib = [];
913     TP: {
914     my $node_ns = $node->namespace_uri;
915     $node_ns = '' unless defined $node_ns;
916     my $node_ln = $node->manakai_local_name;
917 wakaba 1.45 if ($HTMLTransparentElements->{$node_ns}->{$node_ln}) {
918     if ($node_ns eq $HTML_NS and $node_ln eq 'noscript') {
919     if ($parent_todo->{flag}->{in_head}) {
920     #
921     } else {
922     my $end = $self->_add_minuses ({$HTML_NS, {noscript => 1}});
923     push @$sib, $end;
924    
925     unshift @$sib, @{$node->child_nodes};
926     push @$new_todos, {type => 'element-attributes', node => $node};
927     last TP;
928     }
929 wakaba 1.58 } elsif ($node_ns eq $HTML_NS and $node_ln eq 'del') {
930     my $sig_flag = $parent_todo->{flag}->{has_descendant}->{significant};
931     unshift @$sib, @{$node->child_nodes};
932     push @$new_todos, {type => 'element-attributes', node => $node};
933     push @$new_todos,
934     {type => 'code',
935     code => sub {
936     $parent_todo->{flag}->{has_descendant}->{significant} = 0
937     if not $sig_flag;
938     }};
939     last TP;
940 wakaba 1.45 } else {
941     unshift @$sib, @{$node->child_nodes};
942     push @$new_todos, {type => 'element-attributes', node => $node};
943     last TP;
944 wakaba 1.2 }
945     }
946 wakaba 1.8 if ($node_ns eq $HTML_NS and ($node_ln eq 'video' or $node_ln eq 'audio')) {
947 wakaba 1.2 if ($node->has_attribute_ns (undef, 'src')) {
948     unshift @$sib, @{$node->child_nodes};
949 wakaba 1.9 push @$new_todos, {type => 'element-attributes', node => $node};
950 wakaba 1.2 last TP;
951     } else {
952     my @cn = @{$node->child_nodes};
953     CN: while (@cn) {
954     my $cn = shift @cn;
955     my $cnt = $cn->node_type;
956     if ($cnt == 1) {
957 wakaba 1.8 my $cn_nsuri = $cn->namespace_uri;
958     $cn_nsuri = '' unless defined $cn_nsuri;
959     if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'source') {
960 wakaba 1.2 #
961     } else {
962     last CN;
963     }
964     } elsif ($cnt == 3 or $cnt == 4) {
965     if ($cn->data =~ /[^\x09-\x0D\x20]/) {
966     last CN;
967     }
968     }
969     } # CN
970     unshift @$sib, @cn;
971     }
972 wakaba 1.57 } elsif ($node_ns eq $HTML_NS and $node_ln eq 'object') {
973     my @cn = @{$node->child_nodes};
974     CN: while (@cn) {
975     my $cn = shift @cn;
976     my $cnt = $cn->node_type;
977     if ($cnt == 1) {
978     my $cn_nsuri = $cn->namespace_uri;
979     $cn_nsuri = '' unless defined $cn_nsuri;
980     if ($cn_nsuri eq $HTML_NS and $cn->manakai_local_name eq 'param') {
981     #
982     } else {
983     last CN;
984     }
985     } elsif ($cnt == 3 or $cnt == 4) {
986     if ($cn->data =~ /[^\x09-\x0D\x20]/) {
987     last CN;
988     }
989     }
990     } # CN
991     unshift @$sib, @cn;
992 wakaba 1.2 }
993 wakaba 1.4 push @$new_todos, {type => 'element', node => $node};
994 wakaba 1.2 } # TP
995 wakaba 1.30
996     for my $new_todo (@$new_todos) {
997     $new_todo->{flag} = {%{$parent_todo->{flag} or {}}};
998     }
999    
1000 wakaba 1.4 return ($sib, $new_todos);
1001 wakaba 1.2 } # _check_get_children
1002    
1003 wakaba 1.44 =head1 LICENSE
1004    
1005 wakaba 1.56 Copyright 2007-2008 Wakaba <w@suika.fam.cx>
1006 wakaba 1.44
1007     This library is free software; you can redistribute it
1008     and/or modify it under the same terms as Perl itself.
1009    
1010     =cut
1011    
1012 wakaba 1.1 1;
1013 wakaba 1.91 # $Date: 2008/09/09 04:45:13 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24