/[suikacvs]/webroot/gate/2007/html/WebHACC/Language/HTML.pm
Suika

Contents of /webroot/gate/2007/html/WebHACC/Language/HTML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (show annotations) (download)
Sun Sep 21 05:09:35 2008 UTC (15 years, 8 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.11: +15 -3 lines
++ ChangeLog	21 Sep 2008 05:08:51 -0000
2008-09-21  Wakaba  <wakaba@suika.fam.cx>

	* error-description-source.xml: An error for non-supported
	charset="" parameter is added.

++ html/WebHACC/Language/ChangeLog	21 Sep 2008 05:09:27 -0000
2008-09-21  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm (generate_syntax_error_section): Now |charset:not
	supported| is also an error that might invalidate the validation
	result.

1 package WebHACC::Language::HTML;
2 use strict;
3 require WebHACC::Language::DOM;
4 push our @ISA, 'WebHACC::Language::DOM';
5
6 sub new ($) {
7 return bless {}, shift;
8 } # new
9
10 sub generate_syntax_error_section ($) {
11 my $self = shift;
12
13 require Message::DOM::DOMImplementation;
14 require Encode;
15 require Whatpm::HTML;
16
17 my $out = $self->output;
18 $out->start_section (role => 'parse-errors');
19 $out->start_error_list (role => 'parse-errors');
20 $self->result->layer_applicable ('syntax');
21
22 my $input = $self->input;
23 my $result = $self->result;
24
25 my $onerror = sub {
26 my %opt = @_;
27 $result->add_error (layer => 'syntax', %opt);
28
29 if ($opt{type} eq 'chardecode:no error') {
30 $self->result->layer_uncertain ('encode');
31 } elsif ($opt{type} eq 'chardecode:fallback' or
32 $opt{type} eq 'charset:not supported') {
33 $self->result->layer_uncertain ('charset');
34 $self->result->layer_uncertain ('syntax');
35 $self->result->layer_uncertain ('structure');
36 $self->result->layer_uncertain ('semantics');
37 }
38 };
39
40 $self->result->layer_applicable ('charset');
41 my $char_checker = sub ($) {
42 require Whatpm::Charset::UnicodeChecker;
43 return Whatpm::Charset::UnicodeChecker->new_handle ($_[0], 'html5');
44 }; # $char_checker
45
46 my $dom = Message::DOM::DOMImplementation->new;
47 my $doc = $dom->create_document;
48 my $el;
49 my $inner_html_element = $input->{inner_html_element};
50 if (defined $inner_html_element and length $inner_html_element) {
51 $input->{charset} ||= 'utf-8';
52 my $t = \($input->{s});
53 unless ($input->{is_char_string}) {
54 $t = \(Encode::decode ($input->{charset}, $$t));
55 $self->result->layer_applicable ('encode');
56 }
57
58 $el = $doc->create_element_ns
59 ('http://www.w3.org/1999/xhtml', [undef, $inner_html_element]);
60 Whatpm::HTML->set_inner_html ($el, $$t, $onerror, $char_checker);
61
62 $self->{structure} = $el;
63 $self->{_structure_root} = $doc;
64 ## NOTE: This is necessary, otherwise it would be garbage collected
65 ## before $el is useless, since $el->owner_document is only a weak
66 ## reference.
67 } else {
68 if ($input->{is_char_string}) {
69 Whatpm::HTML->parse_char_string ($input->{s} => $doc,
70 $onerror, $char_checker);
71 } else {
72 $self->result->layer_applicable ('encode');
73 Whatpm::HTML->parse_byte_string
74 ($input->{charset}, $input->{s} => $doc, $onerror, $char_checker);
75 }
76
77 $self->{structure} = $doc;
78 }
79 $doc->manakai_charset ($input->{official_charset})
80 if defined $input->{official_charset};
81
82 ## TODO: We need to issue some warning if media type/charset is
83 ## explicitly overridden by the user.
84
85 $doc->document_uri ($input->url);
86 $doc->manakai_entity_base_uri ($input->{base_uri});
87
88 $doc->input_encoding (undef) if $input->isa ('WebHACC::Input::Text');
89
90 $out->end_error_list (role => 'parse-errors');
91 $out->end_section;
92 } # generate_syntax_error_section
93
94 sub source_charset ($) {
95 my $self = shift;
96 return (($self->{structure}->owner_document || $self->{structure})->input_encoding || $self->input->{charset});
97 ## TODO: We need some way to get the source charset reliably. The
98 ## |input_encoding| DOM attribute might be intentionally left blank
99 ## when the input is the direct input form, but even that case the
100 ## charset information should be useful because the input string
101 ## might be a byte sequence. In addition, the |input_encoding| does
102 ## not reflect the fallback encoding in use. On the contrary, the
103 ## |{charset}| property of the |input| object is always the value
104 ## from the lower-level protocol and that might be ignored by the
105 ## HTML sniffer.
106 } # source_charset
107
108 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24