| 1 |
package WebHACC::Language::HTML; |
| 2 |
use strict; |
| 3 |
require WebHACC::Language::DOM; |
| 4 |
push our @ISA, 'WebHACC::Language::DOM'; |
| 5 |
|
| 6 |
sub new ($) { |
| 7 |
return bless {}, shift; |
| 8 |
} # new |
| 9 |
|
| 10 |
sub generate_syntax_error_section ($) { |
| 11 |
my $self = shift; |
| 12 |
|
| 13 |
require Message::DOM::DOMImplementation; |
| 14 |
require Encode; |
| 15 |
require Whatpm::HTML; |
| 16 |
|
| 17 |
my $out = $self->output; |
| 18 |
$out->start_section (role => 'parse-errors'); |
| 19 |
$out->start_error_list (role => 'parse-errors'); |
| 20 |
$self->result->layer_applicable ('syntax'); |
| 21 |
|
| 22 |
my $input = $self->input; |
| 23 |
my $result = $self->result; |
| 24 |
|
| 25 |
my $onerror = sub { |
| 26 |
my %opt = @_; |
| 27 |
$result->add_error (layer => 'syntax', %opt); |
| 28 |
|
| 29 |
if ($opt{type} eq 'chardecode:no error') { |
| 30 |
$self->result->layer_uncertain ('encode'); |
| 31 |
} elsif ($opt{type} eq 'chardecode:fallback' or |
| 32 |
$opt{type} eq 'charset:not supported') { |
| 33 |
$self->result->layer_uncertain ('charset'); |
| 34 |
$self->result->layer_uncertain ('syntax'); |
| 35 |
$self->result->layer_uncertain ('structure'); |
| 36 |
$self->result->layer_uncertain ('semantics'); |
| 37 |
} |
| 38 |
}; |
| 39 |
|
| 40 |
$self->result->layer_applicable ('charset'); |
| 41 |
my $char_checker = sub ($) { |
| 42 |
require Whatpm::Charset::UnicodeChecker; |
| 43 |
return Whatpm::Charset::UnicodeChecker->new_handle ($_[0], 'html5'); |
| 44 |
}; # $char_checker |
| 45 |
|
| 46 |
my $dom = Message::DOM::DOMImplementation->new; |
| 47 |
my $doc = $dom->create_document; |
| 48 |
my $el; |
| 49 |
my $inner_html_element = $input->{inner_html_element}; |
| 50 |
if (defined $inner_html_element and length $inner_html_element) { |
| 51 |
$input->{charset} ||= 'utf-8'; |
| 52 |
my $t = \($input->{s}); |
| 53 |
unless ($input->{is_char_string}) { |
| 54 |
$t = \(Encode::decode ($input->{charset}, $$t)); |
| 55 |
$self->result->layer_applicable ('encode'); |
| 56 |
} |
| 57 |
|
| 58 |
$el = $doc->create_element_ns |
| 59 |
('http://www.w3.org/1999/xhtml', [undef, $inner_html_element]); |
| 60 |
Whatpm::HTML->set_inner_html ($el, $$t, $onerror, $char_checker); |
| 61 |
|
| 62 |
$self->{structure} = $el; |
| 63 |
$self->{_structure_root} = $doc; |
| 64 |
## NOTE: This is necessary, otherwise it would be garbage collected |
| 65 |
## before $el is useless, since $el->owner_document is only a weak |
| 66 |
## reference. |
| 67 |
} else { |
| 68 |
if ($input->{is_char_string}) { |
| 69 |
Whatpm::HTML->parse_char_string ($input->{s} => $doc, |
| 70 |
$onerror, $char_checker); |
| 71 |
} else { |
| 72 |
$self->result->layer_applicable ('encode'); |
| 73 |
Whatpm::HTML->parse_byte_string |
| 74 |
($input->{charset}, $input->{s} => $doc, $onerror, $char_checker); |
| 75 |
} |
| 76 |
|
| 77 |
$self->{structure} = $doc; |
| 78 |
} |
| 79 |
$doc->manakai_charset ($input->{official_charset}) |
| 80 |
if defined $input->{official_charset}; |
| 81 |
|
| 82 |
## TODO: We need to issue some warning if media type/charset is |
| 83 |
## explicitly overridden by the user. |
| 84 |
|
| 85 |
$doc->document_uri ($input->url); |
| 86 |
$doc->manakai_entity_base_uri ($input->{base_uri}); |
| 87 |
|
| 88 |
$doc->input_encoding (undef) if $input->isa ('WebHACC::Input::Text'); |
| 89 |
|
| 90 |
$out->end_error_list (role => 'parse-errors'); |
| 91 |
$out->end_section; |
| 92 |
} # generate_syntax_error_section |
| 93 |
|
| 94 |
sub source_charset ($) { |
| 95 |
my $self = shift; |
| 96 |
return (($self->{structure}->owner_document || $self->{structure})->input_encoding || $self->input->{charset}); |
| 97 |
## TODO: We need some way to get the source charset reliably. The |
| 98 |
## |input_encoding| DOM attribute might be intentionally left blank |
| 99 |
## when the input is the direct input form, but even that case the |
| 100 |
## charset information should be useful because the input string |
| 101 |
## might be a byte sequence. In addition, the |input_encoding| does |
| 102 |
## not reflect the fallback encoding in use. On the contrary, the |
| 103 |
## |{charset}| property of the |input| object is always the value |
| 104 |
## from the lower-level protocol and that might be ignored by the |
| 105 |
## HTML sniffer. |
| 106 |
} # source_charset |
| 107 |
|
| 108 |
1; |