3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
4 |
use Error qw(:try); |
use Error qw(:try); |
5 |
|
|
6 |
|
## NOTE: This module don't check all HTML5 parse errors; character |
7 |
|
## encoding related parse errors are expected to be handled by relevant |
8 |
|
## modules. |
9 |
|
## Parse errors for control characters that are not allowed in HTML5 |
10 |
|
## documents, for surrogate code points, and for noncharacter code |
11 |
|
## points, as well as U+FFFD substitions for characters whose code points |
12 |
|
## is higher than U+10FFFF may be detected by combining the parser with |
13 |
|
## the checker implemented by Whatpm::Charset::UnicodeChecker (for its |
14 |
|
## usage example, see |t/HTML-tree.t| in the Whatpm package or the |
15 |
|
## WebHACC::Language::HTML module in the WebHACC package). |
16 |
|
|
17 |
## ISSUE: |
## ISSUE: |
18 |
## var doc = implementation.createDocument (null, null, null); |
## var doc = implementation.createDocument (null, null, null); |
19 |
## doc.write (''); |
## doc.write (''); |
582 |
my $wrapped_char_stream = $get_wrapper->($char_stream); |
my $wrapped_char_stream = $get_wrapper->($char_stream); |
583 |
$wrapped_char_stream->onerror ($char_onerror); |
$wrapped_char_stream->onerror ($char_onerror); |
584 |
|
|
585 |
my @args = @_; shift @args; # $s |
my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef; |
586 |
my $return; |
my $return; |
587 |
try { |
try { |
588 |
$return = $self->parse_char_stream ($wrapped_char_stream, @args); |
$return = $self->parse_char_stream ($wrapped_char_stream, @args); |
632 |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
633 |
require Whatpm::Charset::DecodeHandle; |
require Whatpm::Charset::DecodeHandle; |
634 |
my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s); |
my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s); |
|
if ($_[3]) { |
|
|
$input = $_[3]->($input); |
|
|
} |
|
635 |
return $self->parse_char_stream ($input, @_[1..$#_]); |
return $self->parse_char_stream ($input, @_[1..$#_]); |
636 |
} # parse_char_string |
} # parse_char_string |
637 |
*parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility. |
*parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility. |
638 |
|
|
639 |
my $disallowed_control_chars = |
sub parse_char_stream ($$$;$$) { |
|
{ |
|
|
0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1, |
|
|
0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1, |
|
|
0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1, |
|
|
0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1, |
|
|
0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1, |
|
|
0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1, |
|
|
0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1, |
|
|
0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1, |
|
|
0x10FFFE => 1, 0x10FFFF => 1, |
|
|
}; |
|
|
$disallowed_control_chars->{$_} = 1 |
|
|
for 0x0001 .. 0x0008, 0x000E .. 0x001F, 0x007F .. 0x009F, |
|
|
0xD800 .. 0xDFFF, 0xFDD0 .. 0xFDDF; |
|
|
## ISSUE: U+FDE0-U+FDEF are not excluded |
|
|
|
|
|
sub parse_char_stream ($$$;$) { |
|
640 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
641 |
my $input = $_[0]; |
my $input = $_[0]; |
642 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
649 |
if defined $self->{input_encoding}; |
if defined $self->{input_encoding}; |
650 |
## TODO: |{input_encoding}| is needless? |
## TODO: |{input_encoding}| is needless? |
651 |
|
|
|
my $i = 0; |
|
652 |
$self->{line_prev} = $self->{line} = 1; |
$self->{line_prev} = $self->{line} = 1; |
653 |
$self->{column_prev} = -1; |
$self->{column_prev} = -1; |
654 |
$self->{column} = 0; |
$self->{column} = 0; |
665 |
$self->{char_buffer_pos} = 0; |
$self->{char_buffer_pos} = 0; |
666 |
|
|
667 |
my $count = $input->manakai_read_until |
my $count = $input->manakai_read_until |
668 |
($self->{char_buffer}, |
($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos}); |
|
qr/(?![\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/, |
|
|
$self->{char_buffer_pos}); |
|
669 |
if ($count) { |
if ($count) { |
670 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
671 |
$self->{column_prev} = $self->{column}; |
$self->{column_prev} = $self->{column}; |
701 |
$self->{next_char} = 0x000A; # LF # MUST |
$self->{next_char} = 0x000A; # LF # MUST |
702 |
$self->{line}++; |
$self->{line}++; |
703 |
$self->{column} = 0; |
$self->{column} = 0; |
|
} elsif ($self->{next_char} > 0x10FFFF) { |
|
|
!!!cp ('j3'); |
|
|
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
|
704 |
} elsif ($self->{next_char} == 0x0000) { # NULL |
} elsif ($self->{next_char} == 0x0000) { # NULL |
705 |
!!!cp ('j4'); |
!!!cp ('j4'); |
706 |
!!!parse-error (type => 'NULL'); |
!!!parse-error (type => 'NULL'); |
707 |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
|
} elsif ($disallowed_control_chars->{$self->{next_char}}) { |
|
|
!!!cp ('j5'); |
|
|
if ($self->{next_char} < 0x10000) { |
|
|
!!!parse-error (type => 'control char', |
|
|
text => (sprintf 'U+%04X', $self->{next_char})); |
|
|
} else { |
|
|
!!!parse-error (type => 'control char', |
|
|
text => (sprintf 'U-%08X', $self->{next_char})); |
|
|
} |
|
708 |
} |
} |
709 |
}; |
}; |
710 |
|
|
712 |
#my ($scalar, $specials_range, $offset) = @_; |
#my ($scalar, $specials_range, $offset) = @_; |
713 |
return 0 if defined $self->{next_next_char}; |
return 0 if defined $self->{next_next_char}; |
714 |
|
|
715 |
my $pattern = qr/(?![$_[1]\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/; |
my $pattern = qr/[^$_[1]\x00\x0A\x0D]/; |
716 |
my $offset = $_[2] || 0; |
my $offset = $_[2] || 0; |
717 |
|
|
718 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
756 |
$onerror->(line => $self->{line}, column => $self->{column}, @_); |
$onerror->(line => $self->{line}, column => $self->{column}, @_); |
757 |
}; |
}; |
758 |
|
|
759 |
|
my $char_onerror = sub { |
760 |
|
my (undef, $type, %opt) = @_; |
761 |
|
!!!parse-error (layer => 'encode', |
762 |
|
line => $self->{line}, column => $self->{column} + 1, |
763 |
|
%opt, type => $type); |
764 |
|
}; # $char_onerror |
765 |
|
|
766 |
|
if ($_[3]) { |
767 |
|
$input = $_[3]->($input); |
768 |
|
$input->onerror ($char_onerror); |
769 |
|
} else { |
770 |
|
$input->onerror ($char_onerror) unless defined $input->onerror; |
771 |
|
} |
772 |
|
|
773 |
$self->_initialize_tokenizer; |
$self->_initialize_tokenizer; |
774 |
$self->_initialize_tree_constructor; |
$self->_initialize_tree_constructor; |
775 |
$self->_construct_tree; |
$self->_construct_tree; |
7825 |
$self->{char_buffer_pos} = 0; |
$self->{char_buffer_pos} = 0; |
7826 |
|
|
7827 |
my $count = $input->manakai_read_until |
my $count = $input->manakai_read_until |
7828 |
($self->{char_buffer}, |
($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, |
7829 |
qr/(?![\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/, |
$self->{char_buffer_pos}); |
|
$self->{char_buffer_pos}); |
|
7830 |
if ($count) { |
if ($count) { |
7831 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
7832 |
$self->{column_prev} = $self->{column}; |
$self->{column_prev} = $self->{column}; |
7862 |
$p->{line}++; |
$p->{line}++; |
7863 |
$p->{column} = 0; |
$p->{column} = 0; |
7864 |
!!!cp ('i2'); |
!!!cp ('i2'); |
|
} elsif ($self->{next_char} > 0x10FFFF) { |
|
|
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
|
|
!!!cp ('i3'); |
|
7865 |
} elsif ($self->{next_char} == 0x0000) { # NULL |
} elsif ($self->{next_char} == 0x0000) { # NULL |
7866 |
!!!cp ('i4'); |
!!!cp ('i4'); |
7867 |
!!!parse-error (type => 'NULL'); |
!!!parse-error (type => 'NULL'); |
7868 |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
|
} elsif ($disallowed_control_chars->{$self->{next_char}}) { |
|
|
!!!cp ('i4.1'); |
|
|
if ($self->{next_char} < 0x10000) { |
|
|
!!!parse-error (type => 'control char', |
|
|
text => (sprintf 'U+%04X', $self->{next_char})); |
|
|
} else { |
|
|
!!!parse-error (type => 'control char', |
|
|
text => (sprintf 'U-%08X', $self->{next_char})); |
|
|
} |
|
7869 |
} |
} |
7870 |
}; |
}; |
7871 |
|
|
7873 |
#my ($scalar, $specials_range, $offset) = @_; |
#my ($scalar, $specials_range, $offset) = @_; |
7874 |
return 0 if defined $p->{next_next_char}; |
return 0 if defined $p->{next_next_char}; |
7875 |
|
|
7876 |
my $pattern = qr/(?![$_[1]\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/; |
my $pattern = qr/[^$_[1]\x00\x0A\x0D]/; |
7877 |
my $offset = $_[2] || 0; |
my $offset = $_[2] || 0; |
7878 |
|
|
7879 |
if ($p->{char_buffer_pos} < length $p->{char_buffer}) { |
if ($p->{char_buffer_pos} < length $p->{char_buffer}) { |