package Whatpm::HTML::Tokenizer;
use strict;
our $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
BEGIN {
require Exporter;
push our @ISA, 'Exporter';
our @EXPORT_OK = qw(
DOCTYPE_TOKEN
COMMENT_TOKEN
START_TAG_TOKEN
END_TAG_TOKEN
END_OF_FILE_TOKEN
CHARACTER_TOKEN
PI_TOKEN
ABORT_TOKEN
);
our %EXPORT_TAGS = (
token => [qw(
DOCTYPE_TOKEN
COMMENT_TOKEN
START_TAG_TOKEN
END_TAG_TOKEN
END_OF_FILE_TOKEN
CHARACTER_TOKEN
PI_TOKEN
ABORT_TOKEN
)],
);
}
## Token types
sub DOCTYPE_TOKEN () { 1 }
sub COMMENT_TOKEN () { 2 }
sub START_TAG_TOKEN () { 3 }
sub END_TAG_TOKEN () { 4 }
sub END_OF_FILE_TOKEN () { 5 }
sub CHARACTER_TOKEN () { 6 }
sub PI_TOKEN () { 7 } # XML5
sub ABORT_TOKEN () { 8 } # Not a token actually
package Whatpm::HTML;
BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
## Content model flags
sub CM_ENTITY () { 0b001 } # & markup in data
sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
sub PLAINTEXT_CONTENT_MODEL () { 0 }
sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
## Tokenizer states
sub DATA_STATE () { 0 }
#sub ENTITY_DATA_STATE () { 1 }
sub TAG_OPEN_STATE () { 2 }
sub CLOSE_TAG_OPEN_STATE () { 3 }
sub TAG_NAME_STATE () { 4 }
sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
sub ATTRIBUTE_NAME_STATE () { 6 }
sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
#sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
sub COMMENT_START_STATE () { 14 }
sub COMMENT_START_DASH_STATE () { 15 }
sub COMMENT_STATE () { 16 }
sub COMMENT_END_STATE () { 17 }
sub COMMENT_END_DASH_STATE () { 18 }
sub BOGUS_COMMENT_STATE () { 19 }
sub DOCTYPE_STATE () { 20 }
sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
sub DOCTYPE_NAME_STATE () { 22 }
sub AFTER_DOCTYPE_NAME_STATE () { 23 }
sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
sub BOGUS_DOCTYPE_STATE () { 32 }
sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
sub SELF_CLOSING_START_TAG_STATE () { 34 }
sub CDATA_SECTION_STATE () { 35 }
sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
## NOTE: "Entity data state", "entity in attribute value state", and
## "consume a character reference" algorithm are jointly implemented
## using the following six states:
sub ENTITY_STATE () { 44 }
sub ENTITY_HASH_STATE () { 45 }
sub NCR_NUM_STATE () { 46 }
sub HEXREF_X_STATE () { 47 }
sub HEXREF_HEX_STATE () { 48 }
sub ENTITY_NAME_STATE () { 49 }
sub PCDATA_STATE () { 50 } # "data state" in the spec
## Tree constructor state constants (see Whatpm::HTML for the full
## list and descriptions)
sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
sub FOREIGN_EL () { 0b1_00000000000 }
## Character reference mappings
my $charref_map = {
0x0D => 0x000A,
0x80 => 0x20AC,
0x81 => 0xFFFD,
0x82 => 0x201A,
0x83 => 0x0192,
0x84 => 0x201E,
0x85 => 0x2026,
0x86 => 0x2020,
0x87 => 0x2021,
0x88 => 0x02C6,
0x89 => 0x2030,
0x8A => 0x0160,
0x8B => 0x2039,
0x8C => 0x0152,
0x8D => 0xFFFD,
0x8E => 0x017D,
0x8F => 0xFFFD,
0x90 => 0xFFFD,
0x91 => 0x2018,
0x92 => 0x2019,
0x93 => 0x201C,
0x94 => 0x201D,
0x95 => 0x2022,
0x96 => 0x2013,
0x97 => 0x2014,
0x98 => 0x02DC,
0x99 => 0x2122,
0x9A => 0x0161,
0x9B => 0x203A,
0x9C => 0x0153,
0x9D => 0xFFFD,
0x9E => 0x017E,
0x9F => 0x0178,
}; # $charref_map
$charref_map->{$_} = 0xFFFD
for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
## Implementations MUST act as if state machine in the spec
sub _initialize_tokenizer ($) {
my $self = shift;
## NOTE: Fields set by |new| constructor:
#$self->{level}
#$self->{set_nc}
#$self->{parse_error}
#$self->{is_xml} (if XML)
$self->{state} = DATA_STATE; # MUST
#$self->{s_kwd}; # state keyword - initialized when used
#$self->{entity__value}; # initialized when used
#$self->{entity__match}; # initialized when used
$self->{content_model} = PCDATA_CONTENT_MODEL; # be
undef $self->{ct}; # current token
undef $self->{ca}; # current attribute
undef $self->{last_stag_name}; # last emitted start tag name
#$self->{prev_state}; # initialized when used
delete $self->{self_closing};
$self->{char_buffer} = '';
$self->{char_buffer_pos} = 0;
$self->{nc} = -1; # next input character
#$self->{next_nc}
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
$self->{token} = [];
# $self->{escape}
} # _initialize_tokenizer
## A token has:
## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
## ->{name} (DOCTYPE_TOKEN)
## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
## ->{pubid} (DOCTYPE_TOKEN)
## ->{sysid} (DOCTYPE_TOKEN)
## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
## ->{name}
## ->{value}
## ->{has_reference} == 1 or 0
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
## |->{self_closing}| is used to save the value of |$self->{self_closing}|
## while the token is pushed back to the stack.
## Emitted token MUST immediately be handled by the tree construction state.
## Before each step, UA MAY check to see if either one of the scripts in
## "list of scripts that will execute as soon as possible" or the first
## script in the "list of scripts that will execute asynchronously",
## has completed loading. If one has, then it MUST be executed
## and removed from the list.
## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
## (This requirement was dropped from HTML5 spec, unfortunately.)
my $is_space = {
0x0009 => 1, # CHARACTER TABULATION (HT)
0x000A => 1, # LINE FEED (LF)
#0x000B => 0, # LINE TABULATION (VT)
0x000C => 1, # FORM FEED (FF)
#0x000D => 1, # CARRIAGE RETURN (CR)
0x0020 => 1, # SPACE (SP)
};
sub _get_next_token ($) {
my $self = shift;
if ($self->{self_closing}) {
$self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
## NOTE: The |self_closing| flag is only set by start tag token.
## In addition, when a start tag token is emitted, it is always set to
## |ct|.
delete $self->{self_closing};
}
if (@{$self->{token}}) {
$self->{self_closing} = $self->{token}->[0]->{self_closing};
return shift @{$self->{token}};
}
A: {
if ($self->{state} == PCDATA_STATE) {
## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
if ($self->{nc} == 0x0026) { # &
## NOTE: In the spec, the tokenizer is switched to the
## "entity data state". In this implementation, the tokenizer
## is switched to the |ENTITY_STATE|, which is an implementation
## of the "consume a character reference" algorithm.
$self->{entity_add} = -1;
$self->{prev_state} = DATA_STATE;
$self->{state} = ENTITY_STATE;
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
redo A;
} elsif ($self->{nc} == 0x003C) { # <
$self->{state} = TAG_OPEN_STATE;
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
redo A;
} elsif ($self->{nc} == -1) {
return ({type => END_OF_FILE_TOKEN,
line => $self->{line}, column => $self->{column}});
last A; ## TODO: ok?
} else {
#
}
# Anything else
my $token = {type => CHARACTER_TOKEN,
data => chr $self->{nc},
line => $self->{line}, column => $self->{column},
};
$self->{read_until}->($token->{data}, q[<&], length $token->{data});
## Stay in the state.
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
return ($token);
redo A;
} elsif ($self->{state} == DATA_STATE) {
$self->{s_kwd} = '' unless defined $self->{s_kwd};
if ($self->{nc} == 0x0026) { # &
$self->{s_kwd} = '';
if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
not $self->{escape}) {
## NOTE: In the spec, the tokenizer is switched to the
## "entity data state". In this implementation, the tokenizer
## is switched to the |ENTITY_STATE|, which is an implementation
## of the "consume a character reference" algorithm.
$self->{entity_add} = -1;
$self->{prev_state} = DATA_STATE;
$self->{state} = ENTITY_STATE;
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
redo A;
} else {
#
}
} elsif ($self->{nc} == 0x002D) { # -
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
$self->{s_kwd} .= '-';
if ($self->{s_kwd} eq '