| 2 |
use strict; |
use strict; |
| 3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 4 |
|
|
| 5 |
|
BEGIN { |
| 6 |
|
require Exporter; |
| 7 |
|
push our @ISA, 'Exporter'; |
| 8 |
|
|
| 9 |
|
our @EXPORT_OK = qw( |
| 10 |
|
DOCTYPE_TOKEN |
| 11 |
|
COMMENT_TOKEN |
| 12 |
|
START_TAG_TOKEN |
| 13 |
|
END_TAG_TOKEN |
| 14 |
|
END_OF_FILE_TOKEN |
| 15 |
|
CHARACTER_TOKEN |
| 16 |
|
PI_TOKEN |
| 17 |
|
ABORT_TOKEN |
| 18 |
|
); |
| 19 |
|
|
| 20 |
|
our %EXPORT_TAGS = ( |
| 21 |
|
token => [qw( |
| 22 |
|
DOCTYPE_TOKEN |
| 23 |
|
COMMENT_TOKEN |
| 24 |
|
START_TAG_TOKEN |
| 25 |
|
END_TAG_TOKEN |
| 26 |
|
END_OF_FILE_TOKEN |
| 27 |
|
CHARACTER_TOKEN |
| 28 |
|
PI_TOKEN |
| 29 |
|
ABORT_TOKEN |
| 30 |
|
)], |
| 31 |
|
); |
| 32 |
|
} |
| 33 |
|
|
| 34 |
|
## Token types |
| 35 |
|
|
| 36 |
|
sub DOCTYPE_TOKEN () { 1 } |
| 37 |
|
sub COMMENT_TOKEN () { 2 } |
| 38 |
|
sub START_TAG_TOKEN () { 3 } |
| 39 |
|
sub END_TAG_TOKEN () { 4 } |
| 40 |
|
sub END_OF_FILE_TOKEN () { 5 } |
| 41 |
|
sub CHARACTER_TOKEN () { 6 } |
| 42 |
|
sub PI_TOKEN () { 7 } # XML5 |
| 43 |
|
sub ABORT_TOKEN () { 8 } # Not a token actually |
| 44 |
|
|
| 45 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 46 |
|
|
| 47 |
|
BEGIN { Whatpm::HTML::Tokenizer->import (':token') } |
| 48 |
|
|
| 49 |
## Content model flags |
## Content model flags |
| 50 |
|
|
| 51 |
sub CM_ENTITY () { 0b001 } # & markup in data |
sub CM_ENTITY () { 0b001 } # & markup in data |
| 114 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
| 115 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
| 116 |
|
|
|
## Token types |
|
|
|
|
|
sub DOCTYPE_TOKEN () { 1 } |
|
|
sub COMMENT_TOKEN () { 2 } |
|
|
sub START_TAG_TOKEN () { 3 } |
|
|
sub END_TAG_TOKEN () { 4 } |
|
|
sub END_OF_FILE_TOKEN () { 5 } |
|
|
sub CHARACTER_TOKEN () { 6 } |
|
|
|
|
| 117 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
| 118 |
## list and descriptions) |
## list and descriptions) |
| 119 |
|
|
| 175 |
#$self->{level} |
#$self->{level} |
| 176 |
#$self->{set_nc} |
#$self->{set_nc} |
| 177 |
#$self->{parse_error} |
#$self->{parse_error} |
| 178 |
|
#$self->{is_xml} (if XML) |
| 179 |
|
|
| 180 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
| 181 |
#$self->{s_kwd}; # state keyword - initialized when used |
#$self->{s_kwd}; # state keyword - initialized when used |
| 541 |
|
|
| 542 |
$self->{ct} |
$self->{ct} |
| 543 |
= {type => START_TAG_TOKEN, |
= {type => START_TAG_TOKEN, |
| 544 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 545 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 546 |
column => $self->{column_prev}}; |
column => $self->{column_prev}}; |
| 547 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 660 |
|
|
| 661 |
$self->{ct} |
$self->{ct} |
| 662 |
= {type => END_TAG_TOKEN, |
= {type => END_TAG_TOKEN, |
| 663 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 664 |
line => $l, column => $c}; |
line => $l, column => $c}; |
| 665 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 666 |
|
|
| 851 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
| 852 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 853 |
|
|
| 854 |
$self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020); |
$self->{ct}->{tag_name} |
| 855 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
| 856 |
# start tag or end tag |
# start tag or end tag |
| 857 |
## Stay in this state |
## Stay in this state |
| 858 |
|
|
| 974 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 975 |
|
|
| 976 |
$self->{ca} |
$self->{ca} |
| 977 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 978 |
value => '', |
value => '', |
| 979 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
| 980 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
| 1135 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
| 1136 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 1137 |
|
|
| 1138 |
$self->{ca}->{name} .= chr ($self->{nc} + 0x0020); |
$self->{ca}->{name} |
| 1139 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
| 1140 |
## Stay in the state |
## Stay in the state |
| 1141 |
|
|
| 1142 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1280 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 1281 |
|
|
| 1282 |
$self->{ca} |
$self->{ca} |
| 1283 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 1284 |
value => '', |
value => '', |
| 1285 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
| 1286 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
| 2013 |
} |
} |
| 2014 |
|
|
| 2015 |
redo A; |
redo A; |
| 2016 |
} elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
} elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
| 2017 |
$self->{open_elements}->[-1]->[1] & FOREIGN_EL and |
$self->{open_elements}->[-1]->[1] & FOREIGN_EL) or |
| 2018 |
|
$self->{is_xml}) and |
| 2019 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 2020 |
|
|
| 2021 |
$self->{state} = MD_CDATA_STATE; |
$self->{state} = MD_CDATA_STATE; |