2 |
use strict; |
use strict; |
3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
4 |
|
|
5 |
|
BEGIN { |
6 |
|
require Exporter; |
7 |
|
push our @ISA, 'Exporter'; |
8 |
|
|
9 |
|
our @EXPORT_OK = qw( |
10 |
|
DOCTYPE_TOKEN |
11 |
|
COMMENT_TOKEN |
12 |
|
START_TAG_TOKEN |
13 |
|
END_TAG_TOKEN |
14 |
|
END_OF_FILE_TOKEN |
15 |
|
CHARACTER_TOKEN |
16 |
|
PI_TOKEN |
17 |
|
ABORT_TOKEN |
18 |
|
); |
19 |
|
|
20 |
|
our %EXPORT_TAGS = ( |
21 |
|
token => [qw( |
22 |
|
DOCTYPE_TOKEN |
23 |
|
COMMENT_TOKEN |
24 |
|
START_TAG_TOKEN |
25 |
|
END_TAG_TOKEN |
26 |
|
END_OF_FILE_TOKEN |
27 |
|
CHARACTER_TOKEN |
28 |
|
PI_TOKEN |
29 |
|
ABORT_TOKEN |
30 |
|
)], |
31 |
|
); |
32 |
|
} |
33 |
|
|
34 |
|
## Token types |
35 |
|
|
36 |
|
sub DOCTYPE_TOKEN () { 1 } |
37 |
|
sub COMMENT_TOKEN () { 2 } |
38 |
|
sub START_TAG_TOKEN () { 3 } |
39 |
|
sub END_TAG_TOKEN () { 4 } |
40 |
|
sub END_OF_FILE_TOKEN () { 5 } |
41 |
|
sub CHARACTER_TOKEN () { 6 } |
42 |
|
sub PI_TOKEN () { 7 } # XML5 |
43 |
|
sub ABORT_TOKEN () { 8 } # Not a token actually |
44 |
|
|
45 |
package Whatpm::HTML; |
package Whatpm::HTML; |
46 |
|
|
47 |
|
BEGIN { Whatpm::HTML::Tokenizer->import (':token') } |
48 |
|
|
49 |
## Content model flags |
## Content model flags |
50 |
|
|
51 |
sub CM_ENTITY () { 0b001 } # & markup in data |
sub CM_ENTITY () { 0b001 } # & markup in data |
114 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
115 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
116 |
|
|
|
## Token types |
|
|
|
|
|
sub DOCTYPE_TOKEN () { 1 } |
|
|
sub COMMENT_TOKEN () { 2 } |
|
|
sub START_TAG_TOKEN () { 3 } |
|
|
sub END_TAG_TOKEN () { 4 } |
|
|
sub END_OF_FILE_TOKEN () { 5 } |
|
|
sub CHARACTER_TOKEN () { 6 } |
|
|
|
|
117 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
118 |
## list and descriptions) |
## list and descriptions) |
119 |
|
|
175 |
#$self->{level} |
#$self->{level} |
176 |
#$self->{set_nc} |
#$self->{set_nc} |
177 |
#$self->{parse_error} |
#$self->{parse_error} |
178 |
|
#$self->{is_xml} (if XML) |
179 |
|
|
180 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
181 |
#$self->{s_kwd}; # state keyword - initialized when used |
#$self->{s_kwd}; # state keyword - initialized when used |
441 |
!!!cp (19); |
!!!cp (19); |
442 |
$self->{ct} |
$self->{ct} |
443 |
= {type => START_TAG_TOKEN, |
= {type => START_TAG_TOKEN, |
444 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
445 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
446 |
column => $self->{column_prev}}; |
column => $self->{column_prev}}; |
447 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
530 |
!!!cp (29); |
!!!cp (29); |
531 |
$self->{ct} |
$self->{ct} |
532 |
= {type => END_TAG_TOKEN, |
= {type => END_TAG_TOKEN, |
533 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
534 |
line => $l, column => $c}; |
line => $l, column => $c}; |
535 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
536 |
!!!next-input-character; |
!!!next-input-character; |
661 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
662 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
663 |
!!!cp (38); |
!!!cp (38); |
664 |
$self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020); |
$self->{ct}->{tag_name} |
665 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
666 |
# start tag or end tag |
# start tag or end tag |
667 |
## Stay in this state |
## Stay in this state |
668 |
!!!next-input-character; |
!!!next-input-character; |
734 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
735 |
!!!cp (49); |
!!!cp (49); |
736 |
$self->{ca} |
$self->{ca} |
737 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
738 |
value => '', |
value => '', |
739 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
740 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
835 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
836 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
837 |
!!!cp (63); |
!!!cp (63); |
838 |
$self->{ca}->{name} .= chr ($self->{nc} + 0x0020); |
$self->{ca}->{name} |
839 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
840 |
## Stay in the state |
## Stay in the state |
841 |
!!!next-input-character; |
!!!next-input-character; |
842 |
redo A; |
redo A; |
920 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
921 |
!!!cp (76); |
!!!cp (76); |
922 |
$self->{ca} |
$self->{ca} |
923 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
924 |
value => '', |
value => '', |
925 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
926 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
1393 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{s_kwd} = chr $self->{nc}; |
1394 |
!!!next-input-character; |
!!!next-input-character; |
1395 |
redo A; |
redo A; |
1396 |
} elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
} elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
1397 |
$self->{open_elements}->[-1]->[1] & FOREIGN_EL and |
$self->{open_elements}->[-1]->[1] & FOREIGN_EL) or |
1398 |
|
$self->{is_xml}) and |
1399 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
1400 |
!!!cp (135.4); |
!!!cp (135.4); |
1401 |
$self->{state} = MD_CDATA_STATE; |
$self->{state} = MD_CDATA_STATE; |