2103 |
sub _tree_construction_main ($) { |
sub _tree_construction_main ($) { |
2104 |
my $self = shift; |
my $self = shift; |
2105 |
|
|
|
my $previous_insertion_mode; |
|
|
|
|
2106 |
my $active_formatting_elements = []; |
my $active_formatting_elements = []; |
2107 |
|
|
2108 |
my $reconstruct_active_formatting_elements = sub { # MUST |
my $reconstruct_active_formatting_elements = sub { # MUST |
3348 |
!!!next-token; |
!!!next-token; |
3349 |
redo B; |
redo B; |
3350 |
} elsif ($token->{type} eq 'end-of-file') { |
} elsif ($token->{type} eq 'end-of-file') { |
3351 |
if ($token->{insertion_mode} ne 'trailing end') { |
if ($self->{insertion_mode} eq 'after html body' or |
3352 |
|
$self->{insertion_mode} eq 'after html frameset') { |
3353 |
|
# |
3354 |
|
} else { |
3355 |
## Generate implied end tags |
## Generate implied end tags |
3356 |
if ({ |
if ({ |
3357 |
dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1, |
dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1, |
3378 |
last B; |
last B; |
3379 |
} elsif ($token->{type} eq 'start tag' and |
} elsif ($token->{type} eq 'start tag' and |
3380 |
$token->{tag_name} eq 'html') { |
$token->{tag_name} eq 'html') { |
3381 |
if ($self->{insertion_mode} eq 'trailing end') { |
if ($self->{insertion_mode} eq 'after html body') { |
3382 |
|
## Turn into the main phase |
3383 |
|
!!!parse-error (type => 'after html:html'); |
3384 |
|
$self->{insertion_mode} = 'after body'; |
3385 |
|
} elsif ($self->{insertion_mode} eq 'after html frameset') { |
3386 |
## Turn into the main phase |
## Turn into the main phase |
3387 |
!!!parse-error (type => 'after html:html'); |
!!!parse-error (type => 'after html:html'); |
3388 |
$self->{insertion_mode} = $previous_insertion_mode; |
$self->{insertion_mode} = 'after frameset'; |
3389 |
} |
} |
3390 |
|
|
3391 |
## ISSUE: "aa<html>" is not a parse error. |
## ISSUE: "aa<html>" is not a parse error. |
3405 |
redo B; |
redo B; |
3406 |
} elsif ($token->{type} eq 'comment') { |
} elsif ($token->{type} eq 'comment') { |
3407 |
my $comment = $self->{document}->create_comment ($token->{data}); |
my $comment = $self->{document}->create_comment ($token->{data}); |
3408 |
if ($self->{insertion_mode} eq 'trailing end') { |
if ($self->{insertion_mode} eq 'after html body' or |
3409 |
|
$self->{insertion_mode} eq 'after html frameset') { |
3410 |
$self->{document}->append_child ($comment); |
$self->{document}->append_child ($comment); |
3411 |
} elsif ($self->{insertion_mode} eq 'after body') { |
} elsif ($self->{insertion_mode} eq 'after body') { |
3412 |
$self->{open_elements}->[0]->[0]->append_child ($comment); |
$self->{open_elements}->[0]->[0]->append_child ($comment); |
4112 |
|
|
4113 |
$in_body->($insert_to_current); |
$in_body->($insert_to_current); |
4114 |
redo B; |
redo B; |
4115 |
} elsif ($self->{insertion_mode} eq 'in row' or |
} elsif ($self->{insertion_mode} eq 'in row' or |
4116 |
$self->{insertion_mode} eq 'in table body' or |
$self->{insertion_mode} eq 'in table body' or |
4117 |
$self->{insertion_mode} eq 'in table') { |
$self->{insertion_mode} eq 'in table') { |
4118 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
4119 |
## NOTE: There are "character in table" code clones. |
## NOTE: There are "character in table" code clones. |
4120 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
4678 |
!!!parse-error (type => 'in table:'.$token->{tag_name}); |
!!!parse-error (type => 'in table:'.$token->{tag_name}); |
4679 |
$in_body->($insert_to_foster); |
$in_body->($insert_to_foster); |
4680 |
redo B; |
redo B; |
4681 |
} elsif ($self->{insertion_mode} eq 'in column group') { |
} elsif ($self->{insertion_mode} eq 'in column group') { |
4682 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
4683 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
4684 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
4735 |
## reprocess |
## reprocess |
4736 |
redo B; |
redo B; |
4737 |
} |
} |
4738 |
} elsif ($self->{insertion_mode} eq 'in select') { |
} elsif ($self->{insertion_mode} eq 'in select') { |
4739 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
4740 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data}); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data}); |
4741 |
!!!next-token; |
!!!next-token; |
4909 |
## Ignore the token |
## Ignore the token |
4910 |
!!!next-token; |
!!!next-token; |
4911 |
redo B; |
redo B; |
4912 |
} elsif ($self->{insertion_mode} eq 'after body') { |
} elsif ($self->{insertion_mode} eq 'after body' or |
4913 |
if ($token->{type} eq 'character') { |
$self->{insertion_mode} eq 'after html body') { |
4914 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{type} eq 'character') { |
4915 |
my $data = $1; |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
4916 |
## As if in body |
my $data = $1; |
4917 |
$reconstruct_active_formatting_elements->($insert_to_current); |
## As if in body |
4918 |
|
$reconstruct_active_formatting_elements->($insert_to_current); |
4919 |
|
|
4920 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
4921 |
|
|
4922 |
|
unless (length $token->{data}) { |
4923 |
|
!!!next-token; |
4924 |
|
redo B; |
4925 |
|
} |
4926 |
|
} |
4927 |
|
|
4928 |
|
if ($self->{insertion_mode} eq 'after html body') { |
4929 |
|
!!!parse-error (type => 'after html:#character'); |
4930 |
|
|
4931 |
unless (length $token->{data}) { |
## Reprocess in the "main" phase, "after body" insertion mode... |
4932 |
!!!next-token; |
} |
4933 |
redo B; |
|
4934 |
} |
## "after body" insertion mode |
4935 |
} |
!!!parse-error (type => 'after body:#character'); |
4936 |
|
|
4937 |
# |
$self->{insertion_mode} = 'in body'; |
4938 |
!!!parse-error (type => 'after body:#character'); |
## reprocess |
4939 |
} elsif ($token->{type} eq 'start tag') { |
redo B; |
4940 |
!!!parse-error (type => 'after body:'.$token->{tag_name}); |
} elsif ($token->{type} eq 'start tag') { |
4941 |
# |
if ($self->{insertion_mode} eq 'after html body') { |
4942 |
} elsif ($token->{type} eq 'end tag') { |
!!!parse-error (type => 'after html:'.$token->{tag_name}); |
4943 |
if ($token->{tag_name} eq 'html') { |
|
4944 |
if (defined $self->{inner_html_node}) { |
## Reprocess in the "main" phase, "after body" insertion mode... |
4945 |
!!!parse-error (type => 'unmatched end tag:html'); |
} |
4946 |
## Ignore the token |
|
4947 |
!!!next-token; |
## "after body" insertion mode |
4948 |
redo B; |
!!!parse-error (type => 'after body:'.$token->{tag_name}); |
4949 |
} else { |
|
4950 |
$previous_insertion_mode = $self->{insertion_mode}; |
$self->{insertion_mode} = 'in body'; |
4951 |
$self->{insertion_mode} = 'trailing end'; |
## reprocess |
4952 |
!!!next-token; |
redo B; |
4953 |
redo B; |
} elsif ($token->{type} eq 'end tag') { |
4954 |
} |
if ($self->{insertion_mode} eq 'after html body') { |
4955 |
} else { |
!!!parse-error (type => 'after html:/'.$token->{tag_name}); |
4956 |
!!!parse-error (type => 'after body:/'.$token->{tag_name}); |
|
4957 |
} |
$self->{insertion_mode} = 'after body'; |
4958 |
|
## Reprocess in the "main" phase, "after body" insertion mode... |
4959 |
|
} |
4960 |
|
|
4961 |
|
## "after body" insertion mode |
4962 |
|
if ($token->{tag_name} eq 'html') { |
4963 |
|
if (defined $self->{inner_html_node}) { |
4964 |
|
!!!parse-error (type => 'unmatched end tag:html'); |
4965 |
|
## Ignore the token |
4966 |
|
!!!next-token; |
4967 |
|
redo B; |
4968 |
} else { |
} else { |
4969 |
die "$0: $token->{type}: Unknown token type"; |
$self->{insertion_mode} = 'after html body'; |
4970 |
|
!!!next-token; |
4971 |
|
redo B; |
4972 |
} |
} |
4973 |
|
} else { |
4974 |
|
!!!parse-error (type => 'after body:/'.$token->{tag_name}); |
4975 |
|
|
4976 |
$self->{insertion_mode} = 'in body'; |
$self->{insertion_mode} = 'in body'; |
4977 |
## reprocess |
## reprocess |
4978 |
redo B; |
redo B; |
4979 |
} elsif ($self->{insertion_mode} eq 'in frameset') { |
} |
4980 |
|
} else { |
4981 |
|
die "$0: $token->{type}: Unknown token type"; |
4982 |
|
} |
4983 |
|
} elsif ($self->{insertion_mode} eq 'in frameset' or |
4984 |
|
$self->{insertion_mode} eq 'after frameset' or |
4985 |
|
$self->{insertion_mode} eq 'after html frameset') { |
4986 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
4987 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
4988 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
4989 |
|
|
4990 |
unless (length $token->{data}) { |
unless (length $token->{data}) { |
4991 |
!!!next-token; |
!!!next-token; |
4992 |
redo B; |
redo B; |
4993 |
} |
} |
4994 |
} |
} |
4995 |
|
|
4996 |
|
if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) { |
4997 |
|
if ($self->{insertion_mode} eq 'in frameset') { |
4998 |
|
!!!parse-error (type => 'in frameset:#character'); |
4999 |
|
} elsif ($self->{insertion_mode} eq 'after frameset') { |
5000 |
|
!!!parse-error (type => 'after frameset:#character'); |
5001 |
|
} else { # "after html frameset" |
5002 |
|
!!!parse-error (type => 'after html:#character'); |
5003 |
|
|
5004 |
!!!parse-error (type => 'in frameset:#character'); |
$self->{insertion_mode} = 'after frameset'; |
5005 |
## Ignore the token |
## Reprocess in the "main" phase, "after frameset"... |
5006 |
!!!next-token; |
!!!parse-error (type => 'after frameset:#character'); |
5007 |
redo B; |
} |
5008 |
|
|
5009 |
|
## Ignore the token. |
5010 |
|
if (length $token->{data}) { |
5011 |
|
## reprocess the rest of characters |
5012 |
|
} else { |
5013 |
|
!!!next-token; |
5014 |
|
} |
5015 |
|
redo B; |
5016 |
|
} |
5017 |
|
|
5018 |
|
die qq[$0: Character "$token->{data}"]; |
5019 |
} elsif ($token->{type} eq 'start tag') { |
} elsif ($token->{type} eq 'start tag') { |
5020 |
if ($token->{tag_name} eq 'frameset') { |
if ($self->{insertion_mode} eq 'after html frameset') { |
5021 |
|
!!!parse-error (type => 'after html:'.$token->{tag_name}); |
5022 |
|
|
5023 |
|
$self->{insertion_mode} = 'after frameset'; |
5024 |
|
## Process in the "main" phase, "after frameset" insertion mode... |
5025 |
|
} |
5026 |
|
|
5027 |
|
if ($token->{tag_name} eq 'frameset' and |
5028 |
|
$self->{insertion_mode} eq 'in frameset') { |
5029 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
5030 |
!!!next-token; |
!!!next-token; |
5031 |
redo B; |
redo B; |
5032 |
} elsif ($token->{tag_name} eq 'frame') { |
} elsif ($token->{tag_name} eq 'frame' and |
5033 |
|
$self->{insertion_mode} eq 'in frameset') { |
5034 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
5035 |
pop @{$self->{open_elements}}; |
pop @{$self->{open_elements}}; |
5036 |
!!!next-token; |
!!!next-token; |
5040 |
$parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current); |
$parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current); |
5041 |
redo B; |
redo B; |
5042 |
} else { |
} else { |
5043 |
!!!parse-error (type => 'in frameset:'.$token->{tag_name}); |
if ($self->{insertion_mode} eq 'in frameset') { |
5044 |
|
!!!parse-error (type => 'in frameset:'.$token->{tag_name}); |
5045 |
|
} else { |
5046 |
|
!!!parse-error (type => 'after frameset:'.$token->{tag_name}); |
5047 |
|
} |
5048 |
## Ignore the token |
## Ignore the token |
5049 |
!!!next-token; |
!!!next-token; |
5050 |
redo B; |
redo B; |
5051 |
} |
} |
5052 |
} elsif ($token->{type} eq 'end tag') { |
} elsif ($token->{type} eq 'end tag') { |
5053 |
if ($token->{tag_name} eq 'frameset') { |
if ($self->{insertion_mode} eq 'after html frameset') { |
5054 |
|
!!!parse-error (type => 'after html:/'.$token->{tag_name}); |
5055 |
|
|
5056 |
|
$self->{insertion_mode} = 'after frameset'; |
5057 |
|
## Process in the "main" phase, "after frameset" insertion mode... |
5058 |
|
} |
5059 |
|
|
5060 |
|
if ($token->{tag_name} eq 'frameset' and |
5061 |
|
$self->{insertion_mode} eq 'in frameset') { |
5062 |
if ($self->{open_elements}->[-1]->[1] eq 'html' and |
if ($self->{open_elements}->[-1]->[1] eq 'html' and |
5063 |
@{$self->{open_elements}} == 1) { |
@{$self->{open_elements}} == 1) { |
5064 |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}); |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}); |
5074 |
$self->{insertion_mode} = 'after frameset'; |
$self->{insertion_mode} = 'after frameset'; |
5075 |
} |
} |
5076 |
redo B; |
redo B; |
5077 |
} else { |
} elsif ($token->{tag_name} eq 'html' and |
5078 |
!!!parse-error (type => 'in frameset:/'.$token->{tag_name}); |
$self->{insertion_mode} eq 'after frameset') { |
5079 |
## Ignore the token |
$self->{insertion_mode} = 'after html frameset'; |
5080 |
!!!next-token; |
!!!next-token; |
5081 |
redo B; |
redo B; |
|
} |
|
|
} else { |
|
|
die "$0: $token->{type}: Unknown token type"; |
|
|
} |
|
|
} elsif ($self->{insertion_mode} eq 'after frameset') { |
|
|
if ($token->{type} eq 'character') { |
|
|
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
|
|
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
|
|
|
|
|
unless (length $token->{data}) { |
|
|
!!!next-token; |
|
|
redo B; |
|
|
} |
|
|
} |
|
|
|
|
|
if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) { |
|
|
!!!parse-error (type => 'after frameset:#character'); |
|
|
|
|
|
## Ignore the token. |
|
|
if (length $token->{data}) { |
|
|
## reprocess the rest of characters |
|
|
} else { |
|
|
!!!next-token; |
|
|
} |
|
|
redo B; |
|
|
} |
|
|
|
|
|
die qq[$0: Character "$token->{data}"]; |
|
|
} elsif ($token->{type} eq 'start tag') { |
|
|
if ($token->{tag_name} eq 'noframes') { |
|
|
## NOTE: As if in body. |
|
|
$parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current); |
|
|
redo B; |
|
5082 |
} else { |
} else { |
5083 |
!!!parse-error (type => 'after frameset:'.$token->{tag_name}); |
if ($self->{insertion_mode} eq 'in frameset') { |
5084 |
## Ignore the token |
!!!parse-error (type => 'in frameset:/'.$token->{tag_name}); |
5085 |
!!!next-token; |
} else { |
5086 |
redo B; |
!!!parse-error (type => 'after frameset:/'.$token->{tag_name}); |
5087 |
} |
} |
|
} elsif ($token->{type} eq 'end tag') { |
|
|
if ($token->{tag_name} eq 'html') { |
|
|
$previous_insertion_mode = $self->{insertion_mode}; |
|
|
$self->{insertion_mode} = 'trailing end'; |
|
|
!!!next-token; |
|
|
redo B; |
|
|
} else { |
|
|
!!!parse-error (type => 'after frameset:/'.$token->{tag_name}); |
|
5088 |
## Ignore the token |
## Ignore the token |
5089 |
!!!next-token; |
!!!next-token; |
5090 |
redo B; |
redo B; |
5094 |
} |
} |
5095 |
|
|
5096 |
## ISSUE: An issue in spec here |
## ISSUE: An issue in spec here |
|
} elsif ($self->{insertion_mode} eq 'trailing end') { |
|
|
## states in the main stage is preserved yet # MUST |
|
|
|
|
|
if ($token->{type} eq 'character') { |
|
|
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
|
|
my $data = $1; |
|
|
## As if in the main phase. |
|
|
## NOTE: The insertion mode in the main phase |
|
|
## just before the phase has been changed to the trailing |
|
|
## end phase is either "after body" or "after frameset". |
|
|
$reconstruct_active_formatting_elements->($insert_to_current); |
|
|
|
|
|
$self->{open_elements}->[-1]->[0]->manakai_append_text ($data); |
|
|
|
|
|
unless (length $token->{data}) { |
|
|
!!!next-token; |
|
|
redo B; |
|
|
} |
|
|
} |
|
|
|
|
|
!!!parse-error (type => 'after html:#character'); |
|
|
$self->{insertion_mode} = $previous_insertion_mode; |
|
|
## reprocess |
|
|
redo B; |
|
|
} elsif ($token->{type} eq 'start tag') { |
|
|
!!!parse-error (type => 'after html:'.$token->{tag_name}); |
|
|
$self->{insertion_mode} = $previous_insertion_mode; |
|
|
## reprocess |
|
|
redo B; |
|
|
} elsif ($token->{type} eq 'end tag') { |
|
|
!!!parse-error (type => 'after html:/'.$token->{tag_name}); |
|
|
$self->{insertion_mode} = $previous_insertion_mode; |
|
|
## reprocess |
|
|
redo B; |
|
|
} else { |
|
|
die "$0: $token->{type}: Unknown token"; |
|
|
} |
|
5097 |
} else { |
} else { |
5098 |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
5099 |
} |
} |
5100 |
} # B |
} # B |
5101 |
|
|
5102 |
|
## NOTE: The "trailing end" phase in HTML5 is split into |
5103 |
|
## two insertion modes: "after html body" and "after html frameset". |
5104 |
|
## NOTE: States in the main stage is preserved while |
5105 |
|
## the parser stays in the trailing end phase. # MUST |
5106 |
|
|
5107 |
## Stop parsing # MUST |
## Stop parsing # MUST |
5108 |
|
|
5109 |
## TODO: script stuffs |
## TODO: script stuffs |