| 2103 |
sub _tree_construction_main ($) { |
sub _tree_construction_main ($) { |
| 2104 |
my $self = shift; |
my $self = shift; |
| 2105 |
|
|
|
my $previous_insertion_mode; |
|
|
|
|
| 2106 |
my $active_formatting_elements = []; |
my $active_formatting_elements = []; |
| 2107 |
|
|
| 2108 |
my $reconstruct_active_formatting_elements = sub { # MUST |
my $reconstruct_active_formatting_elements = sub { # MUST |
| 3348 |
!!!next-token; |
!!!next-token; |
| 3349 |
redo B; |
redo B; |
| 3350 |
} elsif ($token->{type} eq 'end-of-file') { |
} elsif ($token->{type} eq 'end-of-file') { |
| 3351 |
if ($token->{insertion_mode} ne 'trailing end') { |
if ($self->{insertion_mode} eq 'after html body' or |
| 3352 |
|
$self->{insertion_mode} eq 'after html frameset') { |
| 3353 |
|
# |
| 3354 |
|
} else { |
| 3355 |
## Generate implied end tags |
## Generate implied end tags |
| 3356 |
if ({ |
if ({ |
| 3357 |
dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1, |
dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1, |
| 3378 |
last B; |
last B; |
| 3379 |
} elsif ($token->{type} eq 'start tag' and |
} elsif ($token->{type} eq 'start tag' and |
| 3380 |
$token->{tag_name} eq 'html') { |
$token->{tag_name} eq 'html') { |
| 3381 |
if ($self->{insertion_mode} eq 'trailing end') { |
if ($self->{insertion_mode} eq 'after html body') { |
| 3382 |
|
## Turn into the main phase |
| 3383 |
|
!!!parse-error (type => 'after html:html'); |
| 3384 |
|
$self->{insertion_mode} = 'after body'; |
| 3385 |
|
} elsif ($self->{insertion_mode} eq 'after html frameset') { |
| 3386 |
## Turn into the main phase |
## Turn into the main phase |
| 3387 |
!!!parse-error (type => 'after html:html'); |
!!!parse-error (type => 'after html:html'); |
| 3388 |
$self->{insertion_mode} = $previous_insertion_mode; |
$self->{insertion_mode} = 'after frameset'; |
| 3389 |
} |
} |
| 3390 |
|
|
| 3391 |
## ISSUE: "aa<html>" is not a parse error. |
## ISSUE: "aa<html>" is not a parse error. |
| 3405 |
redo B; |
redo B; |
| 3406 |
} elsif ($token->{type} eq 'comment') { |
} elsif ($token->{type} eq 'comment') { |
| 3407 |
my $comment = $self->{document}->create_comment ($token->{data}); |
my $comment = $self->{document}->create_comment ($token->{data}); |
| 3408 |
if ($self->{insertion_mode} eq 'trailing end') { |
if ($self->{insertion_mode} eq 'after html body' or |
| 3409 |
|
$self->{insertion_mode} eq 'after html frameset') { |
| 3410 |
$self->{document}->append_child ($comment); |
$self->{document}->append_child ($comment); |
| 3411 |
} elsif ($self->{insertion_mode} eq 'after body') { |
} elsif ($self->{insertion_mode} eq 'after body') { |
| 3412 |
$self->{open_elements}->[0]->[0]->append_child ($comment); |
$self->{open_elements}->[0]->[0]->append_child ($comment); |
| 4112 |
|
|
| 4113 |
$in_body->($insert_to_current); |
$in_body->($insert_to_current); |
| 4114 |
redo B; |
redo B; |
| 4115 |
} elsif ($self->{insertion_mode} eq 'in row' or |
} elsif ($self->{insertion_mode} eq 'in row' or |
| 4116 |
$self->{insertion_mode} eq 'in table body' or |
$self->{insertion_mode} eq 'in table body' or |
| 4117 |
$self->{insertion_mode} eq 'in table') { |
$self->{insertion_mode} eq 'in table') { |
| 4118 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
| 4119 |
## NOTE: There are "character in table" code clones. |
## NOTE: There are "character in table" code clones. |
| 4120 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
| 4678 |
!!!parse-error (type => 'in table:'.$token->{tag_name}); |
!!!parse-error (type => 'in table:'.$token->{tag_name}); |
| 4679 |
$in_body->($insert_to_foster); |
$in_body->($insert_to_foster); |
| 4680 |
redo B; |
redo B; |
| 4681 |
} elsif ($self->{insertion_mode} eq 'in column group') { |
} elsif ($self->{insertion_mode} eq 'in column group') { |
| 4682 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
| 4683 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
| 4684 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
| 4735 |
## reprocess |
## reprocess |
| 4736 |
redo B; |
redo B; |
| 4737 |
} |
} |
| 4738 |
} elsif ($self->{insertion_mode} eq 'in select') { |
} elsif ($self->{insertion_mode} eq 'in select') { |
| 4739 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
| 4740 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data}); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data}); |
| 4741 |
!!!next-token; |
!!!next-token; |
| 4909 |
## Ignore the token |
## Ignore the token |
| 4910 |
!!!next-token; |
!!!next-token; |
| 4911 |
redo B; |
redo B; |
| 4912 |
} elsif ($self->{insertion_mode} eq 'after body') { |
} elsif ($self->{insertion_mode} eq 'after body' or |
| 4913 |
if ($token->{type} eq 'character') { |
$self->{insertion_mode} eq 'after html body') { |
| 4914 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{type} eq 'character') { |
| 4915 |
my $data = $1; |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
| 4916 |
## As if in body |
my $data = $1; |
| 4917 |
$reconstruct_active_formatting_elements->($insert_to_current); |
## As if in body |
| 4918 |
|
$reconstruct_active_formatting_elements->($insert_to_current); |
| 4919 |
|
|
| 4920 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
| 4921 |
|
|
| 4922 |
|
unless (length $token->{data}) { |
| 4923 |
|
!!!next-token; |
| 4924 |
|
redo B; |
| 4925 |
|
} |
| 4926 |
|
} |
| 4927 |
|
|
| 4928 |
|
if ($self->{insertion_mode} eq 'after html body') { |
| 4929 |
|
!!!parse-error (type => 'after html:#character'); |
| 4930 |
|
|
| 4931 |
unless (length $token->{data}) { |
## Reprocess in the "main" phase, "after body" insertion mode... |
| 4932 |
!!!next-token; |
} |
| 4933 |
redo B; |
|
| 4934 |
} |
## "after body" insertion mode |
| 4935 |
} |
!!!parse-error (type => 'after body:#character'); |
| 4936 |
|
|
| 4937 |
# |
$self->{insertion_mode} = 'in body'; |
| 4938 |
!!!parse-error (type => 'after body:#character'); |
## reprocess |
| 4939 |
} elsif ($token->{type} eq 'start tag') { |
redo B; |
| 4940 |
!!!parse-error (type => 'after body:'.$token->{tag_name}); |
} elsif ($token->{type} eq 'start tag') { |
| 4941 |
# |
if ($self->{insertion_mode} eq 'after html body') { |
| 4942 |
} elsif ($token->{type} eq 'end tag') { |
!!!parse-error (type => 'after html:'.$token->{tag_name}); |
| 4943 |
if ($token->{tag_name} eq 'html') { |
|
| 4944 |
if (defined $self->{inner_html_node}) { |
## Reprocess in the "main" phase, "after body" insertion mode... |
| 4945 |
!!!parse-error (type => 'unmatched end tag:html'); |
} |
| 4946 |
## Ignore the token |
|
| 4947 |
!!!next-token; |
## "after body" insertion mode |
| 4948 |
redo B; |
!!!parse-error (type => 'after body:'.$token->{tag_name}); |
| 4949 |
} else { |
|
| 4950 |
$previous_insertion_mode = $self->{insertion_mode}; |
$self->{insertion_mode} = 'in body'; |
| 4951 |
$self->{insertion_mode} = 'trailing end'; |
## reprocess |
| 4952 |
!!!next-token; |
redo B; |
| 4953 |
redo B; |
} elsif ($token->{type} eq 'end tag') { |
| 4954 |
} |
if ($self->{insertion_mode} eq 'after html body') { |
| 4955 |
} else { |
!!!parse-error (type => 'after html:/'.$token->{tag_name}); |
| 4956 |
!!!parse-error (type => 'after body:/'.$token->{tag_name}); |
|
| 4957 |
} |
$self->{insertion_mode} = 'after body'; |
| 4958 |
|
## Reprocess in the "main" phase, "after body" insertion mode... |
| 4959 |
|
} |
| 4960 |
|
|
| 4961 |
|
## "after body" insertion mode |
| 4962 |
|
if ($token->{tag_name} eq 'html') { |
| 4963 |
|
if (defined $self->{inner_html_node}) { |
| 4964 |
|
!!!parse-error (type => 'unmatched end tag:html'); |
| 4965 |
|
## Ignore the token |
| 4966 |
|
!!!next-token; |
| 4967 |
|
redo B; |
| 4968 |
} else { |
} else { |
| 4969 |
die "$0: $token->{type}: Unknown token type"; |
$self->{insertion_mode} = 'after html body'; |
| 4970 |
|
!!!next-token; |
| 4971 |
|
redo B; |
| 4972 |
} |
} |
| 4973 |
|
} else { |
| 4974 |
|
!!!parse-error (type => 'after body:/'.$token->{tag_name}); |
| 4975 |
|
|
| 4976 |
$self->{insertion_mode} = 'in body'; |
$self->{insertion_mode} = 'in body'; |
| 4977 |
## reprocess |
## reprocess |
| 4978 |
redo B; |
redo B; |
| 4979 |
} elsif ($self->{insertion_mode} eq 'in frameset') { |
} |
| 4980 |
|
} else { |
| 4981 |
|
die "$0: $token->{type}: Unknown token type"; |
| 4982 |
|
} |
| 4983 |
|
} elsif ($self->{insertion_mode} eq 'in frameset' or |
| 4984 |
|
$self->{insertion_mode} eq 'after frameset' or |
| 4985 |
|
$self->{insertion_mode} eq 'after html frameset') { |
| 4986 |
if ($token->{type} eq 'character') { |
if ($token->{type} eq 'character') { |
| 4987 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
| 4988 |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
| 4989 |
|
|
| 4990 |
unless (length $token->{data}) { |
unless (length $token->{data}) { |
| 4991 |
!!!next-token; |
!!!next-token; |
| 4992 |
redo B; |
redo B; |
| 4993 |
} |
} |
| 4994 |
} |
} |
| 4995 |
|
|
| 4996 |
|
if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) { |
| 4997 |
|
if ($self->{insertion_mode} eq 'in frameset') { |
| 4998 |
|
!!!parse-error (type => 'in frameset:#character'); |
| 4999 |
|
} elsif ($self->{insertion_mode} eq 'after frameset') { |
| 5000 |
|
!!!parse-error (type => 'after frameset:#character'); |
| 5001 |
|
} else { # "after html frameset" |
| 5002 |
|
!!!parse-error (type => 'after html:#character'); |
| 5003 |
|
|
| 5004 |
!!!parse-error (type => 'in frameset:#character'); |
$self->{insertion_mode} = 'after frameset'; |
| 5005 |
## Ignore the token |
## Reprocess in the "main" phase, "after frameset"... |
| 5006 |
!!!next-token; |
!!!parse-error (type => 'after frameset:#character'); |
| 5007 |
redo B; |
} |
| 5008 |
|
|
| 5009 |
|
## Ignore the token. |
| 5010 |
|
if (length $token->{data}) { |
| 5011 |
|
## reprocess the rest of characters |
| 5012 |
|
} else { |
| 5013 |
|
!!!next-token; |
| 5014 |
|
} |
| 5015 |
|
redo B; |
| 5016 |
|
} |
| 5017 |
|
|
| 5018 |
|
die qq[$0: Character "$token->{data}"]; |
| 5019 |
} elsif ($token->{type} eq 'start tag') { |
} elsif ($token->{type} eq 'start tag') { |
| 5020 |
if ($token->{tag_name} eq 'frameset') { |
if ($self->{insertion_mode} eq 'after html frameset') { |
| 5021 |
|
!!!parse-error (type => 'after html:'.$token->{tag_name}); |
| 5022 |
|
|
| 5023 |
|
$self->{insertion_mode} = 'after frameset'; |
| 5024 |
|
## Process in the "main" phase, "after frameset" insertion mode... |
| 5025 |
|
} |
| 5026 |
|
|
| 5027 |
|
if ($token->{tag_name} eq 'frameset' and |
| 5028 |
|
$self->{insertion_mode} eq 'in frameset') { |
| 5029 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
| 5030 |
!!!next-token; |
!!!next-token; |
| 5031 |
redo B; |
redo B; |
| 5032 |
} elsif ($token->{tag_name} eq 'frame') { |
} elsif ($token->{tag_name} eq 'frame' and |
| 5033 |
|
$self->{insertion_mode} eq 'in frameset') { |
| 5034 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
| 5035 |
pop @{$self->{open_elements}}; |
pop @{$self->{open_elements}}; |
| 5036 |
!!!next-token; |
!!!next-token; |
| 5040 |
$parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current); |
$parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current); |
| 5041 |
redo B; |
redo B; |
| 5042 |
} else { |
} else { |
| 5043 |
!!!parse-error (type => 'in frameset:'.$token->{tag_name}); |
if ($self->{insertion_mode} eq 'in frameset') { |
| 5044 |
|
!!!parse-error (type => 'in frameset:'.$token->{tag_name}); |
| 5045 |
|
} else { |
| 5046 |
|
!!!parse-error (type => 'after frameset:'.$token->{tag_name}); |
| 5047 |
|
} |
| 5048 |
## Ignore the token |
## Ignore the token |
| 5049 |
!!!next-token; |
!!!next-token; |
| 5050 |
redo B; |
redo B; |
| 5051 |
} |
} |
| 5052 |
} elsif ($token->{type} eq 'end tag') { |
} elsif ($token->{type} eq 'end tag') { |
| 5053 |
if ($token->{tag_name} eq 'frameset') { |
if ($self->{insertion_mode} eq 'after html frameset') { |
| 5054 |
|
!!!parse-error (type => 'after html:/'.$token->{tag_name}); |
| 5055 |
|
|
| 5056 |
|
$self->{insertion_mode} = 'after frameset'; |
| 5057 |
|
## Process in the "main" phase, "after frameset" insertion mode... |
| 5058 |
|
} |
| 5059 |
|
|
| 5060 |
|
if ($token->{tag_name} eq 'frameset' and |
| 5061 |
|
$self->{insertion_mode} eq 'in frameset') { |
| 5062 |
if ($self->{open_elements}->[-1]->[1] eq 'html' and |
if ($self->{open_elements}->[-1]->[1] eq 'html' and |
| 5063 |
@{$self->{open_elements}} == 1) { |
@{$self->{open_elements}} == 1) { |
| 5064 |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}); |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}); |
| 5074 |
$self->{insertion_mode} = 'after frameset'; |
$self->{insertion_mode} = 'after frameset'; |
| 5075 |
} |
} |
| 5076 |
redo B; |
redo B; |
| 5077 |
} else { |
} elsif ($token->{tag_name} eq 'html' and |
| 5078 |
!!!parse-error (type => 'in frameset:/'.$token->{tag_name}); |
$self->{insertion_mode} eq 'after frameset') { |
| 5079 |
## Ignore the token |
$self->{insertion_mode} = 'after html frameset'; |
| 5080 |
!!!next-token; |
!!!next-token; |
| 5081 |
redo B; |
redo B; |
|
} |
|
|
} else { |
|
|
die "$0: $token->{type}: Unknown token type"; |
|
|
} |
|
|
} elsif ($self->{insertion_mode} eq 'after frameset') { |
|
|
if ($token->{type} eq 'character') { |
|
|
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
|
|
$self->{open_elements}->[-1]->[0]->manakai_append_text ($1); |
|
|
|
|
|
unless (length $token->{data}) { |
|
|
!!!next-token; |
|
|
redo B; |
|
|
} |
|
|
} |
|
|
|
|
|
if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) { |
|
|
!!!parse-error (type => 'after frameset:#character'); |
|
|
|
|
|
## Ignore the token. |
|
|
if (length $token->{data}) { |
|
|
## reprocess the rest of characters |
|
|
} else { |
|
|
!!!next-token; |
|
|
} |
|
|
redo B; |
|
|
} |
|
|
|
|
|
die qq[$0: Character "$token->{data}"]; |
|
|
} elsif ($token->{type} eq 'start tag') { |
|
|
if ($token->{tag_name} eq 'noframes') { |
|
|
## NOTE: As if in body. |
|
|
$parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current); |
|
|
redo B; |
|
| 5082 |
} else { |
} else { |
| 5083 |
!!!parse-error (type => 'after frameset:'.$token->{tag_name}); |
if ($self->{insertion_mode} eq 'in frameset') { |
| 5084 |
## Ignore the token |
!!!parse-error (type => 'in frameset:/'.$token->{tag_name}); |
| 5085 |
!!!next-token; |
} else { |
| 5086 |
redo B; |
!!!parse-error (type => 'after frameset:/'.$token->{tag_name}); |
| 5087 |
} |
} |
|
} elsif ($token->{type} eq 'end tag') { |
|
|
if ($token->{tag_name} eq 'html') { |
|
|
$previous_insertion_mode = $self->{insertion_mode}; |
|
|
$self->{insertion_mode} = 'trailing end'; |
|
|
!!!next-token; |
|
|
redo B; |
|
|
} else { |
|
|
!!!parse-error (type => 'after frameset:/'.$token->{tag_name}); |
|
| 5088 |
## Ignore the token |
## Ignore the token |
| 5089 |
!!!next-token; |
!!!next-token; |
| 5090 |
redo B; |
redo B; |
| 5094 |
} |
} |
| 5095 |
|
|
| 5096 |
## ISSUE: An issue in spec here |
## ISSUE: An issue in spec here |
|
} elsif ($self->{insertion_mode} eq 'trailing end') { |
|
|
## states in the main stage is preserved yet # MUST |
|
|
|
|
|
if ($token->{type} eq 'character') { |
|
|
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
|
|
my $data = $1; |
|
|
## As if in the main phase. |
|
|
## NOTE: The insertion mode in the main phase |
|
|
## just before the phase has been changed to the trailing |
|
|
## end phase is either "after body" or "after frameset". |
|
|
$reconstruct_active_formatting_elements->($insert_to_current); |
|
|
|
|
|
$self->{open_elements}->[-1]->[0]->manakai_append_text ($data); |
|
|
|
|
|
unless (length $token->{data}) { |
|
|
!!!next-token; |
|
|
redo B; |
|
|
} |
|
|
} |
|
|
|
|
|
!!!parse-error (type => 'after html:#character'); |
|
|
$self->{insertion_mode} = $previous_insertion_mode; |
|
|
## reprocess |
|
|
redo B; |
|
|
} elsif ($token->{type} eq 'start tag') { |
|
|
!!!parse-error (type => 'after html:'.$token->{tag_name}); |
|
|
$self->{insertion_mode} = $previous_insertion_mode; |
|
|
## reprocess |
|
|
redo B; |
|
|
} elsif ($token->{type} eq 'end tag') { |
|
|
!!!parse-error (type => 'after html:/'.$token->{tag_name}); |
|
|
$self->{insertion_mode} = $previous_insertion_mode; |
|
|
## reprocess |
|
|
redo B; |
|
|
} else { |
|
|
die "$0: $token->{type}: Unknown token"; |
|
|
} |
|
| 5097 |
} else { |
} else { |
| 5098 |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
| 5099 |
} |
} |
| 5100 |
} # B |
} # B |
| 5101 |
|
|
| 5102 |
|
## NOTE: The "trailing end" phase in HTML5 is split into |
| 5103 |
|
## two insertion modes: "after html body" and "after html frameset". |
| 5104 |
|
## NOTE: States in the main stage is preserved while |
| 5105 |
|
## the parser stays in the trailing end phase. # MUST |
| 5106 |
|
|
| 5107 |
## Stop parsing # MUST |
## Stop parsing # MUST |
| 5108 |
|
|
| 5109 |
## TODO: script stuffs |
## TODO: script stuffs |