/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8 by wakaba, Wed Oct 15 04:38:22 2008 UTC revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC
# Line 216  sub _initialize_tokenizer ($) { Line 216  sub _initialize_tokenizer ($) {
216    
217  ## A token has:  ## A token has:
218  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
220  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
221  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222    ##   ->{target} (PI_TOKEN)
223  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
224  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
225  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 226  sub _initialize_tokenizer ($) { Line 227  sub _initialize_tokenizer ($) {
227  ##        ->{name}  ##        ->{name}
228  ##        ->{value}  ##        ->{value}
229  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
230  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
231    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
232  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
233    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
234  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
235  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
236  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 507  sub _get_next_token ($) { Line 510  sub _get_next_token ($) {
510        return  ($token);        return  ($token);
511        redo A;        redo A;
512      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
513          ## XML5: "tag state".
514    
515        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
516          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
517                        
# Line 666  sub _get_next_token ($) { Line 671  sub _get_next_token ($) {
671              ## $self->{nc} is intentionally left as is              ## $self->{nc} is intentionally left as is
672              redo A;              redo A;
673            }            }
674          } else {          } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
675                        
676            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
677                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 681  sub _get_next_token ($) { Line 686  sub _get_next_token ($) {
686                     });                     });
687    
688            redo A;            redo A;
689            } else {
690              ## XML5: "<:" is a parse error.
691              
692              $self->{ct} = {type => START_TAG_TOKEN,
693                                        tag_name => chr ($self->{nc}),
694                                        line => $self->{line_prev},
695                                        column => $self->{column_prev}};
696              $self->{state} = TAG_NAME_STATE;
697              
698        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
699          $self->{line_prev} = $self->{line};
700          $self->{column_prev} = $self->{column};
701          $self->{column}++;
702          $self->{nc}
703              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
704        } else {
705          $self->{set_nc}->($self);
706        }
707      
708              redo A;
709          }          }
710        } else {        } else {
711          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 689  sub _get_next_token ($) { Line 714  sub _get_next_token ($) {
714        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
715        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
716    
717          ## XML5: "end tag state".
718    
719        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
720        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
721          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
# Line 750  sub _get_next_token ($) { Line 777  sub _get_next_token ($) {
777        
778          redo A;          redo A;
779        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
780          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
781                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
782                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
783          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
784          $self->{s_kwd} = '';          $self->{s_kwd} = '';
785                    if ($self->{is_xml}) {
786              
787              ## XML5: No parse error.
788              
789              ## NOTE: This parser raises a parse error, since it supports
790              ## XML1, not XML5.
791    
792              ## NOTE: A short end tag token.
793              my $ct = {type => END_TAG_TOKEN,
794                        tag_name => '',
795                        line => $self->{line_prev},
796                        column => $self->{column_prev} - 1,
797                       };
798              
799        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
800          $self->{line_prev} = $self->{line};
801          $self->{column_prev} = $self->{column};
802          $self->{column}++;
803          $self->{nc}
804              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
805        } else {
806          $self->{set_nc}->($self);
807        }
808      
809              return  ($ct);
810            } else {
811              
812              
813      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
814        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
815        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 767  sub _get_next_token ($) { Line 820  sub _get_next_token ($) {
820        $self->{set_nc}->($self);        $self->{set_nc}->($self);
821      }      }
822        
823            }
824          redo A;          redo A;
825        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
826                    
# Line 780  sub _get_next_token ($) { Line 834  sub _get_next_token ($) {
834                   });                   });
835    
836          redo A;          redo A;
837        } else {        } elsif (not $self->{is_xml} or
838                   $is_space->{$self->{nc}}) {
839                    
840          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
841                            line => $self->{line_prev}, # "<" of "</"
842                            column => $self->{column_prev} - 1);
843          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
844          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
845                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 795  sub _get_next_token ($) { Line 852  sub _get_next_token ($) {
852          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
853          ## "bogus comment state" entry.          ## "bogus comment state" entry.
854          redo A;          redo A;
855          } else {
856            ## XML5: "</:" is a parse error.
857            
858            $self->{ct} = {type => END_TAG_TOKEN,
859                           tag_name => chr ($self->{nc}),
860                           line => $l, column => $c};
861            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
862            
863        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
864          $self->{line_prev} = $self->{line};
865          $self->{column_prev} = $self->{column};
866          $self->{column}++;
867          $self->{nc}
868              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
869        } else {
870          $self->{set_nc}->($self);
871        }
872      
873            redo A;
874        }        }
875      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
876        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
# Line 985  sub _get_next_token ($) { Line 1061  sub _get_next_token ($) {
1061          redo A;          redo A;
1062        }        }
1063      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1064          ## XML5: "Tag attribute name before state".
1065    
1066        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1067                    
1068          ## Stay in the state          ## Stay in the state
# Line 1097  sub _get_next_token ($) { Line 1175  sub _get_next_token ($) {
1175               0x003D => 1, # =               0x003D => 1, # =
1176              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1177                        
1178              ## XML5: Not a parse error.
1179            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1180          } else {          } else {
1181                        
1182              ## XML5: ":" raises a parse error and is ignored.
1183          }          }
1184          $self->{ca}          $self->{ca}
1185              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1120  sub _get_next_token ($) { Line 1200  sub _get_next_token ($) {
1200          redo A;          redo A;
1201        }        }
1202      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1203          ## XML5: "Tag attribute name state".
1204    
1205        my $before_leave = sub {        my $before_leave = sub {
1206          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1207              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1130  sub _get_next_token ($) { Line 1212  sub _get_next_token ($) {
1212                        
1213            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1214              = $self->{ca};              = $self->{ca};
1215              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1216          }          }
1217        }; # $before_leave        }; # $before_leave
1218    
# Line 1166  sub _get_next_token ($) { Line 1249  sub _get_next_token ($) {
1249        
1250          redo A;          redo A;
1251        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1252            if ($self->{is_xml}) {
1253              
1254              ## XML5: Not a parse error.
1255              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1256            } else {
1257              
1258            }
1259    
1260          $before_leave->();          $before_leave->();
1261          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1262                        
# Line 1215  sub _get_next_token ($) { Line 1306  sub _get_next_token ($) {
1306        
1307          redo A;          redo A;
1308        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1309            if ($self->{is_xml}) {
1310              
1311              ## XML5: Not a parse error.
1312              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1313            } else {
1314              
1315            }
1316                    
1317          $before_leave->();          $before_leave->();
1318          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1259  sub _get_next_token ($) { Line 1357  sub _get_next_token ($) {
1357          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1358              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1359                        
1360              ## XML5: Not a parse error.
1361            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1362          } else {          } else {
1363                        
# Line 1279  sub _get_next_token ($) { Line 1378  sub _get_next_token ($) {
1378          redo A;          redo A;
1379        }        }
1380      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1381          ## XML5: "Tag attribute name after state".
1382          
1383        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1384                    
1385          ## Stay in the state          ## Stay in the state
# Line 1310  sub _get_next_token ($) { Line 1411  sub _get_next_token ($) {
1411        
1412          redo A;          redo A;
1413        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1414            if ($self->{is_xml}) {
1415              
1416              ## XML5: Not a parse error.
1417              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1418            } else {
1419              
1420            }
1421    
1422          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1423                        
1424            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1363  sub _get_next_token ($) { Line 1472  sub _get_next_token ($) {
1472        
1473          redo A;          redo A;
1474        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1475            if ($self->{is_xml}) {
1476              
1477              ## XML5: Not a parse error.
1478              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1479            } else {
1480              
1481            }
1482                    
1483          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1484                    
# Line 1402  sub _get_next_token ($) { Line 1518  sub _get_next_token ($) {
1518    
1519          redo A;          redo A;
1520        } else {        } else {
1521            if ($self->{is_xml}) {
1522              
1523              ## XML5: Not a parse error.
1524              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1525            } else {
1526              
1527            }
1528    
1529          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1530              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1531                        
1532              ## XML5: Not a parse error.
1533            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1534          } else {          } else {
1535                        
# Line 1428  sub _get_next_token ($) { Line 1553  sub _get_next_token ($) {
1553          redo A;                  redo A;        
1554        }        }
1555      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1556          ## XML5: "Tag attribute value before state".
1557    
1558        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1559                    
1560          ## Stay in the state          ## Stay in the state
# Line 1539  sub _get_next_token ($) { Line 1666  sub _get_next_token ($) {
1666        } else {        } else {
1667          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1668                        
1669              ## XML5: Not a parse error.
1670            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1671            } elsif ($self->{is_xml}) {
1672              
1673              ## XML5: No parse error.
1674              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1675          } else {          } else {
1676                        
1677          }          }
# Line 1559  sub _get_next_token ($) { Line 1691  sub _get_next_token ($) {
1691          redo A;          redo A;
1692        }        }
1693      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1694          ## XML5: "Tag attribute value double quoted state".
1695          
1696        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1697                    
1698            ## XML5: "Tag attribute name before state".
1699          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1700                    
1701      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1576  sub _get_next_token ($) { Line 1711  sub _get_next_token ($) {
1711          redo A;          redo A;
1712        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1713                    
1714            ## XML5: Not defined yet.
1715    
1716          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1717          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1718          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1620  sub _get_next_token ($) { Line 1757  sub _get_next_token ($) {
1757    
1758          redo A;          redo A;
1759        } else {        } else {
1760                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1761              
1762              ## XML5: Not a parse error.
1763              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1764            } else {
1765              
1766            }
1767          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1768          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1769                                q["&],                                q["&<],
1770                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1771    
1772          ## Stay in the state          ## Stay in the state
# Line 1641  sub _get_next_token ($) { Line 1784  sub _get_next_token ($) {
1784          redo A;          redo A;
1785        }        }
1786      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1787          ## XML5: "Tag attribute value single quoted state".
1788    
1789        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1790                    
1791            ## XML5: "Before attribute name state" (sic).
1792          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1793                    
1794      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1658  sub _get_next_token ($) { Line 1804  sub _get_next_token ($) {
1804          redo A;          redo A;
1805        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1806                    
1807            ## XML5: Not defined yet.
1808    
1809          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1810          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1811          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1702  sub _get_next_token ($) { Line 1850  sub _get_next_token ($) {
1850    
1851          redo A;          redo A;
1852        } else {        } else {
1853                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1854              
1855              ## XML5: Not a parse error.
1856              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1857            } else {
1858              
1859            }
1860          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1861          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1862                                q['&],                                q['&<],
1863                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1864    
1865          ## Stay in the state          ## Stay in the state
# Line 1723  sub _get_next_token ($) { Line 1877  sub _get_next_token ($) {
1877          redo A;          redo A;
1878        }        }
1879      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1880          ## XML5: "Tag attribute value unquoted state".
1881    
1882        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1883                    
1884            ## XML5: "Tag attribute name before state".
1885          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1886                    
1887      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1740  sub _get_next_token ($) { Line 1897  sub _get_next_token ($) {
1897          redo A;          redo A;
1898        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1899                    
1900    
1901            ## XML5: Not defined yet.
1902    
1903          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1904          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1905          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1823  sub _get_next_token ($) { Line 1983  sub _get_next_token ($) {
1983               0x003D => 1, # =               0x003D => 1, # =
1984              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1985                        
1986              ## XML5: Not a parse error.
1987            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1988          } else {          } else {
1989                        
# Line 1939  sub _get_next_token ($) { Line 2100  sub _get_next_token ($) {
2100          redo A;          redo A;
2101        }        }
2102      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2103          ## XML5: "Empty tag state".
2104    
2105        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2106          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2107                        
# Line 1990  sub _get_next_token ($) { Line 2153  sub _get_next_token ($) {
2153          } else {          } else {
2154            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2155          }          }
2156            ## XML5: "Tag attribute name before state".
2157          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2158          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2159          ## Reconsume.          ## Reconsume.
# Line 2132  sub _get_next_token ($) { Line 2296  sub _get_next_token ($) {
2296                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2297                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2298                                   };                                   };
2299          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2300                    
2301      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2302        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2195  sub _get_next_token ($) { Line 2359  sub _get_next_token ($) {
2359        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
2360                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2361                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2362                    if ($self->{s_kwd} ne 'DOCTYP') {
2363              
2364              ## XML5: case-sensitive.
2365              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2366                              text => 'DOCTYPE',
2367                              line => $self->{line_prev},
2368                              column => $self->{column_prev} - 5);
2369            } else {
2370              
2371            }
2372          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2373          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2374                                    quirks => 1,                                    quirks => 1,
# Line 2472  sub _get_next_token ($) { Line 2645  sub _get_next_token ($) {
2645          redo A;          redo A;
2646        }        }
2647      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2648          ## XML5: "comment dash state".
2649    
2650        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2651                    
2652          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2537  sub _get_next_token ($) { Line 2712  sub _get_next_token ($) {
2712          redo A;          redo A;
2713        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2714                    
2715            ## XML5: Not a parse error.
2716          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2717                          line => $self->{line_prev},                          line => $self->{line_prev},
2718                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2566  sub _get_next_token ($) { Line 2742  sub _get_next_token ($) {
2742          redo A;          redo A;
2743        } else {        } else {
2744                    
2745            ## XML5: Not a parse error.
2746          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2747                          line => $self->{line_prev},                          line => $self->{line_prev},
2748                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 3651  sub _get_next_token ($) { Line 3828  sub _get_next_token ($) {
3828        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
3829        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3830        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
3831    
3832          ## XML5: "CDATA state".
3833                
3834        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3835                    
# Line 3677  sub _get_next_token ($) { Line 3856  sub _get_next_token ($) {
3856    
3857          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3858          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3859                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
3860          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
3861                        
3862            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3720  sub _get_next_token ($) { Line 3889  sub _get_next_token ($) {
3889    
3890        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
3891      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3892          ## XML5: "CDATA bracket state".
3893    
3894        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3895                    
3896          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3737  sub _get_next_token ($) { Line 3908  sub _get_next_token ($) {
3908          redo A;          redo A;
3909        } else {        } else {
3910                    
3911            ## XML5: If EOF, "]" is not appended and changed to the data state.
3912          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
3913          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3914          ## Reconsume.          ## Reconsume.
3915          redo A;          redo A;
3916        }        }
3917      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3918          ## XML5: "CDATA end state".
3919    
3920        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3921          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3922          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3785  sub _get_next_token ($) { Line 3959  sub _get_next_token ($) {
3959                    
3960          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
3961          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
3962          ## Reconsume.          ## Reconsume. ## XML5: Emit.
3963          redo A;          redo A;
3964        }        }
3965      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.11

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24