/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9 by wakaba, Wed Oct 15 08:05:47 2008 UTC revision 1.13 by wakaba, Thu Oct 16 03:39:57 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19    );    );
20        
21    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 28  BEGIN {
28        CHARACTER_TOKEN        CHARACTER_TOKEN
29        PI_TOKEN        PI_TOKEN
30        ABORT_TOKEN        ABORT_TOKEN
31          END_OF_DOCTYPE_TOKEN
32      )],      )],
33    );    );
34  }  }
35    
36    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
37    
38  ## Token types  ## Token types
39    
40  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
41  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
42  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
43  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
44  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
45  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
46  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
47  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49    
50    ## XML5: XML5 has "empty tag token".  In this implementation, it is
51    ## represented as a start tag token with $self->{self_closing} flag
52    ## set to true.
53    
54    ## XML5: XML5 has "short end tag token".  In this implementation, it
55    ## is represented as an end tag token with $token->{tag_name} flag set
56    ## to an empty string.
57    
58  package Whatpm::HTML;  package Whatpm::HTML;
59    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 127  sub HEXREF_HEX_STATE () { 48 }
127  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
128  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
129    
130  ## XML states  ## XML-only states
131  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
132  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
133  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
134  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
135  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
136  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
137    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139    sub DOCTYPE_TAG_STATE () { 59 }
140    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141    
142  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
143  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 203  sub _initialize_tokenizer ($) {
203    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
204    
205    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
206    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
207      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
208    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
209    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
210    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 216  sub _initialize_tokenizer ($) { Line 234  sub _initialize_tokenizer ($) {
234    
235  ## A token has:  ## A token has:
236  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
237  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
238  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
239  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
240    ##   ->{target} (PI_TOKEN)
241  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
242  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
243  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 226  sub _initialize_tokenizer ($) { Line 245  sub _initialize_tokenizer ($) {
245  ##        ->{name}  ##        ->{name}
246  ##        ->{value}  ##        ->{value}
247  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
248  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
249    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
250  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
251    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
252    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
253    
254  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
255  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
256  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 247  my $is_space = { Line 270  my $is_space = {
270    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
271    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
272    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
273    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
274    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
275    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
276  };  };
# Line 507  sub _get_next_token ($) { Line 530  sub _get_next_token ($) {
530        return  ($token);        return  ($token);
531        redo A;        redo A;
532      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
533          ## XML5: "tag state".
534    
535        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
536          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
537                        
# Line 525  sub _get_next_token ($) { Line 550  sub _get_next_token ($) {
550            redo A;            redo A;
551          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
552                        
553            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
554            #            #
555          } else {          } else {
556                        
557              $self->{s_kwd} = '';
558            #            #
559          }          }
560    
561          ## reconsume          ## reconsume
562          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
563          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
564                    line => $self->{line_prev},                    line => $self->{line_prev},
565                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 709  sub _get_next_token ($) { Line 734  sub _get_next_token ($) {
734        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
735        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
736    
737          ## XML5: "end tag state".
738    
739        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
740        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
741          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
742            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
743            $self->{s_kwd} = '';            $self->{kwd} = '';
744            ## Reconsume.            ## Reconsume.
745            redo A;            redo A;
746          } else {          } else {
# Line 770  sub _get_next_token ($) { Line 797  sub _get_next_token ($) {
797        
798          redo A;          redo A;
799        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
800          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
801                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
802                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
803          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
804          $self->{s_kwd} = '';          $self->{s_kwd} = '';
805                    if ($self->{is_xml}) {
806              
807              ## XML5: No parse error.
808              
809              ## NOTE: This parser raises a parse error, since it supports
810              ## XML1, not XML5.
811    
812              ## NOTE: A short end tag token.
813              my $ct = {type => END_TAG_TOKEN,
814                        tag_name => '',
815                        line => $self->{line_prev},
816                        column => $self->{column_prev} - 1,
817                       };
818              
819        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
820          $self->{line_prev} = $self->{line};
821          $self->{column_prev} = $self->{column};
822          $self->{column}++;
823          $self->{nc}
824              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
825        } else {
826          $self->{set_nc}->($self);
827        }
828      
829              return  ($ct);
830            } else {
831              
832              
833      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
834        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
835        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 787  sub _get_next_token ($) { Line 840  sub _get_next_token ($) {
840        $self->{set_nc}->($self);        $self->{set_nc}->($self);
841      }      }
842        
843            }
844          redo A;          redo A;
845        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
846                    
# Line 800  sub _get_next_token ($) { Line 854  sub _get_next_token ($) {
854                   });                   });
855    
856          redo A;          redo A;
857        } else {        } elsif (not $self->{is_xml} or
858                   $is_space->{$self->{nc}}) {
859                    
860          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
861                            line => $self->{line_prev}, # "<" of "</"
862                            column => $self->{column_prev} - 1);
863          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
864          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
865                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 815  sub _get_next_token ($) { Line 872  sub _get_next_token ($) {
872          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
873          ## "bogus comment state" entry.          ## "bogus comment state" entry.
874          redo A;          redo A;
875          } else {
876            ## XML5: "</:" is a parse error.
877            
878            $self->{ct} = {type => END_TAG_TOKEN,
879                           tag_name => chr ($self->{nc}),
880                           line => $l, column => $c};
881            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
882            
883        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
884          $self->{line_prev} = $self->{line};
885          $self->{column_prev} = $self->{column};
886          $self->{column}++;
887          $self->{nc}
888              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
889        } else {
890          $self->{set_nc}->($self);
891        }
892      
893            redo A;
894        }        }
895      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
896        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
897        if (length $ch) {        if (length $ch) {
898          my $CH = $ch;          my $CH = $ch;
899          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 825  sub _get_next_token ($) { Line 901  sub _get_next_token ($) {
901          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
902                        
903            ## Stay in the state.            ## Stay in the state.
904            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
905                        
906      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
907        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 844  sub _get_next_token ($) { Line 920  sub _get_next_token ($) {
920            $self->{s_kwd} = '';            $self->{s_kwd} = '';
921            ## Reconsume.            ## Reconsume.
922            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
923                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
924                      line => $self->{line_prev},                      line => $self->{line_prev},
925                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
926                     });                     });
927            redo A;            redo A;
928          }          }
# Line 862  sub _get_next_token ($) { Line 938  sub _get_next_token ($) {
938            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
939            $self->{s_kwd} = '';            $self->{s_kwd} = '';
940            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
941                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
942                      line => $self->{line_prev},                      line => $self->{line_prev},
943                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
944                     });                     });
945            redo A;            redo A;
946          } else {          } else {
# Line 873  sub _get_next_token ($) { Line 949  sub _get_next_token ($) {
949                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
950                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
951                   line => $self->{line_prev},                   line => $self->{line_prev},
952                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
953            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
954            ## Reconsume.            ## Reconsume.
955            redo A;            redo A;
# Line 1005  sub _get_next_token ($) { Line 1081  sub _get_next_token ($) {
1081          redo A;          redo A;
1082        }        }
1083      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1084          ## XML5: "Tag attribute name before state".
1085    
1086        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1087                    
1088          ## Stay in the state          ## Stay in the state
# Line 1117  sub _get_next_token ($) { Line 1195  sub _get_next_token ($) {
1195               0x003D => 1, # =               0x003D => 1, # =
1196              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1197                        
1198              ## XML5: Not a parse error.
1199            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1200          } else {          } else {
1201                        
1202              ## XML5: ":" raises a parse error and is ignored.
1203          }          }
1204          $self->{ca}          $self->{ca}
1205              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1140  sub _get_next_token ($) { Line 1220  sub _get_next_token ($) {
1220          redo A;          redo A;
1221        }        }
1222      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1223          ## XML5: "Tag attribute name state".
1224    
1225        my $before_leave = sub {        my $before_leave = sub {
1226          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1227              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1150  sub _get_next_token ($) { Line 1232  sub _get_next_token ($) {
1232                        
1233            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1234              = $self->{ca};              = $self->{ca};
1235              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1236          }          }
1237        }; # $before_leave        }; # $before_leave
1238    
# Line 1186  sub _get_next_token ($) { Line 1269  sub _get_next_token ($) {
1269        
1270          redo A;          redo A;
1271        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1272            if ($self->{is_xml}) {
1273              
1274              ## XML5: Not a parse error.
1275              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1276            } else {
1277              
1278            }
1279    
1280          $before_leave->();          $before_leave->();
1281          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1282                        
# Line 1235  sub _get_next_token ($) { Line 1326  sub _get_next_token ($) {
1326        
1327          redo A;          redo A;
1328        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1329            if ($self->{is_xml}) {
1330              
1331              ## XML5: Not a parse error.
1332              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1333            } else {
1334              
1335            }
1336                    
1337          $before_leave->();          $before_leave->();
1338          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1279  sub _get_next_token ($) { Line 1377  sub _get_next_token ($) {
1377          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1378              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1379                        
1380              ## XML5: Not a parse error.
1381            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1382          } else {          } else {
1383                        
# Line 1299  sub _get_next_token ($) { Line 1398  sub _get_next_token ($) {
1398          redo A;          redo A;
1399        }        }
1400      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1401          ## XML5: "Tag attribute name after state".
1402          
1403        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1404                    
1405          ## Stay in the state          ## Stay in the state
# Line 1330  sub _get_next_token ($) { Line 1431  sub _get_next_token ($) {
1431        
1432          redo A;          redo A;
1433        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1434            if ($self->{is_xml}) {
1435              
1436              ## XML5: Not a parse error.
1437              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1438            } else {
1439              
1440            }
1441    
1442          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1443                        
1444            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1383  sub _get_next_token ($) { Line 1492  sub _get_next_token ($) {
1492        
1493          redo A;          redo A;
1494        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1495            if ($self->{is_xml}) {
1496              
1497              ## XML5: Not a parse error.
1498              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1499            } else {
1500              
1501            }
1502                    
1503          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1504                    
# Line 1422  sub _get_next_token ($) { Line 1538  sub _get_next_token ($) {
1538    
1539          redo A;          redo A;
1540        } else {        } else {
1541            if ($self->{is_xml}) {
1542              
1543              ## XML5: Not a parse error.
1544              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1545            } else {
1546              
1547            }
1548    
1549          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1550              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1551                        
1552              ## XML5: Not a parse error.
1553            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1554          } else {          } else {
1555                        
# Line 1448  sub _get_next_token ($) { Line 1573  sub _get_next_token ($) {
1573          redo A;                  redo A;        
1574        }        }
1575      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1576          ## XML5: "Tag attribute value before state".
1577    
1578        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1579                    
1580          ## Stay in the state          ## Stay in the state
# Line 1559  sub _get_next_token ($) { Line 1686  sub _get_next_token ($) {
1686        } else {        } else {
1687          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1688                        
1689              ## XML5: Not a parse error.
1690            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1691            } elsif ($self->{is_xml}) {
1692              
1693              ## XML5: No parse error.
1694              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1695          } else {          } else {
1696                        
1697          }          }
# Line 1579  sub _get_next_token ($) { Line 1711  sub _get_next_token ($) {
1711          redo A;          redo A;
1712        }        }
1713      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1714          ## XML5: "Tag attribute value double quoted state".
1715          
1716        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1717                    
1718            ## XML5: "Tag attribute name before state".
1719          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1720                    
1721      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1596  sub _get_next_token ($) { Line 1731  sub _get_next_token ($) {
1731          redo A;          redo A;
1732        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1733                    
1734            ## XML5: Not defined yet.
1735    
1736          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1737          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1738          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1640  sub _get_next_token ($) { Line 1777  sub _get_next_token ($) {
1777    
1778          redo A;          redo A;
1779        } else {        } else {
1780                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1781              
1782              ## XML5: Not a parse error.
1783              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1784            } else {
1785              
1786            }
1787          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1788          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1789                                q["&],                                q["&<],
1790                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1791    
1792          ## Stay in the state          ## Stay in the state
# Line 1661  sub _get_next_token ($) { Line 1804  sub _get_next_token ($) {
1804          redo A;          redo A;
1805        }        }
1806      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1807          ## XML5: "Tag attribute value single quoted state".
1808    
1809        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1810                    
1811            ## XML5: "Before attribute name state" (sic).
1812          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1813                    
1814      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1678  sub _get_next_token ($) { Line 1824  sub _get_next_token ($) {
1824          redo A;          redo A;
1825        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1826                    
1827            ## XML5: Not defined yet.
1828    
1829          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1830          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1831          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1722  sub _get_next_token ($) { Line 1870  sub _get_next_token ($) {
1870    
1871          redo A;          redo A;
1872        } else {        } else {
1873                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1874              
1875              ## XML5: Not a parse error.
1876              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1877            } else {
1878              
1879            }
1880          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1881          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1882                                q['&],                                q['&<],
1883                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1884    
1885          ## Stay in the state          ## Stay in the state
# Line 1743  sub _get_next_token ($) { Line 1897  sub _get_next_token ($) {
1897          redo A;          redo A;
1898        }        }
1899      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1900          ## XML5: "Tag attribute value unquoted state".
1901    
1902        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1903                    
1904            ## XML5: "Tag attribute name before state".
1905          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1906                    
1907      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1760  sub _get_next_token ($) { Line 1917  sub _get_next_token ($) {
1917          redo A;          redo A;
1918        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1919                    
1920    
1921            ## XML5: Not defined yet.
1922    
1923          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1924          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1925          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1843  sub _get_next_token ($) { Line 2003  sub _get_next_token ($) {
2003               0x003D => 1, # =               0x003D => 1, # =
2004              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2005                        
2006              ## XML5: Not a parse error.
2007            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2008          } else {          } else {
2009                        
# Line 1959  sub _get_next_token ($) { Line 2120  sub _get_next_token ($) {
2120          redo A;          redo A;
2121        }        }
2122      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2123          ## XML5: "Empty tag state".
2124    
2125        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2126          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2127                        
# Line 2010  sub _get_next_token ($) { Line 2173  sub _get_next_token ($) {
2173          } else {          } else {
2174            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2175          }          }
2176            ## XML5: "Tag attribute name before state".
2177          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2178          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2179          ## Reconsume.          ## Reconsume.
# Line 2024  sub _get_next_token ($) { Line 2188  sub _get_next_token ($) {
2188          redo A;          redo A;
2189        }        }
2190      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
       ## (only happen if PCDATA state)  
   
2191        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2192        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2193                
2194        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2195                    if ($self->{in_subset}) {
2196          $self->{state} = DATA_STATE;            
2197          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2198            } else {
2199              
2200              $self->{state} = DATA_STATE;
2201              $self->{s_kwd} = '';
2202            }
2203                    
2204      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2205        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2048  sub _get_next_token ($) { Line 2215  sub _get_next_token ($) {
2215          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2216          redo A;          redo A;
2217        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2218                    if ($self->{in_subset}) {
2219          $self->{state} = DATA_STATE;            
2220          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2221            } else {
2222              
2223              $self->{state} = DATA_STATE;
2224              $self->{s_kwd} = '';
2225            }
2226          ## reconsume          ## reconsume
2227    
2228          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2077  sub _get_next_token ($) { Line 2249  sub _get_next_token ($) {
2249          redo A;          redo A;
2250        }        }
2251      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2252        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state" and "DOCTYPE markup
2253          ## declaration state".
2254                
2255        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2256                    
# Line 2099  sub _get_next_token ($) { Line 2272  sub _get_next_token ($) {
2272          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2273                    
2274          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2275          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2276                    
2277      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2278        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2118  sub _get_next_token ($) { Line 2291  sub _get_next_token ($) {
2291                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2292                                                    
2293          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2294          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2295                    
2296      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2297        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2152  sub _get_next_token ($) { Line 2325  sub _get_next_token ($) {
2325                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2326                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2327                                   };                                   };
2328          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2329                    
2330      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2331        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2188  sub _get_next_token ($) { Line 2361  sub _get_next_token ($) {
2361              0x0054, # T              0x0054, # T
2362              0x0059, # Y              0x0059, # Y
2363              0x0050, # P              0x0050, # P
2364            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2365            $self->{nc} == [            $self->{nc} == [
2366              undef,              undef,
2367              0x006F, # o              0x006F, # o
# Line 2196  sub _get_next_token ($) { Line 2369  sub _get_next_token ($) {
2369              0x0074, # t              0x0074, # t
2370              0x0079, # y              0x0079, # y
2371              0x0070, # p              0x0070, # p
2372            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2373                    
2374          ## Stay in the state.          ## Stay in the state.
2375          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2376                    
2377      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2378        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2212  sub _get_next_token ($) { Line 2385  sub _get_next_token ($) {
2385      }      }
2386        
2387          redo A;          redo A;
2388        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2389                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2390                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2391                    if ($self->{is_xml} and
2392                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2393              
2394              ## XML5: case-sensitive.
2395              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2396                              text => 'DOCTYPE',
2397                              line => $self->{line_prev},
2398                              column => $self->{column_prev} - 5);
2399            } else {
2400              
2401            }
2402          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2403          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2404                                    quirks => 1,                                    quirks => 1,
# Line 2238  sub _get_next_token ($) { Line 2421  sub _get_next_token ($) {
2421                                    
2422          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2423                          line => $self->{line_prev},                          line => $self->{line_prev},
2424                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2425          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2426          ## Reconsume.          ## Reconsume.
2427          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2428                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2429                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2430                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2431                                   };                                   };
2432          redo A;          redo A;
2433        }        }
# Line 2255  sub _get_next_token ($) { Line 2438  sub _get_next_token ($) {
2438              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2439              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2440              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2441            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2442                    
2443          ## Stay in the state.          ## Stay in the state.
2444          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2445                    
2446      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2447        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2271  sub _get_next_token ($) { Line 2454  sub _get_next_token ($) {
2454      }      }
2455        
2456          redo A;          redo A;
2457        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2458                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2459          if ($self->{is_xml} and          if ($self->{is_xml} and
2460              not $self->{tainted} and              not $self->{tainted} and
# Line 2306  sub _get_next_token ($) { Line 2489  sub _get_next_token ($) {
2489                    
2490          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2491                          line => $self->{line_prev},                          line => $self->{line_prev},
2492                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2493          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2494          ## Reconsume.          ## Reconsume.
2495          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2496                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2497                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2498                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2499                                   };                                   };
2500          redo A;          redo A;
2501        }        }
# Line 2333  sub _get_next_token ($) { Line 2516  sub _get_next_token ($) {
2516        
2517          redo A;          redo A;
2518        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2519          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2520          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2521          $self->{s_kwd} = '';            
2522              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2523            } else {
2524              
2525              $self->{state} = DATA_STATE;
2526              $self->{s_kwd} = '';
2527            }
2528                    
2529      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2530        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2353  sub _get_next_token ($) { Line 2541  sub _get_next_token ($) {
2541    
2542          redo A;          redo A;
2543        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2544          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2545          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2546          $self->{s_kwd} = '';            
2547              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2548            } else {
2549              
2550              $self->{state} = DATA_STATE;
2551              $self->{s_kwd} = '';
2552            }
2553          ## reconsume          ## reconsume
2554    
2555          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2397  sub _get_next_token ($) { Line 2590  sub _get_next_token ($) {
2590        
2591          redo A;          redo A;
2592        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2593          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2594          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2595          $self->{s_kwd} = '';            
2596              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597            } else {
2598              
2599              $self->{state} = DATA_STATE;
2600              $self->{s_kwd} = '';
2601            }
2602                    
2603      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2604        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2417  sub _get_next_token ($) { Line 2615  sub _get_next_token ($) {
2615    
2616          redo A;          redo A;
2617        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2618          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2620          $self->{s_kwd} = '';            
2621              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2622            } else {
2623              
2624              $self->{state} = DATA_STATE;
2625              $self->{s_kwd} = '';
2626            }
2627          ## reconsume          ## reconsume
2628    
2629          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2461  sub _get_next_token ($) { Line 2664  sub _get_next_token ($) {
2664        
2665          redo A;          redo A;
2666        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2667          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2669          $self->{s_kwd} = '';            
2670              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2671            } else {
2672              
2673              $self->{state} = DATA_STATE;
2674              $self->{s_kwd} = '';
2675            }
2676          ## reconsume          ## reconsume
2677    
2678          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2492  sub _get_next_token ($) { Line 2700  sub _get_next_token ($) {
2700          redo A;          redo A;
2701        }        }
2702      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2703          ## XML5: "comment dash state".
2704    
2705        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2706                    
2707          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2508  sub _get_next_token ($) { Line 2718  sub _get_next_token ($) {
2718        
2719          redo A;          redo A;
2720        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2721          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2722          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2723          $self->{state} = DATA_STATE;            
2724          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725            } else {
2726              
2727              $self->{state} = DATA_STATE;
2728              $self->{s_kwd} = '';
2729            }
2730          ## reconsume          ## reconsume
2731    
2732          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2537  sub _get_next_token ($) { Line 2751  sub _get_next_token ($) {
2751        }        }
2752      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2753        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2754                    if ($self->{in_subset}) {
2755          $self->{state} = DATA_STATE;            
2756          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2757            } else {
2758              
2759              $self->{state} = DATA_STATE;
2760              $self->{s_kwd} = '';
2761            }
2762                    
2763      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2764        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2557  sub _get_next_token ($) { Line 2776  sub _get_next_token ($) {
2776          redo A;          redo A;
2777        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2778                    
2779            ## XML5: Not a parse error.
2780          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2781                          line => $self->{line_prev},                          line => $self->{line_prev},
2782                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2575  sub _get_next_token ($) { Line 2795  sub _get_next_token ($) {
2795        
2796          redo A;          redo A;
2797        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2798          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2799          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2800          $self->{s_kwd} = '';            
2801              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2802            } else {
2803              
2804              $self->{state} = DATA_STATE;
2805              $self->{s_kwd} = '';
2806            }
2807          ## reconsume          ## reconsume
2808    
2809          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2586  sub _get_next_token ($) { Line 2811  sub _get_next_token ($) {
2811          redo A;          redo A;
2812        } else {        } else {
2813                    
2814            ## XML5: Not a parse error.
2815          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2816                          line => $self->{line_prev},                          line => $self->{line_prev},
2817                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2622  sub _get_next_token ($) { Line 2848  sub _get_next_token ($) {
2848          redo A;          redo A;
2849        } else {        } else {
2850                    
2851            ## XML5: Unless EOF, swith to the bogus comment state.
2852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2853          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2854          ## reconsume          ## reconsume
2855          redo A;          redo A;
2856        }        }
2857      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2858          ## XML5: "DOCTYPE root name before state".
2859    
2860        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2861                    
2862          ## Stay in the state          ## Stay in the state
# Line 2645  sub _get_next_token ($) { Line 2874  sub _get_next_token ($) {
2874          redo A;          redo A;
2875        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2876                    
2877            ## XML5: No parse error.
2878          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2879          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2880          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2673  sub _get_next_token ($) { Line 2903  sub _get_next_token ($) {
2903          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
2904    
2905          redo A;          redo A;
2906          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2907            
2908            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2909            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2911            $self->{in_subset} = 1;
2912            
2913        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2914          $self->{line_prev} = $self->{line};
2915          $self->{column_prev} = $self->{column};
2916          $self->{column}++;
2917          $self->{nc}
2918              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2919        } else {
2920          $self->{set_nc}->($self);
2921        }
2922      
2923            return  ($self->{ct}); # DOCTYPE
2924            redo A;
2925        } else {        } else {
2926                    
2927          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2692  sub _get_next_token ($) { Line 2941  sub _get_next_token ($) {
2941          redo A;          redo A;
2942        }        }
2943      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2944  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2945    
2946          ## ISSUE: Redundant "First," in the spec.
2947    
2948        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2949                    
2950          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2738  sub _get_next_token ($) { Line 2990  sub _get_next_token ($) {
2990          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
2991    
2992          redo A;          redo A;
2993          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2994            
2995            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2996            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2997            $self->{in_subset} = 1;
2998            
2999        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3000          $self->{line_prev} = $self->{line};
3001          $self->{column_prev} = $self->{column};
3002          $self->{column}++;
3003          $self->{nc}
3004              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3005        } else {
3006          $self->{set_nc}->($self);
3007        }
3008      
3009            return  ($self->{ct}); # DOCTYPE
3010            redo A;
3011        } else {        } else {
3012                    
3013          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2757  sub _get_next_token ($) { Line 3027  sub _get_next_token ($) {
3027          redo A;          redo A;
3028        }        }
3029      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3030          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3031          ## state", but implemented differently.
3032    
3033        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3034                    
3035          ## Stay in the state          ## Stay in the state
# Line 2804  sub _get_next_token ($) { Line 3077  sub _get_next_token ($) {
3077          redo A;          redo A;
3078        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3079                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3080            
3081          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3082          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3083                    
3084      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2820  sub _get_next_token ($) { Line 3094  sub _get_next_token ($) {
3094          redo A;          redo A;
3095        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3096                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3097            
3098          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3099          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3100            
3101        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3102          $self->{line_prev} = $self->{line};
3103          $self->{column_prev} = $self->{column};
3104          $self->{column}++;
3105          $self->{nc}
3106              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3107        } else {
3108          $self->{set_nc}->($self);
3109        }
3110      
3111            redo A;
3112          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3113            
3114            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3116            $self->{in_subset} = 1;
3117                    
3118      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3119        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2833  sub _get_next_token ($) { Line 3125  sub _get_next_token ($) {
3125        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3126      }      }
3127        
3128            return  ($self->{ct}); # DOCTYPE
3129          redo A;          redo A;
3130        } else {        } else {
3131                    
# Line 2861  sub _get_next_token ($) { Line 3154  sub _get_next_token ($) {
3154              0x0042, # B              0x0042, # B
3155              0x004C, # L              0x004C, # L
3156              0x0049, # I              0x0049, # I
3157            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3158            $self->{nc} == [            $self->{nc} == [
3159              undef,              undef,
3160              0x0075, # u              0x0075, # u
3161              0x0062, # b              0x0062, # b
3162              0x006C, # l              0x006C, # l
3163              0x0069, # i              0x0069, # i
3164            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3165                    
3166          ## Stay in the state.          ## Stay in the state.
3167          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3168                    
3169      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3170        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2884  sub _get_next_token ($) { Line 3177  sub _get_next_token ($) {
3177      }      }
3178        
3179          redo A;          redo A;
3180        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3181                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3182                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3183                    if ($self->{is_xml} and
3184                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3185              
3186              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3187                              text => 'PUBLIC',
3188                              line => $self->{line_prev},
3189                              column => $self->{column_prev} - 4);
3190            } else {
3191              
3192            }
3193          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3194                    
3195      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2905  sub _get_next_token ($) { Line 3207  sub _get_next_token ($) {
3207                    
3208          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3209                          line => $self->{line_prev},                          line => $self->{line_prev},
3210                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3211          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3212    
3213          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2920  sub _get_next_token ($) { Line 3222  sub _get_next_token ($) {
3222              0x0053, # S              0x0053, # S
3223              0x0054, # T              0x0054, # T
3224              0x0045, # E              0x0045, # E
3225            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3226            $self->{nc} == [            $self->{nc} == [
3227              undef,              undef,
3228              0x0079, # y              0x0079, # y
3229              0x0073, # s              0x0073, # s
3230              0x0074, # t              0x0074, # t
3231              0x0065, # e              0x0065, # e
3232            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3233                    
3234          ## Stay in the state.          ## Stay in the state.
3235          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3236                    
3237      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2943  sub _get_next_token ($) { Line 3245  sub _get_next_token ($) {
3245      }      }
3246        
3247          redo A;          redo A;
3248        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3249                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3250                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3251                    if ($self->{is_xml} and
3252                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3253              
3254              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3255                              text => 'SYSTEM',
3256                              line => $self->{line_prev},
3257                              column => $self->{column_prev} - 4);
3258            } else {
3259              
3260            }
3261          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3262                    
3263      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2964  sub _get_next_token ($) { Line 3275  sub _get_next_token ($) {
3275                    
3276          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3277                          line => $self->{line_prev},                          line => $self->{line_prev},
3278                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3279          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3280    
3281          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 3053  sub _get_next_token ($) { Line 3364  sub _get_next_token ($) {
3364          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3365    
3366          redo A;          redo A;
3367          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3368            
3369            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3370            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3371            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3372            $self->{in_subset} = 1;
3373            
3374        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3375          $self->{line_prev} = $self->{line};
3376          $self->{column_prev} = $self->{column};
3377          $self->{column}++;
3378          $self->{nc}
3379              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3380        } else {
3381          $self->{set_nc}->($self);
3382        }
3383      
3384            return  ($self->{ct}); # DOCTYPE
3385            redo A;
3386        } else {        } else {
3387                    
3388          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
# Line 3263  sub _get_next_token ($) { Line 3593  sub _get_next_token ($) {
3593        
3594          redo A;          redo A;
3595        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3596                    if ($self->{is_xml}) {
3597              
3598              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3599            } else {
3600              
3601            }
3602          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3603          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3604                    
# Line 3293  sub _get_next_token ($) { Line 3628  sub _get_next_token ($) {
3628          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3629    
3630          redo A;          redo A;
3631          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3632            
3633            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3634            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3635            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3636            $self->{in_subset} = 1;
3637            
3638        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639          $self->{line_prev} = $self->{line};
3640          $self->{column_prev} = $self->{column};
3641          $self->{column}++;
3642          $self->{nc}
3643              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3644        } else {
3645          $self->{set_nc}->($self);
3646        }
3647      
3648            return  ($self->{ct}); # DOCTYPE
3649            redo A;
3650        } else {        } else {
3651                    
3652          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
# Line 3393  sub _get_next_token ($) { Line 3747  sub _get_next_token ($) {
3747          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3748    
3749          redo A;          redo A;
3750          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3751            
3752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3753    
3754            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3755            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3756            $self->{in_subset} = 1;
3757            
3758        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3759          $self->{line_prev} = $self->{line};
3760          $self->{column_prev} = $self->{column};
3761          $self->{column}++;
3762          $self->{nc}
3763              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3764        } else {
3765          $self->{set_nc}->($self);
3766        }
3767      
3768            return  ($self->{ct}); # DOCTYPE
3769            redo A;
3770        } else {        } else {
3771                    
3772          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
# Line 3428  sub _get_next_token ($) { Line 3802  sub _get_next_token ($) {
3802      }      }
3803        
3804          redo A;          redo A;
3805        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3806                    
3807          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3808    
# Line 3499  sub _get_next_token ($) { Line 3873  sub _get_next_token ($) {
3873      }      }
3874        
3875          redo A;          redo A;
3876        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3877                    
3878          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3879    
# Line 3600  sub _get_next_token ($) { Line 3974  sub _get_next_token ($) {
3974          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3975    
3976          redo A;          redo A;
3977          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3978            
3979            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3980            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3981            $self->{in_subset} = 1;
3982            
3983        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3984          $self->{line_prev} = $self->{line};
3985          $self->{column_prev} = $self->{column};
3986          $self->{column}++;
3987          $self->{nc}
3988              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3989        } else {
3990          $self->{set_nc}->($self);
3991        }
3992      
3993            return  ($self->{ct}); # DOCTYPE
3994            redo A;
3995        } else {        } else {
3996                    
3997          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
# Line 3639  sub _get_next_token ($) { Line 4031  sub _get_next_token ($) {
4031          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4032    
4033          redo A;          redo A;
4034          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4035            
4036            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4037            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4038            $self->{in_subset} = 1;
4039            
4040        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4041          $self->{line_prev} = $self->{line};
4042          $self->{column_prev} = $self->{column};
4043          $self->{column}++;
4044          $self->{nc}
4045              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4046        } else {
4047          $self->{set_nc}->($self);
4048        }
4049      
4050            return  ($self->{ct}); # DOCTYPE
4051            redo A;
4052        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4053                    
4054          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3651  sub _get_next_token ($) { Line 4061  sub _get_next_token ($) {
4061        } else {        } else {
4062                    
4063          my $s = '';          my $s = '';
4064          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4065    
4066          ## Stay in the state          ## Stay in the state
4067                    
# Line 3671  sub _get_next_token ($) { Line 4081  sub _get_next_token ($) {
4081        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4082        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4083        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4084    
4085          ## XML5: "CDATA state".
4086                
4087        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4088                    
# Line 3697  sub _get_next_token ($) { Line 4109  sub _get_next_token ($) {
4109    
4110          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4111          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4112                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4113          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4114                        
4115            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3740  sub _get_next_token ($) { Line 4142  sub _get_next_token ($) {
4142    
4143        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4144      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4145          ## XML5: "CDATA bracket state".
4146    
4147        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4148                    
4149          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3757  sub _get_next_token ($) { Line 4161  sub _get_next_token ($) {
4161          redo A;          redo A;
4162        } else {        } else {
4163                    
4164            ## XML5: If EOF, "]" is not appended and changed to the data state.
4165          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4166          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4167          ## Reconsume.          ## Reconsume.
4168          redo A;          redo A;
4169        }        }
4170      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4171          ## XML5: "CDATA end state".
4172    
4173        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4174          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4175          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3805  sub _get_next_token ($) { Line 4212  sub _get_next_token ($) {
4212                    
4213          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4214          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4215          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4216          redo A;          redo A;
4217        }        }
4218      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3822  sub _get_next_token ($) { Line 4229  sub _get_next_token ($) {
4229        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4230                    
4231          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4232          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4233                    
4234      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3842  sub _get_next_token ($) { Line 4249  sub _get_next_token ($) {
4249                    
4250          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4251          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4252          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4253          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4254          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4255                    
4256      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3893  sub _get_next_token ($) { Line 4300  sub _get_next_token ($) {
4300            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4301                    
4302          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4303          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4304                    
4305      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4306        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3910  sub _get_next_token ($) { Line 4317  sub _get_next_token ($) {
4317                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4318                    
4319          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4320          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4321                    
4322      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4323        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3956  sub _get_next_token ($) { Line 4363  sub _get_next_token ($) {
4363        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4364            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4365                    
4366          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4367          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4368                    
4369          ## Stay in the state.          ## Stay in the state.
4370                    
# Line 3993  sub _get_next_token ($) { Line 4400  sub _get_next_token ($) {
4400          #          #
4401        }        }
4402    
4403        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4404        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4405        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4406        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4036  sub _get_next_token ($) { Line 4443  sub _get_next_token ($) {
4443          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4444                    
4445          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4446          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4447          ## Reconsume.          ## Reconsume.
4448          redo A;          redo A;
4449        } else {        } else {
# Line 4054  sub _get_next_token ($) { Line 4461  sub _get_next_token ($) {
4461            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4462            ## Reconsume.            ## Reconsume.
4463            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4464                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4465                      line => $self->{line_prev},                      line => $self->{line_prev},
4466                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4467                     });                     });
4468            redo A;            redo A;
4469          } else {          } else {
4470                        
4471            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4472            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4473            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4474            ## Reconsume.            ## Reconsume.
# Line 4072  sub _get_next_token ($) { Line 4479  sub _get_next_token ($) {
4479        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4480          # 0..9          # 0..9
4481                    
4482          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4483          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4484          ## Stay in the state.          ## Stay in the state.
4485                    
4486      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4090  sub _get_next_token ($) { Line 4497  sub _get_next_token ($) {
4497        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4498                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4499                    
4500          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4501          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4502          ## Stay in the state.          ## Stay in the state.
4503                    
4504      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4108  sub _get_next_token ($) { Line 4515  sub _get_next_token ($) {
4515        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4516                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4517                    
4518          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4519          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4520          ## Stay in the state.          ## Stay in the state.
4521                    
4522      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4146  sub _get_next_token ($) { Line 4553  sub _get_next_token ($) {
4553          #          #
4554        }        }
4555    
4556        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4557        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4558        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4559        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4183  sub _get_next_token ($) { Line 4590  sub _get_next_token ($) {
4590          redo A;          redo A;
4591        }        }
4592      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
4593        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
4594            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
4595            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
4596              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 4193  sub _get_next_token ($) { Line 4600  sub _get_next_token ($) {
4600              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
4601             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
4602          our $EntityChar;          our $EntityChar;
4603          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4604          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
4605            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
4606                            
4607              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4608              $self->{entity__match} = 1;              $self->{entity__match} = 1;
4609                            
4610      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4213  sub _get_next_token ($) { Line 4620  sub _get_next_token ($) {
4620              #              #
4621            } else {            } else {
4622                            
4623              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4624              $self->{entity__match} = -1;              $self->{entity__match} = -1;
4625              ## Stay in the state.              ## Stay in the state.
4626                            
# Line 4261  sub _get_next_token ($) { Line 4668  sub _get_next_token ($) {
4668          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
4669              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
4670                        
4671            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
4672            #            #
4673          } else {          } else {
4674                        
# Line 4273  sub _get_next_token ($) { Line 4680  sub _get_next_token ($) {
4680                    
4681          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4682                          line => $self->{line_prev},                          line => $self->{line_prev},
4683                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
4684          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
4685          #          #
4686        }        }
4687        
# Line 4297  sub _get_next_token ($) { Line 4704  sub _get_next_token ($) {
4704                    data => $data,                    data => $data,
4705                    has_reference => $has_ref,                    has_reference => $has_ref,
4706                    line => $self->{line_prev},                    line => $self->{line_prev},
4707                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
4708                   });                   });
4709          redo A;          redo A;
4710        } else {        } else {
# Line 4367  sub _get_next_token ($) { Line 4774  sub _get_next_token ($) {
4774          redo A;          redo A;
4775        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4776          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4777          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4778          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4779            } else {
4780              $self->{state} = DATA_STATE;
4781              $self->{s_kwd} = '';
4782            }
4783          ## Reconsume.          ## Reconsume.
4784          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
4785          redo A;          redo A;
# Line 4439  sub _get_next_token ($) { Line 4850  sub _get_next_token ($) {
4850          redo A;          redo A;
4851        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4853          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4854          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4855            } else {
4856              $self->{state} = DATA_STATE;
4857              $self->{s_kwd} = '';
4858            }
4859          ## Reprocess.          ## Reprocess.
4860          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
4861          redo A;          redo A;
# Line 4465  sub _get_next_token ($) { Line 4880  sub _get_next_token ($) {
4880        }        }
4881      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
4882        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4883          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4884          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4885            } else {
4886              $self->{state} = DATA_STATE;
4887              $self->{s_kwd} = '';
4888            }
4889                    
4890      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4891        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4511  sub _get_next_token ($) { Line 4930  sub _get_next_token ($) {
4930      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4931        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" in XML5
4932        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4933          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4934          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935            } else {
4936              $self->{state} = DATA_STATE;
4937              $self->{s_kwd} = '';
4938            }
4939                    
4940      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4941        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4547  sub _get_next_token ($) { Line 4970  sub _get_next_token ($) {
4970          ## Reprocess.          ## Reprocess.
4971          redo A;          redo A;
4972        }        }
4973    
4974        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4975          if ($self->{nc} == 0x003C) { # <
4976            $self->{state} = DOCTYPE_TAG_STATE;
4977            
4978        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4979          $self->{line_prev} = $self->{line};
4980          $self->{column_prev} = $self->{column};
4981          $self->{column}++;
4982          $self->{nc}
4983              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4984        } else {
4985          $self->{set_nc}->($self);
4986        }
4987      
4988            redo A;
4989          } elsif ($self->{nc} == 0x0025) { # %
4990            ## XML5: Not defined yet.
4991    
4992            ## TODO:
4993            
4994        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4995          $self->{line_prev} = $self->{line};
4996          $self->{column_prev} = $self->{column};
4997          $self->{column}++;
4998          $self->{nc}
4999              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5000        } else {
5001          $self->{set_nc}->($self);
5002        }
5003      
5004            redo A;
5005          } elsif ($self->{nc} == 0x005D) { # ]
5006            delete $self->{in_subset};
5007            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5008            
5009        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5010          $self->{line_prev} = $self->{line};
5011          $self->{column_prev} = $self->{column};
5012          $self->{column}++;
5013          $self->{nc}
5014              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5015        } else {
5016          $self->{set_nc}->($self);
5017        }
5018      
5019            redo A;
5020          } elsif ($is_space->{$self->{nc}}) {
5021            ## Stay in the state.
5022            
5023        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5024          $self->{line_prev} = $self->{line};
5025          $self->{column_prev} = $self->{column};
5026          $self->{column}++;
5027          $self->{nc}
5028              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5029        } else {
5030          $self->{set_nc}->($self);
5031        }
5032      
5033            redo A;
5034          } elsif ($self->{nc} == -1) {
5035            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5036            delete $self->{in_subset};
5037            $self->{state} = DATA_STATE;
5038            $self->{s_kwd} = '';
5039            ## Reconsume.
5040            return  ({type => END_OF_DOCTYPE_TOKEN});
5041            redo A;
5042          } else {
5043            unless ($self->{internal_subset_tainted}) {
5044              ## XML5: No parse error.
5045              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5046              $self->{internal_subset_tainted} = 1;
5047            }
5048            ## Stay in the state.
5049            
5050        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5051          $self->{line_prev} = $self->{line};
5052          $self->{column_prev} = $self->{column};
5053          $self->{column}++;
5054          $self->{nc}
5055              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5056        } else {
5057          $self->{set_nc}->($self);
5058        }
5059      
5060            redo A;
5061          }
5062        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5063          if ($self->{nc} == 0x003E) { # >
5064            $self->{state} = DATA_STATE;
5065            $self->{s_kwd} = '';
5066            
5067        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5068          $self->{line_prev} = $self->{line};
5069          $self->{column_prev} = $self->{column};
5070          $self->{column}++;
5071          $self->{nc}
5072              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5073        } else {
5074          $self->{set_nc}->($self);
5075        }
5076      
5077            return  ({type => END_OF_DOCTYPE_TOKEN});
5078            redo A;
5079          } elsif ($self->{nc} == -1) {
5080            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5081            $self->{state} = DATA_STATE;
5082            $self->{s_kwd} = '';
5083            ## Reconsume.
5084            return  ({type => END_OF_DOCTYPE_TOKEN});
5085            redo A;
5086          } else {
5087            ## XML5: No parse error and stay in the state.
5088            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5089    
5090            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5091            
5092        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5093          $self->{line_prev} = $self->{line};
5094          $self->{column_prev} = $self->{column};
5095          $self->{column}++;
5096          $self->{nc}
5097              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5098        } else {
5099          $self->{set_nc}->($self);
5100        }
5101      
5102            redo A;
5103          }
5104        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5105          if ($self->{nc} == 0x003E) { # >
5106            $self->{state} = DATA_STATE;
5107            $self->{s_kwd} = '';
5108            
5109        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5110          $self->{line_prev} = $self->{line};
5111          $self->{column_prev} = $self->{column};
5112          $self->{column}++;
5113          $self->{nc}
5114              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5115        } else {
5116          $self->{set_nc}->($self);
5117        }
5118      
5119            return  ({type => END_OF_DOCTYPE_TOKEN});
5120            redo A;
5121          } elsif ($self->{nc} == -1) {
5122            $self->{state} = DATA_STATE;
5123            $self->{s_kwd} = '';
5124            ## Reconsume.
5125            return  ({type => END_OF_DOCTYPE_TOKEN});
5126            redo A;
5127          } else {
5128            ## Stay in the state.
5129            
5130        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5131          $self->{line_prev} = $self->{line};
5132          $self->{column_prev} = $self->{column};
5133          $self->{column}++;
5134          $self->{nc}
5135              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5136        } else {
5137          $self->{set_nc}->($self);
5138        }
5139      
5140            redo A;
5141          }
5142        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5143          if ($self->{nc} == 0x0021) { # !
5144            $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
5145            
5146        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5147          $self->{line_prev} = $self->{line};
5148          $self->{column_prev} = $self->{column};
5149          $self->{column}++;
5150          $self->{nc}
5151              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5152        } else {
5153          $self->{set_nc}->($self);
5154        }
5155      
5156            redo A;
5157          } elsif ($self->{nc} == 0x003F) { # ?
5158            $self->{state} = PI_STATE;
5159            
5160        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5161          $self->{line_prev} = $self->{line};
5162          $self->{column_prev} = $self->{column};
5163          $self->{column}++;
5164          $self->{nc}
5165              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5166        } else {
5167          $self->{set_nc}->($self);
5168        }
5169      
5170            redo A;
5171          } elsif ($self->{nc} == -1) {
5172            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5173            $self->{state} = DATA_STATE;
5174            $self->{s_kwd} = '';
5175            ## Reconsume.
5176            redo A;
5177          } else {
5178            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5179                            line => $self->{line_prev},
5180                            column => $self->{column_prev});
5181            $self->{state} = BOGUS_COMMENT_STATE;
5182            $self->{ct} = {type => COMMENT_TOKEN,
5183                           data => '',
5184                          }; ## NOTE: Will be discarded.
5185            
5186        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5187          $self->{line_prev} = $self->{line};
5188          $self->{column_prev} = $self->{column};
5189          $self->{column}++;
5190          $self->{nc}
5191              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5192        } else {
5193          $self->{set_nc}->($self);
5194        }
5195      
5196            redo A;
5197          }
5198                    
5199      } else {      } else {
5200        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.13

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24