/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC revision 1.13 by wakaba, Thu Oct 16 03:39:57 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19    );    );
20        
21    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 28  BEGIN {
28        CHARACTER_TOKEN        CHARACTER_TOKEN
29        PI_TOKEN        PI_TOKEN
30        ABORT_TOKEN        ABORT_TOKEN
31          END_OF_DOCTYPE_TOKEN
32      )],      )],
33    );    );
34  }  }
35    
36    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
37    
38  ## Token types  ## Token types
39    
40  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
41  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
42  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
43  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
44  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
45  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
46  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
47  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49    
50    ## XML5: XML5 has "empty tag token".  In this implementation, it is
51    ## represented as a start tag token with $self->{self_closing} flag
52    ## set to true.
53    
54    ## XML5: XML5 has "short end tag token".  In this implementation, it
55    ## is represented as an end tag token with $token->{tag_name} flag set
56    ## to an empty string.
57    
58  package Whatpm::HTML;  package Whatpm::HTML;
59    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 127  sub HEXREF_HEX_STATE () { 48 }
127  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
128  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
129    
130  ## XML states  ## XML-only states
131  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
132  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
133  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
134  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
135  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
136  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
137    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139    sub DOCTYPE_TAG_STATE () { 59 }
140    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141    
142  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
143  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 203  sub _initialize_tokenizer ($) {
203    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
204    
205    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
206    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
207      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
208    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
209    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
210    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 231  sub _initialize_tokenizer ($) { Line 249  sub _initialize_tokenizer ($) {
249  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
250  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
251  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
252    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
253    
254  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
255  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
256  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 250  my $is_space = { Line 270  my $is_space = {
270    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
271    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
272    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
273    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
274    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
275    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
276  };  };
# Line 530  sub _get_next_token ($) { Line 550  sub _get_next_token ($) {
550            redo A;            redo A;
551          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
552                        
553            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
554            #            #
555          } else {          } else {
556                        
557              $self->{s_kwd} = '';
558            #            #
559          }          }
560    
561          ## reconsume          ## reconsume
562          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
563          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
564                    line => $self->{line_prev},                    line => $self->{line_prev},
565                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 720  sub _get_next_token ($) { Line 740  sub _get_next_token ($) {
740        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
741          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
742            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
743            $self->{s_kwd} = '';            $self->{kwd} = '';
744            ## Reconsume.            ## Reconsume.
745            redo A;            redo A;
746          } else {          } else {
# Line 873  sub _get_next_token ($) { Line 893  sub _get_next_token ($) {
893          redo A;          redo A;
894        }        }
895      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
896        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
897        if (length $ch) {        if (length $ch) {
898          my $CH = $ch;          my $CH = $ch;
899          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 881  sub _get_next_token ($) { Line 901  sub _get_next_token ($) {
901          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
902                        
903            ## Stay in the state.            ## Stay in the state.
904            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
905                        
906      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
907        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 900  sub _get_next_token ($) { Line 920  sub _get_next_token ($) {
920            $self->{s_kwd} = '';            $self->{s_kwd} = '';
921            ## Reconsume.            ## Reconsume.
922            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
923                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
924                      line => $self->{line_prev},                      line => $self->{line_prev},
925                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
926                     });                     });
927            redo A;            redo A;
928          }          }
# Line 918  sub _get_next_token ($) { Line 938  sub _get_next_token ($) {
938            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
939            $self->{s_kwd} = '';            $self->{s_kwd} = '';
940            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
941                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
942                      line => $self->{line_prev},                      line => $self->{line_prev},
943                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
944                     });                     });
945            redo A;            redo A;
946          } else {          } else {
# Line 929  sub _get_next_token ($) { Line 949  sub _get_next_token ($) {
949                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
950                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
951                   line => $self->{line_prev},                   line => $self->{line_prev},
952                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
953            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
954            ## Reconsume.            ## Reconsume.
955            redo A;            redo A;
# Line 2168  sub _get_next_token ($) { Line 2188  sub _get_next_token ($) {
2188          redo A;          redo A;
2189        }        }
2190      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
       ## (only happen if PCDATA state)  
   
2191        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2192        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2193                
2194        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2195                    if ($self->{in_subset}) {
2196          $self->{state} = DATA_STATE;            
2197          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2198            } else {
2199              
2200              $self->{state} = DATA_STATE;
2201              $self->{s_kwd} = '';
2202            }
2203                    
2204      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2205        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2192  sub _get_next_token ($) { Line 2215  sub _get_next_token ($) {
2215          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2216          redo A;          redo A;
2217        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2218                    if ($self->{in_subset}) {
2219          $self->{state} = DATA_STATE;            
2220          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2221            } else {
2222              
2223              $self->{state} = DATA_STATE;
2224              $self->{s_kwd} = '';
2225            }
2226          ## reconsume          ## reconsume
2227    
2228          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2221  sub _get_next_token ($) { Line 2249  sub _get_next_token ($) {
2249          redo A;          redo A;
2250        }        }
2251      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2252        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state" and "DOCTYPE markup
2253          ## declaration state".
2254                
2255        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2256                    
# Line 2243  sub _get_next_token ($) { Line 2272  sub _get_next_token ($) {
2272          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2273                    
2274          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2275          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2276                    
2277      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2278        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2262  sub _get_next_token ($) { Line 2291  sub _get_next_token ($) {
2291                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2292                                                    
2293          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2294          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2295                    
2296      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2297        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2332  sub _get_next_token ($) { Line 2361  sub _get_next_token ($) {
2361              0x0054, # T              0x0054, # T
2362              0x0059, # Y              0x0059, # Y
2363              0x0050, # P              0x0050, # P
2364            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2365            $self->{nc} == [            $self->{nc} == [
2366              undef,              undef,
2367              0x006F, # o              0x006F, # o
# Line 2340  sub _get_next_token ($) { Line 2369  sub _get_next_token ($) {
2369              0x0074, # t              0x0074, # t
2370              0x0079, # y              0x0079, # y
2371              0x0070, # p              0x0070, # p
2372            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2373                    
2374          ## Stay in the state.          ## Stay in the state.
2375          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2376                    
2377      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2378        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2356  sub _get_next_token ($) { Line 2385  sub _get_next_token ($) {
2385      }      }
2386        
2387          redo A;          redo A;
2388        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2389                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2390                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2391          if ($self->{s_kwd} ne 'DOCTYP') {          if ($self->{is_xml} and
2392                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2393                        
2394            ## XML5: case-sensitive.            ## XML5: case-sensitive.
2395            $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO            $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
# Line 2391  sub _get_next_token ($) { Line 2421  sub _get_next_token ($) {
2421                                    
2422          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2423                          line => $self->{line_prev},                          line => $self->{line_prev},
2424                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2425          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2426          ## Reconsume.          ## Reconsume.
2427          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2428                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2429                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2430                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2431                                   };                                   };
2432          redo A;          redo A;
2433        }        }
# Line 2408  sub _get_next_token ($) { Line 2438  sub _get_next_token ($) {
2438              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2439              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2440              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2441            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2442                    
2443          ## Stay in the state.          ## Stay in the state.
2444          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2445                    
2446      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2447        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2424  sub _get_next_token ($) { Line 2454  sub _get_next_token ($) {
2454      }      }
2455        
2456          redo A;          redo A;
2457        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2458                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2459          if ($self->{is_xml} and          if ($self->{is_xml} and
2460              not $self->{tainted} and              not $self->{tainted} and
# Line 2459  sub _get_next_token ($) { Line 2489  sub _get_next_token ($) {
2489                    
2490          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2491                          line => $self->{line_prev},                          line => $self->{line_prev},
2492                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2493          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2494          ## Reconsume.          ## Reconsume.
2495          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2496                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2497                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2498                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2499                                   };                                   };
2500          redo A;          redo A;
2501        }        }
# Line 2486  sub _get_next_token ($) { Line 2516  sub _get_next_token ($) {
2516        
2517          redo A;          redo A;
2518        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2519          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2520          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2521          $self->{s_kwd} = '';            
2522              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2523            } else {
2524              
2525              $self->{state} = DATA_STATE;
2526              $self->{s_kwd} = '';
2527            }
2528                    
2529      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2530        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2506  sub _get_next_token ($) { Line 2541  sub _get_next_token ($) {
2541    
2542          redo A;          redo A;
2543        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2544          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2545          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2546          $self->{s_kwd} = '';            
2547              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2548            } else {
2549              
2550              $self->{state} = DATA_STATE;
2551              $self->{s_kwd} = '';
2552            }
2553          ## reconsume          ## reconsume
2554    
2555          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2550  sub _get_next_token ($) { Line 2590  sub _get_next_token ($) {
2590        
2591          redo A;          redo A;
2592        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2593          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2594          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2595          $self->{s_kwd} = '';            
2596              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597            } else {
2598              
2599              $self->{state} = DATA_STATE;
2600              $self->{s_kwd} = '';
2601            }
2602                    
2603      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2604        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2570  sub _get_next_token ($) { Line 2615  sub _get_next_token ($) {
2615    
2616          redo A;          redo A;
2617        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2618          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2620          $self->{s_kwd} = '';            
2621              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2622            } else {
2623              
2624              $self->{state} = DATA_STATE;
2625              $self->{s_kwd} = '';
2626            }
2627          ## reconsume          ## reconsume
2628    
2629          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2614  sub _get_next_token ($) { Line 2664  sub _get_next_token ($) {
2664        
2665          redo A;          redo A;
2666        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2667          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2669          $self->{s_kwd} = '';            
2670              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2671            } else {
2672              
2673              $self->{state} = DATA_STATE;
2674              $self->{s_kwd} = '';
2675            }
2676          ## reconsume          ## reconsume
2677    
2678          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2663  sub _get_next_token ($) { Line 2718  sub _get_next_token ($) {
2718        
2719          redo A;          redo A;
2720        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2721          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2722          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2723          $self->{state} = DATA_STATE;            
2724          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725            } else {
2726              
2727              $self->{state} = DATA_STATE;
2728              $self->{s_kwd} = '';
2729            }
2730          ## reconsume          ## reconsume
2731    
2732          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2692  sub _get_next_token ($) { Line 2751  sub _get_next_token ($) {
2751        }        }
2752      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2753        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2754                    if ($self->{in_subset}) {
2755          $self->{state} = DATA_STATE;            
2756          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2757            } else {
2758              
2759              $self->{state} = DATA_STATE;
2760              $self->{s_kwd} = '';
2761            }
2762                    
2763      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2764        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2731  sub _get_next_token ($) { Line 2795  sub _get_next_token ($) {
2795        
2796          redo A;          redo A;
2797        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2798          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2799          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2800          $self->{s_kwd} = '';            
2801              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2802            } else {
2803              
2804              $self->{state} = DATA_STATE;
2805              $self->{s_kwd} = '';
2806            }
2807          ## reconsume          ## reconsume
2808    
2809          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2779  sub _get_next_token ($) { Line 2848  sub _get_next_token ($) {
2848          redo A;          redo A;
2849        } else {        } else {
2850                    
2851            ## XML5: Unless EOF, swith to the bogus comment state.
2852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2853          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2854          ## reconsume          ## reconsume
2855          redo A;          redo A;
2856        }        }
2857      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2858          ## XML5: "DOCTYPE root name before state".
2859    
2860        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2861                    
2862          ## Stay in the state          ## Stay in the state
# Line 2802  sub _get_next_token ($) { Line 2874  sub _get_next_token ($) {
2874          redo A;          redo A;
2875        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2876                    
2877            ## XML5: No parse error.
2878          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2879          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2880          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2830  sub _get_next_token ($) { Line 2903  sub _get_next_token ($) {
2903          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
2904    
2905          redo A;          redo A;
2906          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2907            
2908            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2909            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2911            $self->{in_subset} = 1;
2912            
2913        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2914          $self->{line_prev} = $self->{line};
2915          $self->{column_prev} = $self->{column};
2916          $self->{column}++;
2917          $self->{nc}
2918              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2919        } else {
2920          $self->{set_nc}->($self);
2921        }
2922      
2923            return  ($self->{ct}); # DOCTYPE
2924            redo A;
2925        } else {        } else {
2926                    
2927          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2849  sub _get_next_token ($) { Line 2941  sub _get_next_token ($) {
2941          redo A;          redo A;
2942        }        }
2943      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2944  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2945    
2946          ## ISSUE: Redundant "First," in the spec.
2947    
2948        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2949                    
2950          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2895  sub _get_next_token ($) { Line 2990  sub _get_next_token ($) {
2990          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
2991    
2992          redo A;          redo A;
2993          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2994            
2995            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2996            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2997            $self->{in_subset} = 1;
2998            
2999        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3000          $self->{line_prev} = $self->{line};
3001          $self->{column_prev} = $self->{column};
3002          $self->{column}++;
3003          $self->{nc}
3004              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3005        } else {
3006          $self->{set_nc}->($self);
3007        }
3008      
3009            return  ($self->{ct}); # DOCTYPE
3010            redo A;
3011        } else {        } else {
3012                    
3013          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2914  sub _get_next_token ($) { Line 3027  sub _get_next_token ($) {
3027          redo A;          redo A;
3028        }        }
3029      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3030          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3031          ## state", but implemented differently.
3032    
3033        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3034                    
3035          ## Stay in the state          ## Stay in the state
# Line 2961  sub _get_next_token ($) { Line 3077  sub _get_next_token ($) {
3077          redo A;          redo A;
3078        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3079                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3080            
3081          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3082          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3083                    
3084      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2977  sub _get_next_token ($) { Line 3094  sub _get_next_token ($) {
3094          redo A;          redo A;
3095        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3096                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3097            
3098          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3099          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3100            
3101        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3102          $self->{line_prev} = $self->{line};
3103          $self->{column_prev} = $self->{column};
3104          $self->{column}++;
3105          $self->{nc}
3106              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3107        } else {
3108          $self->{set_nc}->($self);
3109        }
3110      
3111            redo A;
3112          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3113            
3114            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3116            $self->{in_subset} = 1;
3117                    
3118      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3119        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2990  sub _get_next_token ($) { Line 3125  sub _get_next_token ($) {
3125        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3126      }      }
3127        
3128            return  ($self->{ct}); # DOCTYPE
3129          redo A;          redo A;
3130        } else {        } else {
3131                    
# Line 3018  sub _get_next_token ($) { Line 3154  sub _get_next_token ($) {
3154              0x0042, # B              0x0042, # B
3155              0x004C, # L              0x004C, # L
3156              0x0049, # I              0x0049, # I
3157            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3158            $self->{nc} == [            $self->{nc} == [
3159              undef,              undef,
3160              0x0075, # u              0x0075, # u
3161              0x0062, # b              0x0062, # b
3162              0x006C, # l              0x006C, # l
3163              0x0069, # i              0x0069, # i
3164            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3165                    
3166          ## Stay in the state.          ## Stay in the state.
3167          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3168                    
3169      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3170        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3041  sub _get_next_token ($) { Line 3177  sub _get_next_token ($) {
3177      }      }
3178        
3179          redo A;          redo A;
3180        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3181                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3182                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3183                    if ($self->{is_xml} and
3184                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3185              
3186              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3187                              text => 'PUBLIC',
3188                              line => $self->{line_prev},
3189                              column => $self->{column_prev} - 4);
3190            } else {
3191              
3192            }
3193          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3194                    
3195      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3062  sub _get_next_token ($) { Line 3207  sub _get_next_token ($) {
3207                    
3208          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3209                          line => $self->{line_prev},                          line => $self->{line_prev},
3210                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3211          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3212    
3213          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 3077  sub _get_next_token ($) { Line 3222  sub _get_next_token ($) {
3222              0x0053, # S              0x0053, # S
3223              0x0054, # T              0x0054, # T
3224              0x0045, # E              0x0045, # E
3225            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3226            $self->{nc} == [            $self->{nc} == [
3227              undef,              undef,
3228              0x0079, # y              0x0079, # y
3229              0x0073, # s              0x0073, # s
3230              0x0074, # t              0x0074, # t
3231              0x0065, # e              0x0065, # e
3232            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3233                    
3234          ## Stay in the state.          ## Stay in the state.
3235          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3236                    
3237      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3100  sub _get_next_token ($) { Line 3245  sub _get_next_token ($) {
3245      }      }
3246        
3247          redo A;          redo A;
3248        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3249                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3250                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3251                    if ($self->{is_xml} and
3252                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3253              
3254              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3255                              text => 'SYSTEM',
3256                              line => $self->{line_prev},
3257                              column => $self->{column_prev} - 4);
3258            } else {
3259              
3260            }
3261          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3262                    
3263      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3121  sub _get_next_token ($) { Line 3275  sub _get_next_token ($) {
3275                    
3276          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3277                          line => $self->{line_prev},                          line => $self->{line_prev},
3278                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3279          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3280    
3281          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 3210  sub _get_next_token ($) { Line 3364  sub _get_next_token ($) {
3364          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3365    
3366          redo A;          redo A;
3367          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3368            
3369            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3370            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3371            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3372            $self->{in_subset} = 1;
3373            
3374        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3375          $self->{line_prev} = $self->{line};
3376          $self->{column_prev} = $self->{column};
3377          $self->{column}++;
3378          $self->{nc}
3379              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3380        } else {
3381          $self->{set_nc}->($self);
3382        }
3383      
3384            return  ($self->{ct}); # DOCTYPE
3385            redo A;
3386        } else {        } else {
3387                    
3388          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
# Line 3420  sub _get_next_token ($) { Line 3593  sub _get_next_token ($) {
3593        
3594          redo A;          redo A;
3595        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3596                    if ($self->{is_xml}) {
3597              
3598              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3599            } else {
3600              
3601            }
3602          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3603          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3604                    
# Line 3450  sub _get_next_token ($) { Line 3628  sub _get_next_token ($) {
3628          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3629    
3630          redo A;          redo A;
3631          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3632            
3633            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3634            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3635            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3636            $self->{in_subset} = 1;
3637            
3638        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639          $self->{line_prev} = $self->{line};
3640          $self->{column_prev} = $self->{column};
3641          $self->{column}++;
3642          $self->{nc}
3643              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3644        } else {
3645          $self->{set_nc}->($self);
3646        }
3647      
3648            return  ($self->{ct}); # DOCTYPE
3649            redo A;
3650        } else {        } else {
3651                    
3652          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
# Line 3550  sub _get_next_token ($) { Line 3747  sub _get_next_token ($) {
3747          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3748    
3749          redo A;          redo A;
3750          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3751            
3752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3753    
3754            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3755            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3756            $self->{in_subset} = 1;
3757            
3758        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3759          $self->{line_prev} = $self->{line};
3760          $self->{column_prev} = $self->{column};
3761          $self->{column}++;
3762          $self->{nc}
3763              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3764        } else {
3765          $self->{set_nc}->($self);
3766        }
3767      
3768            return  ($self->{ct}); # DOCTYPE
3769            redo A;
3770        } else {        } else {
3771                    
3772          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
# Line 3585  sub _get_next_token ($) { Line 3802  sub _get_next_token ($) {
3802      }      }
3803        
3804          redo A;          redo A;
3805        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3806                    
3807          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3808    
# Line 3656  sub _get_next_token ($) { Line 3873  sub _get_next_token ($) {
3873      }      }
3874        
3875          redo A;          redo A;
3876        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3877                    
3878          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3879    
# Line 3757  sub _get_next_token ($) { Line 3974  sub _get_next_token ($) {
3974          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3975    
3976          redo A;          redo A;
3977          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3978            
3979            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3980            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3981            $self->{in_subset} = 1;
3982            
3983        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3984          $self->{line_prev} = $self->{line};
3985          $self->{column_prev} = $self->{column};
3986          $self->{column}++;
3987          $self->{nc}
3988              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3989        } else {
3990          $self->{set_nc}->($self);
3991        }
3992      
3993            return  ($self->{ct}); # DOCTYPE
3994            redo A;
3995        } else {        } else {
3996                    
3997          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
# Line 3796  sub _get_next_token ($) { Line 4031  sub _get_next_token ($) {
4031          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4032    
4033          redo A;          redo A;
4034          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4035            
4036            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4037            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4038            $self->{in_subset} = 1;
4039            
4040        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4041          $self->{line_prev} = $self->{line};
4042          $self->{column_prev} = $self->{column};
4043          $self->{column}++;
4044          $self->{nc}
4045              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4046        } else {
4047          $self->{set_nc}->($self);
4048        }
4049      
4050            return  ($self->{ct}); # DOCTYPE
4051            redo A;
4052        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4053                    
4054          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3808  sub _get_next_token ($) { Line 4061  sub _get_next_token ($) {
4061        } else {        } else {
4062                    
4063          my $s = '';          my $s = '';
4064          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4065    
4066          ## Stay in the state          ## Stay in the state
4067                    
# Line 3976  sub _get_next_token ($) { Line 4229  sub _get_next_token ($) {
4229        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4230                    
4231          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4232          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4233                    
4234      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3996  sub _get_next_token ($) { Line 4249  sub _get_next_token ($) {
4249                    
4250          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4251          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4252          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4253          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4254          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4255                    
4256      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4047  sub _get_next_token ($) { Line 4300  sub _get_next_token ($) {
4300            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4301                    
4302          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4303          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4304                    
4305      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4306        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4064  sub _get_next_token ($) { Line 4317  sub _get_next_token ($) {
4317                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4318                    
4319          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4320          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4321                    
4322      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4323        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4110  sub _get_next_token ($) { Line 4363  sub _get_next_token ($) {
4363        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4364            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4365                    
4366          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4367          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4368                    
4369          ## Stay in the state.          ## Stay in the state.
4370                    
# Line 4147  sub _get_next_token ($) { Line 4400  sub _get_next_token ($) {
4400          #          #
4401        }        }
4402    
4403        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4404        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4405        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4406        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4190  sub _get_next_token ($) { Line 4443  sub _get_next_token ($) {
4443          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4444                    
4445          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4446          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4447          ## Reconsume.          ## Reconsume.
4448          redo A;          redo A;
4449        } else {        } else {
# Line 4208  sub _get_next_token ($) { Line 4461  sub _get_next_token ($) {
4461            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4462            ## Reconsume.            ## Reconsume.
4463            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4464                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4465                      line => $self->{line_prev},                      line => $self->{line_prev},
4466                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4467                     });                     });
4468            redo A;            redo A;
4469          } else {          } else {
4470                        
4471            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4472            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4473            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4474            ## Reconsume.            ## Reconsume.
# Line 4226  sub _get_next_token ($) { Line 4479  sub _get_next_token ($) {
4479        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4480          # 0..9          # 0..9
4481                    
4482          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4483          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4484          ## Stay in the state.          ## Stay in the state.
4485                    
4486      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4244  sub _get_next_token ($) { Line 4497  sub _get_next_token ($) {
4497        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4498                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4499                    
4500          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4501          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4502          ## Stay in the state.          ## Stay in the state.
4503                    
4504      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4262  sub _get_next_token ($) { Line 4515  sub _get_next_token ($) {
4515        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4516                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4517                    
4518          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4519          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4520          ## Stay in the state.          ## Stay in the state.
4521                    
4522      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4300  sub _get_next_token ($) { Line 4553  sub _get_next_token ($) {
4553          #          #
4554        }        }
4555    
4556        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4557        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4558        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4559        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4337  sub _get_next_token ($) { Line 4590  sub _get_next_token ($) {
4590          redo A;          redo A;
4591        }        }
4592      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
4593        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
4594            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
4595            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
4596              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 4347  sub _get_next_token ($) { Line 4600  sub _get_next_token ($) {
4600              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
4601             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
4602          our $EntityChar;          our $EntityChar;
4603          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4604          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
4605            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
4606                            
4607              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4608              $self->{entity__match} = 1;              $self->{entity__match} = 1;
4609                            
4610      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4367  sub _get_next_token ($) { Line 4620  sub _get_next_token ($) {
4620              #              #
4621            } else {            } else {
4622                            
4623              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4624              $self->{entity__match} = -1;              $self->{entity__match} = -1;
4625              ## Stay in the state.              ## Stay in the state.
4626                            
# Line 4415  sub _get_next_token ($) { Line 4668  sub _get_next_token ($) {
4668          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
4669              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
4670                        
4671            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
4672            #            #
4673          } else {          } else {
4674                        
# Line 4427  sub _get_next_token ($) { Line 4680  sub _get_next_token ($) {
4680                    
4681          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4682                          line => $self->{line_prev},                          line => $self->{line_prev},
4683                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
4684          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
4685          #          #
4686        }        }
4687        
# Line 4451  sub _get_next_token ($) { Line 4704  sub _get_next_token ($) {
4704                    data => $data,                    data => $data,
4705                    has_reference => $has_ref,                    has_reference => $has_ref,
4706                    line => $self->{line_prev},                    line => $self->{line_prev},
4707                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
4708                   });                   });
4709          redo A;          redo A;
4710        } else {        } else {
# Line 4521  sub _get_next_token ($) { Line 4774  sub _get_next_token ($) {
4774          redo A;          redo A;
4775        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4776          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4777          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4778          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4779            } else {
4780              $self->{state} = DATA_STATE;
4781              $self->{s_kwd} = '';
4782            }
4783          ## Reconsume.          ## Reconsume.
4784          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
4785          redo A;          redo A;
# Line 4593  sub _get_next_token ($) { Line 4850  sub _get_next_token ($) {
4850          redo A;          redo A;
4851        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4853          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4854          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4855            } else {
4856              $self->{state} = DATA_STATE;
4857              $self->{s_kwd} = '';
4858            }
4859          ## Reprocess.          ## Reprocess.
4860          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
4861          redo A;          redo A;
# Line 4619  sub _get_next_token ($) { Line 4880  sub _get_next_token ($) {
4880        }        }
4881      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
4882        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4883          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4884          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4885            } else {
4886              $self->{state} = DATA_STATE;
4887              $self->{s_kwd} = '';
4888            }
4889                    
4890      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4891        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4665  sub _get_next_token ($) { Line 4930  sub _get_next_token ($) {
4930      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4931        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" in XML5
4932        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4933          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4934          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935            } else {
4936              $self->{state} = DATA_STATE;
4937              $self->{s_kwd} = '';
4938            }
4939                    
4940      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4941        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4701  sub _get_next_token ($) { Line 4970  sub _get_next_token ($) {
4970          ## Reprocess.          ## Reprocess.
4971          redo A;          redo A;
4972        }        }
4973    
4974        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4975          if ($self->{nc} == 0x003C) { # <
4976            $self->{state} = DOCTYPE_TAG_STATE;
4977            
4978        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4979          $self->{line_prev} = $self->{line};
4980          $self->{column_prev} = $self->{column};
4981          $self->{column}++;
4982          $self->{nc}
4983              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4984        } else {
4985          $self->{set_nc}->($self);
4986        }
4987      
4988            redo A;
4989          } elsif ($self->{nc} == 0x0025) { # %
4990            ## XML5: Not defined yet.
4991    
4992            ## TODO:
4993            
4994        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4995          $self->{line_prev} = $self->{line};
4996          $self->{column_prev} = $self->{column};
4997          $self->{column}++;
4998          $self->{nc}
4999              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5000        } else {
5001          $self->{set_nc}->($self);
5002        }
5003      
5004            redo A;
5005          } elsif ($self->{nc} == 0x005D) { # ]
5006            delete $self->{in_subset};
5007            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5008            
5009        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5010          $self->{line_prev} = $self->{line};
5011          $self->{column_prev} = $self->{column};
5012          $self->{column}++;
5013          $self->{nc}
5014              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5015        } else {
5016          $self->{set_nc}->($self);
5017        }
5018      
5019            redo A;
5020          } elsif ($is_space->{$self->{nc}}) {
5021            ## Stay in the state.
5022            
5023        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5024          $self->{line_prev} = $self->{line};
5025          $self->{column_prev} = $self->{column};
5026          $self->{column}++;
5027          $self->{nc}
5028              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5029        } else {
5030          $self->{set_nc}->($self);
5031        }
5032      
5033            redo A;
5034          } elsif ($self->{nc} == -1) {
5035            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5036            delete $self->{in_subset};
5037            $self->{state} = DATA_STATE;
5038            $self->{s_kwd} = '';
5039            ## Reconsume.
5040            return  ({type => END_OF_DOCTYPE_TOKEN});
5041            redo A;
5042          } else {
5043            unless ($self->{internal_subset_tainted}) {
5044              ## XML5: No parse error.
5045              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5046              $self->{internal_subset_tainted} = 1;
5047            }
5048            ## Stay in the state.
5049            
5050        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5051          $self->{line_prev} = $self->{line};
5052          $self->{column_prev} = $self->{column};
5053          $self->{column}++;
5054          $self->{nc}
5055              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5056        } else {
5057          $self->{set_nc}->($self);
5058        }
5059      
5060            redo A;
5061          }
5062        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5063          if ($self->{nc} == 0x003E) { # >
5064            $self->{state} = DATA_STATE;
5065            $self->{s_kwd} = '';
5066            
5067        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5068          $self->{line_prev} = $self->{line};
5069          $self->{column_prev} = $self->{column};
5070          $self->{column}++;
5071          $self->{nc}
5072              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5073        } else {
5074          $self->{set_nc}->($self);
5075        }
5076      
5077            return  ({type => END_OF_DOCTYPE_TOKEN});
5078            redo A;
5079          } elsif ($self->{nc} == -1) {
5080            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5081            $self->{state} = DATA_STATE;
5082            $self->{s_kwd} = '';
5083            ## Reconsume.
5084            return  ({type => END_OF_DOCTYPE_TOKEN});
5085            redo A;
5086          } else {
5087            ## XML5: No parse error and stay in the state.
5088            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5089    
5090            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5091            
5092        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5093          $self->{line_prev} = $self->{line};
5094          $self->{column_prev} = $self->{column};
5095          $self->{column}++;
5096          $self->{nc}
5097              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5098        } else {
5099          $self->{set_nc}->($self);
5100        }
5101      
5102            redo A;
5103          }
5104        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5105          if ($self->{nc} == 0x003E) { # >
5106            $self->{state} = DATA_STATE;
5107            $self->{s_kwd} = '';
5108            
5109        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5110          $self->{line_prev} = $self->{line};
5111          $self->{column_prev} = $self->{column};
5112          $self->{column}++;
5113          $self->{nc}
5114              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5115        } else {
5116          $self->{set_nc}->($self);
5117        }
5118      
5119            return  ({type => END_OF_DOCTYPE_TOKEN});
5120            redo A;
5121          } elsif ($self->{nc} == -1) {
5122            $self->{state} = DATA_STATE;
5123            $self->{s_kwd} = '';
5124            ## Reconsume.
5125            return  ({type => END_OF_DOCTYPE_TOKEN});
5126            redo A;
5127          } else {
5128            ## Stay in the state.
5129            
5130        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5131          $self->{line_prev} = $self->{line};
5132          $self->{column_prev} = $self->{column};
5133          $self->{column}++;
5134          $self->{nc}
5135              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5136        } else {
5137          $self->{set_nc}->($self);
5138        }
5139      
5140            redo A;
5141          }
5142        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5143          if ($self->{nc} == 0x0021) { # !
5144            $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
5145            
5146        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5147          $self->{line_prev} = $self->{line};
5148          $self->{column_prev} = $self->{column};
5149          $self->{column}++;
5150          $self->{nc}
5151              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5152        } else {
5153          $self->{set_nc}->($self);
5154        }
5155      
5156            redo A;
5157          } elsif ($self->{nc} == 0x003F) { # ?
5158            $self->{state} = PI_STATE;
5159            
5160        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5161          $self->{line_prev} = $self->{line};
5162          $self->{column_prev} = $self->{column};
5163          $self->{column}++;
5164          $self->{nc}
5165              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5166        } else {
5167          $self->{set_nc}->($self);
5168        }
5169      
5170            redo A;
5171          } elsif ($self->{nc} == -1) {
5172            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5173            $self->{state} = DATA_STATE;
5174            $self->{s_kwd} = '';
5175            ## Reconsume.
5176            redo A;
5177          } else {
5178            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5179                            line => $self->{line_prev},
5180                            column => $self->{column_prev});
5181            $self->{state} = BOGUS_COMMENT_STATE;
5182            $self->{ct} = {type => COMMENT_TOKEN,
5183                           data => '',
5184                          }; ## NOTE: Will be discarded.
5185            
5186        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5187          $self->{line_prev} = $self->{line};
5188          $self->{column_prev} = $self->{column};
5189          $self->{column}++;
5190          $self->{nc}
5191              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5192        } else {
5193          $self->{set_nc}->($self);
5194        }
5195      
5196            redo A;
5197          }
5198                    
5199      } else {      } else {
5200        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.13

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24