/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.18 by wakaba, Sun Oct 19 06:14:57 2008 UTC revision 1.34 by wakaba, Sat Sep 5 11:31:58 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 182  sub NDATA_STATE () { 86 } Line 184  sub NDATA_STATE () { 86 }
184  sub AFTER_NDATA_STATE () { 87 }  sub AFTER_NDATA_STATE () { 87 }
185  sub BEFORE_NOTATION_NAME_STATE () { 88 }  sub BEFORE_NOTATION_NAME_STATE () { 88 }
186  sub NOTATION_NAME_STATE () { 89 }  sub NOTATION_NAME_STATE () { 89 }
187  sub AFTER_NOTATION_NAME_STATE () { 90 }  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188  sub BOGUS_MD_STATE () { 91 }  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190    sub AFTER_ELEMENT_NAME_STATE () { 93 }
191    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192    sub CONTENT_KEYWORD_STATE () { 95 }
193    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194    sub CM_ELEMENT_NAME_STATE () { 97 }
195    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197    sub AFTER_MD_DEF_STATE () { 100 }
198    sub BOGUS_MD_STATE () { 101 }
199    
200  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
201  ## list and descriptions)  ## list and descriptions)
# Line 194  sub FOREIGN_EL () { 0b1_00000000000 } Line 206  sub FOREIGN_EL () { 0b1_00000000000 }
206  ## Character reference mappings  ## Character reference mappings
207    
208  my $charref_map = {  my $charref_map = {
209      0x00 => 0xFFFD, # REPLACEMENT CHARACTER
210    0x0D => 0x000A,    0x0D => 0x000A,
211    0x80 => 0x20AC,    0x80 => 0x20AC,
212    0x81 => 0xFFFD,    0x81 => 0x0081,
213    0x82 => 0x201A,    0x82 => 0x201A,
214    0x83 => 0x0192,    0x83 => 0x0192,
215    0x84 => 0x201E,    0x84 => 0x201E,
# Line 208  my $charref_map = { Line 221  my $charref_map = {
221    0x8A => 0x0160,    0x8A => 0x0160,
222    0x8B => 0x2039,    0x8B => 0x2039,
223    0x8C => 0x0152,    0x8C => 0x0152,
224    0x8D => 0xFFFD,    0x8D => 0x008D,
225    0x8E => 0x017D,    0x8E => 0x017D,
226    0x8F => 0xFFFD,    0x8F => 0x008F,
227    0x90 => 0xFFFD,    0x90 => 0x0090,
228    0x91 => 0x2018,    0x91 => 0x2018,
229    0x92 => 0x2019,    0x92 => 0x2019,
230    0x93 => 0x201C,    0x93 => 0x201C,
# Line 224  my $charref_map = { Line 237  my $charref_map = {
237    0x9A => 0x0161,    0x9A => 0x0161,
238    0x9B => 0x203A,    0x9B => 0x203A,
239    0x9C => 0x0153,    0x9C => 0x0153,
240    0x9D => 0xFFFD,    0x9D => 0x009D,
241    0x9E => 0x017E,    0x9E => 0x017E,
242    0x9F => 0x0178,    0x9F => 0x0178,
243  }; # $charref_map  }; # $charref_map
244  $charref_map->{$_} = 0xFFFD  $charref_map->{$_} = $_
245      for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,      for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
246          0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF          0xD800..0xDFFF, 0xFDD0..0xFDEF,
247          0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,          0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
248          0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,          0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
249          0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,          0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
# Line 1090  sub _get_next_token ($) { Line 1103  sub _get_next_token ($) {
1103          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1104          # reconsume          # reconsume
1105    
1106          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1107            #return  ($self->{ct}); # start tag or end tag
1108    
1109          redo A;          redo A;
1110        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
# Line 1231  sub _get_next_token ($) { Line 1245  sub _get_next_token ($) {
1245          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1246          # reconsume          # reconsume
1247    
1248          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1249            #return  ($self->{ct}); # start tag or end tag
1250    
1251          redo A;          redo A;
1252        } else {        } else {
1253          if ({          if ({
1254               0x0022 => 1, # "               0x0022 => 1, # "
1255               0x0027 => 1, # '               0x0027 => 1, # '
1256                 0x003C => 1, # <
1257               0x003D => 1, # =               0x003D => 1, # =
1258              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1259                        
# Line 1416  sub _get_next_token ($) { Line 1432  sub _get_next_token ($) {
1432          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1433          # reconsume          # reconsume
1434    
1435          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1436            #return  ($self->{ct}); # start tag or end tag
1437    
1438          redo A;          redo A;
1439        } else {        } else {
1440          if ($self->{nc} == 0x0022 or # "          if ({
1441              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1442                 0x0027 => 1, # '
1443                 0x003C => 1, # <
1444                }->{$self->{nc}}) {
1445                        
1446            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1447            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1580  sub _get_next_token ($) { Line 1600  sub _get_next_token ($) {
1600          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1601          # reconsume          # reconsume
1602    
1603          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1604            #return  ($self->{ct}); # start tag or end tag
1605    
1606          redo A;          redo A;
1607        } else {        } else {
# Line 1592  sub _get_next_token ($) { Line 1613  sub _get_next_token ($) {
1613                        
1614          }          }
1615    
1616          if ($self->{nc} == 0x0022 or # "          if ({
1617              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1618                 0x0027 => 1, # '
1619                 0x003C => 1, # <
1620                }->{$self->{nc}}) {
1621                        
1622            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1623            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1726  sub _get_next_token ($) { Line 1750  sub _get_next_token ($) {
1750          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1751          ## reconsume          ## reconsume
1752    
1753          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1754            #return  ($self->{ct}); # start tag or end tag
1755    
1756          redo A;          redo A;
1757        } else {        } else {
1758          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1759                        
1760            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1761            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
# Line 1806  sub _get_next_token ($) { Line 1831  sub _get_next_token ($) {
1831      }      }
1832        
1833          redo A;          redo A;
1834          } elsif ($self->{is_xml} and
1835                   $is_space->{$self->{nc}}) {
1836            
1837            $self->{ca}->{value} .= ' ';
1838            ## Stay in the state.
1839            
1840        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1841          $self->{line_prev} = $self->{line};
1842          $self->{column_prev} = $self->{column};
1843          $self->{column}++;
1844          $self->{nc}
1845              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1846        } else {
1847          $self->{set_nc}->($self);
1848        }
1849      
1850            redo A;
1851        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1853          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1830  sub _get_next_token ($) { Line 1872  sub _get_next_token ($) {
1872            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1873            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1874            ## reconsume            ## reconsume
1875            return  ($self->{ct}); # end tag  
1876              ## Discard the token.
1877              #return  ($self->{ct}); # end tag
1878    
1879            redo A;            redo A;
1880          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1881            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
1882            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1883            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1884            ## Reconsume.            ## Reconsume.
1885            return  ($self->{ct}); # ATTLIST  
1886              ## Discard the token.
1887              #return  ($self->{ct}); # ATTLIST
1888    
1889            redo A;            redo A;
1890          } else {          } else {
1891            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1853  sub _get_next_token ($) { Line 1901  sub _get_next_token ($) {
1901          }          }
1902          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1903          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1904                                q["&<],                                qq["&<\x09\x0C\x20],
1905                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1906    
1907          ## Stay in the state          ## Stay in the state
# Line 1920  sub _get_next_token ($) { Line 1968  sub _get_next_token ($) {
1968      }      }
1969        
1970          redo A;          redo A;
1971          } elsif ($self->{is_xml} and
1972                   $is_space->{$self->{nc}}) {
1973            
1974            $self->{ca}->{value} .= ' ';
1975            ## Stay in the state.
1976            
1977        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1978          $self->{line_prev} = $self->{line};
1979          $self->{column_prev} = $self->{column};
1980          $self->{column}++;
1981          $self->{nc}
1982              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1983        } else {
1984          $self->{set_nc}->($self);
1985        }
1986      
1987            redo A;
1988        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1989          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1990          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1929  sub _get_next_token ($) { Line 1994  sub _get_next_token ($) {
1994            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1995            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1996            ## reconsume            ## reconsume
1997            return  ($self->{ct}); # start tag  
1998              ## Discard the token.
1999              #return  ($self->{ct}); # start tag
2000    
2001            redo A;            redo A;
2002          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2003            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
# Line 1944  sub _get_next_token ($) { Line 2012  sub _get_next_token ($) {
2012            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2013            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2014            ## reconsume            ## reconsume
2015            return  ($self->{ct}); # end tag  
2016              ## Discard the token.
2017              #return  ($self->{ct}); # end tag
2018    
2019            redo A;            redo A;
2020          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2021            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
2022            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2023            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2024            ## Reconsume.            ## Reconsume.
2025            return  ($self->{ct}); # ATTLIST  
2026              ## Discard the token.
2027              #return  ($self->{ct}); # ATTLIST
2028    
2029            redo A;            redo A;
2030          } else {          } else {
2031            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1967  sub _get_next_token ($) { Line 2041  sub _get_next_token ($) {
2041          }          }
2042          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2043          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2044                                q['&<],                                qq['&<\x09\x0C\x20],
2045                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2046    
2047          ## Stay in the state          ## Stay in the state
# Line 2106  sub _get_next_token ($) { Line 2180  sub _get_next_token ($) {
2180            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2181            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2182            ## reconsume            ## reconsume
2183            return  ($self->{ct}); # start tag  
2184              ## Discard the token.
2185              #return  ($self->{ct}); # start tag
2186              
2187            redo A;            redo A;
2188          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2189            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
# Line 2122  sub _get_next_token ($) { Line 2199  sub _get_next_token ($) {
2199            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2200            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2201            ## reconsume            ## reconsume
2202            return  ($self->{ct}); # end tag  
2203              ## Discard the token.
2204              #return  ($self->{ct}); # end tag
2205    
2206            redo A;            redo A;
2207          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2208            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2209            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2210            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2211            ## Reconsume.            ## Reconsume.
2212            return  ($self->{ct}); # ATTLIST  
2213              ## Discard the token.
2214              #return  ($self->{ct}); # ATTLIST
2215    
2216            redo A;            redo A;
2217          } else {          } else {
2218            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 2139  sub _get_next_token ($) { Line 2222  sub _get_next_token ($) {
2222               0x0022 => 1, # "               0x0022 => 1, # "
2223               0x0027 => 1, # '               0x0027 => 1, # '
2224               0x003D => 1, # =               0x003D => 1, # =
2225                 0x003C => 1, # <
2226              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2227                        
2228            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 2148  sub _get_next_token ($) { Line 2232  sub _get_next_token ($) {
2232          }          }
2233          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2234          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2235                                q["'=& >],                                qq["'=& \x09\x0C>],
2236                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2237    
2238          ## Stay in the state          ## Stay in the state
# Line 2248  sub _get_next_token ($) { Line 2332  sub _get_next_token ($) {
2332          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2333          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2334          ## Reconsume.          ## Reconsume.
2335          return  ($self->{ct}); # start tag or end tag  
2336            ## Discard the token.
2337            #return  ($self->{ct}); # start tag or end tag
2338    
2339          redo A;          redo A;
2340        } else {        } else {
2341                    
# Line 2315  sub _get_next_token ($) { Line 2402  sub _get_next_token ($) {
2402          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2403          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2404          ## Reconsume.          ## Reconsume.
2405          return  ($self->{ct}); # start tag or end tag  
2406            ## Discard the token.
2407            #return  ($self->{ct}); # start tag or end tag
2408    
2409          redo A;          redo A;
2410        } else {        } else {
2411                    
# Line 2890  sub _get_next_token ($) { Line 2980  sub _get_next_token ($) {
2980        
2981          redo A;          redo A;
2982        }        }
2983      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2984                 $self->{state} == COMMENT_END_BANG_STATE) {
2985        ## XML5: "Comment end state" and "DOCTYPE comment end state".        ## XML5: "Comment end state" and "DOCTYPE comment end state".
2986          ## (No comment end bang state.)
2987    
2988        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2989          if ($self->{in_subset}) {          if ($self->{in_subset}) {
# Line 2918  sub _get_next_token ($) { Line 3010  sub _get_next_token ($) {
3010    
3011          redo A;          redo A;
3012        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
3013            if ($self->{state} == COMMENT_END_BANG_STATE) {
3014              
3015              $self->{ct}->{data} .= '--!'; # comment
3016              $self->{state} = COMMENT_END_DASH_STATE;
3017            } else {
3018              
3019              ## XML5: Not a parse error.
3020              $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3021                              line => $self->{line_prev},
3022                              column => $self->{column_prev});
3023              $self->{ct}->{data} .= '-'; # comment
3024              ## Stay in the state
3025            }
3026                    
3027          ## XML5: Not a parse error.      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3028          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',        $self->{line_prev} = $self->{line};
3029                          line => $self->{line_prev},        $self->{column_prev} = $self->{column};
3030                          column => $self->{column_prev});        $self->{column}++;
3031          $self->{ct}->{data} .= '-'; # comment        $self->{nc}
3032          ## Stay in the state            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3033        } else {
3034          $self->{set_nc}->($self);
3035        }
3036      
3037            redo A;
3038          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3039                   $is_space->{$self->{nc}}) {
3040            
3041            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3042            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3043            $self->{state} = COMMENT_END_SPACE_STATE;
3044            
3045        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3046          $self->{line_prev} = $self->{line};
3047          $self->{column_prev} = $self->{column};
3048          $self->{column}++;
3049          $self->{nc}
3050              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3051        } else {
3052          $self->{set_nc}->($self);
3053        }
3054      
3055            redo A;
3056          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3057                   $self->{nc} == 0x0021) { # !
3058            
3059            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3060            $self->{state} = COMMENT_END_BANG_STATE;
3061                    
3062      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3063        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2947  sub _get_next_token ($) { Line 3080  sub _get_next_token ($) {
3080            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
3081            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3082          }          }
3083          ## reconsume          ## Reconsume.
3084    
3085          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
3086    
3087          redo A;          redo A;
3088        } else {        } else {
3089                    
3090          ## XML5: Not a parse error.          if ($self->{state} == COMMENT_END_BANG_STATE) {
3091          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',            $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3092                          line => $self->{line_prev},          } else {
3093                          column => $self->{column_prev});            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3094          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          }
3095          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3096                    
3097      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2973  sub _get_next_token ($) { Line 3106  sub _get_next_token ($) {
3106        
3107          redo A;          redo A;
3108        }        }
3109        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3110          ## XML5: Not exist.
3111    
3112          if ($self->{nc} == 0x003E) { # >
3113            if ($self->{in_subset}) {
3114              
3115              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3116            } else {
3117              
3118              $self->{state} = DATA_STATE;
3119              $self->{s_kwd} = '';
3120            }
3121            
3122        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123          $self->{line_prev} = $self->{line};
3124          $self->{column_prev} = $self->{column};
3125          $self->{column}++;
3126          $self->{nc}
3127              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128        } else {
3129          $self->{set_nc}->($self);
3130        }
3131      
3132    
3133            return  ($self->{ct}); # comment
3134    
3135            redo A;
3136          } elsif ($is_space->{$self->{nc}}) {
3137            
3138            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3139            ## Stay in the state.
3140            
3141        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3142          $self->{line_prev} = $self->{line};
3143          $self->{column_prev} = $self->{column};
3144          $self->{column}++;
3145          $self->{nc}
3146              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3147        } else {
3148          $self->{set_nc}->($self);
3149        }
3150      
3151            redo A;
3152          } elsif ($self->{nc} == -1) {
3153            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3154            if ($self->{in_subset}) {
3155              
3156              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3157            } else {
3158              
3159              $self->{state} = DATA_STATE;
3160              $self->{s_kwd} = '';
3161            }
3162            ## Reconsume.
3163    
3164            return  ($self->{ct}); # comment
3165    
3166            redo A;
3167          } else {
3168            
3169            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3170            $self->{state} = COMMENT_STATE;
3171            
3172        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3173          $self->{line_prev} = $self->{line};
3174          $self->{column_prev} = $self->{column};
3175          $self->{column}++;
3176          $self->{nc}
3177              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3178        } else {
3179          $self->{set_nc}->($self);
3180        }
3181      
3182            redo A;
3183          }
3184      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
3185        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3186                    
# Line 2989  sub _get_next_token ($) { Line 3197  sub _get_next_token ($) {
3197      }      }
3198        
3199          redo A;          redo A;
3200          } elsif ($self->{nc} == -1) {
3201            
3202            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3203            $self->{ct}->{quirks} = 1;
3204    
3205            $self->{state} = DATA_STATE;
3206            ## Reconsume.
3207            return  ($self->{ct}); # DOCTYPE (quirks)
3208    
3209            redo A;
3210        } else {        } else {
3211                    
3212          ## XML5: Unless EOF, swith to the bogus comment state.          ## XML5: Swith to the bogus comment state.
3213          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3214          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3215          ## reconsume          ## reconsume
# Line 3036  sub _get_next_token ($) { Line 3254  sub _get_next_token ($) {
3254          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3255    
3256          redo A;          redo A;
3257          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3258            
3259            $self->{ct}->{name} # DOCTYPE
3260                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3261            delete $self->{ct}->{quirks};
3262            $self->{state} = DOCTYPE_NAME_STATE;
3263            
3264        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265          $self->{line_prev} = $self->{line};
3266          $self->{column_prev} = $self->{column};
3267          $self->{column}++;
3268          $self->{nc}
3269              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270        } else {
3271          $self->{set_nc}->($self);
3272        }
3273      
3274            redo A;
3275        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3276                    
3277          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 3122  sub _get_next_token ($) { Line 3358  sub _get_next_token ($) {
3358          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3359    
3360          redo A;          redo A;
3361          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3362            
3363            $self->{ct}->{name} # DOCTYPE
3364                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3365            delete $self->{ct}->{quirks};
3366            ## Stay in the state.
3367            
3368        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3369          $self->{line_prev} = $self->{line};
3370          $self->{column_prev} = $self->{column};
3371          $self->{column}++;
3372          $self->{nc}
3373              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3374        } else {
3375          $self->{set_nc}->($self);
3376        }
3377      
3378            redo A;
3379        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3380                    
3381          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 3153  sub _get_next_token ($) { Line 3407  sub _get_next_token ($) {
3407          redo A;          redo A;
3408        } else {        } else {
3409                    
3410          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3411            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3412                    
3413      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3414        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3262  sub _get_next_token ($) { Line 3515  sub _get_next_token ($) {
3515      }      }
3516        
3517          redo A;          redo A;
3518  ## TODO: " and ' for ENTITY        } elsif ($self->{nc} == 0x0022 and # "
3519                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3520                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3521            
3522            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3523            $self->{ct}->{value} = ''; # ENTITY
3524            
3525        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3526          $self->{line_prev} = $self->{line};
3527          $self->{column_prev} = $self->{column};
3528          $self->{column}++;
3529          $self->{nc}
3530              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3531        } else {
3532          $self->{set_nc}->($self);
3533        }
3534      
3535            redo A;
3536          } elsif ($self->{nc} == 0x0027 and # '
3537                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3538                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3539            
3540            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3541            $self->{ct}->{value} = ''; # ENTITY
3542            
3543        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3544          $self->{line_prev} = $self->{line};
3545          $self->{column_prev} = $self->{column};
3546          $self->{column}++;
3547          $self->{nc}
3548              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3549        } else {
3550          $self->{set_nc}->($self);
3551        }
3552      
3553            redo A;
3554        } elsif ($self->{is_xml} and        } elsif ($self->{is_xml} and
3555                 $self->{ct}->{type} == DOCTYPE_TOKEN and                 $self->{ct}->{type} == DOCTYPE_TOKEN and
3556                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
# Line 4583  sub _get_next_token ($) { Line 4871  sub _get_next_token ($) {
4871              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4872              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4873            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4874                    if ($self->{is_xml}) {
4875              
4876              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4877                              line => $self->{line_prev},
4878                              column => $self->{column_prev}
4879                                  + ($self->{nc} == -1 ? 1 : 0));
4880            } else {
4881              
4882              ## No error
4883            }
4884          ## Don't consume          ## Don't consume
         ## No error  
4885          ## Return nothing.          ## Return nothing.
4886          #          #
4887        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
# Line 4604  sub _get_next_token ($) { Line 4900  sub _get_next_token ($) {
4900      }      }
4901        
4902          redo A;          redo A;
4903        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4904                   (0x0041 <= $self->{nc} and
4905                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4906                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4907                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
# Line 4658  sub _get_next_token ($) { Line 4955  sub _get_next_token ($) {
4955          redo A;          redo A;
4956        }        }
4957      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4958        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
4959            $self->{nc} == 0x0058) { # X          
4960            $self->{state} = HEXREF_X_STATE;
4961            $self->{kwd} .= chr $self->{nc};
4962            
4963        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964          $self->{line_prev} = $self->{line};
4965          $self->{column_prev} = $self->{column};
4966          $self->{column}++;
4967          $self->{nc}
4968              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969        } else {
4970          $self->{set_nc}->($self);
4971        }
4972      
4973            redo A;
4974          } elsif ($self->{nc} == 0x0058) { # X
4975                    
4976            if ($self->{is_xml}) {
4977              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4978            }
4979          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4980          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4981                    
# Line 4765  sub _get_next_token ($) { Line 5080  sub _get_next_token ($) {
5080        my $code = $self->{kwd};        my $code = $self->{kwd};
5081        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5082        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5083        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5084              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5085              ($self->{is_xml} and $code == 0x0000)) {
5086                    
5087          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5088                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4918  sub _get_next_token ($) { Line 5235  sub _get_next_token ($) {
5235        my $code = $self->{kwd};        my $code = $self->{kwd};
5236        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5237        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5238        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5239              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5240              ($self->{is_xml} and $code == 0x0000)) {
5241                    
5242          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5243                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4952  sub _get_next_token ($) { Line 5271  sub _get_next_token ($) {
5271          redo A;          redo A;
5272        }        }
5273      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5274        if (length $self->{kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5275            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5276            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5277              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5278             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5279              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5280             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
5281              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
5282             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
5283                    {
5284                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5285                      $self->{entity_add} => 1,
5286                    }->{$self->{nc}}))) {
5287          our $EntityChar;          our $EntityChar;
5288          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5289          if (defined $EntityChar->{$self->{kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5290                $self->{ge}->{$self->{kwd}}) {
5291            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5292                            if (defined $self->{ge}->{$self->{kwd}}) {
5293              $self->{entity__value} = $EntityChar->{$self->{kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5294                    
5295                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5296                  } else {
5297                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5298                      
5299                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5300                                      value => $self->{kwd});
5301                    } else {
5302                      
5303                    }
5304                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5305                  }
5306                } else {
5307                  if ($self->{is_xml}) {
5308                    
5309                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5310                                    value => $self->{kwd},
5311                                    level => {
5312                                              'amp;' => $self->{level}->{warn},
5313                                              'quot;' => $self->{level}->{warn},
5314                                              'lt;' => $self->{level}->{warn},
5315                                              'gt;' => $self->{level}->{warn},
5316                                              'apos;' => $self->{level}->{warn},
5317                                             }->{$self->{kwd}} ||
5318                                             $self->{level}->{must});
5319                  } else {
5320                    
5321                  }
5322                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5323                }
5324              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5325                            
5326      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 5362  sub _get_next_token ($) { Line 5716  sub _get_next_token ($) {
5716          ## XML5: Not defined yet.          ## XML5: Not defined yet.
5717    
5718          ## TODO:          ## TODO:
5719    
5720            if (not $self->{stop_processing} and
5721                not $self->{document}->xml_standalone) {
5722              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5723                              level => $self->{level}->{info});
5724              $self->{stop_processing} = 1;
5725            }
5726    
5727                    
5728      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5729        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5796  sub _get_next_token ($) { Line 6158  sub _get_next_token ($) {
6158          }          }
6159          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6160                         line => $self->{line_prev},                         line => $self->{line_prev},
6161                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
6162          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6163                    
6164      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 5864  sub _get_next_token ($) { Line 6226  sub _get_next_token ($) {
6226          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6227                         attrdefs => [],                         attrdefs => [],
6228                         line => $self->{line_prev},                         line => $self->{line_prev},
6229                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
6230          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6231                    
6232      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 5933  sub _get_next_token ($) { Line 6295  sub _get_next_token ($) {
6295          }          }
6296          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
6297                         line => $self->{line_prev},                         line => $self->{line_prev},
6298                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 8};
6299          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6300                    
6301      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 6145  sub _get_next_token ($) { Line 6507  sub _get_next_token ($) {
6507          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6508            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6509          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6510            ## TODO: ...            $self->{state} = AFTER_ELEMENT_NAME_STATE;
           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;  
6511          } else { # ENTITY/NOTATION          } else { # ENTITY/NOTATION
6512            $self->{state} = AFTER_DOCTYPE_NAME_STATE;            $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6513          }          }
# Line 7629  sub _get_next_token ($) { Line 7990  sub _get_next_token ($) {
7990        }        }
7991      } elsif ($self->{state} == NOTATION_NAME_STATE) {      } elsif ($self->{state} == NOTATION_NAME_STATE) {
7992        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
7993          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
7994                    
7995      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7996        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 7689  sub _get_next_token ($) { Line 8050  sub _get_next_token ($) {
8050        
8051          redo A;          redo A;
8052        }        }
8053      } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8054          if ($self->{nc} == 0x0022) { # "
8055            $self->{state} = AFTER_MD_DEF_STATE;
8056            
8057        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8058          $self->{line_prev} = $self->{line};
8059          $self->{column_prev} = $self->{column};
8060          $self->{column}++;
8061          $self->{nc}
8062              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8063        } else {
8064          $self->{set_nc}->($self);
8065        }
8066      
8067            redo A;
8068          } elsif ($self->{nc} == 0x0026) { # &
8069            $self->{prev_state} = $self->{state};
8070            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8071            $self->{entity_add} = 0x0022; # "
8072            
8073        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8074          $self->{line_prev} = $self->{line};
8075          $self->{column_prev} = $self->{column};
8076          $self->{column}++;
8077          $self->{nc}
8078              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8079        } else {
8080          $self->{set_nc}->($self);
8081        }
8082      
8083            redo A;
8084    ## TODO: %
8085          } elsif ($self->{nc} == -1) {
8086            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8087            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8088            ## Reconsume.
8089            return  ($self->{ct}); # ENTITY
8090            redo A;
8091          } else {
8092            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8093            
8094        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095          $self->{line_prev} = $self->{line};
8096          $self->{column_prev} = $self->{column};
8097          $self->{column}++;
8098          $self->{nc}
8099              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100        } else {
8101          $self->{set_nc}->($self);
8102        }
8103      
8104            redo A;
8105          }
8106        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8107          if ($self->{nc} == 0x0027) { # '
8108            $self->{state} = AFTER_MD_DEF_STATE;
8109            
8110        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8111          $self->{line_prev} = $self->{line};
8112          $self->{column_prev} = $self->{column};
8113          $self->{column}++;
8114          $self->{nc}
8115              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8116        } else {
8117          $self->{set_nc}->($self);
8118        }
8119      
8120            redo A;
8121          } elsif ($self->{nc} == 0x0026) { # &
8122            $self->{prev_state} = $self->{state};
8123            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8124            $self->{entity_add} = 0x0027; # '
8125            
8126        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127          $self->{line_prev} = $self->{line};
8128          $self->{column_prev} = $self->{column};
8129          $self->{column}++;
8130          $self->{nc}
8131              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132        } else {
8133          $self->{set_nc}->($self);
8134        }
8135      
8136            redo A;
8137    ## TODO: %
8138          } elsif ($self->{nc} == -1) {
8139            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8140            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141            ## Reconsume.
8142            return  ($self->{ct}); # ENTITY
8143            redo A;
8144          } else {
8145            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8146            
8147        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8148          $self->{line_prev} = $self->{line};
8149          $self->{column_prev} = $self->{column};
8150          $self->{column}++;
8151          $self->{nc}
8152              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8153        } else {
8154          $self->{set_nc}->($self);
8155        }
8156      
8157            redo A;
8158          }
8159        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8160          if ($is_space->{$self->{nc}} or
8161              {
8162                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8163                $self->{entity_add} => 1,
8164              }->{$self->{nc}}) {
8165            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8166                            line => $self->{line_prev},
8167                            column => $self->{column_prev}
8168                                + ($self->{nc} == -1 ? 1 : 0));
8169            ## Don't consume
8170            ## Return nothing.
8171            #
8172          } elsif ($self->{nc} == 0x0023) { # #
8173            $self->{ca} = $self->{ct};
8174            $self->{state} = ENTITY_HASH_STATE;
8175            $self->{kwd} = '#';
8176            
8177        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8178          $self->{line_prev} = $self->{line};
8179          $self->{column_prev} = $self->{column};
8180          $self->{column}++;
8181          $self->{nc}
8182              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8183        } else {
8184          $self->{set_nc}->($self);
8185        }
8186      
8187            redo A;
8188          } else {
8189            #
8190          }
8191    
8192          $self->{ct}->{value} .= '&';
8193          $self->{state} = $self->{prev_state};
8194          ## Reconsume.
8195          redo A;
8196        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8197        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
8198            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8199            
8200        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8201          $self->{line_prev} = $self->{line};
8202          $self->{column_prev} = $self->{column};
8203          $self->{column}++;
8204          $self->{nc}
8205              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8206        } else {
8207          $self->{set_nc}->($self);
8208        }
8209      
8210            redo A;
8211          } elsif ($self->{nc} == 0x0028) { # (
8212            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8213            $self->{ct}->{content} = ['('];
8214            $self->{group_depth} = 1;
8215            
8216        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8217          $self->{line_prev} = $self->{line};
8218          $self->{column_prev} = $self->{column};
8219          $self->{column}++;
8220          $self->{nc}
8221              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8222        } else {
8223          $self->{set_nc}->($self);
8224        }
8225      
8226            redo A;
8227          } elsif ($self->{nc} == 0x003E) { # >
8228            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8229            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8230            
8231        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8232          $self->{line_prev} = $self->{line};
8233          $self->{column_prev} = $self->{column};
8234          $self->{column}++;
8235          $self->{nc}
8236              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8237        } else {
8238          $self->{set_nc}->($self);
8239        }
8240      
8241            return  ($self->{ct}); # ELEMENT
8242            redo A;
8243          } elsif ($self->{nc} == -1) {
8244            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8245            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8246            
8247        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8248          $self->{line_prev} = $self->{line};
8249          $self->{column_prev} = $self->{column};
8250          $self->{column}++;
8251          $self->{nc}
8252              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8253        } else {
8254          $self->{set_nc}->($self);
8255        }
8256      
8257            return  ($self->{ct}); # ELEMENT
8258            redo A;
8259          } else {
8260            $self->{ct}->{content} = [chr $self->{nc}];
8261            $self->{state} = CONTENT_KEYWORD_STATE;
8262            
8263        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264          $self->{line_prev} = $self->{line};
8265          $self->{column_prev} = $self->{column};
8266          $self->{column}++;
8267          $self->{nc}
8268              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269        } else {
8270          $self->{set_nc}->($self);
8271        }
8272      
8273            redo A;
8274          }
8275        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8276          if ($is_space->{$self->{nc}}) {
8277            $self->{state} = AFTER_MD_DEF_STATE;
8278            
8279        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280          $self->{line_prev} = $self->{line};
8281          $self->{column_prev} = $self->{column};
8282          $self->{column}++;
8283          $self->{nc}
8284              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285        } else {
8286          $self->{set_nc}->($self);
8287        }
8288      
8289            redo A;
8290          } elsif ($self->{nc} == 0x003E) { # >
8291            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8292            
8293        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8294          $self->{line_prev} = $self->{line};
8295          $self->{column_prev} = $self->{column};
8296          $self->{column}++;
8297          $self->{nc}
8298              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8299        } else {
8300          $self->{set_nc}->($self);
8301        }
8302      
8303            return  ($self->{ct}); # ELEMENT
8304            redo A;
8305          } elsif ($self->{nc} == -1) {
8306            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8307            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8308            
8309        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8310          $self->{line_prev} = $self->{line};
8311          $self->{column_prev} = $self->{column};
8312          $self->{column}++;
8313          $self->{nc}
8314              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8315        } else {
8316          $self->{set_nc}->($self);
8317        }
8318      
8319            return  ($self->{ct}); # ELEMENT
8320            redo A;
8321          } else {
8322            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8323            ## Stay in the state.
8324            
8325        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8326          $self->{line_prev} = $self->{line};
8327          $self->{column_prev} = $self->{column};
8328          $self->{column}++;
8329          $self->{nc}
8330              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8331        } else {
8332          $self->{set_nc}->($self);
8333        }
8334      
8335            redo A;
8336          }
8337        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8338          if ($is_space->{$self->{nc}}) {
8339            ## Stay in the state.
8340            
8341        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8342          $self->{line_prev} = $self->{line};
8343          $self->{column_prev} = $self->{column};
8344          $self->{column}++;
8345          $self->{nc}
8346              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8347        } else {
8348          $self->{set_nc}->($self);
8349        }
8350      
8351            redo A;
8352          } elsif ($self->{nc} == 0x0028) { # (
8353            $self->{group_depth}++;
8354            push @{$self->{ct}->{content}}, chr $self->{nc};
8355            ## Stay in the state.
8356            
8357        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8358          $self->{line_prev} = $self->{line};
8359          $self->{column_prev} = $self->{column};
8360          $self->{column}++;
8361          $self->{nc}
8362              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8363        } else {
8364          $self->{set_nc}->($self);
8365        }
8366      
8367            redo A;
8368          } elsif ($self->{nc} == 0x007C or # |
8369                   $self->{nc} == 0x002C) { # ,
8370            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8371          ## Stay in the state.          ## Stay in the state.
8372                    
8373      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 7704  sub _get_next_token ($) { Line 8381  sub _get_next_token ($) {
8381      }      }
8382        
8383          redo A;          redo A;
8384          } elsif ($self->{nc} == 0x0029) { # )
8385            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8386            push @{$self->{ct}->{content}}, chr $self->{nc};
8387            $self->{group_depth}--;
8388            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8389            
8390        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8391          $self->{line_prev} = $self->{line};
8392          $self->{column_prev} = $self->{column};
8393          $self->{column}++;
8394          $self->{nc}
8395              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8396        } else {
8397          $self->{set_nc}->($self);
8398        }
8399      
8400            redo A;
8401        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
8402            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8403            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8404          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8405                    
8406      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 7717  sub _get_next_token ($) { Line 8413  sub _get_next_token ($) {
8413        $self->{set_nc}->($self);        $self->{set_nc}->($self);
8414      }      }
8415        
8416          return  ($self->{ct}); # ENTITY          return  ($self->{ct}); # ELEMENT
8417          redo A;          redo A;
8418        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
8419          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8420            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8421          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8422                    
8423      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 7733  sub _get_next_token ($) { Line 8430  sub _get_next_token ($) {
8430        $self->{set_nc}->($self);        $self->{set_nc}->($self);
8431      }      }
8432        
8433          return  ($self->{ct}); # ENTITY          return  ($self->{ct}); # ELEMENT
8434            redo A;
8435          } else {
8436            push @{$self->{ct}->{content}}, chr $self->{nc};
8437            $self->{state} = CM_ELEMENT_NAME_STATE;
8438            
8439        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8440          $self->{line_prev} = $self->{line};
8441          $self->{column_prev} = $self->{column};
8442          $self->{column}++;
8443          $self->{nc}
8444              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8445        } else {
8446          $self->{set_nc}->($self);
8447        }
8448      
8449            redo A;
8450          }
8451        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8452          if ($is_space->{$self->{nc}}) {
8453            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8454            
8455        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8456          $self->{line_prev} = $self->{line};
8457          $self->{column_prev} = $self->{column};
8458          $self->{column}++;
8459          $self->{nc}
8460              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8461        } else {
8462          $self->{set_nc}->($self);
8463        }
8464      
8465            redo A;
8466          } elsif ($self->{nc} == 0x002A or # *
8467                   $self->{nc} == 0x002B or # +
8468                   $self->{nc} == 0x003F) { # ?
8469            push @{$self->{ct}->{content}}, chr $self->{nc};
8470            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8471            
8472        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8473          $self->{line_prev} = $self->{line};
8474          $self->{column_prev} = $self->{column};
8475          $self->{column}++;
8476          $self->{nc}
8477              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8478        } else {
8479          $self->{set_nc}->($self);
8480        }
8481      
8482            redo A;
8483          } elsif ($self->{nc} == 0x007C or # |
8484                   $self->{nc} == 0x002C) { # ,
8485            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8486            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8487            
8488        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8489          $self->{line_prev} = $self->{line};
8490          $self->{column_prev} = $self->{column};
8491          $self->{column}++;
8492          $self->{nc}
8493              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8494        } else {
8495          $self->{set_nc}->($self);
8496        }
8497      
8498            redo A;
8499          } elsif ($self->{nc} == 0x0029) { # )
8500            $self->{group_depth}--;
8501            push @{$self->{ct}->{content}}, chr $self->{nc};
8502            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8503            
8504        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8505          $self->{line_prev} = $self->{line};
8506          $self->{column_prev} = $self->{column};
8507          $self->{column}++;
8508          $self->{nc}
8509              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8510        } else {
8511          $self->{set_nc}->($self);
8512        }
8513      
8514            redo A;
8515          } elsif ($self->{nc} == 0x003E) { # >
8516            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8517            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8518            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8519            
8520        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8521          $self->{line_prev} = $self->{line};
8522          $self->{column_prev} = $self->{column};
8523          $self->{column}++;
8524          $self->{nc}
8525              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8526        } else {
8527          $self->{set_nc}->($self);
8528        }
8529      
8530            return  ($self->{ct}); # ELEMENT
8531            redo A;
8532          } elsif ($self->{nc} == -1) {
8533            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8534            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8535            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8536            
8537        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8538          $self->{line_prev} = $self->{line};
8539          $self->{column_prev} = $self->{column};
8540          $self->{column}++;
8541          $self->{nc}
8542              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8543        } else {
8544          $self->{set_nc}->($self);
8545        }
8546      
8547            return  ($self->{ct}); # ELEMENT
8548          redo A;          redo A;
8549        } else {        } else {
8550          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type          $self->{ct}->{content}->[-1] .= chr $self->{nc};
8551            ## Stay in the state.
8552            
8553        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8554          $self->{line_prev} = $self->{line};
8555          $self->{column_prev} = $self->{column};
8556          $self->{column}++;
8557          $self->{nc}
8558              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8559        } else {
8560          $self->{set_nc}->($self);
8561        }
8562      
8563            redo A;
8564          }
8565        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8566          if ($is_space->{$self->{nc}}) {
8567            ## Stay in the state.
8568            
8569        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8570          $self->{line_prev} = $self->{line};
8571          $self->{column_prev} = $self->{column};
8572          $self->{column}++;
8573          $self->{nc}
8574              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8575        } else {
8576          $self->{set_nc}->($self);
8577        }
8578      
8579            redo A;
8580          } elsif ($self->{nc} == 0x007C or # |
8581                   $self->{nc} == 0x002C) { # ,
8582            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8583            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8584            
8585        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8586          $self->{line_prev} = $self->{line};
8587          $self->{column_prev} = $self->{column};
8588          $self->{column}++;
8589          $self->{nc}
8590              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8591        } else {
8592          $self->{set_nc}->($self);
8593        }
8594      
8595            redo A;
8596          } elsif ($self->{nc} == 0x0029) { # )
8597            $self->{group_depth}--;
8598            push @{$self->{ct}->{content}}, chr $self->{nc};
8599            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8600            
8601        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8602          $self->{line_prev} = $self->{line};
8603          $self->{column_prev} = $self->{column};
8604          $self->{column}++;
8605          $self->{nc}
8606              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8607        } else {
8608          $self->{set_nc}->($self);
8609        }
8610      
8611            redo A;
8612          } elsif ($self->{nc} == 0x003E) { # >
8613            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8614            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8615            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8616            
8617        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8618          $self->{line_prev} = $self->{line};
8619          $self->{column_prev} = $self->{column};
8620          $self->{column}++;
8621          $self->{nc}
8622              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8623        } else {
8624          $self->{set_nc}->($self);
8625        }
8626      
8627            return  ($self->{ct}); # ELEMENT
8628            redo A;
8629          } elsif ($self->{nc} == -1) {
8630            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8631            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8633            
8634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8635          $self->{line_prev} = $self->{line};
8636          $self->{column_prev} = $self->{column};
8637          $self->{column}++;
8638          $self->{nc}
8639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8640        } else {
8641          $self->{set_nc}->($self);
8642        }
8643      
8644            return  ($self->{ct}); # ELEMENT
8645            redo A;
8646          } else {
8647            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8648            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8649            $self->{state} = BOGUS_MD_STATE;
8650            
8651        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8652          $self->{line_prev} = $self->{line};
8653          $self->{column_prev} = $self->{column};
8654          $self->{column}++;
8655          $self->{nc}
8656              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8657        } else {
8658          $self->{set_nc}->($self);
8659        }
8660      
8661            redo A;
8662          }
8663        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8664          if ($is_space->{$self->{nc}}) {
8665            if ($self->{group_depth}) {
8666              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8667            } else {
8668              $self->{state} = AFTER_MD_DEF_STATE;
8669            }
8670            
8671        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8672          $self->{line_prev} = $self->{line};
8673          $self->{column_prev} = $self->{column};
8674          $self->{column}++;
8675          $self->{nc}
8676              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8677        } else {
8678          $self->{set_nc}->($self);
8679        }
8680      
8681            redo A;
8682          } elsif ($self->{nc} == 0x002A or # *
8683                   $self->{nc} == 0x002B or # +
8684                   $self->{nc} == 0x003F) { # ?
8685            push @{$self->{ct}->{content}}, chr $self->{nc};
8686            if ($self->{group_depth}) {
8687              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8688            } else {
8689              $self->{state} = AFTER_MD_DEF_STATE;
8690            }
8691            
8692        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8693          $self->{line_prev} = $self->{line};
8694          $self->{column_prev} = $self->{column};
8695          $self->{column}++;
8696          $self->{nc}
8697              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8698        } else {
8699          $self->{set_nc}->($self);
8700        }
8701      
8702            redo A;
8703          } elsif ($self->{nc} == 0x0029) { # )
8704            if ($self->{group_depth}) {
8705              $self->{group_depth}--;
8706              push @{$self->{ct}->{content}}, chr $self->{nc};
8707              ## Stay in the state.
8708              
8709        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8710          $self->{line_prev} = $self->{line};
8711          $self->{column_prev} = $self->{column};
8712          $self->{column}++;
8713          $self->{nc}
8714              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8715        } else {
8716          $self->{set_nc}->($self);
8717        }
8718      
8719              redo A;
8720            } else {
8721              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8722              $self->{state} = BOGUS_MD_STATE;
8723              ## Reconsume.
8724              redo A;
8725            }
8726          } elsif ($self->{nc} == 0x003E) { # >
8727            if ($self->{group_depth}) {
8728              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8729              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8730            }
8731            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8732            
8733        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8734          $self->{line_prev} = $self->{line};
8735          $self->{column_prev} = $self->{column};
8736          $self->{column}++;
8737          $self->{nc}
8738              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8739        } else {
8740          $self->{set_nc}->($self);
8741        }
8742      
8743            return  ($self->{ct}); # ELEMENT
8744            redo A;
8745          } elsif ($self->{nc} == -1) {
8746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8747            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8748            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8749            
8750        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8751          $self->{line_prev} = $self->{line};
8752          $self->{column_prev} = $self->{column};
8753          $self->{column}++;
8754          $self->{nc}
8755              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8756        } else {
8757          $self->{set_nc}->($self);
8758        }
8759      
8760            return  ($self->{ct}); # ELEMENT
8761            redo A;
8762          } else {
8763            if ($self->{group_depth}) {
8764              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8765            } else {
8766              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8767              $self->{state} = BOGUS_MD_STATE;
8768            }
8769            ## Reconsume.
8770            redo A;
8771          }
8772        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8773          if ($is_space->{$self->{nc}}) {
8774            ## Stay in the state.
8775            
8776        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8777          $self->{line_prev} = $self->{line};
8778          $self->{column_prev} = $self->{column};
8779          $self->{column}++;
8780          $self->{nc}
8781              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8782        } else {
8783          $self->{set_nc}->($self);
8784        }
8785      
8786            redo A;
8787          } elsif ($self->{nc} == 0x003E) { # >
8788            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8789            
8790        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8791          $self->{line_prev} = $self->{line};
8792          $self->{column_prev} = $self->{column};
8793          $self->{column}++;
8794          $self->{nc}
8795              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8796        } else {
8797          $self->{set_nc}->($self);
8798        }
8799      
8800            return  ($self->{ct}); # ENTITY/ELEMENT
8801            redo A;
8802          } elsif ($self->{nc} == -1) {
8803            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8804            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8805            
8806        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8807          $self->{line_prev} = $self->{line};
8808          $self->{column_prev} = $self->{column};
8809          $self->{column}++;
8810          $self->{nc}
8811              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8812        } else {
8813          $self->{set_nc}->($self);
8814        }
8815      
8816            return  ($self->{ct}); # ENTITY/ELEMENT
8817            redo A;
8818          } else {
8819            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8820          $self->{state} = BOGUS_MD_STATE;          $self->{state} = BOGUS_MD_STATE;
8821          ## Reconsume.          ## Reconsume.
8822          redo A;          redo A;
8823        }        }
   
   
8824      } elsif ($self->{state} == BOGUS_MD_STATE) {      } elsif ($self->{state} == BOGUS_MD_STATE) {
8825        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
8826          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;

Legend:
Removed from v.1.18  
changed lines
  Added in v.1.34

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24