/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC revision 1.14 by wakaba, Fri Oct 17 07:14:29 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
# Line 43  sub END_OF_FILE_TOKEN () { 5 } Line 55  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } ## NOTE: XML only.  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65  ## XML5: XML5 has "empty tag token".  In this implementation, it is  ## XML5: XML5 has "empty tag token".  In this implementation, it is
66  ## represented as a start tag token with $self->{self_closing} flag  ## represented as a start tag token with $self->{self_closing} flag
# Line 133  sub PI_AFTER_STATE () { 55 } Line 151  sub PI_AFTER_STATE () { 55 }
151  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
152  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    
168  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
169  ## list and descriptions)  ## list and descriptions)
# Line 1563  sub _get_next_token ($) { Line 1594  sub _get_next_token ($) {
1594          redo A;          redo A;
1595        }        }
1596      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1597        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1598    
1599        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1600        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1601                
1602        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1603          !!!cp (124);          if ($self->{in_subset}) {
1604          $self->{state} = DATA_STATE;            !!!cp (123);
1605          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1606            } else {
1607              !!!cp (124);
1608              $self->{state} = DATA_STATE;
1609              $self->{s_kwd} = '';
1610            }
1611          !!!next-input-character;          !!!next-input-character;
1612    
1613          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1614          redo A;          redo A;
1615        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1616          !!!cp (125);          if ($self->{in_subset}) {
1617          $self->{state} = DATA_STATE;            !!!cp (125.1);
1618          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1619            } else {
1620              !!!cp (125);
1621              $self->{state} = DATA_STATE;
1622              $self->{s_kwd} = '';
1623            }
1624          ## reconsume          ## reconsume
1625    
1626          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1596  sub _get_next_token ($) { Line 1637  sub _get_next_token ($) {
1637          redo A;          redo A;
1638        }        }
1639      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1640        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1641                
1642        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1643          !!!cp (133);          !!!cp (133);
# Line 1772  sub _get_next_token ($) { Line 1813  sub _get_next_token ($) {
1813          !!!next-input-character;          !!!next-input-character;
1814          redo A;          redo A;
1815        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1816          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1817          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1818          $self->{s_kwd} = '';            !!!cp (138.1);
1819              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1820            } else {
1821              !!!cp (138);
1822              $self->{state} = DATA_STATE;
1823              $self->{s_kwd} = '';
1824            }
1825          !!!next-input-character;          !!!next-input-character;
1826    
1827          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1828    
1829          redo A;          redo A;
1830        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1831          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1832          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1833          $self->{s_kwd} = '';            !!!cp (139.1);
1834              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1835            } else {
1836              !!!cp (139);
1837              $self->{state} = DATA_STATE;
1838              $self->{s_kwd} = '';
1839            }
1840          ## reconsume          ## reconsume
1841    
1842          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1806  sub _get_next_token ($) { Line 1857  sub _get_next_token ($) {
1857          !!!next-input-character;          !!!next-input-character;
1858          redo A;          redo A;
1859        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1860          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1861          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1862          $self->{s_kwd} = '';            !!!cp (142.1);
1863              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1864            } else {
1865              !!!cp (142);
1866              $self->{state} = DATA_STATE;
1867              $self->{s_kwd} = '';
1868            }
1869          !!!next-input-character;          !!!next-input-character;
1870    
1871          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1872    
1873          redo A;          redo A;
1874        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1875          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1876          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1877          $self->{s_kwd} = '';            !!!cp (143.1);
1878              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1879            } else {
1880              !!!cp (143);
1881              $self->{state} = DATA_STATE;
1882              $self->{s_kwd} = '';
1883            }
1884          ## reconsume          ## reconsume
1885    
1886          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1834  sub _get_next_token ($) { Line 1895  sub _get_next_token ($) {
1895          redo A;          redo A;
1896        }        }
1897      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1898          ## XML5: "Comment state" and "DOCTYPE comment state".
1899    
1900        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1901          !!!cp (145);          !!!cp (145);
1902          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1903          !!!next-input-character;          !!!next-input-character;
1904          redo A;          redo A;
1905        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
1906          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1907          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1908          $self->{s_kwd} = '';            !!!cp (146.1);
1909              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1910            } else {
1911              !!!cp (146);
1912              $self->{state} = DATA_STATE;
1913              $self->{s_kwd} = '';
1914            }
1915          ## reconsume          ## reconsume
1916    
1917          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1861  sub _get_next_token ($) { Line 1929  sub _get_next_token ($) {
1929          redo A;          redo A;
1930        }        }
1931      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1932        ## XML5: "comment dash state".        ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
1933    
1934        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1935          !!!cp (148);          !!!cp (148);
# Line 1869  sub _get_next_token ($) { Line 1937  sub _get_next_token ($) {
1937          !!!next-input-character;          !!!next-input-character;
1938          redo A;          redo A;
1939        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
1940          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1941          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1942          $self->{s_kwd} = '';            !!!cp (149.1);
1943              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1944            } else {
1945              !!!cp (149);
1946              $self->{state} = DATA_STATE;
1947              $self->{s_kwd} = '';
1948            }
1949          ## reconsume          ## reconsume
1950    
1951          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1886  sub _get_next_token ($) { Line 1959  sub _get_next_token ($) {
1959          redo A;          redo A;
1960        }        }
1961      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
1962          ## XML5: "Comment end state" and "DOCTYPE comment end state".
1963    
1964        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1965          !!!cp (151);          if ($self->{in_subset}) {
1966          $self->{state} = DATA_STATE;            !!!cp (151.1);
1967          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1968            } else {
1969              !!!cp (151);
1970              $self->{state} = DATA_STATE;
1971              $self->{s_kwd} = '';
1972            }
1973          !!!next-input-character;          !!!next-input-character;
1974    
1975          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1906  sub _get_next_token ($) { Line 1986  sub _get_next_token ($) {
1986          !!!next-input-character;          !!!next-input-character;
1987          redo A;          redo A;
1988        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
1989          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1990          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1991          $self->{s_kwd} = '';            !!!cp (153.1);
1992              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1993            } else {
1994              !!!cp (153);
1995              $self->{state} = DATA_STATE;
1996              $self->{s_kwd} = '';
1997            }
1998          ## reconsume          ## reconsume
1999    
2000          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1973  sub _get_next_token ($) { Line 2058  sub _get_next_token ($) {
2058          !!!cp (159.1);          !!!cp (159.1);
2059          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2060          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2062            $self->{in_subset} = 1;
2063          !!!next-input-character;          !!!next-input-character;
2064            !!!emit ($self->{ct}); # DOCTYPE
2065          redo A;          redo A;
2066        } else {        } else {
2067          !!!cp (160);          !!!cp (160);
# Line 2016  sub _get_next_token ($) { Line 2104  sub _get_next_token ($) {
2104        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2105          !!!cp (163.1);          !!!cp (163.1);
2106          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2107            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2108            $self->{in_subset} = 1;
2109          !!!next-input-character;          !!!next-input-character;
2110            !!!emit ($self->{ct}); # DOCTYPE
2111          redo A;          redo A;
2112        } else {        } else {
2113          !!!cp (164);          !!!cp (164);
# Line 2073  sub _get_next_token ($) { Line 2164  sub _get_next_token ($) {
2164          !!!cp (167.3);          !!!cp (167.3);
2165          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2166          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2167            $self->{in_subset} = 1;
2168          !!!next-input-character;          !!!next-input-character;
2169            !!!emit ($self->{ct}); # DOCTYPE
2170          redo A;          redo A;
2171        } else {        } else {
2172          !!!cp (180);          !!!cp (180);
# Line 2227  sub _get_next_token ($) { Line 2320  sub _get_next_token ($) {
2320          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2321          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2322          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2323            $self->{in_subset} = 1;
2324          !!!next-input-character;          !!!next-input-character;
2325            !!!emit ($self->{ct}); # DOCTYPE
2326          redo A;          redo A;
2327        } else {        } else {
2328          !!!cp (186);          !!!cp (186);
# Line 2369  sub _get_next_token ($) { Line 2464  sub _get_next_token ($) {
2464          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2465          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2466          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2467            $self->{in_subset} = 1;
2468          !!!next-input-character;          !!!next-input-character;
2469            !!!emit ($self->{ct}); # DOCTYPE
2470          redo A;          redo A;
2471        } else {        } else {
2472          !!!cp (200);          !!!cp (200);
# Line 2427  sub _get_next_token ($) { Line 2524  sub _get_next_token ($) {
2524    
2525          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2526          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2527            $self->{in_subset} = 1;
2528          !!!next-input-character;          !!!next-input-character;
2529            !!!emit ($self->{ct}); # DOCTYPE
2530          redo A;          redo A;
2531        } else {        } else {
2532          !!!cp (206);          !!!cp (206);
# Line 2550  sub _get_next_token ($) { Line 2649  sub _get_next_token ($) {
2649          !!!cp (218.1);          !!!cp (218.1);
2650          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2651          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2652            $self->{in_subset} = 1;
2653          !!!next-input-character;          !!!next-input-character;
2654            !!!emit ($self->{ct}); # DOCTYPE
2655          redo A;          redo A;
2656        } else {        } else {
2657          !!!cp (218);          !!!cp (218);
# Line 2572  sub _get_next_token ($) { Line 2673  sub _get_next_token ($) {
2673    
2674          redo A;          redo A;
2675        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2676          if ($self->{ct}->{has_internal_subset}) { # DOCTYPE          !!!cp (220.1);
2677            !!!cp (220.2);          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2678            ## Stay in the state.          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2679            !!!next-input-character;          $self->{in_subset} = 1;
2680            redo A;          !!!next-input-character;
2681          } else {          !!!emit ($self->{ct}); # DOCTYPE
2682            !!!cp (220.1);          redo A;
           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;  
           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE  
           !!!next-input-character;  
           redo A;  
         }  
2683        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2684          !!!cp (220);          !!!cp (220);
2685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3065  sub _get_next_token ($) { Line 3161  sub _get_next_token ($) {
3161      ## XML-only states      ## XML-only states
3162    
3163      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
3164          ## XML5: "Pi state" and "DOCTYPE pi state".
3165    
3166        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
3167            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
3168            $self->{nc} == -1) {            $self->{nc} == -1) {
3169            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3170            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3171            ## "DOCTYPE pi state": Parse error, switch to the "data
3172            ## state".
3173          !!!parse-error (type => 'bare pio', ## TODO: type          !!!parse-error (type => 'bare pio', ## TODO: type
3174                          line => $self->{line_prev},                          line => $self->{line_prev},
3175                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 3082  sub _get_next_token ($) { Line 3184  sub _get_next_token ($) {
3184                        };                        };
3185          redo A;          redo A;
3186        } else {        } else {
3187            ## XML5: "DOCTYPE pi state": Stay in the state.
3188          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
3189                         target => chr $self->{nc},                         target => chr $self->{nc},
3190                         data => '',                         data => '',
# Line 3099  sub _get_next_token ($) { Line 3202  sub _get_next_token ($) {
3202          redo A;          redo A;
3203        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3204          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3205          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3206          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3207            } else {
3208              $self->{state} = DATA_STATE;
3209              $self->{s_kwd} = '';
3210            }
3211          ## Reconsume.          ## Reconsume.
3212          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3213          redo A;          redo A;
# Line 3131  sub _get_next_token ($) { Line 3238  sub _get_next_token ($) {
3238          redo A;          redo A;
3239        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3240          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3241          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3242          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3243            } else {
3244              $self->{state} = DATA_STATE;
3245              $self->{s_kwd} = '';
3246            }
3247          ## Reprocess.          ## Reprocess.
3248          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3249          redo A;          redo A;
# Line 3146  sub _get_next_token ($) { Line 3257  sub _get_next_token ($) {
3257          redo A;          redo A;
3258        }        }
3259      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
3260          ## XML5: Part of "Pi after state".
3261    
3262        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3263          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3264          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265            } else {
3266              $self->{state} = DATA_STATE;
3267              $self->{s_kwd} = '';
3268            }
3269          !!!next-input-character;          !!!next-input-character;
3270          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3271          redo A;          redo A;
# Line 3171  sub _get_next_token ($) { Line 3288  sub _get_next_token ($) {
3288          redo A;          redo A;
3289        }        }
3290      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3291        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3292    
3293        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3294          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3295          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3296            } else {
3297              $self->{state} = DATA_STATE;
3298              $self->{s_kwd} = '';
3299            }
3300          !!!next-input-character;          !!!next-input-character;
3301          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3302          redo A;          redo A;
# Line 3192  sub _get_next_token ($) { Line 3314  sub _get_next_token ($) {
3314    
3315      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3316        if ($self->{nc} == 0x003C) { # <        if ($self->{nc} == 0x003C) { # <
3317          ## TODO:          $self->{state} = DOCTYPE_TAG_STATE;
3318          !!!next-input-character;          !!!next-input-character;
3319          redo A;          redo A;
3320        } elsif ($self->{nc} == 0x0025) { # %        } elsif ($self->{nc} == 0x0025) { # %
# Line 3202  sub _get_next_token ($) { Line 3324  sub _get_next_token ($) {
3324          !!!next-input-character;          !!!next-input-character;
3325          redo A;          redo A;
3326        } elsif ($self->{nc} == 0x005D) { # ]        } elsif ($self->{nc} == 0x005D) { # ]
3327            delete $self->{in_subset};
3328          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3329          !!!next-input-character;          !!!next-input-character;
3330          redo A;          redo A;
# Line 3211  sub _get_next_token ($) { Line 3334  sub _get_next_token ($) {
3334          redo A;          redo A;
3335        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3336          !!!parse-error (type => 'unclosed internal subset'); ## TODO: type          !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3337            delete $self->{in_subset};
3338          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3339          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3340          ## Reconsume.          ## Reconsume.
3341          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3342          redo A;          redo A;
3343        } else {        } else {
3344          unless ($self->{internal_subset_tainted}) {          unless ($self->{internal_subset_tainted}) {
# Line 3231  sub _get_next_token ($) { Line 3355  sub _get_next_token ($) {
3355          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3356          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3357          !!!next-input-character;          !!!next-input-character;
3358          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3359          redo A;          redo A;
3360        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3361          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
3362          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3363          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3364          ## Reconsume.          ## Reconsume.
3365          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3366          redo A;          redo A;
3367        } else {        } else {
3368          ## XML5: No parse error and stay in the state.          ## XML5: No parse error and stay in the state.
3369          !!!parse-error (type => 'string after internal subset'); ## TODO: type          !!!parse-error (type => 'string after internal subset'); ## TODO: type
3370    
3371          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3372          !!!next-input-character;          !!!next-input-character;
3373          redo A;          redo A;
3374        }        }
3375                } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3376          if ($self->{nc} == 0x003E) { # >
3377            $self->{state} = DATA_STATE;
3378            $self->{s_kwd} = '';
3379            !!!next-input-character;
3380            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3381            redo A;
3382          } elsif ($self->{nc} == -1) {
3383            $self->{state} = DATA_STATE;
3384            $self->{s_kwd} = '';
3385            ## Reconsume.
3386            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3387            redo A;
3388          } else {
3389            ## Stay in the state.
3390            !!!next-input-character;
3391            redo A;
3392          }
3393        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3394          if ($self->{nc} == 0x0021) { # !
3395            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3396            !!!next-input-character;
3397            redo A;
3398          } elsif ($self->{nc} == 0x003F) { # ?
3399            $self->{state} = PI_STATE;
3400            !!!next-input-character;
3401            redo A;
3402          } elsif ($self->{nc} == -1) {
3403            !!!parse-error (type => 'bare stago');
3404            $self->{state} = DATA_STATE;
3405            $self->{s_kwd} = '';
3406            ## Reconsume.
3407            redo A;
3408          } else {
3409            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3410                            line => $self->{line_prev},
3411                            column => $self->{column_prev});
3412            $self->{state} = BOGUS_COMMENT_STATE;
3413            $self->{ct} = {type => COMMENT_TOKEN,
3414                           data => '',
3415                          }; ## NOTE: Will be discarded.
3416            !!!next-input-character;
3417            redo A;
3418          }
3419        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3420          ## XML5: "DOCTYPE markup declaration state".
3421          
3422          if ($self->{nc} == 0x002D) { # -
3423            $self->{state} = MD_HYPHEN_STATE;
3424            !!!next-input-character;
3425            redo A;
3426          } elsif ($self->{nc} == 0x0045) { # E
3427            $self->{state} = MD_E_STATE;
3428            $self->{kwd} = chr $self->{nc};
3429            !!!next-input-character;
3430            redo A;
3431          } elsif ($self->{nc} == 0x0041) { # A
3432            $self->{state} = MD_ATTLIST_STATE;
3433            $self->{kwd} = chr $self->{nc};
3434            !!!next-input-character;
3435            redo A;
3436          } elsif ($self->{nc} == 0x004E) { # N
3437            $self->{state} = MD_NOTATION_STATE;
3438            $self->{kwd} = chr $self->{nc};
3439            !!!next-input-character;
3440            redo A;
3441          } else {
3442            #
3443          }
3444          
3445          ## XML5: No parse error.
3446          !!!parse-error (type => 'bogus comment',
3447                          line => $self->{line_prev},
3448                          column => $self->{column_prev} - 1);
3449          ## Reconsume.
3450          $self->{state} = BOGUS_COMMENT_STATE;
3451          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3452          redo A;
3453        } elsif ($self->{state} == MD_E_STATE) {
3454          if ($self->{nc} == 0x004E) { # N
3455            $self->{state} = MD_ENTITY_STATE;
3456            $self->{kwd} .= chr $self->{nc};
3457            !!!next-input-character;
3458            redo A;
3459          } elsif ($self->{nc} == 0x004C) { # L
3460            ## XML5: <!ELEMENT> not supported.
3461            $self->{state} = MD_ELEMENT_STATE;
3462            $self->{kwd} .= chr $self->{nc};
3463            !!!next-input-character;
3464            redo A;
3465          } else {
3466            ## XML5: No parse error.
3467            !!!parse-error (type => 'bogus comment',
3468                            line => $self->{line_prev},
3469                            column => $self->{column_prev} - 2
3470                                + 1 * ($self->{nc} == -1));
3471            ## Reconsume.
3472            $self->{state} = BOGUS_COMMENT_STATE;
3473            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3474            redo A;
3475          }
3476        } elsif ($self->{state} == MD_ENTITY_STATE) {
3477          if ($self->{nc} == {
3478                'EN' => 0x0054, # T
3479                'ENT' => 0x0049, # I
3480                'ENTI' => 0x0054, # T
3481              }->{$self->{kwd}}) {
3482            ## Stay in the state.
3483            $self->{kwd} .= chr $self->{nc};
3484            !!!next-input-character;
3485            redo A;
3486          } elsif ($self->{kwd} eq 'ENTIT' and
3487                   $self->{nc} == 0x0059) { # Y
3488            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
3489                           line => $self->{line_prev},
3490                           column => $self->{column_prev} - 6};
3491            $self->{state} = DOCTYPE_MD_STATE;
3492            !!!next-input-character;
3493            redo A;
3494          } else {
3495            !!!parse-error (type => 'bogus comment',
3496                            line => $self->{line_prev},
3497                            column => $self->{column_prev} - 1
3498                                - (length $self->{kwd})
3499                                + 1 * ($self->{nc} == -1));
3500            $self->{state} = BOGUS_COMMENT_STATE;
3501            ## Reconsume.
3502            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3503            redo A;
3504          }
3505        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3506          if ($self->{nc} == {
3507                'EL' => 0x0045, # E
3508                'ELE' => 0x004D, # M
3509                'ELEM' => 0x0045, # E
3510                'ELEME' => 0x004E, # N
3511              }->{$self->{kwd}}) {
3512            ## Stay in the state.
3513            $self->{kwd} .= chr $self->{nc};
3514            !!!next-input-character;
3515            redo A;
3516          } elsif ($self->{kwd} eq 'ELEMEN' and
3517                   $self->{nc} == 0x0054) { # T
3518            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3519                           line => $self->{line_prev},
3520                           column => $self->{column_prev} - 6};
3521            $self->{state} = DOCTYPE_MD_STATE;
3522            !!!next-input-character;
3523            redo A;
3524          } else {
3525            !!!parse-error (type => 'bogus comment',
3526                            line => $self->{line_prev},
3527                            column => $self->{column_prev} - 1
3528                                - (length $self->{kwd})
3529                                + 1 * ($self->{nc} == -1));
3530            $self->{state} = BOGUS_COMMENT_STATE;
3531            ## Reconsume.
3532            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3533            redo A;
3534          }
3535        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3536          if ($self->{nc} == {
3537                'A' => 0x0054, # T
3538                'AT' => 0x0054, # T
3539                'ATT' => 0x004C, # L
3540                'ATTL' => 0x0049, # I
3541                'ATTLI' => 0x0053, # S
3542              }->{$self->{kwd}}) {
3543            ## Stay in the state.
3544            $self->{kwd} .= chr $self->{nc};
3545            !!!next-input-character;
3546            redo A;
3547          } elsif ($self->{kwd} eq 'ATTLIS' and
3548                   $self->{nc} == 0x0054) { # T
3549            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3550                           line => $self->{line_prev},
3551                           column => $self->{column_prev} - 6};
3552            $self->{state} = DOCTYPE_MD_STATE;
3553            !!!next-input-character;
3554            redo A;
3555          } else {
3556            !!!parse-error (type => 'bogus comment',
3557                            line => $self->{line_prev},
3558                            column => $self->{column_prev} - 1
3559                                 - (length $self->{kwd})
3560                                 + 1 * ($self->{nc} == -1));
3561            $self->{state} = BOGUS_COMMENT_STATE;
3562            ## Reconsume.
3563            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3564            redo A;
3565          }
3566        } elsif ($self->{state} == MD_NOTATION_STATE) {
3567          if ($self->{nc} == {
3568                'N' => 0x004F, # O
3569                'NO' => 0x0054, # T
3570                'NOT' => 0x0041, # A
3571                'NOTA' => 0x0054, # T
3572                'NOTAT' => 0x0049, # I
3573                'NOTATI' => 0x004F, # O
3574              }->{$self->{kwd}}) {
3575            ## Stay in the state.
3576            $self->{kwd} .= chr $self->{nc};
3577            !!!next-input-character;
3578            redo A;
3579          } elsif ($self->{kwd} eq 'NOTATIO' and
3580                   $self->{nc} == 0x004E) { # N
3581            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3582                           line => $self->{line_prev},
3583                           column => $self->{column_prev} - 6};
3584            $self->{state} = DOCTYPE_MD_STATE;
3585            !!!next-input-character;
3586            redo A;
3587          } else {
3588            !!!parse-error (type => 'bogus comment',
3589                            line => $self->{line_prev},
3590                            column => $self->{column_prev} - 1
3591                                - (length $self->{kwd})
3592                                + 1 * ($self->{nc} == -1));
3593            $self->{state} = BOGUS_COMMENT_STATE;
3594            ## Reconsume.
3595            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3596            redo A;
3597          }
3598        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3599          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3600          ## "DOCTYPE NOTATION state".
3601    
3602          if ($is_space->{$self->{nc}}) {
3603            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3604            $self->{state} = BEFORE_MD_NAME_STATE;
3605            !!!next-input-character;
3606            redo A;
3607          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3608                   $self->{nc} == 0x0025) { # %
3609            ## XML5: Switch to the "DOCTYPE bogus comment state".
3610            !!!parse-error (type => 'no space before md name'); ## TODO: type
3611            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3612            !!!next-input-character;
3613            redo A;
3614          } elsif ($self->{nc} == -1) {
3615            !!!parse-error (type => 'unclosed md'); ## TODO: type
3616            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3617            ## Reconsume.
3618            redo A;
3619          } elsif ($self->{nc} == 0x003E) { # >
3620            ## XML5: Switch to the "DOCTYPE bogus comment state".
3621            !!!parse-error (type => 'no md name'); ## TODO: type
3622            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3623            !!!next-input-character;
3624            redo A;
3625          } else {
3626            ## XML5: Switch to the "DOCTYPE bogus comment state".
3627            !!!parse-error (type => 'no space before md name'); ## TODO: type
3628            $self->{state} = BEFORE_MD_NAME_STATE;
3629            redo A;
3630          }
3631        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3632          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3633          ## before state", "DOCTYPE ATTLIST name before state".
3634    
3635          if ($is_space->{$self->{nc}}) {
3636            ## Stay in the state.
3637            !!!next-input-character;
3638            redo A;
3639          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3640                   $self->{nc} == 0x0025) { # %
3641            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3642            !!!next-input-character;
3643            redo A;
3644          } elsif ($self->{nc} == 0x003E) { # >
3645            ## XML5: Same as "Anything else".
3646            !!!parse-error (type => 'no md name'); ## TODO: type
3647            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3648            !!!next-input-character;
3649            redo A;
3650          } elsif ($self->{nc} == -1) {
3651            !!!parse-error (type => 'unclosed md'); ## TODO: type
3652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3653            ## Reconsume.
3654            redo A;
3655          } else {
3656            ## XML5: [ATTLIST] Not defined yet.
3657            $self->{ct}->{name} .= chr $self->{nc};
3658            $self->{state} = MD_NAME_STATE;
3659            !!!next-input-character;
3660            redo A;
3661          }
3662        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3663          if ($is_space->{$self->{nc}}) {
3664            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3665            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3666            $self->{state} = BEFORE_MD_NAME_STATE;
3667            !!!next-input-character;
3668            redo A;
3669          } elsif ($self->{nc} == 0x003E) { # >
3670            ## XML5: Same as "Anything else".
3671            !!!parse-error (type => 'no md name'); ## TODO: type
3672            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3673            !!!next-input-character;
3674            redo A;
3675          } elsif ($self->{nc} == -1) {
3676            !!!parse-error (type => 'unclosed md');
3677            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3678            ## Reconsume.
3679            redo A;
3680          } else {
3681            ## XML5: No parse error.
3682            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3683            $self->{state} = BOGUS_COMMENT_STATE;
3684            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3685            ## Reconsume.
3686            redo A;
3687          }
3688        } elsif ($self->{state} == MD_NAME_STATE) {
3689          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3690          
3691          if ($is_space->{$self->{nc}}) {
3692            ## TODO:
3693            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3694            !!!next-input-character;
3695            redo A;
3696          } elsif ($self->{nc} == 0x003E) { # >
3697            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3698              #
3699            } else {
3700              !!!parse-error (type => 'no md body'); ## TODO: type
3701            }
3702            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3703            !!!next-input-character;
3704            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3705            redo A;
3706          } elsif ($self->{nc} == -1) {
3707            ## XML5: [ATTLIST] No parse error.
3708            !!!parse-error (type => 'unclosed md');
3709            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3710            ## Reconsume.
3711            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3712            redo A;
3713          } else {
3714            ## XML5: [ATTLIST] Not defined yet.
3715            $self->{ct}->{name} .= chr $self->{nc};
3716            ## Stay in the state.
3717            !!!next-input-character;
3718            redo A;
3719          }
3720        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
3721          if ($is_space->{$self->{nc}}) {
3722            ## Stay in the state.
3723            !!!next-input-character;
3724            redo A;
3725          } elsif ($self->{nc} == 0x003E) { # >
3726            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3727            !!!next-input-character;
3728            !!!emit ($self->{ct}); # ATTLIST
3729            redo A;
3730          } elsif ($self->{nc} == -1) {
3731            ## XML5: No parse error.
3732            !!!parse-error (type => 'unclosed md'); ## TODO: type
3733            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3734            redo A;
3735          } else {
3736            ## XML5: Not defined yet.
3737    
3738            ## TODO: ...
3739    
3740            $self->{state} = BOGUS_COMMENT_STATE;
3741            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3742            ## Reconsume.
3743            redo A;
3744          }
3745    
3746      } else {      } else {
3747        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
3748      }      }

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.14

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24