/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.16 by wakaba, Sat Oct 18 11:34:49 2008 UTC revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC
# Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO
177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180  sub BOGUS_MD_STATE () { 85 }  sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188    sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189    sub BOGUS_MD_STATE () { 94 }
190    
191  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
192  ## list and descriptions)  ## list and descriptions)
# Line 2256  sub _get_next_token ($) { Line 2265  sub _get_next_token ($) {
2265          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2266          !!!next-input-character;          !!!next-input-character;
2267          redo A;          redo A;
2268  ## TODO: " and ' for ENTITY        } elsif ($self->{nc} == 0x0022 and # "
2269                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2270                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2271            !!!cp (167.21);
2272            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2273            $self->{ct}->{value} = ''; # ENTITY
2274            !!!next-input-character;
2275            redo A;
2276          } elsif ($self->{nc} == 0x0027 and # '
2277                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2278                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2279            !!!cp (167.22);
2280            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2281            $self->{ct}->{value} = ''; # ENTITY
2282            !!!next-input-character;
2283            redo A;
2284        } elsif ($self->{is_xml} and        } elsif ($self->{is_xml} and
2285                 $self->{ct}->{type} == DOCTYPE_TOKEN and                 $self->{ct}->{type} == DOCTYPE_TOKEN and
2286                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
# Line 2812  sub _get_next_token ($) { Line 2836  sub _get_next_token ($) {
2836        }        }
2837      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2838        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2839          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2840          ## Stay in the state            !!!cp (215.1);
2841              $self->{state} = BEFORE_NDATA_STATE;
2842            } else {
2843              !!!cp (215);
2844              ## Stay in the state
2845            }
2846          !!!next-input-character;          !!!next-input-character;
2847          redo A;          redo A;
2848        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
# Line 2829  sub _get_next_token ($) { Line 2858  sub _get_next_token ($) {
2858          !!!next-input-character;          !!!next-input-character;
2859          !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION          !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860          redo A;          redo A;
2861  ## TODO: "NDATA"        } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2862                   ($self->{nc} == 0x004E or # N
2863                    $self->{nc} == 0x006E)) { # n
2864            !!!cp (216.2);
2865            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2866            $self->{state} = NDATA_STATE;
2867            $self->{kwd} = chr $self->{nc};
2868            !!!next-input-character;
2869            redo A;
2870        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2871          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872            !!!cp (217);            !!!cp (217);
# Line 2871  sub _get_next_token ($) { Line 2908  sub _get_next_token ($) {
2908          !!!next-input-character;          !!!next-input-character;
2909          redo A;          redo A;
2910        }        }
2911        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2912          if ($is_space->{$self->{nc}}) {
2913            !!!cp (218.3);
2914            ## Stay in the state.
2915            !!!next-input-character;
2916            redo A;
2917          } elsif ($self->{nc} == 0x003E) { # >
2918            !!!cp (218.4);
2919            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920            !!!next-input-character;
2921            !!!emit ($self->{ct}); # ENTITY
2922            redo A;
2923          } elsif ($self->{nc} == 0x004E or # N
2924                   $self->{nc} == 0x006E) { # n
2925            !!!cp (218.5);
2926            $self->{state} = NDATA_STATE;
2927            $self->{kwd} = chr $self->{nc};
2928            !!!next-input-character;
2929            redo A;
2930          } elsif ($self->{nc} == -1) {
2931            !!!cp (218.6);
2932            !!!parse-error (type => 'unclosed md'); ## TODO: type
2933            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2934            ## reconsume
2935            !!!emit ($self->{ct}); # ENTITY
2936            redo A;
2937          } else {
2938            !!!cp (218.7);
2939            !!!parse-error (type => 'string after SYSTEM literal');
2940            $self->{state} = BOGUS_MD_STATE;
2941            !!!next-input-character;
2942            redo A;
2943          }
2944      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2945        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2946          !!!cp (219);          !!!cp (219);
# Line 3632  sub _get_next_token ($) { Line 3702  sub _get_next_token ($) {
3702          $self->{state} = MD_HYPHEN_STATE;          $self->{state} = MD_HYPHEN_STATE;
3703          !!!next-input-character;          !!!next-input-character;
3704          redo A;          redo A;
3705        } elsif ($self->{nc} == 0x0045) { # E        } elsif ($self->{nc} == 0x0045 or # E
3706                   $self->{nc} == 0x0065) { # e
3707          $self->{state} = MD_E_STATE;          $self->{state} = MD_E_STATE;
3708          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3709          !!!next-input-character;          !!!next-input-character;
3710          redo A;          redo A;
3711        } elsif ($self->{nc} == 0x0041) { # A        } elsif ($self->{nc} == 0x0041 or # A
3712                   $self->{nc} == 0x0061) { # a
3713          $self->{state} = MD_ATTLIST_STATE;          $self->{state} = MD_ATTLIST_STATE;
3714          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3715          !!!next-input-character;          !!!next-input-character;
3716          redo A;          redo A;
3717        } elsif ($self->{nc} == 0x004E) { # N        } elsif ($self->{nc} == 0x004E or # N
3718                   $self->{nc} == 0x006E) { # n
3719          $self->{state} = MD_NOTATION_STATE;          $self->{state} = MD_NOTATION_STATE;
3720          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3721          !!!next-input-character;          !!!next-input-character;
# Line 3660  sub _get_next_token ($) { Line 3733  sub _get_next_token ($) {
3733        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3734        redo A;        redo A;
3735      } elsif ($self->{state} == MD_E_STATE) {      } elsif ($self->{state} == MD_E_STATE) {
3736        if ($self->{nc} == 0x004E) { # N        if ($self->{nc} == 0x004E or # N
3737              $self->{nc} == 0x006E) { # n
3738          $self->{state} = MD_ENTITY_STATE;          $self->{state} = MD_ENTITY_STATE;
3739          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3740          !!!next-input-character;          !!!next-input-character;
3741          redo A;          redo A;
3742        } elsif ($self->{nc} == 0x004C) { # L        } elsif ($self->{nc} == 0x004C or # L
3743                   $self->{nc} == 0x006C) { # l
3744          ## XML5: <!ELEMENT> not supported.          ## XML5: <!ELEMENT> not supported.
3745          $self->{state} = MD_ELEMENT_STATE;          $self->{state} = MD_ELEMENT_STATE;
3746          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
# Line 3683  sub _get_next_token ($) { Line 3758  sub _get_next_token ($) {
3758          redo A;          redo A;
3759        }        }
3760      } elsif ($self->{state} == MD_ENTITY_STATE) {      } elsif ($self->{state} == MD_ENTITY_STATE) {
3761        if ($self->{nc} == {        if ($self->{nc} == [
3762              'EN' => 0x0054, # T              undef,
3763              'ENT' => 0x0049, # I              undef,
3764              'ENTI' => 0x0054, # T              0x0054, # T
3765            }->{$self->{kwd}}) {              0x0049, # I
3766                0x0054, # T
3767              ]->[length $self->{kwd}] or
3768              $self->{nc} == [
3769                undef,
3770                undef,
3771                0x0074, # t
3772                0x0069, # i
3773                0x0074, # t
3774              ]->[length $self->{kwd}]) {
3775          ## Stay in the state.          ## Stay in the state.
3776          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3777          !!!next-input-character;          !!!next-input-character;
3778          redo A;          redo A;
3779        } elsif ($self->{kwd} eq 'ENTIT' and        } elsif ((length $self->{kwd}) == 5 and
3780                 $self->{nc} == 0x0059) { # Y                 ($self->{nc} == 0x0059 or # Y
3781          $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',                  $self->{nc} == 0x0079)) { # y
3782            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3783              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3784                              text => 'ENTITY',
3785                              line => $self->{line_prev},
3786                              column => $self->{column_prev} - 4);
3787            }
3788            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3789                         line => $self->{line_prev},                         line => $self->{line_prev},
3790                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
3791          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
# Line 3712  sub _get_next_token ($) { Line 3803  sub _get_next_token ($) {
3803          redo A;          redo A;
3804        }        }
3805      } elsif ($self->{state} == MD_ELEMENT_STATE) {      } elsif ($self->{state} == MD_ELEMENT_STATE) {
3806        if ($self->{nc} == {        if ($self->{nc} == [
3807              'EL' => 0x0045, # E             undef,
3808              'ELE' => 0x004D, # M             undef,
3809              'ELEM' => 0x0045, # E             0x0045, # E
3810              'ELEME' => 0x004E, # N             0x004D, # M
3811            }->{$self->{kwd}}) {             0x0045, # E
3812               0x004E, # N
3813              ]->[length $self->{kwd}] or
3814              $self->{nc} == [
3815               undef,
3816               undef,
3817               0x0065, # e
3818               0x006D, # m
3819               0x0065, # e
3820               0x006E, # n
3821              ]->[length $self->{kwd}]) {
3822          ## Stay in the state.          ## Stay in the state.
3823          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3824          !!!next-input-character;          !!!next-input-character;
3825          redo A;          redo A;
3826        } elsif ($self->{kwd} eq 'ELEMEN' and        } elsif ((length $self->{kwd}) == 6 and
3827                 $self->{nc} == 0x0054) { # T                 ($self->{nc} == 0x0054 or # T
3828                    $self->{nc} == 0x0074)) { # t
3829            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3830              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3831                              text => 'ELEMENT',
3832                              line => $self->{line_prev},
3833                              column => $self->{column_prev} - 5);
3834            }
3835          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3836                         line => $self->{line_prev},                         line => $self->{line_prev},
3837                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
# Line 3742  sub _get_next_token ($) { Line 3850  sub _get_next_token ($) {
3850          redo A;          redo A;
3851        }        }
3852      } elsif ($self->{state} == MD_ATTLIST_STATE) {      } elsif ($self->{state} == MD_ATTLIST_STATE) {
3853        if ($self->{nc} == {        if ($self->{nc} == [
3854              'A' => 0x0054, # T             undef,
3855              'AT' => 0x0054, # T             0x0054, # T
3856              'ATT' => 0x004C, # L             0x0054, # T
3857              'ATTL' => 0x0049, # I             0x004C, # L
3858              'ATTLI' => 0x0053, # S             0x0049, # I
3859            }->{$self->{kwd}}) {             0x0053, # S
3860              ]->[length $self->{kwd}] or
3861              $self->{nc} == [
3862               undef,
3863               0x0074, # t
3864               0x0074, # t
3865               0x006C, # l
3866               0x0069, # i
3867               0x0073, # s
3868              ]->[length $self->{kwd}]) {
3869          ## Stay in the state.          ## Stay in the state.
3870          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3871          !!!next-input-character;          !!!next-input-character;
3872          redo A;          redo A;
3873        } elsif ($self->{kwd} eq 'ATTLIS' and        } elsif ((length $self->{kwd}) == 6 and
3874                 $self->{nc} == 0x0054) { # T                 ($self->{nc} == 0x0054 or # T
3875                    $self->{nc} == 0x0074)) { # t
3876            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3877              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3878                              text => 'ATTLIST',
3879                              line => $self->{line_prev},
3880                              column => $self->{column_prev} - 5);
3881            }
3882          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3883                         attrdefs => [],                         attrdefs => [],
3884                         line => $self->{line_prev},                         line => $self->{line_prev},
# Line 3774  sub _get_next_token ($) { Line 3898  sub _get_next_token ($) {
3898          redo A;          redo A;
3899        }        }
3900      } elsif ($self->{state} == MD_NOTATION_STATE) {      } elsif ($self->{state} == MD_NOTATION_STATE) {
3901        if ($self->{nc} == {        if ($self->{nc} == [
3902              'N' => 0x004F, # O             undef,
3903              'NO' => 0x0054, # T             0x004F, # O
3904              'NOT' => 0x0041, # A             0x0054, # T
3905              'NOTA' => 0x0054, # T             0x0041, # A
3906              'NOTAT' => 0x0049, # I             0x0054, # T
3907              'NOTATI' => 0x004F, # O             0x0049, # I
3908            }->{$self->{kwd}}) {             0x004F, # O
3909              ]->[length $self->{kwd}] or
3910              $self->{nc} == [
3911               undef,
3912               0x006F, # o
3913               0x0074, # t
3914               0x0061, # a
3915               0x0074, # t
3916               0x0069, # i
3917               0x006F, # o
3918              ]->[length $self->{kwd}]) {
3919          ## Stay in the state.          ## Stay in the state.
3920          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3921          !!!next-input-character;          !!!next-input-character;
3922          redo A;          redo A;
3923        } elsif ($self->{kwd} eq 'NOTATIO' and        } elsif ((length $self->{kwd}) == 7 and
3924                 $self->{nc} == 0x004E) { # N                 ($self->{nc} == 0x004E or # N
3925                    $self->{nc} == 0x006E)) { # n
3926            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3927              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3928                              text => 'NOTATION',
3929                              line => $self->{line_prev},
3930                              column => $self->{column_prev} - 6);
3931            }
3932          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
3933                         line => $self->{line_prev},                         line => $self->{line_prev},
3934                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
# Line 4431  sub _get_next_token ($) { Line 4572  sub _get_next_token ($) {
4572          ## Reconsume.          ## Reconsume.
4573          redo A;          redo A;
4574        }        }
4575        } elsif ($self->{state} == NDATA_STATE) {
4576          ## ASCII case-insensitive
4577          if ($self->{nc} == [
4578                undef,
4579                0x0044, # D
4580                0x0041, # A
4581                0x0054, # T
4582              ]->[length $self->{kwd}] or
4583              $self->{nc} == [
4584                undef,
4585                0x0064, # d
4586                0x0061, # a
4587                0x0074, # t
4588              ]->[length $self->{kwd}]) {
4589            !!!cp (172.2);
4590            ## Stay in the state.
4591            $self->{kwd} .= chr $self->{nc};
4592            !!!next-input-character;
4593            redo A;
4594          } elsif ((length $self->{kwd}) == 4 and
4595                   ($self->{nc} == 0x0041 or # A
4596                    $self->{nc} == 0x0061)) { # a
4597            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4598              !!!cp (172.3);
4599              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4600                              text => 'NDATA',
4601                              line => $self->{line_prev},
4602                              column => $self->{column_prev} - 4);
4603            } else {
4604              !!!cp (172.4);
4605            }
4606            $self->{state} = AFTER_NDATA_STATE;
4607            !!!next-input-character;
4608            redo A;
4609          } else {
4610            !!!parse-error (type => 'string after literal', ## TODO: type
4611                            line => $self->{line_prev},
4612                            column => $self->{column_prev} + 1
4613                                - length $self->{kwd});
4614            !!!cp (172.5);
4615            $self->{state} = BOGUS_MD_STATE;
4616            ## Reconsume.
4617            redo A;
4618          }
4619        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4620          if ($is_space->{$self->{nc}}) {
4621            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4622            !!!next-input-character;
4623            redo A;
4624          } elsif ($self->{nc} == 0x003E) { # >
4625            !!!parse-error (type => 'no notation name'); ## TODO: type
4626            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4627            !!!next-input-character;
4628            !!!emit ($self->{ct}); # ENTITY
4629            redo A;
4630          } elsif ($self->{nc} == -1) {
4631            !!!parse-error (type => 'unclosed md'); ## TODO: type
4632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4633            !!!next-input-character;
4634            !!!emit ($self->{ct}); # ENTITY
4635            redo A;
4636          } else {
4637            !!!parse-error (type => 'string after literal', ## TODO: type
4638                            line => $self->{line_prev},
4639                            column => $self->{column_prev} + 1
4640                                - length $self->{kwd});
4641            $self->{state} = BOGUS_MD_STATE;
4642            ## Reconsume.
4643            redo A;
4644          }
4645        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4646          if ($is_space->{$self->{nc}}) {
4647            ## Stay in the state.
4648            !!!next-input-character;
4649            redo A;
4650          } elsif ($self->{nc} == 0x003E) { # >
4651            !!!parse-error (type => 'no notation name'); ## TODO: type
4652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653            !!!next-input-character;
4654            !!!emit ($self->{ct}); # ENTITY
4655            redo A;
4656          } elsif ($self->{nc} == -1) {
4657            !!!parse-error (type => 'unclosed md'); ## TODO: type
4658            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4659            !!!next-input-character;
4660            !!!emit ($self->{ct}); # ENTITY
4661            redo A;
4662          } else {
4663            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4664            $self->{state} = NOTATION_NAME_STATE;
4665            !!!next-input-character;
4666            redo A;
4667          }
4668        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4669          if ($is_space->{$self->{nc}}) {
4670            $self->{state} = AFTER_NOTATION_NAME_STATE;
4671            !!!next-input-character;
4672            redo A;
4673          } elsif ($self->{nc} == 0x003E) { # >
4674            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4675            !!!next-input-character;
4676            !!!emit ($self->{ct}); # ENTITY
4677            redo A;
4678          } elsif ($self->{nc} == -1) {
4679            !!!parse-error (type => 'unclosed md'); ## TODO: type
4680            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4681            !!!next-input-character;
4682            !!!emit ($self->{ct}); # ENTITY
4683            redo A;
4684          } else {
4685            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4686            ## Stay in the state.
4687            !!!next-input-character;
4688            redo A;
4689          }
4690        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4691          if ($self->{nc} == 0x0022) { # "
4692            $self->{state} = AFTER_NOTATION_NAME_STATE;
4693            !!!next-input-character;
4694            redo A;
4695          } elsif ($self->{nc} == 0x0026) { # &
4696            $self->{prev_state} = $self->{state};
4697            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4698            $self->{entity_add} = 0x0022; # "
4699            !!!next-input-character;
4700            redo A;
4701    ## TODO: %
4702          } elsif ($self->{nc} == -1) {
4703            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4704            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4705            ## Reconsume.
4706            !!!emit ($self->{ct}); # ENTITY
4707            redo A;
4708          } else {
4709            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4710            !!!next-input-character;
4711            redo A;
4712          }
4713        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4714          if ($self->{nc} == 0x0027) { # '
4715            $self->{state} = AFTER_NOTATION_NAME_STATE;
4716            !!!next-input-character;
4717            redo A;
4718          } elsif ($self->{nc} == 0x0026) { # &
4719            $self->{prev_state} = $self->{state};
4720            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4721            $self->{entity_add} = 0x0027; # '
4722            !!!next-input-character;
4723            redo A;
4724    ## TODO: %
4725          } elsif ($self->{nc} == -1) {
4726            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4727            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728            ## Reconsume.
4729            !!!emit ($self->{ct}); # ENTITY
4730            redo A;
4731          } else {
4732            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4733            !!!next-input-character;
4734            redo A;
4735          }
4736        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4737          ## TODO: XMLize
4738    
4739          if ($is_space->{$self->{nc}} or
4740              {
4741                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4742                $self->{entity_add} => 1,
4743              }->{$self->{nc}}) {
4744            ## Don't consume
4745            ## No error
4746            ## Return nothing.
4747            #
4748          } elsif ($self->{nc} == 0x0023) { # #
4749            $self->{ca} = $self->{ct};
4750            $self->{state} = ENTITY_HASH_STATE;
4751            $self->{kwd} = '#';
4752            !!!next-input-character;
4753            redo A;
4754          } elsif ((0x0041 <= $self->{nc} and
4755                    $self->{nc} <= 0x005A) or # A..Z
4756                   (0x0061 <= $self->{nc} and
4757                    $self->{nc} <= 0x007A)) { # a..z
4758            #
4759          } else {
4760            !!!parse-error (type => 'bare ero');
4761            ## Return nothing.
4762            #
4763          }
4764    
4765          $self->{ct}->{value} .= '&';
4766          $self->{state} = $self->{prev_state};
4767          ## Reconsume.
4768          redo A;
4769        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4770          if ($is_space->{$self->{nc}}) {
4771            ## Stay in the state.
4772            !!!next-input-character;
4773            redo A;
4774          } elsif ($self->{nc} == 0x003E) { # >
4775            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4776            !!!next-input-character;
4777            !!!emit ($self->{ct}); # ENTITY
4778            redo A;
4779          } elsif ($self->{nc} == -1) {
4780            !!!parse-error (type => 'unclosed md'); ## TODO: type
4781            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4782            !!!next-input-character;
4783            !!!emit ($self->{ct}); # ENTITY
4784            redo A;
4785          } else {
4786            !!!parse-error (type => 'string after notation name'); ## TODO: type
4787            $self->{state} = BOGUS_MD_STATE;
4788            ## Reconsume.
4789            redo A;
4790          }
4791      } elsif ($self->{state} == BOGUS_MD_STATE) {      } elsif ($self->{state} == BOGUS_MD_STATE) {
4792        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4793          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;

Legend:
Removed from v.1.16  
changed lines
  Added in v.1.19

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24