/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7 by wakaba, Tue Oct 14 15:25:50 2008 UTC revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117    ## XML states
118    sub PI_STATE () { 51 }
119    sub PI_TARGET_STATE () { 52 }
120    sub PI_TARGET_AFTER_STATE () { 53 }
121    sub PI_DATA_STATE () { 54 }
122    sub PI_AFTER_STATE () { 55 }
123    sub PI_DATA_AFTER_STATE () { 56 }
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
127    
# Line 198  sub _initialize_tokenizer ($) { Line 206  sub _initialize_tokenizer ($) {
206    
207  ## A token has:  ## A token has:
208  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
210  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
211  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212    ##   ->{target} (PI_TOKEN)
213  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
214  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
215  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 208  sub _initialize_tokenizer ($) { Line 217  sub _initialize_tokenizer ($) {
217  ##        ->{name}  ##        ->{name}
218  ##        ->{value}  ##        ->{value}
219  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
220  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
221    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
222  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
224  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
225  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
226  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 429  sub _get_next_token ($) { Line 440  sub _get_next_token ($) {
440        !!!emit ($token);        !!!emit ($token);
441        redo A;        redo A;
442      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
443          ## XML5: "tag state".
444    
445        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
447            !!!cp (15);            !!!cp (15);
# Line 500  sub _get_next_token ($) { Line 513  sub _get_next_token ($) {
513    
514            redo A;            redo A;
515          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
516            !!!cp (22);            if ($self->{is_xml}) {
517            !!!parse-error (type => 'pio',              !!!cp (22.1);
518                            line => $self->{line_prev},              $self->{state} = PI_STATE;
519                            column => $self->{column_prev});              !!!next-input-character;
520            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
521            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
522                                      line => $self->{line_prev},              !!!cp (22);
523                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
524                                     };                              line => $self->{line_prev},
525            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
526            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
527          } else {              $self->{ct} = {type => COMMENT_TOKEN, data => '',
528                               line => $self->{line_prev},
529                               column => $self->{column_prev},
530                              };
531                ## $self->{nc} is intentionally left as is
532                redo A;
533              }
534            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
535            !!!cp (23);            !!!cp (23);
536            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
537                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 526  sub _get_next_token ($) { Line 546  sub _get_next_token ($) {
546                     });                     });
547    
548            redo A;            redo A;
549            } else {
550              ## XML5: "<:" is a parse error.
551              !!!cp (23.1);
552              $self->{ct} = {type => START_TAG_TOKEN,
553                                        tag_name => chr ($self->{nc}),
554                                        line => $self->{line_prev},
555                                        column => $self->{column_prev}};
556              $self->{state} = TAG_NAME_STATE;
557              !!!next-input-character;
558              redo A;
559          }          }
560        } else {        } else {
561          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 534  sub _get_next_token ($) { Line 564  sub _get_next_token ($) {
564        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
565        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
566    
567          ## XML5: "end tag state".
568    
569        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
570        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
571          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
# Line 575  sub _get_next_token ($) { Line 607  sub _get_next_token ($) {
607          !!!next-input-character;          !!!next-input-character;
608          redo A;          redo A;
609        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (31);  
610          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
611                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
612                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
613          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
614          $self->{s_kwd} = '';          $self->{s_kwd} = '';
615          !!!next-input-character;          if ($self->{is_xml}) {
616              !!!cp (31);
617              ## XML5: No parse error.
618              
619              ## NOTE: This parser raises a parse error, since it supports
620              ## XML1, not XML5.
621    
622              ## NOTE: A short end tag token.
623              my $ct = {type => END_TAG_TOKEN,
624                        tag_name => '',
625                        line => $self->{line_prev},
626                        column => $self->{column_prev} - 1,
627                       };
628              !!!next-input-character;
629              !!!emit ($ct);
630            } else {
631              !!!cp (31.1);
632              !!!next-input-character;
633            }
634          redo A;          redo A;
635        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
636          !!!cp (32);          !!!cp (32);
# Line 595  sub _get_next_token ($) { Line 644  sub _get_next_token ($) {
644                   });                   });
645    
646          redo A;          redo A;
647        } else {        } elsif (not $self->{is_xml} or
648                   $is_space->{$self->{nc}}) {
649          !!!cp (33);          !!!cp (33);
650          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag',
651                            line => $self->{line_prev}, # "<" of "</"
652                            column => $self->{column_prev} - 1);
653          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
654          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
655                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 610  sub _get_next_token ($) { Line 662  sub _get_next_token ($) {
662          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
663          ## "bogus comment state" entry.          ## "bogus comment state" entry.
664          redo A;          redo A;
665          } else {
666            ## XML5: "</:" is a parse error.
667            !!!cp (30.1);
668            $self->{ct} = {type => END_TAG_TOKEN,
669                           tag_name => chr ($self->{nc}),
670                           line => $l, column => $c};
671            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
672            !!!next-input-character;
673            redo A;
674        }        }
675      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
676        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
# Line 740  sub _get_next_token ($) { Line 801  sub _get_next_token ($) {
801          redo A;          redo A;
802        }        }
803      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
804          ## XML5: "Tag attribute name before state".
805    
806        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
807          !!!cp (45);          !!!cp (45);
808          ## Stay in the state          ## Stay in the state
# Line 812  sub _get_next_token ($) { Line 875  sub _get_next_token ($) {
875               0x003D => 1, # =               0x003D => 1, # =
876              }->{$self->{nc}}) {              }->{$self->{nc}}) {
877            !!!cp (55);            !!!cp (55);
878              ## XML5: Not a parse error.
879            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
880          } else {          } else {
881            !!!cp (56);            !!!cp (56);
882              ## XML5: ":" raises a parse error and is ignored.
883          }          }
884          $self->{ca}          $self->{ca}
885              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 825  sub _get_next_token ($) { Line 890  sub _get_next_token ($) {
890          redo A;          redo A;
891        }        }
892      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
893          ## XML5: "Tag attribute name state".
894    
895        my $before_leave = sub {        my $before_leave = sub {
896          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
897              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 835  sub _get_next_token ($) { Line 902  sub _get_next_token ($) {
902            !!!cp (58);            !!!cp (58);
903            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
904              = $self->{ca};              = $self->{ca};
905              $self->{ca}->{index} = ++$self->{ct}->{last_index};
906          }          }
907        }; # $before_leave        }; # $before_leave
908    
# Line 851  sub _get_next_token ($) { Line 919  sub _get_next_token ($) {
919          !!!next-input-character;          !!!next-input-character;
920          redo A;          redo A;
921        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
922            if ($self->{is_xml}) {
923              !!!cp (60.1);
924              ## XML5: Not a parse error.
925              !!!parse-error (type => 'no attr value'); ## TODO: type
926            } else {
927              !!!cp (60.2);
928            }
929    
930          $before_leave->();          $before_leave->();
931          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
932            !!!cp (61);            !!!cp (61);
# Line 880  sub _get_next_token ($) { Line 956  sub _get_next_token ($) {
956          !!!next-input-character;          !!!next-input-character;
957          redo A;          redo A;
958        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
959          !!!cp (64);          if ($self->{is_xml}) {
960              !!!cp (64);
961              ## XML5: Not a parse error.
962              !!!parse-error (type => 'no attr value'); ## TODO: type
963            } else {
964              !!!cp (64.1);
965            }
966            
967          $before_leave->();          $before_leave->();
968          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
969          !!!next-input-character;          !!!next-input-character;
# Line 914  sub _get_next_token ($) { Line 997  sub _get_next_token ($) {
997          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
998              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
999            !!!cp (69);            !!!cp (69);
1000              ## XML5: Not a parse error.
1001            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1002          } else {          } else {
1003            !!!cp (70);            !!!cp (70);
# Line 924  sub _get_next_token ($) { Line 1008  sub _get_next_token ($) {
1008          redo A;          redo A;
1009        }        }
1010      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1011          ## XML5: "Tag attribute name after state".
1012          
1013        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1014          !!!cp (71);          !!!cp (71);
1015          ## Stay in the state          ## Stay in the state
# Line 935  sub _get_next_token ($) { Line 1021  sub _get_next_token ($) {
1021          !!!next-input-character;          !!!next-input-character;
1022          redo A;          redo A;
1023        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1024            if ($self->{is_xml}) {
1025              !!!cp (72.1);
1026              ## XML5: Not a parse error.
1027              !!!parse-error (type => 'no attr value'); ## TODO: type
1028            } else {
1029              !!!cp (72.2);
1030            }
1031    
1032          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033            !!!cp (73);            !!!cp (73);
1034            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 968  sub _get_next_token ($) { Line 1062  sub _get_next_token ($) {
1062          !!!next-input-character;          !!!next-input-character;
1063          redo A;          redo A;
1064        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1065          !!!cp (77);          if ($self->{is_xml}) {
1066              !!!cp (77);
1067              ## XML5: Not a parse error.
1068              !!!parse-error (type => 'no attr value'); ## TODO: type
1069            } else {
1070              !!!cp (77.1);
1071            }
1072            
1073          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1074          !!!next-input-character;          !!!next-input-character;
1075          redo A;          redo A;
# Line 997  sub _get_next_token ($) { Line 1098  sub _get_next_token ($) {
1098    
1099          redo A;          redo A;
1100        } else {        } else {
1101            if ($self->{is_xml}) {
1102              !!!cp (78.1);
1103              ## XML5: Not a parse error.
1104              !!!parse-error (type => 'no attr value'); ## TODO: type
1105            } else {
1106              !!!cp (78.2);
1107            }
1108    
1109          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1110              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1111            !!!cp (78);            !!!cp (78);
1112              ## XML5: Not a parse error.
1113            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1114          } else {          } else {
1115            !!!cp (82);            !!!cp (82);
# Line 1013  sub _get_next_token ($) { Line 1123  sub _get_next_token ($) {
1123          redo A;                  redo A;        
1124        }        }
1125      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1126          ## XML5: "Tag attribute value before state".
1127    
1128        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1129          !!!cp (83);          !!!cp (83);
1130          ## Stay in the state          ## Stay in the state
# Line 1084  sub _get_next_token ($) { Line 1196  sub _get_next_token ($) {
1196        } else {        } else {
1197          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1198            !!!cp (93);            !!!cp (93);
1199              ## XML5: Not a parse error.
1200            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1201            } elsif ($self->{is_xml}) {
1202              !!!cp (93.1);
1203              ## XML5: No parse error.
1204              !!!parse-error (type => 'unquoted attr value'); ## TODO
1205          } else {          } else {
1206            !!!cp (94);            !!!cp (94);
1207          }          }
# Line 1094  sub _get_next_token ($) { Line 1211  sub _get_next_token ($) {
1211          redo A;          redo A;
1212        }        }
1213      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1214          ## XML5: "Tag attribute value double quoted state".
1215          
1216        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1217          !!!cp (95);          !!!cp (95);
1218            ## XML5: "Tag attribute name before state".
1219          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1220          !!!next-input-character;          !!!next-input-character;
1221          redo A;          redo A;
1222        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1223          !!!cp (96);          !!!cp (96);
1224            ## XML5: Not defined yet.
1225    
1226          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1227          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1228          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1135  sub _get_next_token ($) { Line 1257  sub _get_next_token ($) {
1257    
1258          redo A;          redo A;
1259        } else {        } else {
1260          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1261              !!!cp (100);
1262              ## XML5: Not a parse error.
1263              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1264            } else {
1265              !!!cp (100.1);
1266            }
1267          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1268          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1269                                q["&],                                q["&<],
1270                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1271    
1272          ## Stay in the state          ## Stay in the state
# Line 1146  sub _get_next_token ($) { Line 1274  sub _get_next_token ($) {
1274          redo A;          redo A;
1275        }        }
1276      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1277          ## XML5: "Tag attribute value single quoted state".
1278    
1279        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1280          !!!cp (101);          !!!cp (101);
1281            ## XML5: "Before attribute name state" (sic).
1282          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1283          !!!next-input-character;          !!!next-input-character;
1284          redo A;          redo A;
1285        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1286          !!!cp (102);          !!!cp (102);
1287            ## XML5: Not defined yet.
1288    
1289          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1290          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1291          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1187  sub _get_next_token ($) { Line 1320  sub _get_next_token ($) {
1320    
1321          redo A;          redo A;
1322        } else {        } else {
1323          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1324              !!!cp (106);
1325              ## XML5: Not a parse error.
1326              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1327            } else {
1328              !!!cp (106.1);
1329            }
1330          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1331          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1332                                q['&],                                q['&<],
1333                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1334    
1335          ## Stay in the state          ## Stay in the state
# Line 1198  sub _get_next_token ($) { Line 1337  sub _get_next_token ($) {
1337          redo A;          redo A;
1338        }        }
1339      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1340          ## XML5: "Tag attribute value unquoted state".
1341    
1342        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1343          !!!cp (107);          !!!cp (107);
1344            ## XML5: "Tag attribute name before state".
1345          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346          !!!next-input-character;          !!!next-input-character;
1347          redo A;          redo A;
1348        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1349          !!!cp (108);          !!!cp (108);
1350    
1351            ## XML5: Not defined yet.
1352    
1353          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1354          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1355          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1268  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413               0x003D => 1, # =               0x003D => 1, # =
1414              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1415            !!!cp (115);            !!!cp (115);
1416              ## XML5: Not a parse error.
1417            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1418          } else {          } else {
1419            !!!cp (116);            !!!cp (116);
# Line 1344  sub _get_next_token ($) { Line 1490  sub _get_next_token ($) {
1490          redo A;          redo A;
1491        }        }
1492      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1493          ## XML5: "Empty tag state".
1494    
1495        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1496          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1497            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1385  sub _get_next_token ($) { Line 1533  sub _get_next_token ($) {
1533          } else {          } else {
1534            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1535          }          }
1536            ## XML5: "Tag attribute name before state".
1537          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1538          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1539          ## Reconsume.          ## Reconsume.
# Line 1477  sub _get_next_token ($) { Line 1626  sub _get_next_token ($) {
1626                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1627                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
1628                                   };                                   };
1629          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1630          !!!next-input-character;          !!!next-input-character;
1631          redo A;          redo A;
1632        } else {        } else {
# Line 1520  sub _get_next_token ($) { Line 1669  sub _get_next_token ($) {
1669        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
1670                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1671                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1672          !!!cp (129);          if ($self->{s_kwd} ne 'DOCTYP') {
1673              !!!cp (129);
1674              ## XML5: case-sensitive.
1675              !!!parse-error (type => 'lowercase keyword', ## TODO
1676                              text => 'DOCTYPE',
1677                              line => $self->{line_prev},
1678                              column => $self->{column_prev} - 5);
1679            } else {
1680              !!!cp (129.1);
1681            }
1682          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
1683          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
1684                                    quirks => 1,                                    quirks => 1,
# Line 1558  sub _get_next_token ($) { Line 1716  sub _get_next_token ($) {
1716          redo A;          redo A;
1717        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
1718                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
         !!!cp (135.2);  
   
1719          if ($self->{is_xml} and          if ($self->{is_xml} and
1720              not $self->{tainted} and              not $self->{tainted} and
1721              @{$self->{open_elements} or []} == 0) {              @{$self->{open_elements} or []} == 0) {
1722              !!!cp (135.2);
1723            !!!parse-error (type => 'cdata outside of root element',            !!!parse-error (type => 'cdata outside of root element',
1724                            line => $self->{line_prev},                            line => $self->{line_prev},
1725                            column => $self->{column_prev} - 7);                            column => $self->{column_prev} - 7);
1726            $self->{tainted} = 1;            $self->{tainted} = 1;
1727            } else {
1728              !!!cp (135.21);
1729          }          }
1730    
1731          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
# Line 1686  sub _get_next_token ($) { Line 1845  sub _get_next_token ($) {
1845          redo A;          redo A;
1846        }        }
1847      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1848          ## XML5: "comment dash state".
1849    
1850        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1851          !!!cp (148);          !!!cp (148);
1852          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 1721  sub _get_next_token ($) { Line 1882  sub _get_next_token ($) {
1882          redo A;          redo A;
1883        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
1884          !!!cp (152);          !!!cp (152);
1885            ## XML5: Not a parse error.
1886          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1887                          line => $self->{line_prev},                          line => $self->{line_prev},
1888                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1740  sub _get_next_token ($) { Line 1902  sub _get_next_token ($) {
1902          redo A;          redo A;
1903        } else {        } else {
1904          !!!cp (154);          !!!cp (154);
1905            ## XML5: Not a parse error.
1906          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1907                          line => $self->{line_prev},                          line => $self->{line_prev},
1908                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2335  sub _get_next_token ($) { Line 2498  sub _get_next_token ($) {
2498        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
2499        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2500        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
2501    
2502          ## XML5: "CDATA state".
2503                
2504        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2505          !!!cp (221.1);          !!!cp (221.1);
# Line 2343  sub _get_next_token ($) { Line 2508  sub _get_next_token ($) {
2508          redo A;          redo A;
2509        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2510          if ($self->{is_xml}) {          if ($self->{is_xml}) {
2511              !!!cp (221.11);
2512            !!!parse-error (type => 'no mse'); ## TODO: type            !!!parse-error (type => 'no mse'); ## TODO: type
2513            } else {
2514              !!!cp (221.12);
2515          }          }
2516    
2517          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2518          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2519          !!!next-input-character;          ## Reconsume.
2520          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2521            !!!cp (221.2);            !!!cp (221.2);
2522            !!!emit ($self->{ct}); # character            !!!emit ($self->{ct}); # character
# Line 2371  sub _get_next_token ($) { Line 2539  sub _get_next_token ($) {
2539    
2540        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
2541      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2542          ## XML5: "CDATA bracket state".
2543    
2544        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2545          !!!cp (221.5);          !!!cp (221.5);
2546          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 2378  sub _get_next_token ($) { Line 2548  sub _get_next_token ($) {
2548          redo A;          redo A;
2549        } else {        } else {
2550          !!!cp (221.6);          !!!cp (221.6);
2551            ## XML5: If EOF, "]" is not appended and changed to the data state.
2552          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
2553          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2554          ## Reconsume.          ## Reconsume.
2555          redo A;          redo A;
2556        }        }
2557      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2558          ## XML5: "CDATA end state".
2559    
2560        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2561          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2562          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2406  sub _get_next_token ($) { Line 2579  sub _get_next_token ($) {
2579          !!!cp (221.11);          !!!cp (221.11);
2580          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
2581          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
2582          ## Reconsume.          ## Reconsume. ## XML5: Emit.
2583          redo A;          redo A;
2584        }        }
2585      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 2780  sub _get_next_token ($) { Line 2953  sub _get_next_token ($) {
2953          ## Reconsume.          ## Reconsume.
2954          redo A;          redo A;
2955        }        }
2956    
2957        ## XML-only states
2958    
2959        } elsif ($self->{state} == PI_STATE) {
2960          if ($is_space->{$self->{nc}} or
2961              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2962              $self->{nc} == -1) {
2963            !!!parse-error (type => 'bare pio', ## TODO: type
2964                            line => $self->{line_prev},
2965                            column => $self->{column_prev}
2966                                - 1 * ($self->{nc} != -1));
2967            $self->{state} = BOGUS_COMMENT_STATE;
2968            ## Reconsume.
2969            $self->{ct} = {type => COMMENT_TOKEN,
2970                           data => '?',
2971                           line => $self->{line_prev},
2972                           column => $self->{column_prev}
2973                               - 1 * ($self->{nc} != -1),
2974                          };
2975            redo A;
2976          } else {
2977            $self->{ct} = {type => PI_TOKEN,
2978                           target => chr $self->{nc},
2979                           data => '',
2980                           line => $self->{line_prev},
2981                           column => $self->{column_prev} - 1,
2982                          };
2983            $self->{state} = PI_TARGET_STATE;
2984            !!!next-input-character;
2985            redo A;
2986          }
2987        } elsif ($self->{state} == PI_TARGET_STATE) {
2988          if ($is_space->{$self->{nc}}) {
2989            $self->{state} = PI_TARGET_AFTER_STATE;
2990            !!!next-input-character;
2991            redo A;
2992          } elsif ($self->{nc} == -1) {
2993            !!!parse-error (type => 'no pic'); ## TODO: type
2994            $self->{state} = DATA_STATE;
2995            $self->{s_kwd} = '';
2996            ## Reconsume.
2997            !!!emit ($self->{ct}); # pi
2998            redo A;
2999          } elsif ($self->{nc} == 0x003F) { # ?
3000            $self->{state} = PI_AFTER_STATE;
3001            !!!next-input-character;
3002            redo A;
3003          } else {
3004            ## XML5: typo ("tag name" -> "target")
3005            $self->{ct}->{target} .= chr $self->{nc}; # pi
3006            !!!next-input-character;
3007            redo A;
3008          }
3009        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3010          if ($is_space->{$self->{nc}}) {
3011            ## Stay in the state.
3012            !!!next-input-character;
3013            redo A;
3014          } else {
3015            $self->{state} = PI_DATA_STATE;
3016            ## Reprocess.
3017            redo A;
3018          }
3019        } elsif ($self->{state} == PI_DATA_STATE) {
3020          if ($self->{nc} == 0x003F) { # ?
3021            $self->{state} = PI_DATA_AFTER_STATE;
3022            !!!next-input-character;
3023            redo A;
3024          } elsif ($self->{nc} == -1) {
3025            !!!parse-error (type => 'no pic'); ## TODO: type
3026            $self->{state} = DATA_STATE;
3027            $self->{s_kwd} = '';
3028            ## Reprocess.
3029            !!!emit ($self->{ct}); # pi
3030            redo A;
3031          } else {
3032            $self->{ct}->{data} .= chr $self->{nc}; # pi
3033            $self->{read_until}->($self->{ct}->{data}, q[?],
3034                                  length $self->{ct}->{data});
3035            ## Stay in the state.
3036            !!!next-input-character;
3037            ## Reprocess.
3038            redo A;
3039          }
3040        } elsif ($self->{state} == PI_AFTER_STATE) {
3041          if ($self->{nc} == 0x003E) { # >
3042            $self->{state} = DATA_STATE;
3043            $self->{s_kwd} = '';
3044            !!!next-input-character;
3045            !!!emit ($self->{ct}); # pi
3046            redo A;
3047          } elsif ($self->{nc} == 0x003F) { # ?
3048            !!!parse-error (type => 'no s after target', ## TODO: type
3049                            line => $self->{line_prev},
3050                            column => $self->{column_prev}); ## XML5: no error
3051            $self->{ct}->{data} .= '?';
3052            $self->{state} = PI_DATA_AFTER_STATE;
3053            !!!next-input-character;
3054            redo A;
3055          } else {
3056            !!!parse-error (type => 'no s after target', ## TODO: type
3057                            line => $self->{line_prev},
3058                            column => $self->{column_prev}
3059                                + 1 * ($self->{nc} == -1)); ## XML5: no error
3060            $self->{ct}->{data} .= '?'; ## XML5: not appended
3061            $self->{state} = PI_DATA_STATE;
3062            ## Reprocess.
3063            redo A;
3064          }
3065        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3066          ## XML5: Same as "pi after state" in XML5
3067          if ($self->{nc} == 0x003E) { # >
3068            $self->{state} = DATA_STATE;
3069            $self->{s_kwd} = '';
3070            !!!next-input-character;
3071            !!!emit ($self->{ct}); # pi
3072            redo A;
3073          } elsif ($self->{nc} == 0x003F) { # ?
3074            $self->{ct}->{data} .= '?';
3075            ## Stay in the state.
3076            !!!next-input-character;
3077            redo A;
3078          } else {
3079            $self->{ct}->{data} .= '?'; ## XML5: not appended
3080            $self->{state} = PI_DATA_STATE;
3081            ## Reprocess.
3082            redo A;
3083          }
3084            
3085      } else {      } else {
3086        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
3087      }      }

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.11

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24