/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8 by wakaba, Wed Oct 15 04:38:22 2008 UTC revision 1.10 by wakaba, Wed Oct 15 08:51:02 2008 UTC
# Line 437  sub _get_next_token ($) { Line 437  sub _get_next_token ($) {
437        !!!emit ($token);        !!!emit ($token);
438        redo A;        redo A;
439      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
440          ## XML5: "tag state".
441    
442        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
443          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
444            !!!cp (15);            !!!cp (15);
# Line 526  sub _get_next_token ($) { Line 528  sub _get_next_token ($) {
528              ## $self->{nc} is intentionally left as is              ## $self->{nc} is intentionally left as is
529              redo A;              redo A;
530            }            }
531          } else {          } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
532            !!!cp (23);            !!!cp (23);
533            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
534                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 541  sub _get_next_token ($) { Line 543  sub _get_next_token ($) {
543                     });                     });
544    
545            redo A;            redo A;
546            } else {
547              ## XML5: "<:" is a parse error.
548              !!!cp (23.1);
549              $self->{ct} = {type => START_TAG_TOKEN,
550                                        tag_name => chr ($self->{nc}),
551                                        line => $self->{line_prev},
552                                        column => $self->{column_prev}};
553              $self->{state} = TAG_NAME_STATE;
554              !!!next-input-character;
555              redo A;
556          }          }
557        } else {        } else {
558          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 549  sub _get_next_token ($) { Line 561  sub _get_next_token ($) {
561        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
562        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
563    
564          ## XML5: "end tag state".
565    
566        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
567        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
568          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
# Line 590  sub _get_next_token ($) { Line 604  sub _get_next_token ($) {
604          !!!next-input-character;          !!!next-input-character;
605          redo A;          redo A;
606        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (31);  
607          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
608                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
609                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
610          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
611          $self->{s_kwd} = '';          $self->{s_kwd} = '';
612          !!!next-input-character;          if ($self->{is_xml}) {
613              !!!cp (31);
614              ## XML5: No parse error.
615              
616              ## NOTE: This parser raises a parse error, since it supports
617              ## XML1, not XML5.
618    
619              ## NOTE: A short end tag token.
620              my $ct = {type => END_TAG_TOKEN,
621                        tag_name => '',
622                        line => $self->{line_prev},
623                        column => $self->{column_prev} - 1,
624                       };
625              !!!next-input-character;
626              !!!emit ($ct);
627            } else {
628              !!!cp (31.1);
629              !!!next-input-character;
630            }
631          redo A;          redo A;
632        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
633          !!!cp (32);          !!!cp (32);
# Line 610  sub _get_next_token ($) { Line 641  sub _get_next_token ($) {
641                   });                   });
642    
643          redo A;          redo A;
644        } else {        } elsif (not $self->{is_xml} or
645                   $is_space->{$self->{nc}}) {
646          !!!cp (33);          !!!cp (33);
647          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag',
648                            line => $self->{line_prev}, # "<" of "</"
649                            column => $self->{column_prev} - 1);
650          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
651          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
652                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 625  sub _get_next_token ($) { Line 659  sub _get_next_token ($) {
659          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
660          ## "bogus comment state" entry.          ## "bogus comment state" entry.
661          redo A;          redo A;
662          } else {
663            ## XML5: "</:" is a parse error.
664            !!!cp (30.1);
665            $self->{ct} = {type => END_TAG_TOKEN,
666                           tag_name => chr ($self->{nc}),
667                           line => $l, column => $c};
668            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
669            !!!next-input-character;
670            redo A;
671        }        }
672      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
673        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
# Line 1492  sub _get_next_token ($) { Line 1535  sub _get_next_token ($) {
1535                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1536                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
1537                                   };                                   };
1538          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1539          !!!next-input-character;          !!!next-input-character;
1540          redo A;          redo A;
1541        } else {        } else {
# Line 1535  sub _get_next_token ($) { Line 1578  sub _get_next_token ($) {
1578        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
1579                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1580                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1581          !!!cp (129);          if ($self->{s_kwd} ne 'DOCTYP') {
1582              !!!cp (129);
1583              ## XML5: case-sensitive.
1584              !!!parse-error (type => 'lowercase keyword', ## TODO
1585                              text => 'DOCTYPE',
1586                              line => $self->{line_prev},
1587                              column => $self->{column_prev} - 5);
1588            } else {
1589              !!!cp (129.1);
1590            }
1591          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
1592          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
1593                                    quirks => 1,                                    quirks => 1,
# Line 1702  sub _get_next_token ($) { Line 1754  sub _get_next_token ($) {
1754          redo A;          redo A;
1755        }        }
1756      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1757          ## XML5: "comment dash state".
1758    
1759        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1760          !!!cp (148);          !!!cp (148);
1761          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 1737  sub _get_next_token ($) { Line 1791  sub _get_next_token ($) {
1791          redo A;          redo A;
1792        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
1793          !!!cp (152);          !!!cp (152);
1794            ## XML5: Not a parse error.
1795          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1796                          line => $self->{line_prev},                          line => $self->{line_prev},
1797                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1756  sub _get_next_token ($) { Line 1811  sub _get_next_token ($) {
1811          redo A;          redo A;
1812        } else {        } else {
1813          !!!cp (154);          !!!cp (154);
1814            ## XML5: Not a parse error.
1815          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1816                          line => $self->{line_prev},                          line => $self->{line_prev},
1817                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2351  sub _get_next_token ($) { Line 2407  sub _get_next_token ($) {
2407        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
2408        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2409        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
2410    
2411          ## XML5: "CDATA state".
2412                
2413        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2414          !!!cp (221.1);          !!!cp (221.1);
# Line 2367  sub _get_next_token ($) { Line 2425  sub _get_next_token ($) {
2425    
2426          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2427          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2428          !!!next-input-character;          ## Reconsume.
2429          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2430            !!!cp (221.2);            !!!cp (221.2);
2431            !!!emit ($self->{ct}); # character            !!!emit ($self->{ct}); # character
# Line 2390  sub _get_next_token ($) { Line 2448  sub _get_next_token ($) {
2448    
2449        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
2450      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2451          ## XML5: "CDATA bracket state".
2452    
2453        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2454          !!!cp (221.5);          !!!cp (221.5);
2455          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 2397  sub _get_next_token ($) { Line 2457  sub _get_next_token ($) {
2457          redo A;          redo A;
2458        } else {        } else {
2459          !!!cp (221.6);          !!!cp (221.6);
2460            ## XML5: If EOF, "]" is not appended and changed to the data state.
2461          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
2462          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2463          ## Reconsume.          ## Reconsume.
2464          redo A;          redo A;
2465        }        }
2466      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2467          ## XML5: "CDATA end state".
2468    
2469        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2470          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2471          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2425  sub _get_next_token ($) { Line 2488  sub _get_next_token ($) {
2488          !!!cp (221.11);          !!!cp (221.11);
2489          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
2490          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
2491          ## Reconsume.          ## Reconsume. ## XML5: Emit.
2492          redo A;          redo A;
2493        }        }
2494      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.10

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24