/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5 by wakaba, Tue Oct 14 14:38:59 2008 UTC revision 1.9 by wakaba, Wed Oct 15 08:05:47 2008 UTC
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117    ## XML states
118    sub PI_STATE () { 51 }
119    sub PI_TARGET_STATE () { 52 }
120    sub PI_TARGET_AFTER_STATE () { 53 }
121    sub PI_DATA_STATE () { 54 }
122    sub PI_AFTER_STATE () { 55 }
123    sub PI_DATA_AFTER_STATE () { 56 }
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
127    
# Line 209  sub _initialize_tokenizer ($) { Line 217  sub _initialize_tokenizer ($) {
217  ##        ->{value}  ##        ->{value}
218  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
219  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
220    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
221  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
223  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 499  sub _get_next_token ($) { Line 508  sub _get_next_token ($) {
508    
509            redo A;            redo A;
510          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
511            !!!cp (22);            if ($self->{is_xml}) {
512            !!!parse-error (type => 'pio',              !!!cp (22.1);
513                            line => $self->{line_prev},              $self->{state} = PI_STATE;
514                            column => $self->{column_prev});              !!!next-input-character;
515            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
516            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
517                                      line => $self->{line_prev},              !!!cp (22);
518                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
519                                     };                              line => $self->{line_prev},
520            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
521            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
522          } else {              $self->{ct} = {type => COMMENT_TOKEN, data => '',
523                               line => $self->{line_prev},
524                               column => $self->{column_prev},
525                              };
526                ## $self->{nc} is intentionally left as is
527                redo A;
528              }
529            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
530            !!!cp (23);            !!!cp (23);
531            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
532                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 525  sub _get_next_token ($) { Line 541  sub _get_next_token ($) {
541                     });                     });
542    
543            redo A;            redo A;
544            } else {
545              ## XML5: "<:" is a parse error.
546              !!!cp (23.1);
547              $self->{ct} = {type => START_TAG_TOKEN,
548                                        tag_name => chr ($self->{nc}),
549                                        line => $self->{line_prev},
550                                        column => $self->{column_prev}};
551              $self->{state} = TAG_NAME_STATE;
552              !!!next-input-character;
553              redo A;
554          }          }
555        } else {        } else {
556          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 1557  sub _get_next_token ($) { Line 1583  sub _get_next_token ($) {
1583          redo A;          redo A;
1584        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
1585                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1586          !!!cp (135.2);          if ($self->{is_xml} and
1587                not $self->{tainted} and
1588                @{$self->{open_elements} or []} == 0) {
1589              !!!cp (135.2);
1590              !!!parse-error (type => 'cdata outside of root element',
1591                              line => $self->{line_prev},
1592                              column => $self->{column_prev} - 7);
1593              $self->{tainted} = 1;
1594            } else {
1595              !!!cp (135.21);
1596            }
1597    
1598          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
1599                                    data => '',                                    data => '',
1600                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2331  sub _get_next_token ($) { Line 2368  sub _get_next_token ($) {
2368          !!!next-input-character;          !!!next-input-character;
2369          redo A;          redo A;
2370        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2371            if ($self->{is_xml}) {
2372              !!!cp (221.11);
2373              !!!parse-error (type => 'no mse'); ## TODO: type
2374            } else {
2375              !!!cp (221.12);
2376            }
2377    
2378          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2379          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2380          !!!next-input-character;          !!!next-input-character;
# Line 2542  sub _get_next_token ($) { Line 2586  sub _get_next_token ($) {
2586          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2587          ## Reconsume.          ## Reconsume.
2588          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2589                      has_reference => 1,
2590                    line => $l, column => $c,                    line => $l, column => $c,
2591                   });                   });
2592          redo A;          redo A;
# Line 2654  sub _get_next_token ($) { Line 2699  sub _get_next_token ($) {
2699          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2700          ## Reconsume.          ## Reconsume.
2701          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2702                      has_reference => 1,
2703                    line => $l, column => $c,                    line => $l, column => $c,
2704                   });                   });
2705          redo A;          redo A;
# Line 2749  sub _get_next_token ($) { Line 2795  sub _get_next_token ($) {
2795          ## Reconsume.          ## Reconsume.
2796          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
2797                    data => $data,                    data => $data,
2798                      has_reference => $has_ref,
2799                    line => $self->{line_prev},                    line => $self->{line_prev},
2800                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
2801                   });                   });
# Line 2762  sub _get_next_token ($) { Line 2809  sub _get_next_token ($) {
2809          ## Reconsume.          ## Reconsume.
2810          redo A;          redo A;
2811        }        }
2812    
2813        ## XML-only states
2814    
2815        } elsif ($self->{state} == PI_STATE) {
2816          if ($is_space->{$self->{nc}} or
2817              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2818              $self->{nc} == -1) {
2819            !!!parse-error (type => 'bare pio', ## TODO: type
2820                            line => $self->{line_prev},
2821                            column => $self->{column_prev}
2822                                - 1 * ($self->{nc} != -1));
2823            $self->{state} = BOGUS_COMMENT_STATE;
2824            ## Reconsume.
2825            $self->{ct} = {type => COMMENT_TOKEN,
2826                           data => '?',
2827                           line => $self->{line_prev},
2828                           column => $self->{column_prev}
2829                               - 1 * ($self->{nc} != -1),
2830                          };
2831            redo A;
2832          } else {
2833            $self->{ct} = {type => PI_TOKEN,
2834                           target => chr $self->{nc},
2835                           data => '',
2836                           line => $self->{line_prev},
2837                           column => $self->{column_prev} - 1,
2838                          };
2839            $self->{state} = PI_TARGET_STATE;
2840            !!!next-input-character;
2841            redo A;
2842          }
2843        } elsif ($self->{state} == PI_TARGET_STATE) {
2844          if ($is_space->{$self->{nc}}) {
2845            $self->{state} = PI_TARGET_AFTER_STATE;
2846            !!!next-input-character;
2847            redo A;
2848          } elsif ($self->{nc} == -1) {
2849            !!!parse-error (type => 'no pic'); ## TODO: type
2850            $self->{state} = DATA_STATE;
2851            $self->{s_kwd} = '';
2852            ## Reconsume.
2853            !!!emit ($self->{ct}); # pi
2854            redo A;
2855          } elsif ($self->{nc} == 0x003F) { # ?
2856            $self->{state} = PI_AFTER_STATE;
2857            !!!next-input-character;
2858            redo A;
2859          } else {
2860            ## XML5: typo ("tag name" -> "target")
2861            $self->{ct}->{target} .= chr $self->{nc}; # pi
2862            !!!next-input-character;
2863            redo A;
2864          }
2865        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
2866          if ($is_space->{$self->{nc}}) {
2867            ## Stay in the state.
2868            !!!next-input-character;
2869            redo A;
2870          } else {
2871            $self->{state} = PI_DATA_STATE;
2872            ## Reprocess.
2873            redo A;
2874          }
2875        } elsif ($self->{state} == PI_DATA_STATE) {
2876          if ($self->{nc} == 0x003F) { # ?
2877            $self->{state} = PI_DATA_AFTER_STATE;
2878            !!!next-input-character;
2879            redo A;
2880          } elsif ($self->{nc} == -1) {
2881            !!!parse-error (type => 'no pic'); ## TODO: type
2882            $self->{state} = DATA_STATE;
2883            $self->{s_kwd} = '';
2884            ## Reprocess.
2885            !!!emit ($self->{ct}); # pi
2886            redo A;
2887          } else {
2888            $self->{ct}->{data} .= chr $self->{nc}; # pi
2889            $self->{read_until}->($self->{ct}->{data}, q[?],
2890                                  length $self->{ct}->{data});
2891            ## Stay in the state.
2892            !!!next-input-character;
2893            ## Reprocess.
2894            redo A;
2895          }
2896        } elsif ($self->{state} == PI_AFTER_STATE) {
2897          if ($self->{nc} == 0x003E) { # >
2898            $self->{state} = DATA_STATE;
2899            $self->{s_kwd} = '';
2900            !!!next-input-character;
2901            !!!emit ($self->{ct}); # pi
2902            redo A;
2903          } elsif ($self->{nc} == 0x003F) { # ?
2904            !!!parse-error (type => 'no s after target', ## TODO: type
2905                            line => $self->{line_prev},
2906                            column => $self->{column_prev}); ## XML5: no error
2907            $self->{ct}->{data} .= '?';
2908            $self->{state} = PI_DATA_AFTER_STATE;
2909            !!!next-input-character;
2910            redo A;
2911          } else {
2912            !!!parse-error (type => 'no s after target', ## TODO: type
2913                            line => $self->{line_prev},
2914                            column => $self->{column_prev}
2915                                + 1 * ($self->{nc} == -1)); ## XML5: no error
2916            $self->{ct}->{data} .= '?'; ## XML5: not appended
2917            $self->{state} = PI_DATA_STATE;
2918            ## Reprocess.
2919            redo A;
2920          }
2921        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
2922          ## XML5: Same as "pi after state" in XML5
2923          if ($self->{nc} == 0x003E) { # >
2924            $self->{state} = DATA_STATE;
2925            $self->{s_kwd} = '';
2926            !!!next-input-character;
2927            !!!emit ($self->{ct}); # pi
2928            redo A;
2929          } elsif ($self->{nc} == 0x003F) { # ?
2930            $self->{ct}->{data} .= '?';
2931            ## Stay in the state.
2932            !!!next-input-character;
2933            redo A;
2934          } else {
2935            $self->{ct}->{data} .= '?'; ## XML5: not appended
2936            $self->{state} = PI_DATA_STATE;
2937            ## Reprocess.
2938            redo A;
2939          }
2940            
2941      } else {      } else {
2942        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
2943      }      }

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.9

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24