/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2 by wakaba, Sat Sep 8 01:31:44 2007 UTC revision 1.4 by wakaba, Sat Sep 8 02:58:24 2007 UTC
# Line 17  sub ESCAPE_BEFORE_NL_STATE () { 12 } Line 17  sub ESCAPE_BEFORE_NL_STATE () { 12 }
17  sub NUMBER_DOT_STATE () { 13 }  sub NUMBER_DOT_STATE () { 13 }
18  sub NUMBER_DOT_NUMBER_STATE () { 14 }  sub NUMBER_DOT_NUMBER_STATE () { 14 }
19  sub DELIM_STATE () { 15 }  sub DELIM_STATE () { 15 }
20    sub URI_UNQUOTED_STATE () { 16 }
21    sub URI_AFTER_WSP_STATE () { 17 }
22    sub AFTER_AT_STATE () { 18 }
23    sub AFTER_AT_HYPHEN_STATE () { 19 }
24    
25  sub IDENT_TOKEN () { 1 }  sub IDENT_TOKEN () { 1 }
26  sub ATKEYWORD_TOKEN () { 2 }  sub ATKEYWORD_TOKEN () { 2 }
# Line 58  sub COMMENT_INVALID_TOKEN () { 37 } Line 62  sub COMMENT_INVALID_TOKEN () { 37 }
62  sub EOF_TOKEN () { 38 }  sub EOF_TOKEN () { 38 }
63    
64  our @TokenName = qw(  our @TokenName = qw(
65    0 IDENT ATKWTWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
66    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
67    UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH    UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
68    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
# Line 88  sub get_next_token ($) { Line 92  sub get_next_token ($) {
92    my $char;    my $char;
93    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
94    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
95    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
96          ## NOTE:
97          ##   0: in |ident|.
98          ##   1: in |URI| outside of |string|.
99          ##   0x0022: in |string1| or |invalid1|.
100          ##   0x0027: in |string2| or |invalid2|.
101    
102    A: {    A: {
103      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
# Line 116  sub get_next_token ($) { Line 125  sub get_next_token ($) {
125        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
126          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
127          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $current_token = {type => ATKEYWORD_TOKEN, value => ''};
128          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = AFTER_AT_STATE;
129          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
130          redo A;          redo A;
131        } elsif ($self->{c} == 0x0022) { # "        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
         ## NOTE: |"| in |string1| in |string| in |STRING|, or  
         ## |"| in |invalid1| in |invalid| in |INVALID|.  
132          $current_token = {type => STRING_TOKEN, value => ''};          $current_token = {type => STRING_TOKEN, value => ''};
133          $self->{state} = STRING_STATE; $q = 1;          $self->{state} = STRING_STATE; $q = $self->{c};
         $self->{c} = $self->{get_char}->();  
         redo A;  
       } elsif ($self->{c} == 0x0027) { # '  
         ## NOTE: |'| in |string2| in |string| in |STRING|, or  
         ## |'| in |invalid2| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
134          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
135          redo A;          redo A;
136        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
# Line 151  sub get_next_token ($) { Line 151  sub get_next_token ($) {
151          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
152          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
153          redo A;          redo A;
154          } elsif ($self->{c} == 0x002F) { # /
155            $self->{c} = $self->{get_char}->();
156            if ($self->{c} == 0x002A) { # *
157              C: {
158                $self->{c} = $self->{get_char}->();
159                if ($self->{c} == 0x002A) { # *
160                  D: {
161                    $self->{c} = $self->{get_char}->();
162                    if ($self->{c} == 0x002F) { # /
163                      #
164                    } elsif ($self->{c} == 0x002A) { # *
165                      redo D;
166                    } else {
167                      redo C;
168                    }
169                  } # D
170                } elsif ($self->{c} == -1) {
171                  # stay in the state
172                  # reprocess
173                  return {type => COMMENT_INVALID_TOKEN};
174                  #redo A;
175                } else {
176                  redo C;
177                }
178              } # C
179    
180              # stay in the state.
181              $self->{c} = $self->{get_char}->();
182              redo A;
183            } else {
184              # stay in the state.
185              # reprocess
186              return {type => DELIM_STATE, value => '/'};
187              #redo A;
188            }        
189        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
190          ## NOTE: |CDO|          ## NOTE: |CDO|
191          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 292  sub get_next_token ($) { Line 327  sub get_next_token ($) {
327          #redo A;          #redo A;
328        }        }
329      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
330        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
331          ## |FUNCTION|)
332        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
333            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
334            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
# Line 326  sub get_next_token ($) { Line 362  sub get_next_token ($) {
362          }          }
363        } else {        } else {
364          if ($current_token->{type} == NUMBER_TOKEN) {          if ($current_token->{type} == NUMBER_TOKEN) {
           ## NOTE: |-| after |num|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {  
           ## NOTE: |-| after |@|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == NUMBER_TOKEN) {  
365            ## NOTE: |-| after |NUMBER|.            ## NOTE: |-| after |NUMBER|.
366            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
367            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
# Line 352  sub get_next_token ($) { Line 376  sub get_next_token ($) {
376            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '-'};
377          }          }
378        }        }
379        } elsif ($self->{state} == AFTER_AT_STATE) {
380          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
381              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
382              $self->{c} == 0x005F or # _
383              $self->{c} > 0x007F) { # nonascii
384            $current_token->{value} .= chr $self->{c};
385            $self->{state} = NAME_STATE;
386            $self->{c} = $self->{get_char}->();
387            redo A;
388          } elsif ($self->{c} == 0x002D) { # -
389            $current_token->{value} .= '-';
390            $self->{state} = AFTER_AT_HYPHEN_STATE;
391            $self->{c} = $self->{get_char}->();
392            redo A;
393          } elsif ($self->{c} == 0x005C) { # \
394            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
395            $self->{c} = $self->{get_char}->();
396            redo A;
397          } else {
398            $self->{state} = BEFORE_TOKEN_STATE;
399            # reprocess
400            return {type => DELIM_TOKEN, value => '@'};
401          }
402        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
403          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
404              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
405              $self->{c} == 0x005F or # _
406              $self->{c} > 0x007F) { # nonascii
407            $current_token->{value} .= chr $self->{c};
408            $self->{state} = NAME_STATE;
409            $self->{c} = $self->{get_char}->();
410            redo A;
411          } elsif ($self->{c} == 0x002D) { # -
412            $self->{c} = $self->{get_char}->();
413            if ($self->{c} == 0x003E) { # >
414              unshift @{$self->{token}}, {type => CDC_TOKEN};
415              $self->{state} = BEFORE_TOKEN_STATE;
416              $self->{c} = $self->{get_char}->();
417              return {type => DELIM_TOKEN, value => '@'};
418              #redo A;
419            } else {
420              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
421              $current_token = {type => IDENT_TOKEN, value => '-'};
422              $self->{state} = BEFORE_NMSTART_STATE;
423              # reprocess
424              return {type => DELIM_TOKEN, value => '@'};
425              #redo A;
426            }
427          } elsif ($self->{c} == 0x005C) { # \
428            ## TODO: @-\{nl}
429            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
430            $self->{c} = $self->{get_char}->();
431            redo A;
432          } else {
433            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
434            $self->{state} = BEFORE_TOKEN_STATE;
435            # reprocess
436            return {type => DELIM_TOKEN, value => '@'};
437          }
438      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
439        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
440          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
# Line 422  sub get_next_token ($) { Line 505  sub get_next_token ($) {
505          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
506          redo A;          redo A;
507        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
508          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
509          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
510          redo A;          redo A;
511        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
512                 $current_token->{type} == IDENT_TOKEN) { # (                 $current_token->{type} == IDENT_TOKEN) { # (
513          if (not $current_token->{has_escape} and          my $func_name = $current_token->{value};
514              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
515               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
516              ->{$current_token->{value}}) {            if ($current_token->{has_escape}) {
517            $current_token->{type} = URI_TOKEN;              ## TODO: warn
518              }
519              $current_token->{type}
520                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
521              $current_token->{value} = '';
522            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
523            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
524            redo A;            redo A;
525          } else {          } else {
526            $current_token->{type} = FUNCTION_TOKEN;            $current_token->{type} = FUNCTION_TOKEN;
# Line 457  sub get_next_token ($) { Line 535  sub get_next_token ($) {
535          return $current_token;          return $current_token;
536          #redo A;          #redo A;
537        }        }
538        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
539          while ({
540                    0x0020 => 1, # SP
541                    0x0009 => 1, # \t
542                    0x000D => 1, # \r
543                    0x000A => 1, # \n
544                    0x000C => 1, # \f
545                 }->{$self->{c}}) {
546            $self->{c} = $self->{get_char}->();
547          }
548          if ($self->{c} == -1) {
549            $current_token->{type} = {
550                URI_TOKEN, URI_INVALID_TOKEN,
551                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
552                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
553                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
554            }->{$current_token->{type}};        
555            $self->{state} = BEFORE_TOKEN_STATE;
556            $self->{c} = $self->{get_char}->();
557            return $current_token;
558            #redo A;
559          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
560            ## TODO: Should we consider matches of "(" and ")"?
561            $current_token->{type} = {
562                URI_TOKEN, URI_INVALID_TOKEN,
563                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
564                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
565                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
566            }->{$current_token->{type}};
567            $self->{state} = URI_UNQUOTED_STATE;
568            $self->{c} = $self->{get_char}->();
569            redo A;
570          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
571            $self->{state} = STRING_STATE; $q = $self->{c};
572            $self->{c} = $self->{get_char}->();
573            redo A;
574          } elsif ($self->{c} == 0x0029) { # )
575            $self->{state} = BEFORE_TOKEN_STATE;
576            $self->{c} = $self->{get_char}->();
577            return $current_token;
578            #redo A;
579          } elsif ($self->{c} == 0x005C) { # \
580            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
581            $self->{c} = $self->{get_char}->();
582            redo A;
583          } else {
584            $current_token->{value} .= chr $self->{c};
585            $self->{state} = URI_UNQUOTED_STATE;
586            $self->{c} = $self->{get_char}->();
587            redo A;
588          }
589        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
590          if ({
591               0x0020 => 1, # SP
592               0x0009 => 1, # \t
593               0x000D => 1, # \r
594               0x000A => 1, # \n
595               0x000C => 1, # \f
596              }->{$self->{c}}) {
597            $self->{state} = URI_AFTER_WSP_STATE;
598            $self->{c} = $self->{get_char}->();
599            redo A;
600          } elsif ($self->{c} == -1) {
601            $current_token->{type} = {
602                URI_TOKEN, URI_INVALID_TOKEN,
603                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
604                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
605                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
606            }->{$current_token->{type}};        
607            $self->{state} = BEFORE_TOKEN_STATE;
608            $self->{c} = $self->{get_char}->();
609            return $current_token;
610            #redo A;
611          } elsif ($self->{c} < 0x0020 or {
612              0x0022 => 1, # "
613              0x0027 => 1, # '
614              0x0028 => 1, # (
615          }->{$self->{c}}) { # C0 or (
616            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
617            $current_token->{type} = {
618                URI_TOKEN, URI_INVALID_TOKEN,
619                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622            }->{$current_token->{type}};
623            # stay in the state.
624            $self->{c} = $self->{get_char}->();
625            redo A;
626          } elsif ($self->{c} == 0x0029) { # )
627            $self->{state} = BEFORE_TOKEN_STATE;
628            $self->{c} = $self->{get_char}->();
629            return $current_token;
630            #redo A;
631          } elsif ($self->{c} == 0x005C) { # \
632            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
633            $self->{c} = $self->{get_char}->();
634            redo A;
635          } else {
636            $current_token->{value} .= chr $self->{c};
637            # stay in the state.
638            $self->{c} = $self->{get_char}->();
639            redo A;
640          }
641        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
642          if ({
643               0x0020 => 1, # SP
644               0x0009 => 1, # \t
645               0x000D => 1, # \r
646               0x000A => 1, # \n
647               0x000C => 1, # \f
648              }->{$self->{c}}) {
649            # stay in the state.
650            $self->{c} = $self->{get_char}->();
651            redo A;
652          } elsif ($self->{c} == -1) {
653            $current_token->{type} = {
654                URI_TOKEN, URI_INVALID_TOKEN,
655                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
656                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
657                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
658            }->{$current_token->{type}};        
659            $self->{state} = BEFORE_TOKEN_STATE;
660            $self->{c} = $self->{get_char}->();
661            return $current_token;
662            #redo A;
663          } elsif ($self->{c} == 0x0029) { # )
664            $self->{state} = BEFORE_TOKEN_STATE;
665            $self->{c} = $self->{get_char}->();
666            return $current_token;
667            #redo A;
668          } elsif ($self->{c} == 0x005C) { # \
669            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
670            $self->{c} = $self->{get_char}->();
671            redo A;
672          } else {
673            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
674            $current_token->{type} = {
675                URI_TOKEN, URI_INVALID_TOKEN,
676                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
677                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
678                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
679            }->{$current_token->{type}};
680            # stay in the state.
681            $self->{c} = $self->{get_char}->();
682            redo A;
683          }
684      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
685        $current_token->{has_escape} = 1;        $current_token->{has_escape} = 1;
686        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
# Line 486  sub get_next_token ($) { Line 710  sub get_next_token ($) {
710            return $current_token;            return $current_token;
711            # reconsume            # reconsume
712            #redo A;            #redo A;
713            } elsif ($q == 1) {
714              ## NOTE: In |escape| in |URI|.
715              $current_token->{type} = {
716                  URI_TOKEN, URI_INVALID_TOKEN,
717                  URI_INVALID_TOKEN, URI_INVALID_TOKEN,
718                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
719                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
720              }->{$current_token->{type}};
721              $current_token->{value} .= chr $self->{c};
722              $self->{state} = URI_UNQUOTED_STATE;
723              $self->{c} = $self->{get_char}->();
724              redo A;
725          } else {          } else {
726            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
727            $current_token->{value} .= chr $self->{c};            $current_token->{value} .= chr $self->{c};
# Line 501  sub get_next_token ($) { Line 737  sub get_next_token ($) {
737            return $current_token;            return $current_token;
738            # reconsume            # reconsume
739            #redo A;            #redo A;
740            } elsif ($q == 1) {
741              $current_token->{type} = {
742                  URI_TOKEN, URI_INVALID_TOKEN,
743                  URI_INVALID_TOKEN, URI_INVALID_TOKEN,
744                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
745                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
746              }->{$current_token->{type}};
747              $current_token->{value} .= "\x0D\x0A";
748              $self->{state} = URI_UNQUOTED_STATE;
749              $self->{c} = $self->{get_char}->();
750              redo A;
751          } else {          } else {
752            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
753            $current_token->{value} .= "\x0D\x0A";            $current_token->{value} .= "\x0D\x0A";
# Line 511  sub get_next_token ($) { Line 758  sub get_next_token ($) {
758        } else {        } else {
759          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
760          $current_token->{value} .= chr $self->{c};          $current_token->{value} .= chr $self->{c};
761          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
762                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
763          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
764          redo A;          redo A;
765        }        }
# Line 537  sub get_next_token ($) { Line 785  sub get_next_token ($) {
785                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
786                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
787          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
788          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
789                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
790          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
791          redo A;          redo A;
792        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 546  sub get_next_token ($) { Line 795  sub get_next_token ($) {
795          redo A;          redo A;
796        } else {        } else {
797          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
798          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
799                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
800          # reconsume          # reconsume
801          redo A;          redo A;
802        }        }
# Line 557  sub get_next_token ($) { Line 807  sub get_next_token ($) {
807            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
808            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
809          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
810          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
811                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
812          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
813          redo A;          redo A;
814        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 566  sub get_next_token ($) { Line 817  sub get_next_token ($) {
817          redo A;          redo A;
818        } else {        } else {
819          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
820          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
821                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
822          # reconsume          # reconsume
823          redo A;          redo A;
824        }        }
# Line 574  sub get_next_token ($) { Line 826  sub get_next_token ($) {
826        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
827        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
828          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
829          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
830                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
831          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
832          redo A;          redo A;
833        } else {        } else {
834          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
835          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
836                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
837          # reconsume          # reconsume
838          redo A;          redo A;
839        }        }
# Line 587  sub get_next_token ($) { Line 841  sub get_next_token ($) {
841        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
842        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
843        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
844          ## Or, in |URI|.
845        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
846          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
847          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
848          redo A;          redo A;
849        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
850          $self->{state} = BEFORE_TOKEN_STATE;          if ($current_token->{type} == STRING_TOKEN) {
851          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
852          return $current_token;            $self->{c} = $self->{get_char}->();
853          #redo A;            return $current_token;
854              #redo A;
855            } else {
856              $self->{state} = URI_AFTER_WSP_STATE;
857              $self->{c} = $self->{get_char}->();
858              redo A;
859            }
860        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
861                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
862                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.4

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24