/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2 by wakaba, Sat Sep 8 01:31:44 2007 UTC revision 1.7 by wakaba, Sat Sep 8 10:21:04 2007 UTC
# Line 17  sub ESCAPE_BEFORE_NL_STATE () { 12 } Line 17  sub ESCAPE_BEFORE_NL_STATE () { 12 }
17  sub NUMBER_DOT_STATE () { 13 }  sub NUMBER_DOT_STATE () { 13 }
18  sub NUMBER_DOT_NUMBER_STATE () { 14 }  sub NUMBER_DOT_NUMBER_STATE () { 14 }
19  sub DELIM_STATE () { 15 }  sub DELIM_STATE () { 15 }
20    sub URI_UNQUOTED_STATE () { 16 }
21    sub URI_AFTER_WSP_STATE () { 17 }
22    sub AFTER_AT_STATE () { 18 }
23    sub AFTER_AT_HYPHEN_STATE () { 19 }
24    
25  sub IDENT_TOKEN () { 1 }  sub IDENT_TOKEN () { 1 }
26  sub ATKEYWORD_TOKEN () { 2 }  sub ATKEYWORD_TOKEN () { 2 }
# Line 32  sub NUMBER_TOKEN () { 11 } Line 36  sub NUMBER_TOKEN () { 11 }
36  sub DIMENSION_TOKEN () { 12 }  sub DIMENSION_TOKEN () { 12 }
37  sub PERCENTAGE_TOKEN () { 13 }  sub PERCENTAGE_TOKEN () { 13 }
38  sub UNICODE_RANGE_TOKEN () { 14 }  sub UNICODE_RANGE_TOKEN () { 14 }
 sub UNICODE_RANGE_INVALID_TOKEN () { 15 }  
39  sub DELIM_TOKEN () { 16 }  sub DELIM_TOKEN () { 16 }
40  sub PLUS_TOKEN () { 17 }  sub PLUS_TOKEN () { 17 }
41  sub GREATER_TOKEN () { 18 }  sub GREATER_TOKEN () { 18 }
# Line 58  sub COMMENT_INVALID_TOKEN () { 37 } Line 61  sub COMMENT_INVALID_TOKEN () { 37 }
61  sub EOF_TOKEN () { 38 }  sub EOF_TOKEN () { 38 }
62    
63  our @TokenName = qw(  our @TokenName = qw(
64    0 IDENT ATKWTWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
65    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
66    UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH    0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
67    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
68    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
69    COMMENT_INVALID EOF    COMMENT_INVALID EOF
# Line 76  sub init ($) { Line 79  sub init ($) {
79    my $self = shift;    my $self = shift;
80    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
81    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->();
82      #$self->{t} = {type => token-type, value => value, number => number};
83  } # init  } # init
84    
85  sub get_next_token ($) {  sub get_next_token ($) {
# Line 84  sub get_next_token ($) { Line 88  sub get_next_token ($) {
88      return shift @{$self->{token}};      return shift @{$self->{token}};
89    }    }
90    
   my $current_token;  
91    my $char;    my $char;
92    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
93    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
94    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
95          ## NOTE:
96          ##   0: in |ident|.
97          ##   1: in |URI| outside of |string|.
98          ##   0x0022: in |string1| or |invalid1|.
99          ##   0x0027: in |string2| or |invalid2|.
100    
101    A: {    A: {
102      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
103        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
104          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
105          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
106          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
107          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
108          redo A;          redo A;
109          } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
110            $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
111            $self->{c} = $self->{get_char}->();
112            if ($self->{c} == 0x002B) { # +
113              $self->{c} = $self->{get_char}->();
114              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
115                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
116                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
117                  $self->{c} == 0x003F) { # ?
118                $self->{t}->{value} .= '+' . chr $self->{c};
119                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
120                $self->{c} = $self->{get_char}->();
121                C: for (2..6) {
122                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
123                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
124                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
125                      $self->{c} == 0x003F) { # ?
126                    $self->{t}->{value} .= chr $self->{c};
127                    $self->{c} = $self->{get_char}->();
128                  } else {
129                    last C;
130                  }
131                } # C
132    
133                if ($self->{c} == 0x002D) { # -
134                  $self->{c} = $self->{get_char}->();
135                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
136                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
137                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
138                    $self->{t}->{value} .= '-' . chr $self->{c};
139                    $self->{c} = $self->{get_char}->();
140                    C: for (2..6) {
141                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
142                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
143                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
144                        $self->{t}->{value} .= chr $self->{c};
145                        $self->{c} = $self->{get_char}->();
146                      } else {
147                        last C;
148                      }
149                    } # C
150                    
151                    #
152                  } else {
153                    my $token = $self->{t};
154                    $self->{t} = {type => IDENT_TOKEN, value => '-'};
155                    $self->{state} = BEFORE_NMSTART_STATE;
156                    # reprocess
157                    return $token;
158                    #redo A;
159                  }
160                }
161    
162                $self->{state} = BEFORE_TOKEN_STATE;
163                # reprocess
164                return $self->{t};
165                #redo A;
166              } else {
167                unshift @{$self->{token}}, {type => PLUS_TOKEN};
168                $self->{state} = BEFORE_TOKEN_STATE;
169                # reprocess
170                return $self->{t};
171                #redo A;
172              }
173            } else {
174              $self->{state} = NAME_STATE;
175              # reprocess
176              redo A;
177            }
178        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
179                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
180                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
181                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
182          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
183          $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
184          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
185          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
186          redo A;          redo A;
187        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
188          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
189          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => ''};
190          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
191          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
192          redo A;          redo A;
193        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
194          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
195          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
196          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = AFTER_AT_STATE;
         $self->{c} = $self->{get_char}->();  
         redo A;  
       } elsif ($self->{c} == 0x0022) { # "  
         ## NOTE: |"| in |string1| in |string| in |STRING|, or  
         ## |"| in |invalid1| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 1;  
197          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
198          redo A;          redo A;
199        } elsif ($self->{c} == 0x0027) { # '        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
200          ## NOTE: |'| in |string2| in |string| in |STRING|, or          $self->{t} = {type => STRING_TOKEN, value => ''};
201          ## |'| in |invalid2| in |invalid| in |INVALID|.          $self->{state} = STRING_STATE; $q = $self->{c};
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
202          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
203          redo A;          redo A;
204        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
205          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
206          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => ''};
207          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
208          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
209          redo A;          redo A;
210        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
211          ## NOTE: |num|.          ## NOTE: |num|.
212          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
213          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
214          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
215          redo A;          redo A;
216        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
217          ## NOTE: |num|.          ## NOTE: |num|.
218          $current_token = {type => NUMBER_TOKEN, value => '0'};          $self->{t} = {type => NUMBER_TOKEN, value => '0'};
219          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
220          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
221          redo A;          redo A;
222          } elsif ($self->{c} == 0x002F) { # /
223            $self->{c} = $self->{get_char}->();
224            if ($self->{c} == 0x002A) { # *
225              C: {
226                $self->{c} = $self->{get_char}->();
227                if ($self->{c} == 0x002A) { # *
228                  D: {
229                    $self->{c} = $self->{get_char}->();
230                    if ($self->{c} == 0x002F) { # /
231                      #
232                    } elsif ($self->{c} == 0x002A) { # *
233                      redo D;
234                    } else {
235                      redo C;
236                    }
237                  } # D
238                } elsif ($self->{c} == -1) {
239                  # stay in the state
240                  # reprocess
241                  return {type => COMMENT_INVALID_TOKEN};
242                  #redo A;
243                } else {
244                  redo C;
245                }
246              } # C
247    
248              # stay in the state.
249              $self->{c} = $self->{get_char}->();
250              redo A;
251            } else {
252              # stay in the state.
253              # reprocess
254              return {type => DELIM_STATE, value => '/'};
255              #redo A;
256            }        
257        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
258          ## NOTE: |CDO|          ## NOTE: |CDO|
259          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 166  sub get_next_token ($) { Line 269  sub get_next_token ($) {
269              } else {              } else {
270                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
271                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
272                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-'};
273                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
274                #reprocess                #reprocess
275                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<'};
# Line 286  sub get_next_token ($) { Line 389  sub get_next_token ($) {
389          #redo A;          #redo A;
390        } else {        } else {
391          # stay in the state          # stay in the state
392          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
393          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
394          return $current_token;          return $self->{t};
395          #redo A;          #redo A;
396        }        }
397      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
398        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
399          ## |FUNCTION|)
400        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
401            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
402            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
403            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
404          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
405          $current_token->{type} = DIMENSION_TOKEN          $self->{t}->{type} = DIMENSION_TOKEN
406              if $current_token->{type} == NUMBER_TOKEN;              if $self->{t}->{type} == NUMBER_TOKEN;
407          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
408          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
409          redo A;          redo A;
# Line 309  sub get_next_token ($) { Line 413  sub get_next_token ($) {
413          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
414          redo A;          redo A;
415        } elsif ($self->{c} == 0x002D and # -        } elsif ($self->{c} == 0x002D and # -
416                 $current_token->{type} == IDENT_TOKEN) {                 $self->{t}->{type} == IDENT_TOKEN) {
417          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
418          if ($self->{c} == 0x003E) { # >          if ($self->{c} == 0x003E) { # >
419            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
# Line 318  sub get_next_token ($) { Line 422  sub get_next_token ($) {
422            #redo A;            #redo A;
423          } else {          } else {
424            ## NOTE: |-|, |-|, $self->{c}            ## NOTE: |-|, |-|, $self->{c}
425            #$current_token = {type => IDENT_TOKEN, value => '-'};            #$self->{t} = {type => IDENT_TOKEN, value => '-'};
426            # stay in the state            # stay in the state
427            # reconsume            # reconsume
428            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '-'};
429            #redo A;            #redo A;
430          }          }
431        } else {        } else {
432          if ($current_token->{type} == NUMBER_TOKEN) {          if ($self->{t}->{type} == NUMBER_TOKEN) {
           ## NOTE: |-| after |num|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {  
           ## NOTE: |-| after |@|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == NUMBER_TOKEN) {  
433            ## NOTE: |-| after |NUMBER|.            ## NOTE: |-| after |NUMBER|.
434            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
435            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
436            # reconsume            # reprocess
437            $current_token->{value} = $current_token->{number};            $self->{t}->{value} = $self->{t}->{number};
438            delete $current_token->{number};            delete $self->{t}->{number};
439            return $current_token;            return $self->{t};
440          } else {          } else {
441            ## NOTE: |-| not followed by |nmstart|.            ## NOTE: |-| not followed by |nmstart|.
442            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
443            $self->{c} = $self->{get_char}->();            # reprocess
444            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '-'};
445          }          }
446        }        }
447        } elsif ($self->{state} == AFTER_AT_STATE) {
448          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
449              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
450              $self->{c} == 0x005F or # _
451              $self->{c} > 0x007F) { # nonascii
452            $self->{t}->{value} .= chr $self->{c};
453            $self->{state} = NAME_STATE;
454            $self->{c} = $self->{get_char}->();
455            redo A;
456          } elsif ($self->{c} == 0x002D) { # -
457            $self->{t}->{value} .= '-';
458            $self->{state} = AFTER_AT_HYPHEN_STATE;
459            $self->{c} = $self->{get_char}->();
460            redo A;
461          } elsif ($self->{c} == 0x005C) { # \
462            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
463            $self->{c} = $self->{get_char}->();
464            redo A;
465          } else {
466            $self->{state} = BEFORE_TOKEN_STATE;
467            # reprocess
468            return {type => DELIM_TOKEN, value => '@'};
469          }
470        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
471          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
472              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
473              $self->{c} == 0x005F or # _
474              $self->{c} > 0x007F) { # nonascii
475            $self->{t}->{value} .= chr $self->{c};
476            $self->{state} = NAME_STATE;
477            $self->{c} = $self->{get_char}->();
478            redo A;
479          } elsif ($self->{c} == 0x002D) { # -
480            $self->{c} = $self->{get_char}->();
481            if ($self->{c} == 0x003E) { # >
482              unshift @{$self->{token}}, {type => CDC_TOKEN};
483              $self->{state} = BEFORE_TOKEN_STATE;
484              $self->{c} = $self->{get_char}->();
485              return {type => DELIM_TOKEN, value => '@'};
486              #redo A;
487            } else {
488              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
489              $self->{t} = {type => IDENT_TOKEN, value => '-'};
490              $self->{state} = BEFORE_NMSTART_STATE;
491              # reprocess
492              return {type => DELIM_TOKEN, value => '@'};
493              #redo A;
494            }
495          } elsif ($self->{c} == 0x005C) { # \
496            ## TODO: @-\{nl}
497            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
498            $self->{c} = $self->{get_char}->();
499            redo A;
500          } else {
501            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
502            $self->{state} = BEFORE_TOKEN_STATE;
503            # reprocess
504            return {type => DELIM_TOKEN, value => '@'};
505          }
506      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
507        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
508          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
509          $current_token->{value} = '-';          $self->{t}->{value} = '-';
510          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
511          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
512          redo A;          redo A;
# Line 364  sub get_next_token ($) { Line 515  sub get_next_token ($) {
515                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
516                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
517          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
518          $current_token->{value} = chr $self->{c};          $self->{t}->{value} = chr $self->{c};
519          $current_token->{type} = DIMENSION_TOKEN;          $self->{t}->{type} = DIMENSION_TOKEN;
520          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
521          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
522          redo A;          redo A;
523        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
524          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
525          $current_token->{value} = '';          $self->{t}->{value} = '';
526          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
527          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
528          redo A;          redo A;
529        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
530          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
531          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
532          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
533          return $current_token;          return $self->{t};
534          #redo A;          #redo A;
535        } else {        } else {
536          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
537          # reprocess          # reprocess
538          return $current_token;          return $self->{t};
539          #redo A;          #redo A;
540        }        }
541      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
# Line 395  sub get_next_token ($) { Line 546  sub get_next_token ($) {
546            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
547            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
548            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
549          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
550          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
551          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
552          redo A;          redo A;
# Line 417  sub get_next_token ($) { Line 568  sub get_next_token ($) {
568            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
569            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
570            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
571          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
572          # stay in the state          # stay in the state
573          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
574          redo A;          redo A;
575        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
576          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
577          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
578          redo A;          redo A;
579        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
580                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
581          if (not $current_token->{has_escape} and          my $func_name = $self->{t}->{value};
582              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
583               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
584              ->{$current_token->{value}}) {            if ($self->{t}->{has_escape}) {
585            $current_token->{type} = URI_TOKEN;              ## TODO: warn
586              }
587              $self->{t}->{type}
588                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
589              $self->{t}->{value} = '';
590            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
591            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
592            redo A;            redo A;
593          } else {          } else {
594            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
595            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
596            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
597            return $current_token;            return $self->{t};
598            #redo A;            #redo A;
599          }          }
600        } else {        } else {
601          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
602          # reconsume          # reconsume
603          return $current_token;          return $self->{t};
604            #redo A;
605          }
606        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
607          while ({
608                    0x0020 => 1, # SP
609                    0x0009 => 1, # \t
610                    0x000D => 1, # \r
611                    0x000A => 1, # \n
612                    0x000C => 1, # \f
613                 }->{$self->{c}}) {
614            $self->{c} = $self->{get_char}->();
615          }
616          if ($self->{c} == -1) {
617            $self->{t}->{type} = {
618                URI_TOKEN, URI_INVALID_TOKEN,
619                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622            }->{$self->{t}->{type}};        
623            $self->{state} = BEFORE_TOKEN_STATE;
624            $self->{c} = $self->{get_char}->();
625            return $self->{t};
626            #redo A;
627          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
628            ## TODO: Should we consider matches of "(" and ")"?
629            $self->{t}->{type} = {
630                URI_TOKEN, URI_INVALID_TOKEN,
631                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
632                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
633                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
634            }->{$self->{t}->{type}};
635            $self->{state} = URI_UNQUOTED_STATE;
636            $self->{c} = $self->{get_char}->();
637            redo A;
638          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
639            $self->{state} = STRING_STATE; $q = $self->{c};
640            $self->{c} = $self->{get_char}->();
641            redo A;
642          } elsif ($self->{c} == 0x0029) { # )
643            $self->{state} = BEFORE_TOKEN_STATE;
644            $self->{c} = $self->{get_char}->();
645            return $self->{t};
646            #redo A;
647          } elsif ($self->{c} == 0x005C) { # \
648            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
649            $self->{c} = $self->{get_char}->();
650            redo A;
651          } else {
652            $self->{t}->{value} .= chr $self->{c};
653            $self->{state} = URI_UNQUOTED_STATE;
654            $self->{c} = $self->{get_char}->();
655            redo A;
656          }
657        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
658          if ({
659               0x0020 => 1, # SP
660               0x0009 => 1, # \t
661               0x000D => 1, # \r
662               0x000A => 1, # \n
663               0x000C => 1, # \f
664              }->{$self->{c}}) {
665            $self->{state} = URI_AFTER_WSP_STATE;
666            $self->{c} = $self->{get_char}->();
667            redo A;
668          } elsif ($self->{c} == -1) {
669            $self->{t}->{type} = {
670                URI_TOKEN, URI_INVALID_TOKEN,
671                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
672                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
673                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
674            }->{$self->{t}->{type}};        
675            $self->{state} = BEFORE_TOKEN_STATE;
676            $self->{c} = $self->{get_char}->();
677            return $self->{t};
678            #redo A;
679          } elsif ($self->{c} < 0x0020 or {
680              0x0022 => 1, # "
681              0x0027 => 1, # '
682              0x0028 => 1, # (
683          }->{$self->{c}}) { # C0 or (
684            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
685            $self->{t}->{type} = {
686                URI_TOKEN, URI_INVALID_TOKEN,
687                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690            }->{$self->{t}->{type}};
691            # stay in the state.
692            $self->{c} = $self->{get_char}->();
693            redo A;
694          } elsif ($self->{c} == 0x0029) { # )
695            $self->{state} = BEFORE_TOKEN_STATE;
696            $self->{c} = $self->{get_char}->();
697            return $self->{t};
698            #redo A;
699          } elsif ($self->{c} == 0x005C) { # \
700            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
701            $self->{c} = $self->{get_char}->();
702            redo A;
703          } else {
704            $self->{t}->{value} .= chr $self->{c};
705            # stay in the state.
706            $self->{c} = $self->{get_char}->();
707            redo A;
708          }
709        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
710          if ({
711               0x0020 => 1, # SP
712               0x0009 => 1, # \t
713               0x000D => 1, # \r
714               0x000A => 1, # \n
715               0x000C => 1, # \f
716              }->{$self->{c}}) {
717            # stay in the state.
718            $self->{c} = $self->{get_char}->();
719            redo A;
720          } elsif ($self->{c} == -1) {
721            $self->{t}->{type} = {
722                URI_TOKEN, URI_INVALID_TOKEN,
723                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
724                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
725                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
726            }->{$self->{t}->{type}};        
727            $self->{state} = BEFORE_TOKEN_STATE;
728            $self->{c} = $self->{get_char}->();
729            return $self->{t};
730            #redo A;
731          } elsif ($self->{c} == 0x0029) { # )
732            $self->{state} = BEFORE_TOKEN_STATE;
733            $self->{c} = $self->{get_char}->();
734            return $self->{t};
735          #redo A;          #redo A;
736          } elsif ($self->{c} == 0x005C) { # \
737            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
738            $self->{c} = $self->{get_char}->();
739            redo A;
740          } else {
741            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
742            $self->{t}->{type} = {
743                URI_TOKEN, URI_INVALID_TOKEN,
744                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
745                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
746                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
747            }->{$self->{t}->{type}};
748            # stay in the state.
749            $self->{c} = $self->{get_char}->();
750            redo A;
751        }        }
752      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
753        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
754        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
755          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
756          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
# Line 473  sub get_next_token ($) { Line 765  sub get_next_token ($) {
765          redo A;          redo A;
766        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
767          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
768          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 + 0xA;
769          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
770          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
771          redo A;          redo A;
772        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
773                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
774          if ($q == 0) {          if ($q == 0) {
775            ## NOTE: In |escape| in ... in |ident|.            #
776            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
777            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
778            return $current_token;            $self->{t}->{type} = {
779            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
780            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
781                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
782                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
783              }->{$self->{t}->{type}};
784              $self->{t}->{value} .= chr $self->{c};
785              $self->{state} = URI_UNQUOTED_STATE;
786              $self->{c} = $self->{get_char}->();
787              redo A;
788          } else {          } else {
789            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
790            $current_token->{value} .= chr $self->{c};            $self->{t}->{value} .= chr $self->{c};
791            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
792            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
793            redo A;            redo A;
794          }          }
795        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
796          if ($q == 0) {          if ($q == 0) {
797            ## NOTE: In |escape| in ... in |ident|.            #
798            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
799            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
800            return $current_token;            $self->{t}->{type} = {
801            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
802            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
803                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
804                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
805              }->{$self->{t}->{type}};
806              $self->{t}->{value} .= "\x0D\x0A";
807              $self->{state} = URI_UNQUOTED_STATE;
808              $self->{c} = $self->{get_char}->();
809              redo A;
810          } else {          } else {
811            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
812            $current_token->{value} .= "\x0D\x0A";            $self->{t}->{value} .= "\x0D\x0A";
813            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
814            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
815            redo A;            redo A;
816          }          }
817          } elsif ($self->{c} == -1) {
818            #
819        } else {        } else {
820          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
821          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
822          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
823                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
824            $self->{c} = $self->{get_char}->();
825            redo A;
826          }
827    
828          if ($q == 0) {
829            $self->{state} = BEFORE_TOKEN_STATE;
830            # reprocess
831            if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
832              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
833              return {type => DELIM_TOKEN, value => '-'};
834              #redo A;
835            } elsif (length $self->{t}->{value}) {
836              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
837              return $self->{t};
838              #redo A;
839            } else {
840              return {type => DELIM_TOKEN, value => '\\'};
841              #redo A;
842            }
843          } else {
844            $self->{state} = $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
845          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
846          redo A;          redo A;
847        }        }
# Line 528  sub get_next_token ($) { Line 858  sub get_next_token ($) {
858          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
859          redo A;          redo A;
860        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
861          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
862          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
863          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
864          redo A;          redo A;
# Line 536  sub get_next_token ($) { Line 866  sub get_next_token ($) {
866                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
867                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
868                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
869          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
870          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
871                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
872          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
873          redo A;          redo A;
874        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 545  sub get_next_token ($) { Line 876  sub get_next_token ($) {
876          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
877          redo A;          redo A;
878        } else {        } else {
879          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
880          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
881                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
882          # reconsume          # reconsume
883          redo A;          redo A;
884        }        }
# Line 556  sub get_next_token ($) { Line 888  sub get_next_token ($) {
888            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
889            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
890            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
891          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
892          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
893                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
894          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
895          redo A;          redo A;
896        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 565  sub get_next_token ($) { Line 898  sub get_next_token ($) {
898          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
899          redo A;          redo A;
900        } else {        } else {
901          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
902          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
903                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
904          # reconsume          # reconsume
905          redo A;          redo A;
906        }        }
907      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
908        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
909        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
910          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
911          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
912                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
913          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
914          redo A;          redo A;
915        } else {        } else {
916          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
917          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
918                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
919          # reconsume          # reconsume
920          redo A;          redo A;
921        }        }
# Line 587  sub get_next_token ($) { Line 923  sub get_next_token ($) {
923        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
924        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
925        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
926          ## Or, in |URI|.
927        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
928          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
929          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
930          redo A;          redo A;
931        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
932          $self->{state} = BEFORE_TOKEN_STATE;          if ($self->{t}->{type} == STRING_TOKEN) {
933          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
934          return $current_token;            $self->{c} = $self->{get_char}->();
935          #redo A;            return $self->{t};
936              #redo A;
937            } else {
938              $self->{state} = URI_AFTER_WSP_STATE;
939              $self->{c} = $self->{get_char}->();
940              redo A;
941            }
942        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
943                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
944                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
945                 $self->{c} == -1) {                 $self->{c} == -1) {
946          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = INVALID_TOKEN;
947          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
948          # reconsume          # reconsume
949          return $current_token;          return $self->{t};
950          #redo A;          #redo A;
951        } else {        } else {
952          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
953          # stay in the state          # stay in the state
954          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
955          redo A;          redo A;
# Line 614  sub get_next_token ($) { Line 957  sub get_next_token ($) {
957      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
958        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
959        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
960          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
961          # stay in the state          # stay in the state
962          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
963          redo A;          redo A;
# Line 623  sub get_next_token ($) { Line 966  sub get_next_token ($) {
966          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
967          redo A;          redo A;
968        } else {        } else {
969          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
970          $current_token->{value} = '';          $self->{t}->{value} = '';
971          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
972          # reprocess          # reprocess
973          redo A;          redo A;
# Line 632  sub get_next_token ($) { Line 975  sub get_next_token ($) {
975      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
976        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
977        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
978          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
979          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
980          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
981          redo A;          redo A;
982        } else {        } else {
983          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
984          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
985          $current_token->{value} = '';          $self->{t}->{value} = '';
986          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
987          # reprocess          # reprocess
988          return $current_token;          return $self->{t};
989          #redo A;          #redo A;
990        }        }
991      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
992        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
993        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
994          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
995          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
996          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
997          redo A;          redo A;
# Line 661  sub get_next_token ($) { Line 1004  sub get_next_token ($) {
1004      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1005        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
1006        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1007          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1008          # stay in the state          # stay in the state
1009          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1010          redo A;          redo A;
1011        } else {        } else {
1012          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1013          $current_token->{value} = '';          $self->{t}->{value} = '';
1014          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1015          # reprocess          # reprocess
1016          redo A;          redo A;
# Line 676  sub get_next_token ($) { Line 1019  sub get_next_token ($) {
1019        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1020      }      }
1021    } # A    } # A
   
   ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  
   
1022  } # get_next_token  } # get_next_token
1023    
1024  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.7

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24