/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (show annotations) (download)
Sat Sep 22 12:16:33 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.13: +19 -1 lines
++ whatpm/Whatpm/CSS/ChangeLog	22 Sep 2007 12:16:08 -0000
2007-09-22  Wakaba  <wakaba@suika.fam.cx>

	* SelectorsParser.pm, SelectorsSerializer.pm: New Perl modules.

	* Tokenizer.pm: Token type constants are exportable.

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3
4 require Exporter;
5 push our @ISA, 'Exporter';
6
7 sub BEFORE_TOKEN_STATE () { 0 }
8 sub BEFORE_NMSTART_STATE () { 1 }
9 sub NAME_STATE () { 2 }
10 sub ESCAPE_OPEN_STATE () { 3 }
11 sub STRING_STATE () { 4 }
12 sub HASH_OPEN_STATE () { 5 }
13 sub NUMBER_STATE () { 6 }
14 sub NUMBER_FRACTION_STATE () { 7 }
15 sub AFTER_NUMBER_STATE () { 8 }
16 sub URI_BEFORE_WSP_STATE () { 9 }
17 sub ESCAPE_STATE () { 10 }
18 sub ESCAPE_BEFORE_LF_STATE () { 11 }
19 sub ESCAPE_BEFORE_NL_STATE () { 12 }
20 sub NUMBER_DOT_STATE () { 13 }
21 sub NUMBER_DOT_NUMBER_STATE () { 14 }
22 sub DELIM_STATE () { 15 }
23 sub URI_UNQUOTED_STATE () { 16 }
24 sub URI_AFTER_WSP_STATE () { 17 }
25 sub AFTER_AT_STATE () { 18 }
26 sub AFTER_AT_HYPHEN_STATE () { 19 }
27
28 sub IDENT_TOKEN () { 1 }
29 sub ATKEYWORD_TOKEN () { 2 }
30 sub HASH_TOKEN () { 3 }
31 sub FUNCTION_TOKEN () { 4 }
32 sub URI_TOKEN () { 5 }
33 sub URI_INVALID_TOKEN () { 6 }
34 sub URI_PREFIX_TOKEN () { 7 }
35 sub URI_PREFIX_INVALID_TOKEN () { 8 }
36 sub STRING_TOKEN () { 9 }
37 sub INVALID_TOKEN () { 10 }
38 sub NUMBER_TOKEN () { 11 }
39 sub DIMENSION_TOKEN () { 12 }
40 sub PERCENTAGE_TOKEN () { 13 }
41 sub UNICODE_RANGE_TOKEN () { 14 }
42 sub DELIM_TOKEN () { 16 }
43 sub PLUS_TOKEN () { 17 }
44 sub GREATER_TOKEN () { 18 }
45 sub COMMA_TOKEN () { 19 }
46 sub TILDE_TOKEN () { 20 }
47 sub DASHMATCH_TOKEN () { 21 }
48 sub PREFIXMATCH_TOKEN () { 22 }
49 sub SUFFIXMATCH_TOKEN () { 23 }
50 sub SUBSTRINGMATCH_TOKEN () { 24 }
51 sub INCLUDES_TOKEN () { 25 }
52 sub SEMICOLON_TOKEN () { 26 }
53 sub LBRACE_TOKEN () { 27 }
54 sub RBRACE_TOKEN () { 28 }
55 sub LPAREN_TOKEN () { 29 }
56 sub RPAREN_TOKEN () { 30 }
57 sub LBRACKET_TOKEN () { 31 }
58 sub RBRACKET_TOKEN () { 32 }
59 sub S_TOKEN () { 33 }
60 sub CDO_TOKEN () { 34 }
61 sub CDC_TOKEN () { 35 }
62 sub COMMENT_TOKEN () { 36 }
63 sub COMMENT_INVALID_TOKEN () { 37 }
64 sub EOF_TOKEN () { 38 }
65 sub MINUS_TOKEN () { 39 }
66 sub STAR_TOKEN () { 40 }
67 sub VBAR_TOKEN () { 41 }
68 sub DOT_TOKEN () { 42 }
69 sub COLON_TOKEN () { 43 }
70 sub MATCH_TOKEN () { 44 }
71 sub EXCLAMATION_TOKEN () { 45 }
72
73 our @TokenName = qw(
74 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
75 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
76 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
77 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
78 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
79 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
80 );
81
82 our @EXPORT_OK = qw(
83 IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
84 URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
85 STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
86 UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
87 TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
88 SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
89 RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
90 S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
91 MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
92 EXCLAMATION_TOKEN
93 );
94
95 our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
96
97 sub new ($) {
98 my $self = bless {token => [], get_char => sub { -1 },
99 onerror => sub { }}, shift;
100 return $self;
101 } # new
102
103 sub init ($) {
104 my $self = shift;
105 $self->{state} = BEFORE_TOKEN_STATE;
106 $self->{c} = $self->{get_char}->();
107 #$self->{t} = {type => token-type, value => value, number => number};
108 } # init
109
110 sub get_next_token ($) {
111 my $self = shift;
112 if (@{$self->{token}}) {
113 return shift @{$self->{token}};
114 }
115
116 my $char;
117 my $num; # |{num}|, if any.
118 my $i; # |$i + 1|th character in |unicode| in |escape|.
119 my $q;
120 ## NOTE:
121 ## 0: in |ident|.
122 ## 1: in |URI| outside of |string|.
123 ## 0x0022: in |string1| or |invalid1|.
124 ## 0x0027: in |string2| or |invalid2|.
125
126 A: {
127 if ($self->{state} == BEFORE_TOKEN_STATE) {
128 if ($self->{c} == 0x002D) { # -
129 ## NOTE: |-| in |ident| in |IDENT|
130 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
131 $self->{state} = BEFORE_NMSTART_STATE;
132 $self->{c} = $self->{get_char}->();
133 redo A;
134 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
135 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
136 $self->{c} = $self->{get_char}->();
137 if ($self->{c} == 0x002B) { # +
138 $self->{c} = $self->{get_char}->();
139 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
140 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
141 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
142 $self->{c} == 0x003F) { # ?
143 $self->{t}->{value} = chr $self->{c};
144 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
145 $self->{c} = $self->{get_char}->();
146 C: for (2..6) {
147 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
148 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
149 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
150 $self->{c} == 0x003F) { # ?
151 $self->{t}->{value} .= chr $self->{c};
152 $self->{c} = $self->{get_char}->();
153 } else {
154 last C;
155 }
156 } # C
157
158 if ($self->{c} == 0x002D) { # -
159 $self->{c} = $self->{get_char}->();
160 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
161 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
162 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
163 $self->{t}->{value} .= '-' . chr $self->{c};
164 $self->{c} = $self->{get_char}->();
165 C: for (2..6) {
166 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
167 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
168 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
169 $self->{t}->{value} .= chr $self->{c};
170 $self->{c} = $self->{get_char}->();
171 } else {
172 last C;
173 }
174 } # C
175
176 #
177 } else {
178 my $token = $self->{t};
179 $self->{t} = {type => IDENT_TOKEN, value => '-'};
180 $self->{state} = BEFORE_NMSTART_STATE;
181 # reprocess
182 return $token;
183 #redo A;
184 }
185 }
186
187 $self->{state} = BEFORE_TOKEN_STATE;
188 # reprocess
189 return $self->{t};
190 #redo A;
191 } else {
192 unshift @{$self->{token}}, {type => PLUS_TOKEN};
193 $self->{state} = BEFORE_TOKEN_STATE;
194 # reprocess
195 return $self->{t};
196 #redo A;
197 }
198 } else {
199 $self->{state} = NAME_STATE;
200 # reprocess
201 redo A;
202 }
203 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
204 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
205 $self->{c} == 0x005F or # _
206 $self->{c} > 0x007F) { # nonascii
207 ## NOTE: |nmstart| in |ident| in |IDENT|
208 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
209 $self->{state} = NAME_STATE;
210 $self->{c} = $self->{get_char}->();
211 redo A;
212 } elsif ($self->{c} == 0x005C) { # \
213 ## NOTE: |nmstart| in |ident| in |IDENT|
214 $self->{t} = {type => IDENT_TOKEN, value => ''};
215 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
216 $self->{c} = $self->{get_char}->();
217 redo A;
218 } elsif ($self->{c} == 0x0040) { # @
219 ## NOTE: |@| in |ATKEYWORD|
220 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
221 $self->{state} = AFTER_AT_STATE;
222 $self->{c} = $self->{get_char}->();
223 redo A;
224 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
225 $self->{t} = {type => STRING_TOKEN, value => ''};
226 $self->{state} = STRING_STATE; $q = $self->{c};
227 $self->{c} = $self->{get_char}->();
228 redo A;
229 } elsif ($self->{c} == 0x0023) { # #
230 ## NOTE: |#| in |HASH|.
231 $self->{t} = {type => HASH_TOKEN, value => ''};
232 $self->{state} = HASH_OPEN_STATE;
233 $self->{c} = $self->{get_char}->();
234 redo A;
235 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
236 ## NOTE: |num|.
237 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
238 $self->{state} = NUMBER_STATE;
239 $self->{c} = $self->{get_char}->();
240 redo A;
241 } elsif ($self->{c} == 0x002E) { # .
242 ## NOTE: |num|.
243 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
244 $self->{state} = NUMBER_FRACTION_STATE;
245 $self->{c} = $self->{get_char}->();
246 redo A;
247 } elsif ($self->{c} == 0x002F) { # /
248 $self->{c} = $self->{get_char}->();
249 if ($self->{c} == 0x002A) { # *
250 C: {
251 $self->{c} = $self->{get_char}->();
252 if ($self->{c} == 0x002A) { # *
253 D: {
254 $self->{c} = $self->{get_char}->();
255 if ($self->{c} == 0x002F) { # /
256 #
257 } elsif ($self->{c} == 0x002A) { # *
258 redo D;
259 } else {
260 redo C;
261 }
262 } # D
263 } elsif ($self->{c} == -1) {
264 # stay in the state
265 # reprocess
266 return {type => COMMENT_INVALID_TOKEN};
267 #redo A;
268 } else {
269 redo C;
270 }
271 } # C
272
273 # stay in the state.
274 $self->{c} = $self->{get_char}->();
275 redo A;
276 } else {
277 # stay in the state.
278 # reprocess
279 return {type => DELIM_TOKEN, value => '/'};
280 #redo A;
281 }
282 } elsif ($self->{c} == 0x003C) { # <
283 ## NOTE: |CDO|
284 $self->{c} = $self->{get_char}->();
285 if ($self->{c} == 0x0021) { # !
286 $self->{c} = $self->{get_char}->();
287 if ($self->{c} == 0x002D) { # -
288 $self->{c} = $self->{get_char}->();
289 if ($self->{c} == 0x002D) { # -
290 $self->{state} = BEFORE_TOKEN_STATE;
291 $self->{c} = $self->{get_char}->();
292 return {type => CDO_TOKEN};
293 #redo A;
294 } else {
295 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
296 ## NOTE: |-| in |ident| in |IDENT|
297 $self->{t} = {type => IDENT_TOKEN, value => '-'};
298 $self->{state} = BEFORE_NMSTART_STATE;
299 #reprocess
300 return {type => DELIM_TOKEN, value => '<'};
301 #redo A;
302 }
303 } else {
304 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
305 $self->{state} = BEFORE_TOKEN_STATE;
306 #reprocess
307 return {type => DELIM_TOKEN, value => '<'};
308 #redo A;
309 }
310 } else {
311 $self->{state} = BEFORE_TOKEN_STATE;
312 #reprocess
313 return {type => DELIM_TOKEN, value => '<'};
314 #redo A;
315 }
316 } elsif (my $t = {
317 0x0021 => EXCLAMATION_TOKEN, # !
318 0x002D => MINUS_TOKEN, # -
319 0x002E => DOT_TOKEN, # .
320 0x003A => COLON_TOKEN, # :
321 0x003B => SEMICOLON_TOKEN, # ;
322 0x003D => MATCH_TOKEN, # =
323 0x007B => LBRACE_TOKEN, # {
324 0x007D => RBRACE_TOKEN, # }
325 0x0028 => LPAREN_TOKEN, # (
326 0x0029 => RPAREN_TOKEN, # )
327 0x005B => LBRACKET_TOKEN, # [
328 0x005D => RBRACKET_TOKEN, # ]
329 }->{$self->{c}}) {
330 # stay in the state
331 $self->{c} = $self->{get_char}->();
332 return {type => $t};
333 # redo A;
334 } elsif ({
335 0x0020 => 1, # SP
336 0x0009 => 1, # \t
337 0x000D => 1, # \r
338 0x000A => 1, # \n
339 0x000C => 1, # \f
340 }->{$self->{c}}) {
341 W: {
342 $self->{c} = $self->{get_char}->();
343 if ({
344 0x0020 => 1, # SP
345 0x0009 => 1, # \t
346 0x000D => 1, # \r
347 0x000A => 1, # \n
348 0x000C => 1, # \f
349 }->{$self->{c}}) {
350 redo W;
351 } elsif (my $v = {
352 0x002B => PLUS_TOKEN, # +
353 0x003E => GREATER_TOKEN, # >
354 0x002C => COMMA_TOKEN, # ,
355 0x007E => TILDE_TOKEN, # ~
356 }->{$self->{c}}) {
357 # stay in the state
358 $self->{c} = $self->{get_char}->();
359 return {type => $v};
360 #redo A;
361 } else {
362 # stay in the state
363 # reprocess
364 return {type => S_TOKEN};
365 #redo A;
366 }
367 } # W
368 } elsif (my $v = {
369 0x007C => DASHMATCH_TOKEN, # |
370 0x005E => PREFIXMATCH_TOKEN, # ^
371 0x0024 => SUFFIXMATCH_TOKEN, # $
372 0x002A => SUBSTRINGMATCH_TOKEN, # *
373 }->{$self->{c}}) {
374 my $c = $self->{c};
375 $self->{c} = $self->{get_char}->();
376 if ($self->{c} == 0x003D) { # =
377 # stay in the state
378 $self->{c} = $self->{get_char}->();
379 return {type => $v};
380 #redo A;
381 } elsif ($v = {
382 0x002A => STAR_TOKEN, # *
383 0x007C => VBAR_TOKEN, # |
384 }->{$c}) {
385 # stay in the state.
386 # reprocess
387 return {type => $v};
388 #redo A;
389 } else {
390 # stay in the state
391 # reprocess
392 return {type => DELIM_TOKEN, value => chr $c};
393 #redo A;
394 }
395 } elsif ($self->{c} == 0x002B) { # +
396 # stay in the state
397 $self->{c} = $self->{get_char}->();
398 return {type => PLUS_TOKEN};
399 #redo A;
400 } elsif ($self->{c} == 0x003E) { # >
401 # stay in the state
402 $self->{c} = $self->{get_char}->();
403 return {type => GREATER_TOKEN};
404 #redo A;
405 } elsif ($self->{c} == 0x002C) { # ,
406 # stay in the state
407 $self->{c} = $self->{get_char}->();
408 return {type => COMMA_TOKEN};
409 #redo A;
410 } elsif ($self->{c} == 0x007E) { # ~
411 $self->{c} = $self->{get_char}->();
412 if ($self->{c} == 0x003D) { # =
413 # stay in the state
414 $self->{c} = $self->{get_char}->();
415 return {type => INCLUDES_TOKEN};
416 #redo A;
417 } else {
418 # stay in the state
419 # reprocess
420 return {type => TILDE_TOKEN};
421 #redo A;
422 }
423 } elsif ($self->{c} == -1) {
424 # stay in the state
425 $self->{c} = $self->{get_char}->();
426 return {type => EOF_TOKEN};
427 #redo A;
428 } else {
429 # stay in the state
430 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
431 $self->{c} = $self->{get_char}->();
432 return $self->{t};
433 #redo A;
434 }
435 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
436 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
437 ## |FUNCTION|)
438 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
439 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
440 $self->{c} == 0x005F or # _
441 $self->{c} > 0x007F) { # nonascii
442 $self->{t}->{value} .= chr $self->{c};
443 $self->{t}->{type} = DIMENSION_TOKEN
444 if $self->{t}->{type} == NUMBER_TOKEN;
445 $self->{state} = NAME_STATE;
446 $self->{c} = $self->{get_char}->();
447 redo A;
448 } elsif ($self->{c} == 0x005C) { # \
449 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
450 $self->{c} = $self->{get_char}->();
451 redo A;
452 } elsif ($self->{c} == 0x002D) { # -
453 if ($self->{t}->{type} == IDENT_TOKEN) {
454 $self->{c} = $self->{get_char}->();
455 if ($self->{c} == 0x003E) { # >
456 $self->{state} = BEFORE_TOKEN_STATE;
457 $self->{c} = $self->{get_char}->();
458 return {type => CDC_TOKEN};
459 #redo A;
460 } else {
461 ## NOTE: |-|, |-|, $self->{c}
462 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
463 # stay in the state
464 # reconsume
465 return {type => MINUS_TOKEN};
466 #redo A;
467 }
468 } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
469 $self->{c} = $self->{get_char}->();
470 if ($self->{c} == 0x003E) { # >
471 unshift @{$self->{token}}, {type => CDC_TOKEN};
472 $self->{t}->{type} = NUMBER_TOKEN;
473 $self->{t}->{value} = '';
474 $self->{state} = BEFORE_TOKEN_STATE;
475 $self->{c} = $self->{get_char}->();
476 return $self->{t};
477 #redo A;
478 } else {
479 ## NOTE: |-|, |-|, $self->{c}
480 my $t = $self->{t};
481 $t->{type} = NUMBER_TOKEN;
482 $t->{value} = '';
483 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
484 unshift @{$self->{token}}, {type => MINUS_TOKEN};
485 # stay in the state
486 # reconsume
487 return $t;
488 #redo A;
489 }
490 } else {
491 #
492 }
493 } else {
494 #
495 }
496
497 if ($self->{t}->{type} == DIMENSION_TOKEN) {
498 ## NOTE: |-| after |NUMBER|.
499 unshift @{$self->{token}}, {type => MINUS_TOKEN};
500 $self->{state} = BEFORE_TOKEN_STATE;
501 # reprocess
502 $self->{t}->{type} = NUMBER_TOKEN;
503 $self->{t}->{value} = '';
504 return $self->{t};
505 } else {
506 ## NOTE: |-| not followed by |nmstart|.
507 $self->{state} = BEFORE_TOKEN_STATE;
508 # reprocess
509 return {type => MINUS_TOKEN};
510 }
511 } elsif ($self->{state} == AFTER_AT_STATE) {
512 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
513 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
514 $self->{c} == 0x005F or # _
515 $self->{c} > 0x007F) { # nonascii
516 $self->{t}->{value} .= chr $self->{c};
517 $self->{state} = NAME_STATE;
518 $self->{c} = $self->{get_char}->();
519 redo A;
520 } elsif ($self->{c} == 0x002D) { # -
521 $self->{t}->{value} .= '-';
522 $self->{state} = AFTER_AT_HYPHEN_STATE;
523 $self->{c} = $self->{get_char}->();
524 redo A;
525 } elsif ($self->{c} == 0x005C) { # \
526 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
527 $self->{c} = $self->{get_char}->();
528 redo A;
529 } else {
530 $self->{state} = BEFORE_TOKEN_STATE;
531 # reprocess
532 return {type => DELIM_TOKEN, value => '@'};
533 }
534 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
535 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
536 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
537 $self->{c} == 0x005F or # _
538 $self->{c} > 0x007F) { # nonascii
539 $self->{t}->{value} .= chr $self->{c};
540 $self->{state} = NAME_STATE;
541 $self->{c} = $self->{get_char}->();
542 redo A;
543 } elsif ($self->{c} == 0x002D) { # -
544 $self->{c} = $self->{get_char}->();
545 if ($self->{c} == 0x003E) { # >
546 unshift @{$self->{token}}, {type => CDC_TOKEN};
547 $self->{state} = BEFORE_TOKEN_STATE;
548 $self->{c} = $self->{get_char}->();
549 return {type => DELIM_TOKEN, value => '@'};
550 #redo A;
551 } else {
552 unshift @{$self->{token}}, {type => MINUS_TOKEN};
553 $self->{t} = {type => IDENT_TOKEN, value => '-'};
554 $self->{state} = BEFORE_NMSTART_STATE;
555 # reprocess
556 return {type => DELIM_TOKEN, value => '@'};
557 #redo A;
558 }
559 } elsif ($self->{c} == 0x005C) { # \
560 ## TODO: @-\{nl}
561 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
562 $self->{c} = $self->{get_char}->();
563 redo A;
564 } else {
565 unshift @{$self->{token}}, {type => MINUS_TOKEN};
566 $self->{state} = BEFORE_TOKEN_STATE;
567 # reprocess
568 return {type => DELIM_TOKEN, value => '@'};
569 }
570 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
571 if ($self->{c} == 0x002D) { # -
572 ## NOTE: |-| in |ident|.
573 $self->{t}->{hyphen} = 1;
574 $self->{t}->{value} = '-';
575 $self->{t}->{type} = DIMENSION_TOKEN;
576 $self->{state} = BEFORE_NMSTART_STATE;
577 $self->{c} = $self->{get_char}->();
578 redo A;
579 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
580 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
581 $self->{c} == 0x005F or # _
582 $self->{c} > 0x007F) { # nonascii
583 ## NOTE: |nmstart| in |ident|.
584 $self->{t}->{value} = chr $self->{c};
585 $self->{t}->{type} = DIMENSION_TOKEN;
586 $self->{state} = NAME_STATE;
587 $self->{c} = $self->{get_char}->();
588 redo A;
589 } elsif ($self->{c} == 0x005C) { # \
590 ## NOTE: |nmstart| in |ident| in |IDENT|
591 $self->{t}->{value} = '';
592 $self->{t}->{type} = DIMENSION_TOKEN;
593 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
594 $self->{c} = $self->{get_char}->();
595 redo A;
596 } elsif ($self->{c} == 0x0025) { # %
597 $self->{t}->{type} = PERCENTAGE_TOKEN;
598 $self->{state} = BEFORE_TOKEN_STATE;
599 $self->{c} = $self->{get_char}->();
600 return $self->{t};
601 #redo A;
602 } else {
603 $self->{state} = BEFORE_TOKEN_STATE;
604 # reprocess
605 return $self->{t};
606 #redo A;
607 }
608 } elsif ($self->{state} == HASH_OPEN_STATE) {
609 ## NOTE: The first |nmchar| in |name| in |HASH|.
610 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
611 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
612 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
613 $self->{c} == 0x002D or # -
614 $self->{c} == 0x005F or # _
615 $self->{c} > 0x007F) { # nonascii
616 $self->{t}->{value} .= chr $self->{c};
617 $self->{state} = NAME_STATE;
618 $self->{c} = $self->{get_char}->();
619 redo A;
620 } elsif ($self->{c} == 0x005C) { # \
621 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
622 $self->{c} = $self->{get_char}->();
623 redo A;
624 } else {
625 $self->{state} = BEFORE_TOKEN_STATE;
626 # reprocess
627 return {type => DELIM_TOKEN, value => '#'};
628 #redo A;
629 }
630 } elsif ($self->{state} == NAME_STATE) {
631 ## NOTE: |nmchar| in (|ident| or |name|).
632 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
633 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
634 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
635 $self->{c} == 0x005F or # _
636 $self->{c} == 0x002D or # -
637 $self->{c} > 0x007F) { # nonascii
638 $self->{t}->{value} .= chr $self->{c};
639 # stay in the state
640 $self->{c} = $self->{get_char}->();
641 redo A;
642 } elsif ($self->{c} == 0x005C) { # \
643 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
644 $self->{c} = $self->{get_char}->();
645 redo A;
646 } elsif ($self->{c} == 0x0028 and # (
647 $self->{t}->{type} == IDENT_TOKEN) { # (
648 my $func_name = $self->{t}->{value};
649 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
650 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
651 if ($self->{t}->{has_escape}) {
652 ## TODO: warn
653 }
654 $self->{t}->{type}
655 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
656 $self->{t}->{value} = '';
657 $self->{state} = URI_BEFORE_WSP_STATE;
658 $self->{c} = $self->{get_char}->();
659 redo A;
660 } else {
661 $self->{t}->{type} = FUNCTION_TOKEN;
662 $self->{state} = BEFORE_TOKEN_STATE;
663 $self->{c} = $self->{get_char}->();
664 return $self->{t};
665 #redo A;
666 }
667 } else {
668 $self->{state} = BEFORE_TOKEN_STATE;
669 # reconsume
670 return $self->{t};
671 #redo A;
672 }
673 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
674 while ({
675 0x0020 => 1, # SP
676 0x0009 => 1, # \t
677 0x000D => 1, # \r
678 0x000A => 1, # \n
679 0x000C => 1, # \f
680 }->{$self->{c}}) {
681 $self->{c} = $self->{get_char}->();
682 }
683 if ($self->{c} == -1) {
684 $self->{t}->{type} = {
685 URI_TOKEN, URI_INVALID_TOKEN,
686 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
687 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
688 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
689 }->{$self->{t}->{type}};
690 $self->{state} = BEFORE_TOKEN_STATE;
691 $self->{c} = $self->{get_char}->();
692 return $self->{t};
693 #redo A;
694 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
695 ## TODO: Should we consider matches of "(" and ")"?
696 $self->{t}->{type} = {
697 URI_TOKEN, URI_INVALID_TOKEN,
698 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
699 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
700 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
701 }->{$self->{t}->{type}};
702 $self->{state} = URI_UNQUOTED_STATE;
703 $self->{c} = $self->{get_char}->();
704 redo A;
705 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
706 $self->{state} = STRING_STATE; $q = $self->{c};
707 $self->{c} = $self->{get_char}->();
708 redo A;
709 } elsif ($self->{c} == 0x0029) { # )
710 $self->{state} = BEFORE_TOKEN_STATE;
711 $self->{c} = $self->{get_char}->();
712 return $self->{t};
713 #redo A;
714 } elsif ($self->{c} == 0x005C) { # \
715 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
716 $self->{c} = $self->{get_char}->();
717 redo A;
718 } else {
719 $self->{t}->{value} .= chr $self->{c};
720 $self->{state} = URI_UNQUOTED_STATE;
721 $self->{c} = $self->{get_char}->();
722 redo A;
723 }
724 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
725 if ({
726 0x0020 => 1, # SP
727 0x0009 => 1, # \t
728 0x000D => 1, # \r
729 0x000A => 1, # \n
730 0x000C => 1, # \f
731 }->{$self->{c}}) {
732 $self->{state} = URI_AFTER_WSP_STATE;
733 $self->{c} = $self->{get_char}->();
734 redo A;
735 } elsif ($self->{c} == -1) {
736 $self->{t}->{type} = {
737 URI_TOKEN, URI_INVALID_TOKEN,
738 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
739 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
740 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
741 }->{$self->{t}->{type}};
742 $self->{state} = BEFORE_TOKEN_STATE;
743 $self->{c} = $self->{get_char}->();
744 return $self->{t};
745 #redo A;
746 } elsif ($self->{c} < 0x0020 or {
747 0x0022 => 1, # "
748 0x0027 => 1, # '
749 0x0028 => 1, # (
750 }->{$self->{c}}) { # C0 or (
751 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
752 $self->{t}->{type} = {
753 URI_TOKEN, URI_INVALID_TOKEN,
754 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
755 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
756 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
757 }->{$self->{t}->{type}};
758 # stay in the state.
759 $self->{c} = $self->{get_char}->();
760 redo A;
761 } elsif ($self->{c} == 0x0029) { # )
762 $self->{state} = BEFORE_TOKEN_STATE;
763 $self->{c} = $self->{get_char}->();
764 return $self->{t};
765 #redo A;
766 } elsif ($self->{c} == 0x005C) { # \
767 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
768 $self->{c} = $self->{get_char}->();
769 redo A;
770 } else {
771 $self->{t}->{value} .= chr $self->{c};
772 # stay in the state.
773 $self->{c} = $self->{get_char}->();
774 redo A;
775 }
776 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
777 if ({
778 0x0020 => 1, # SP
779 0x0009 => 1, # \t
780 0x000D => 1, # \r
781 0x000A => 1, # \n
782 0x000C => 1, # \f
783 }->{$self->{c}}) {
784 # stay in the state.
785 $self->{c} = $self->{get_char}->();
786 redo A;
787 } elsif ($self->{c} == -1) {
788 $self->{t}->{type} = {
789 URI_TOKEN, URI_INVALID_TOKEN,
790 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
791 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
792 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
793 }->{$self->{t}->{type}};
794 $self->{state} = BEFORE_TOKEN_STATE;
795 $self->{c} = $self->{get_char}->();
796 return $self->{t};
797 #redo A;
798 } elsif ($self->{c} == 0x0029) { # )
799 $self->{state} = BEFORE_TOKEN_STATE;
800 $self->{c} = $self->{get_char}->();
801 return $self->{t};
802 #redo A;
803 } elsif ($self->{c} == 0x005C) { # \
804 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
805 $self->{c} = $self->{get_char}->();
806 redo A;
807 } else {
808 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
809 $self->{t}->{type} = {
810 URI_TOKEN, URI_INVALID_TOKEN,
811 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
812 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
813 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
814 }->{$self->{t}->{type}};
815 # stay in the state.
816 $self->{c} = $self->{get_char}->();
817 redo A;
818 }
819 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
820 $self->{t}->{has_escape} = 1;
821 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
822 ## NOTE: second character of |unicode| in |escape|.
823 $char = $self->{c} - 0x0030;
824 $self->{state} = ESCAPE_STATE; $i = 2;
825 $self->{c} = $self->{get_char}->();
826 redo A;
827 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
828 ## NOTE: second character of |unicode| in |escape|.
829 $char = $self->{c} - 0x0041 + 0xA;
830 $self->{state} = ESCAPE_STATE; $i = 2;
831 $self->{c} = $self->{get_char}->();
832 redo A;
833 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
834 ## NOTE: second character of |unicode| in |escape|.
835 $char = $self->{c} - 0x0061 + 0xA;
836 $self->{state} = ESCAPE_STATE; $i = 2;
837 $self->{c} = $self->{get_char}->();
838 redo A;
839 } elsif ($self->{c} == 0x000A or # \n
840 $self->{c} == 0x000C) { # \f
841 if ($q == 0) {
842 #
843 } elsif ($q == 1) {
844 ## NOTE: In |escape| in |URI|.
845 $self->{t}->{type} = {
846 URI_TOKEN, URI_INVALID_TOKEN,
847 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
848 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
849 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
850 }->{$self->{t}->{type}};
851 $self->{t}->{value} .= chr $self->{c};
852 $self->{state} = URI_UNQUOTED_STATE;
853 $self->{c} = $self->{get_char}->();
854 redo A;
855 } else {
856 ## Note: In |nl| in ... in |string| or |ident|.
857 $self->{t}->{value} .= chr $self->{c};
858 $self->{state} = STRING_STATE;
859 $self->{c} = $self->{get_char}->();
860 redo A;
861 }
862 } elsif ($self->{c} == 0x000D) { # \r
863 if ($q == 0) {
864 #
865 } elsif ($q == 1) {
866 ## NOTE: In |escape| in |URI|.
867 $self->{t}->{type} = {
868 URI_TOKEN, URI_INVALID_TOKEN,
869 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872 }->{$self->{t}->{type}};
873 $self->{t}->{value} .= "\x0D";
874 $self->{state} = ESCAPE_BEFORE_LF_STATE;
875 $self->{c} = $self->{get_char}->();
876 redo A;
877 } else {
878 ## Note: In |nl| in ... in |string| or |ident|.
879 $self->{t}->{value} .= "\x0D";
880 $self->{state} = ESCAPE_BEFORE_LF_STATE;
881 $self->{c} = $self->{get_char}->();
882 redo A;
883 }
884 } elsif ($self->{c} == -1) {
885 #
886 } else {
887 ## NOTE: second character of |escape|.
888 $self->{t}->{value} .= chr $self->{c};
889 $self->{state} = $q == 0 ? NAME_STATE :
890 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
891 $self->{c} = $self->{get_char}->();
892 redo A;
893 }
894
895 if ($q == 0) {
896 if ($self->{t}->{type} == DIMENSION_TOKEN) {
897 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
898 $self->{state} = BEFORE_TOKEN_STATE;
899 # reprocess
900 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
901 unshift @{$self->{token}}, {type => MINUS_TOKEN};
902 $self->{t}->{type} = NUMBER_TOKEN;
903 $self->{t}->{value} = '';
904 return $self->{t};
905 #redo A;
906 } elsif (length $self->{t}->{value}) {
907 $self->{state} = BEFORE_TOKEN_STATE;
908 # reprocess
909 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
910 return $self->{t};
911 #redo A;
912 } else {
913 $self->{state} = BEFORE_TOKEN_STATE;
914 # reprocess
915 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
916 $self->{t}->{type} = NUMBER_TOKEN;
917 $self->{t}->{value} = '';
918 return $self->{t};
919 #redo A;
920 }
921 } else {
922 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
923 $self->{state} = BEFORE_TOKEN_STATE;
924 # reprocess
925 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
926 return {type => MINUS_TOKEN};
927 #redo A;
928 } elsif (length $self->{t}->{value}) {
929 $self->{state} = BEFORE_TOKEN_STATE;
930 # reprocess
931 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
932 return $self->{t};
933 #redo A;
934 } else {
935 $self->{state} = BEFORE_TOKEN_STATE;
936 # reprocess
937 return {type => DELIM_TOKEN, value => '\\'};
938 #redo A;
939 }
940 }
941 } elsif ($q == 1) {
942 $self->{state} = URI_UNQUOTED_STATE;
943 $self->{c} = $self->{get_char}->();
944 redo A;
945 } else {
946 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
947 $self->{t}->{type} = {
948 STRING_TOKEN, INVALID_TOKEN,
949 URI_TOKEN, URI_INVALID_TOKEN,
950 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
951 }->{$self->{t}->{type}} || $self->{t}->{type};
952 $self->{state} = BEFORE_TOKEN_STATE;
953 # reprocess
954 return $self->{t};
955 #redo A;
956 }
957 } elsif ($self->{state} == ESCAPE_STATE) {
958 ## NOTE: third..seventh character of |unicode| in |escape|.
959 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
960 $char = $char * 0x10 + $self->{c} - 0x0030;
961 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
962 $self->{c} = $self->{get_char}->();
963 redo A;
964 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
965 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
966 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
967 $self->{c} = $self->{get_char}->();
968 redo A;
969 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
970 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
971 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
972 $self->{c} = $self->{get_char}->();
973 redo A;
974 } elsif ($self->{c} == 0x0020 or # SP
975 $self->{c} == 0x000A or # \n
976 $self->{c} == 0x0009 or # \t
977 $self->{c} == 0x000C) { # \f
978 $self->{t}->{value} .= chr $char;
979 $self->{state} = $q == 0 ? NAME_STATE :
980 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
981 $self->{c} = $self->{get_char}->();
982 redo A;
983 } elsif ($self->{c} == 0x000D) { # \r
984 $self->{state} = ESCAPE_BEFORE_LF_STATE;
985 $self->{c} = $self->{get_char}->();
986 redo A;
987 } else {
988 $self->{t}->{value} .= chr $char;
989 $self->{state} = $q == 0 ? NAME_STATE :
990 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
991 # reconsume
992 redo A;
993 }
994 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
995 ## NOTE: eightth character of |unicode| in |escape|.
996 if ($self->{c} == 0x0020 or # SP
997 $self->{c} == 0x000A or # \n
998 $self->{c} == 0x0009 or # \t
999 $self->{c} == 0x000C) { # \f
1000 $self->{t}->{value} .= chr $char;
1001 $self->{state} = $q == 0 ? NAME_STATE :
1002 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1003 $self->{c} = $self->{get_char}->();
1004 redo A;
1005 } elsif ($self->{c} == 0x000D) { # \r
1006 $self->{state} = ESCAPE_BEFORE_NL_STATE;
1007 $self->{c} = $self->{get_char}->();
1008 redo A;
1009 } else {
1010 $self->{t}->{value} .= chr $char;
1011 $self->{state} = $q == 0 ? NAME_STATE :
1012 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1013 # reconsume
1014 redo A;
1015 }
1016 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1017 ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
1018 if ($self->{c} == 0x000A) { # \n
1019 $self->{t}->{value} .= chr $self->{c};
1020 $self->{state} = $q == 0 ? NAME_STATE :
1021 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1022 $self->{c} = $self->{get_char}->();
1023 redo A;
1024 } else {
1025 $self->{state} = $q == 0 ? NAME_STATE :
1026 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1027 # reprocess
1028 redo A;
1029 }
1030 } elsif ($self->{state} == STRING_STATE) {
1031 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1032 ## a character in |invalid$Q| in |invalid| in |INVALID|,
1033 ## where |$Q = $q == 0x0022 ? 1 : 2|.
1034 ## Or, in |URI|.
1035 if ($self->{c} == 0x005C) { # \
1036 $self->{state} = ESCAPE_OPEN_STATE;
1037 $self->{c} = $self->{get_char}->();
1038 redo A;
1039 } elsif ($self->{c} == $q) { # " | '
1040 if ($self->{t}->{type} == STRING_TOKEN) {
1041 $self->{state} = BEFORE_TOKEN_STATE;
1042 $self->{c} = $self->{get_char}->();
1043 return $self->{t};
1044 #redo A;
1045 } else {
1046 $self->{state} = URI_AFTER_WSP_STATE;
1047 $self->{c} = $self->{get_char}->();
1048 redo A;
1049 }
1050 } elsif ($self->{c} == 0x000A or # \n
1051 $self->{c} == 0x000D or # \r
1052 $self->{c} == 0x000C or # \f
1053 $self->{c} == -1) {
1054 $self->{t}->{type} = {
1055 STRING_TOKEN, INVALID_TOKEN,
1056 INVALID_TOKEN, INVALID_TOKEN,
1057 URI_TOKEN, URI_INVALID_TOKEN,
1058 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1059 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1060 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1061 }->{$self->{t}->{type}};
1062 $self->{state} = BEFORE_TOKEN_STATE;
1063 # reconsume
1064 return $self->{t};
1065 #redo A;
1066 } else {
1067 $self->{t}->{value} .= chr $self->{c};
1068 # stay in the state
1069 $self->{c} = $self->{get_char}->();
1070 redo A;
1071 }
1072 } elsif ($self->{state} == NUMBER_STATE) {
1073 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1074 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1075 $self->{t}->{value} .= chr $self->{c};
1076 # stay in the state
1077 $self->{c} = $self->{get_char}->();
1078 redo A;
1079 } elsif ($self->{c} == 0x002E) { # .
1080 $self->{state} = NUMBER_DOT_STATE;
1081 $self->{c} = $self->{get_char}->();
1082 redo A;
1083 } else {
1084 $self->{t}->{number} = $self->{t}->{value};
1085 $self->{t}->{value} = '';
1086 $self->{state} = AFTER_NUMBER_STATE;
1087 # reprocess
1088 redo A;
1089 }
1090 } elsif ($self->{state} == NUMBER_DOT_STATE) {
1091 ## NOTE: The character immediately following |.| in |num|.
1092 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1093 $self->{t}->{value} .= '.' . chr $self->{c};
1094 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1095 $self->{c} = $self->{get_char}->();
1096 redo A;
1097 } else {
1098 unshift @{$self->{token}}, {type => DOT_TOKEN};
1099 $self->{t}->{number} = $self->{t}->{value};
1100 $self->{t}->{value} = '';
1101 $self->{state} = BEFORE_TOKEN_STATE;
1102 # reprocess
1103 return $self->{t};
1104 #redo A;
1105 }
1106 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1107 ## NOTE: The character immediately following |.| at the beginning of |num|.
1108 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1109 $self->{t}->{value} .= '.' . chr $self->{c};
1110 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1111 $self->{c} = $self->{get_char}->();
1112 redo A;
1113 } else {
1114 $self->{state} = BEFORE_TOKEN_STATE;
1115 # reprocess
1116 return {type => DOT_TOKEN};
1117 #redo A;
1118 }
1119 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1120 ## NOTE: |[0-9]| in |num| after |.|.
1121 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1122 $self->{t}->{value} .= chr $self->{c};
1123 # stay in the state
1124 $self->{c} = $self->{get_char}->();
1125 redo A;
1126 } else {
1127 $self->{t}->{number} = $self->{t}->{value};
1128 $self->{t}->{value} = '';
1129 $self->{state} = AFTER_NUMBER_STATE;
1130 # reprocess
1131 redo A;
1132 }
1133 } else {
1134 die "$0: Unknown state |$self->{state}|";
1135 }
1136 } # A
1137 } # get_next_token
1138
1139 1;
1140 # $Date: 2007/09/08 17:43:41 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24