/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (show annotations) (download)
Wed Oct 17 10:46:26 2007 UTC (18 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.15: +11 -1 lines
++ whatpm/Whatpm/ChangeLog	17 Oct 2007 10:45:53 -0000
	* Makefile (clean): New rule.

	* NanoDOM.pm (public_id, system_id): New attributes.

2007-10-17  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 require Exporter;
6 push our @ISA, 'Exporter';
7
8 sub BEFORE_TOKEN_STATE () { 0 }
9 sub BEFORE_NMSTART_STATE () { 1 }
10 sub NAME_STATE () { 2 }
11 sub ESCAPE_OPEN_STATE () { 3 }
12 sub STRING_STATE () { 4 }
13 sub HASH_OPEN_STATE () { 5 }
14 sub NUMBER_STATE () { 6 }
15 sub NUMBER_FRACTION_STATE () { 7 }
16 sub AFTER_NUMBER_STATE () { 8 }
17 sub URI_BEFORE_WSP_STATE () { 9 }
18 sub ESCAPE_STATE () { 10 }
19 sub ESCAPE_BEFORE_LF_STATE () { 11 }
20 sub ESCAPE_BEFORE_NL_STATE () { 12 }
21 sub NUMBER_DOT_STATE () { 13 }
22 sub NUMBER_DOT_NUMBER_STATE () { 14 }
23 sub DELIM_STATE () { 15 }
24 sub URI_UNQUOTED_STATE () { 16 }
25 sub URI_AFTER_WSP_STATE () { 17 }
26 sub AFTER_AT_STATE () { 18 }
27 sub AFTER_AT_HYPHEN_STATE () { 19 }
28
29 sub IDENT_TOKEN () { 1 }
30 sub ATKEYWORD_TOKEN () { 2 }
31 sub HASH_TOKEN () { 3 }
32 sub FUNCTION_TOKEN () { 4 }
33 sub URI_TOKEN () { 5 }
34 sub URI_INVALID_TOKEN () { 6 }
35 sub URI_PREFIX_TOKEN () { 7 }
36 sub URI_PREFIX_INVALID_TOKEN () { 8 }
37 sub STRING_TOKEN () { 9 }
38 sub INVALID_TOKEN () { 10 }
39 sub NUMBER_TOKEN () { 11 }
40 sub DIMENSION_TOKEN () { 12 }
41 sub PERCENTAGE_TOKEN () { 13 }
42 sub UNICODE_RANGE_TOKEN () { 14 }
43 sub DELIM_TOKEN () { 16 }
44 sub PLUS_TOKEN () { 17 }
45 sub GREATER_TOKEN () { 18 }
46 sub COMMA_TOKEN () { 19 }
47 sub TILDE_TOKEN () { 20 }
48 sub DASHMATCH_TOKEN () { 21 }
49 sub PREFIXMATCH_TOKEN () { 22 }
50 sub SUFFIXMATCH_TOKEN () { 23 }
51 sub SUBSTRINGMATCH_TOKEN () { 24 }
52 sub INCLUDES_TOKEN () { 25 }
53 sub SEMICOLON_TOKEN () { 26 }
54 sub LBRACE_TOKEN () { 27 }
55 sub RBRACE_TOKEN () { 28 }
56 sub LPAREN_TOKEN () { 29 }
57 sub RPAREN_TOKEN () { 30 }
58 sub LBRACKET_TOKEN () { 31 }
59 sub RBRACKET_TOKEN () { 32 }
60 sub S_TOKEN () { 33 }
61 sub CDO_TOKEN () { 34 }
62 sub CDC_TOKEN () { 35 }
63 sub COMMENT_TOKEN () { 36 }
64 sub COMMENT_INVALID_TOKEN () { 37 }
65 sub EOF_TOKEN () { 38 }
66 sub MINUS_TOKEN () { 39 }
67 sub STAR_TOKEN () { 40 }
68 sub VBAR_TOKEN () { 41 }
69 sub DOT_TOKEN () { 42 }
70 sub COLON_TOKEN () { 43 }
71 sub MATCH_TOKEN () { 44 }
72 sub EXCLAMATION_TOKEN () { 45 }
73
74 our @TokenName = qw(
75 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81 );
82
83 our @EXPORT_OK = qw(
84 IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85 URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86 STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87 UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88 TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89 SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90 RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91 S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92 MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93 EXCLAMATION_TOKEN
94 );
95
96 our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97
98 sub new ($) {
99 my $self = bless {token => [], get_char => sub { -1 },
100 onerror => sub { }}, shift;
101 return $self;
102 } # new
103
104 sub init ($) {
105 my $self = shift;
106 $self->{state} = BEFORE_TOKEN_STATE;
107 $self->{c} = $self->{get_char}->();
108 #$self->{t} = {type => token-type, value => value, number => number};
109 } # init
110
111 sub get_next_token ($) {
112 my $self = shift;
113 if (@{$self->{token}}) {
114 return shift @{$self->{token}};
115 }
116
117 my $char;
118 my $num; # |{num}|, if any.
119 my $i; # |$i + 1|th character in |unicode| in |escape|.
120 my $q;
121 ## NOTE:
122 ## 0: in |ident|.
123 ## 1: in |URI| outside of |string|.
124 ## 0x0022: in |string1| or |invalid1|.
125 ## 0x0027: in |string2| or |invalid2|.
126
127 A: {
128 if ($self->{state} == BEFORE_TOKEN_STATE) {
129 if ($self->{c} == 0x002D) { # -
130 ## NOTE: |-| in |ident| in |IDENT|
131 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
132 $self->{state} = BEFORE_NMSTART_STATE;
133 $self->{c} = $self->{get_char}->();
134 redo A;
135 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
137 $self->{c} = $self->{get_char}->();
138 if ($self->{c} == 0x002B) { # +
139 $self->{c} = $self->{get_char}->();
140 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
141 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
142 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
143 $self->{c} == 0x003F) { # ?
144 $self->{t}->{value} = chr $self->{c};
145 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
146 $self->{c} = $self->{get_char}->();
147 C: for (2..6) {
148 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
149 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
150 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
151 $self->{c} == 0x003F) { # ?
152 $self->{t}->{value} .= chr $self->{c};
153 $self->{c} = $self->{get_char}->();
154 } else {
155 last C;
156 }
157 } # C
158
159 if ($self->{c} == 0x002D) { # -
160 $self->{c} = $self->{get_char}->();
161 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
162 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
163 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
164 $self->{t}->{value} .= '-' . chr $self->{c};
165 $self->{c} = $self->{get_char}->();
166 C: for (2..6) {
167 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
168 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
169 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
170 $self->{t}->{value} .= chr $self->{c};
171 $self->{c} = $self->{get_char}->();
172 } else {
173 last C;
174 }
175 } # C
176
177 #
178 } else {
179 my $token = $self->{t};
180 $self->{t} = {type => IDENT_TOKEN, value => '-'};
181 $self->{state} = BEFORE_NMSTART_STATE;
182 # reprocess
183 return $token;
184 #redo A;
185 }
186 }
187
188 $self->{state} = BEFORE_TOKEN_STATE;
189 # reprocess
190 return $self->{t};
191 #redo A;
192 } else {
193 unshift @{$self->{token}}, {type => PLUS_TOKEN};
194 $self->{state} = BEFORE_TOKEN_STATE;
195 # reprocess
196 return $self->{t};
197 #redo A;
198 }
199 } else {
200 $self->{state} = NAME_STATE;
201 # reprocess
202 redo A;
203 }
204 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
205 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
206 $self->{c} == 0x005F or # _
207 $self->{c} > 0x007F) { # nonascii
208 ## NOTE: |nmstart| in |ident| in |IDENT|
209 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
210 $self->{state} = NAME_STATE;
211 $self->{c} = $self->{get_char}->();
212 redo A;
213 } elsif ($self->{c} == 0x005C) { # \
214 ## NOTE: |nmstart| in |ident| in |IDENT|
215 $self->{t} = {type => IDENT_TOKEN, value => ''};
216 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
217 $self->{c} = $self->{get_char}->();
218 redo A;
219 } elsif ($self->{c} == 0x0040) { # @
220 ## NOTE: |@| in |ATKEYWORD|
221 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
222 $self->{state} = AFTER_AT_STATE;
223 $self->{c} = $self->{get_char}->();
224 redo A;
225 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
226 $self->{t} = {type => STRING_TOKEN, value => ''};
227 $self->{state} = STRING_STATE; $q = $self->{c};
228 $self->{c} = $self->{get_char}->();
229 redo A;
230 } elsif ($self->{c} == 0x0023) { # #
231 ## NOTE: |#| in |HASH|.
232 $self->{t} = {type => HASH_TOKEN, value => ''};
233 $self->{state} = HASH_OPEN_STATE;
234 $self->{c} = $self->{get_char}->();
235 redo A;
236 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
237 ## NOTE: |num|.
238 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
239 $self->{state} = NUMBER_STATE;
240 $self->{c} = $self->{get_char}->();
241 redo A;
242 } elsif ($self->{c} == 0x002E) { # .
243 ## NOTE: |num|.
244 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
245 $self->{state} = NUMBER_FRACTION_STATE;
246 $self->{c} = $self->{get_char}->();
247 redo A;
248 } elsif ($self->{c} == 0x002F) { # /
249 $self->{c} = $self->{get_char}->();
250 if ($self->{c} == 0x002A) { # *
251 C: {
252 $self->{c} = $self->{get_char}->();
253 if ($self->{c} == 0x002A) { # *
254 D: {
255 $self->{c} = $self->{get_char}->();
256 if ($self->{c} == 0x002F) { # /
257 #
258 } elsif ($self->{c} == 0x002A) { # *
259 redo D;
260 } else {
261 redo C;
262 }
263 } # D
264 } elsif ($self->{c} == -1) {
265 # stay in the state
266 # reprocess
267 return {type => COMMENT_INVALID_TOKEN};
268 #redo A;
269 } else {
270 redo C;
271 }
272 } # C
273
274 # stay in the state.
275 $self->{c} = $self->{get_char}->();
276 redo A;
277 } else {
278 # stay in the state.
279 # reprocess
280 return {type => DELIM_TOKEN, value => '/'};
281 #redo A;
282 }
283 } elsif ($self->{c} == 0x003C) { # <
284 ## NOTE: |CDO|
285 $self->{c} = $self->{get_char}->();
286 if ($self->{c} == 0x0021) { # !
287 $self->{c} = $self->{get_char}->();
288 if ($self->{c} == 0x002D) { # -
289 $self->{c} = $self->{get_char}->();
290 if ($self->{c} == 0x002D) { # -
291 $self->{state} = BEFORE_TOKEN_STATE;
292 $self->{c} = $self->{get_char}->();
293 return {type => CDO_TOKEN};
294 #redo A;
295 } else {
296 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
297 ## NOTE: |-| in |ident| in |IDENT|
298 $self->{t} = {type => IDENT_TOKEN, value => '-'};
299 $self->{state} = BEFORE_NMSTART_STATE;
300 #reprocess
301 return {type => DELIM_TOKEN, value => '<'};
302 #redo A;
303 }
304 } else {
305 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
306 $self->{state} = BEFORE_TOKEN_STATE;
307 #reprocess
308 return {type => DELIM_TOKEN, value => '<'};
309 #redo A;
310 }
311 } else {
312 $self->{state} = BEFORE_TOKEN_STATE;
313 #reprocess
314 return {type => DELIM_TOKEN, value => '<'};
315 #redo A;
316 }
317 } elsif (my $t = {
318 0x0021 => EXCLAMATION_TOKEN, # !
319 0x002D => MINUS_TOKEN, # -
320 0x002E => DOT_TOKEN, # .
321 0x003A => COLON_TOKEN, # :
322 0x003B => SEMICOLON_TOKEN, # ;
323 0x003D => MATCH_TOKEN, # =
324 0x007B => LBRACE_TOKEN, # {
325 0x007D => RBRACE_TOKEN, # }
326 0x0028 => LPAREN_TOKEN, # (
327 0x0029 => RPAREN_TOKEN, # )
328 0x005B => LBRACKET_TOKEN, # [
329 0x005D => RBRACKET_TOKEN, # ]
330 }->{$self->{c}}) {
331 # stay in the state
332 $self->{c} = $self->{get_char}->();
333 return {type => $t};
334 # redo A;
335 } elsif ({
336 0x0020 => 1, # SP
337 0x0009 => 1, # \t
338 0x000D => 1, # \r
339 0x000A => 1, # \n
340 0x000C => 1, # \f
341 }->{$self->{c}}) {
342 W: {
343 $self->{c} = $self->{get_char}->();
344 if ({
345 0x0020 => 1, # SP
346 0x0009 => 1, # \t
347 0x000D => 1, # \r
348 0x000A => 1, # \n
349 0x000C => 1, # \f
350 }->{$self->{c}}) {
351 redo W;
352 } elsif (my $v = {
353 0x002B => PLUS_TOKEN, # +
354 0x003E => GREATER_TOKEN, # >
355 0x002C => COMMA_TOKEN, # ,
356 0x007E => TILDE_TOKEN, # ~
357 }->{$self->{c}}) {
358 # stay in the state
359 $self->{c} = $self->{get_char}->();
360 return {type => $v};
361 #redo A;
362 } else {
363 # stay in the state
364 # reprocess
365 return {type => S_TOKEN};
366 #redo A;
367 }
368 } # W
369 } elsif (my $v = {
370 0x007C => DASHMATCH_TOKEN, # |
371 0x005E => PREFIXMATCH_TOKEN, # ^
372 0x0024 => SUFFIXMATCH_TOKEN, # $
373 0x002A => SUBSTRINGMATCH_TOKEN, # *
374 }->{$self->{c}}) {
375 my $c = $self->{c};
376 $self->{c} = $self->{get_char}->();
377 if ($self->{c} == 0x003D) { # =
378 # stay in the state
379 $self->{c} = $self->{get_char}->();
380 return {type => $v};
381 #redo A;
382 } elsif ($v = {
383 0x002A => STAR_TOKEN, # *
384 0x007C => VBAR_TOKEN, # |
385 }->{$c}) {
386 # stay in the state.
387 # reprocess
388 return {type => $v};
389 #redo A;
390 } else {
391 # stay in the state
392 # reprocess
393 return {type => DELIM_TOKEN, value => chr $c};
394 #redo A;
395 }
396 } elsif ($self->{c} == 0x002B) { # +
397 # stay in the state
398 $self->{c} = $self->{get_char}->();
399 return {type => PLUS_TOKEN};
400 #redo A;
401 } elsif ($self->{c} == 0x003E) { # >
402 # stay in the state
403 $self->{c} = $self->{get_char}->();
404 return {type => GREATER_TOKEN};
405 #redo A;
406 } elsif ($self->{c} == 0x002C) { # ,
407 # stay in the state
408 $self->{c} = $self->{get_char}->();
409 return {type => COMMA_TOKEN};
410 #redo A;
411 } elsif ($self->{c} == 0x007E) { # ~
412 $self->{c} = $self->{get_char}->();
413 if ($self->{c} == 0x003D) { # =
414 # stay in the state
415 $self->{c} = $self->{get_char}->();
416 return {type => INCLUDES_TOKEN};
417 #redo A;
418 } else {
419 # stay in the state
420 # reprocess
421 return {type => TILDE_TOKEN};
422 #redo A;
423 }
424 } elsif ($self->{c} == -1) {
425 # stay in the state
426 $self->{c} = $self->{get_char}->();
427 return {type => EOF_TOKEN};
428 #redo A;
429 } else {
430 # stay in the state
431 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
432 $self->{c} = $self->{get_char}->();
433 return $self->{t};
434 #redo A;
435 }
436 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
437 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
438 ## |FUNCTION|)
439 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
440 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
441 $self->{c} == 0x005F or # _
442 $self->{c} > 0x007F) { # nonascii
443 $self->{t}->{value} .= chr $self->{c};
444 $self->{t}->{type} = DIMENSION_TOKEN
445 if $self->{t}->{type} == NUMBER_TOKEN;
446 $self->{state} = NAME_STATE;
447 $self->{c} = $self->{get_char}->();
448 redo A;
449 } elsif ($self->{c} == 0x005C) { # \
450 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
451 $self->{c} = $self->{get_char}->();
452 redo A;
453 } elsif ($self->{c} == 0x002D) { # -
454 if ($self->{t}->{type} == IDENT_TOKEN) {
455 $self->{c} = $self->{get_char}->();
456 if ($self->{c} == 0x003E) { # >
457 $self->{state} = BEFORE_TOKEN_STATE;
458 $self->{c} = $self->{get_char}->();
459 return {type => CDC_TOKEN};
460 #redo A;
461 } else {
462 ## NOTE: |-|, |-|, $self->{c}
463 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
464 # stay in the state
465 # reconsume
466 return {type => MINUS_TOKEN};
467 #redo A;
468 }
469 } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
470 $self->{c} = $self->{get_char}->();
471 if ($self->{c} == 0x003E) { # >
472 unshift @{$self->{token}}, {type => CDC_TOKEN};
473 $self->{t}->{type} = NUMBER_TOKEN;
474 $self->{t}->{value} = '';
475 $self->{state} = BEFORE_TOKEN_STATE;
476 $self->{c} = $self->{get_char}->();
477 return $self->{t};
478 #redo A;
479 } else {
480 ## NOTE: |-|, |-|, $self->{c}
481 my $t = $self->{t};
482 $t->{type} = NUMBER_TOKEN;
483 $t->{value} = '';
484 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
485 unshift @{$self->{token}}, {type => MINUS_TOKEN};
486 # stay in the state
487 # reconsume
488 return $t;
489 #redo A;
490 }
491 } else {
492 #
493 }
494 } else {
495 #
496 }
497
498 if ($self->{t}->{type} == DIMENSION_TOKEN) {
499 ## NOTE: |-| after |NUMBER|.
500 unshift @{$self->{token}}, {type => MINUS_TOKEN};
501 $self->{state} = BEFORE_TOKEN_STATE;
502 # reprocess
503 $self->{t}->{type} = NUMBER_TOKEN;
504 $self->{t}->{value} = '';
505 return $self->{t};
506 } else {
507 ## NOTE: |-| not followed by |nmstart|.
508 $self->{state} = BEFORE_TOKEN_STATE;
509 # reprocess
510 return {type => MINUS_TOKEN};
511 }
512 } elsif ($self->{state} == AFTER_AT_STATE) {
513 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515 $self->{c} == 0x005F or # _
516 $self->{c} > 0x007F) { # nonascii
517 $self->{t}->{value} .= chr $self->{c};
518 $self->{state} = NAME_STATE;
519 $self->{c} = $self->{get_char}->();
520 redo A;
521 } elsif ($self->{c} == 0x002D) { # -
522 $self->{t}->{value} .= '-';
523 $self->{state} = AFTER_AT_HYPHEN_STATE;
524 $self->{c} = $self->{get_char}->();
525 redo A;
526 } elsif ($self->{c} == 0x005C) { # \
527 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
528 $self->{c} = $self->{get_char}->();
529 redo A;
530 } else {
531 $self->{state} = BEFORE_TOKEN_STATE;
532 # reprocess
533 return {type => DELIM_TOKEN, value => '@'};
534 }
535 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
536 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
537 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
538 $self->{c} == 0x005F or # _
539 $self->{c} > 0x007F) { # nonascii
540 $self->{t}->{value} .= chr $self->{c};
541 $self->{state} = NAME_STATE;
542 $self->{c} = $self->{get_char}->();
543 redo A;
544 } elsif ($self->{c} == 0x002D) { # -
545 $self->{c} = $self->{get_char}->();
546 if ($self->{c} == 0x003E) { # >
547 unshift @{$self->{token}}, {type => CDC_TOKEN};
548 $self->{state} = BEFORE_TOKEN_STATE;
549 $self->{c} = $self->{get_char}->();
550 return {type => DELIM_TOKEN, value => '@'};
551 #redo A;
552 } else {
553 unshift @{$self->{token}}, {type => MINUS_TOKEN};
554 $self->{t} = {type => IDENT_TOKEN, value => '-'};
555 $self->{state} = BEFORE_NMSTART_STATE;
556 # reprocess
557 return {type => DELIM_TOKEN, value => '@'};
558 #redo A;
559 }
560 } elsif ($self->{c} == 0x005C) { # \
561 ## TODO: @-\{nl}
562 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
563 $self->{c} = $self->{get_char}->();
564 redo A;
565 } else {
566 unshift @{$self->{token}}, {type => MINUS_TOKEN};
567 $self->{state} = BEFORE_TOKEN_STATE;
568 # reprocess
569 return {type => DELIM_TOKEN, value => '@'};
570 }
571 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
572 if ($self->{c} == 0x002D) { # -
573 ## NOTE: |-| in |ident|.
574 $self->{t}->{hyphen} = 1;
575 $self->{t}->{value} = '-';
576 $self->{t}->{type} = DIMENSION_TOKEN;
577 $self->{state} = BEFORE_NMSTART_STATE;
578 $self->{c} = $self->{get_char}->();
579 redo A;
580 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
581 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
582 $self->{c} == 0x005F or # _
583 $self->{c} > 0x007F) { # nonascii
584 ## NOTE: |nmstart| in |ident|.
585 $self->{t}->{value} = chr $self->{c};
586 $self->{t}->{type} = DIMENSION_TOKEN;
587 $self->{state} = NAME_STATE;
588 $self->{c} = $self->{get_char}->();
589 redo A;
590 } elsif ($self->{c} == 0x005C) { # \
591 ## NOTE: |nmstart| in |ident| in |IDENT|
592 $self->{t}->{value} = '';
593 $self->{t}->{type} = DIMENSION_TOKEN;
594 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
595 $self->{c} = $self->{get_char}->();
596 redo A;
597 } elsif ($self->{c} == 0x0025) { # %
598 $self->{t}->{type} = PERCENTAGE_TOKEN;
599 $self->{state} = BEFORE_TOKEN_STATE;
600 $self->{c} = $self->{get_char}->();
601 return $self->{t};
602 #redo A;
603 } else {
604 $self->{state} = BEFORE_TOKEN_STATE;
605 # reprocess
606 return $self->{t};
607 #redo A;
608 }
609 } elsif ($self->{state} == HASH_OPEN_STATE) {
610 ## NOTE: The first |nmchar| in |name| in |HASH|.
611 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
612 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
613 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
614 $self->{c} == 0x002D or # -
615 $self->{c} == 0x005F or # _
616 $self->{c} > 0x007F) { # nonascii
617 $self->{t}->{value} .= chr $self->{c};
618 $self->{state} = NAME_STATE;
619 $self->{c} = $self->{get_char}->();
620 redo A;
621 } elsif ($self->{c} == 0x005C) { # \
622 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
623 $self->{c} = $self->{get_char}->();
624 redo A;
625 } else {
626 $self->{state} = BEFORE_TOKEN_STATE;
627 # reprocess
628 return {type => DELIM_TOKEN, value => '#'};
629 #redo A;
630 }
631 } elsif ($self->{state} == NAME_STATE) {
632 ## NOTE: |nmchar| in (|ident| or |name|).
633 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
634 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
635 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
636 $self->{c} == 0x005F or # _
637 $self->{c} == 0x002D or # -
638 $self->{c} > 0x007F) { # nonascii
639 $self->{t}->{value} .= chr $self->{c};
640 # stay in the state
641 $self->{c} = $self->{get_char}->();
642 redo A;
643 } elsif ($self->{c} == 0x005C) { # \
644 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
645 $self->{c} = $self->{get_char}->();
646 redo A;
647 } elsif ($self->{c} == 0x0028 and # (
648 $self->{t}->{type} == IDENT_TOKEN) { # (
649 my $func_name = $self->{t}->{value};
650 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
651 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
652 if ($self->{t}->{has_escape}) {
653 ## TODO: warn
654 }
655 $self->{t}->{type}
656 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
657 $self->{t}->{value} = '';
658 $self->{state} = URI_BEFORE_WSP_STATE;
659 $self->{c} = $self->{get_char}->();
660 redo A;
661 } else {
662 $self->{t}->{type} = FUNCTION_TOKEN;
663 $self->{state} = BEFORE_TOKEN_STATE;
664 $self->{c} = $self->{get_char}->();
665 return $self->{t};
666 #redo A;
667 }
668 } else {
669 $self->{state} = BEFORE_TOKEN_STATE;
670 # reconsume
671 return $self->{t};
672 #redo A;
673 }
674 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
675 while ({
676 0x0020 => 1, # SP
677 0x0009 => 1, # \t
678 0x000D => 1, # \r
679 0x000A => 1, # \n
680 0x000C => 1, # \f
681 }->{$self->{c}}) {
682 $self->{c} = $self->{get_char}->();
683 }
684 if ($self->{c} == -1) {
685 $self->{t}->{type} = {
686 URI_TOKEN, URI_INVALID_TOKEN,
687 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690 }->{$self->{t}->{type}};
691 $self->{state} = BEFORE_TOKEN_STATE;
692 $self->{c} = $self->{get_char}->();
693 return $self->{t};
694 #redo A;
695 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
696 ## TODO: Should we consider matches of "(" and ")"?
697 $self->{t}->{type} = {
698 URI_TOKEN, URI_INVALID_TOKEN,
699 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
700 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
701 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
702 }->{$self->{t}->{type}};
703 $self->{state} = URI_UNQUOTED_STATE;
704 $self->{c} = $self->{get_char}->();
705 redo A;
706 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
707 $self->{state} = STRING_STATE; $q = $self->{c};
708 $self->{c} = $self->{get_char}->();
709 redo A;
710 } elsif ($self->{c} == 0x0029) { # )
711 $self->{state} = BEFORE_TOKEN_STATE;
712 $self->{c} = $self->{get_char}->();
713 return $self->{t};
714 #redo A;
715 } elsif ($self->{c} == 0x005C) { # \
716 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
717 $self->{c} = $self->{get_char}->();
718 redo A;
719 } else {
720 $self->{t}->{value} .= chr $self->{c};
721 $self->{state} = URI_UNQUOTED_STATE;
722 $self->{c} = $self->{get_char}->();
723 redo A;
724 }
725 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
726 if ({
727 0x0020 => 1, # SP
728 0x0009 => 1, # \t
729 0x000D => 1, # \r
730 0x000A => 1, # \n
731 0x000C => 1, # \f
732 }->{$self->{c}}) {
733 $self->{state} = URI_AFTER_WSP_STATE;
734 $self->{c} = $self->{get_char}->();
735 redo A;
736 } elsif ($self->{c} == -1) {
737 $self->{t}->{type} = {
738 URI_TOKEN, URI_INVALID_TOKEN,
739 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
740 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
741 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
742 }->{$self->{t}->{type}};
743 $self->{state} = BEFORE_TOKEN_STATE;
744 $self->{c} = $self->{get_char}->();
745 return $self->{t};
746 #redo A;
747 } elsif ($self->{c} < 0x0020 or {
748 0x0022 => 1, # "
749 0x0027 => 1, # '
750 0x0028 => 1, # (
751 }->{$self->{c}}) { # C0 or (
752 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
753 $self->{t}->{type} = {
754 URI_TOKEN, URI_INVALID_TOKEN,
755 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
756 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
757 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
758 }->{$self->{t}->{type}};
759 # stay in the state.
760 $self->{c} = $self->{get_char}->();
761 redo A;
762 } elsif ($self->{c} == 0x0029) { # )
763 $self->{state} = BEFORE_TOKEN_STATE;
764 $self->{c} = $self->{get_char}->();
765 return $self->{t};
766 #redo A;
767 } elsif ($self->{c} == 0x005C) { # \
768 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
769 $self->{c} = $self->{get_char}->();
770 redo A;
771 } else {
772 $self->{t}->{value} .= chr $self->{c};
773 # stay in the state.
774 $self->{c} = $self->{get_char}->();
775 redo A;
776 }
777 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
778 if ({
779 0x0020 => 1, # SP
780 0x0009 => 1, # \t
781 0x000D => 1, # \r
782 0x000A => 1, # \n
783 0x000C => 1, # \f
784 }->{$self->{c}}) {
785 # stay in the state.
786 $self->{c} = $self->{get_char}->();
787 redo A;
788 } elsif ($self->{c} == -1) {
789 $self->{t}->{type} = {
790 URI_TOKEN, URI_INVALID_TOKEN,
791 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
792 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
793 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
794 }->{$self->{t}->{type}};
795 $self->{state} = BEFORE_TOKEN_STATE;
796 $self->{c} = $self->{get_char}->();
797 return $self->{t};
798 #redo A;
799 } elsif ($self->{c} == 0x0029) { # )
800 $self->{state} = BEFORE_TOKEN_STATE;
801 $self->{c} = $self->{get_char}->();
802 return $self->{t};
803 #redo A;
804 } elsif ($self->{c} == 0x005C) { # \
805 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
806 $self->{c} = $self->{get_char}->();
807 redo A;
808 } else {
809 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
810 $self->{t}->{type} = {
811 URI_TOKEN, URI_INVALID_TOKEN,
812 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815 }->{$self->{t}->{type}};
816 # stay in the state.
817 $self->{c} = $self->{get_char}->();
818 redo A;
819 }
820 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
821 $self->{t}->{has_escape} = 1;
822 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
823 ## NOTE: second character of |unicode| in |escape|.
824 $char = $self->{c} - 0x0030;
825 $self->{state} = ESCAPE_STATE; $i = 2;
826 $self->{c} = $self->{get_char}->();
827 redo A;
828 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
829 ## NOTE: second character of |unicode| in |escape|.
830 $char = $self->{c} - 0x0041 + 0xA;
831 $self->{state} = ESCAPE_STATE; $i = 2;
832 $self->{c} = $self->{get_char}->();
833 redo A;
834 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
835 ## NOTE: second character of |unicode| in |escape|.
836 $char = $self->{c} - 0x0061 + 0xA;
837 $self->{state} = ESCAPE_STATE; $i = 2;
838 $self->{c} = $self->{get_char}->();
839 redo A;
840 } elsif ($self->{c} == 0x000A or # \n
841 $self->{c} == 0x000C) { # \f
842 if ($q == 0) {
843 #
844 } elsif ($q == 1) {
845 ## NOTE: In |escape| in |URI|.
846 $self->{t}->{type} = {
847 URI_TOKEN, URI_INVALID_TOKEN,
848 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
849 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
850 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
851 }->{$self->{t}->{type}};
852 $self->{t}->{value} .= chr $self->{c};
853 $self->{state} = URI_UNQUOTED_STATE;
854 $self->{c} = $self->{get_char}->();
855 redo A;
856 } else {
857 ## Note: In |nl| in ... in |string| or |ident|.
858 $self->{state} = STRING_STATE;
859 $self->{c} = $self->{get_char}->();
860 redo A;
861 }
862 } elsif ($self->{c} == 0x000D) { # \r
863 if ($q == 0) {
864 #
865 } elsif ($q == 1) {
866 ## NOTE: In |escape| in |URI|.
867 $self->{t}->{type} = {
868 URI_TOKEN, URI_INVALID_TOKEN,
869 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872 }->{$self->{t}->{type}};
873 $self->{state} = ESCAPE_BEFORE_LF_STATE;
874 $self->{c} = $self->{get_char}->();
875 redo A;
876 } else {
877 ## Note: In |nl| in ... in |string| or |ident|.
878 $self->{state} = ESCAPE_BEFORE_LF_STATE;
879 $self->{c} = $self->{get_char}->();
880 redo A;
881 }
882 } elsif ($self->{c} == -1) {
883 #
884 } else {
885 ## NOTE: second character of |escape|.
886 $self->{t}->{value} .= chr $self->{c};
887 $self->{state} = $q == 0 ? NAME_STATE :
888 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
889 $self->{c} = $self->{get_char}->();
890 redo A;
891 }
892
893 if ($q == 0) {
894 if ($self->{t}->{type} == DIMENSION_TOKEN) {
895 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
896 $self->{state} = BEFORE_TOKEN_STATE;
897 # reprocess
898 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
899 unshift @{$self->{token}}, {type => MINUS_TOKEN};
900 $self->{t}->{type} = NUMBER_TOKEN;
901 $self->{t}->{value} = '';
902 return $self->{t};
903 #redo A;
904 } elsif (length $self->{t}->{value}) {
905 $self->{state} = BEFORE_TOKEN_STATE;
906 # reprocess
907 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908 return $self->{t};
909 #redo A;
910 } else {
911 $self->{state} = BEFORE_TOKEN_STATE;
912 # reprocess
913 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914 $self->{t}->{type} = NUMBER_TOKEN;
915 $self->{t}->{value} = '';
916 return $self->{t};
917 #redo A;
918 }
919 } else {
920 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
921 $self->{state} = BEFORE_TOKEN_STATE;
922 # reprocess
923 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
924 return {type => MINUS_TOKEN};
925 #redo A;
926 } elsif (length $self->{t}->{value}) {
927 $self->{state} = BEFORE_TOKEN_STATE;
928 # reprocess
929 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
930 return $self->{t};
931 #redo A;
932 } else {
933 $self->{state} = BEFORE_TOKEN_STATE;
934 # reprocess
935 return {type => DELIM_TOKEN, value => '\\'};
936 #redo A;
937 }
938 }
939 } elsif ($q == 1) {
940 $self->{state} = URI_UNQUOTED_STATE;
941 $self->{c} = $self->{get_char}->();
942 redo A;
943 } else {
944 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
945 $self->{t}->{type} = {
946 STRING_TOKEN, INVALID_TOKEN,
947 URI_TOKEN, URI_INVALID_TOKEN,
948 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
949 }->{$self->{t}->{type}} || $self->{t}->{type};
950 $self->{state} = BEFORE_TOKEN_STATE;
951 # reprocess
952 return $self->{t};
953 #redo A;
954 }
955 } elsif ($self->{state} == ESCAPE_STATE) {
956 ## NOTE: third..seventh character of |unicode| in |escape|.
957 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
958 $char = $char * 0x10 + $self->{c} - 0x0030;
959 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
960 $self->{c} = $self->{get_char}->();
961 redo A;
962 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
963 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
964 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
965 $self->{c} = $self->{get_char}->();
966 redo A;
967 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
968 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
969 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
970 $self->{c} = $self->{get_char}->();
971 redo A;
972 } elsif ($self->{c} == 0x0020 or # SP
973 $self->{c} == 0x000A or # \n
974 $self->{c} == 0x0009 or # \t
975 $self->{c} == 0x000C) { # \f
976 $self->{t}->{value} .= chr $char;
977 $self->{state} = $q == 0 ? NAME_STATE :
978 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
979 $self->{c} = $self->{get_char}->();
980 redo A;
981 } elsif ($self->{c} == 0x000D) { # \r
982 $self->{state} = ESCAPE_BEFORE_LF_STATE;
983 $self->{c} = $self->{get_char}->();
984 redo A;
985 } else {
986 $self->{t}->{value} .= chr $char;
987 $self->{state} = $q == 0 ? NAME_STATE :
988 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989 # reconsume
990 redo A;
991 }
992 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
993 ## NOTE: eightth character of |unicode| in |escape|.
994 if ($self->{c} == 0x0020 or # SP
995 $self->{c} == 0x000A or # \n
996 $self->{c} == 0x0009 or # \t
997 $self->{c} == 0x000C) { # \f
998 $self->{t}->{value} .= chr $char;
999 $self->{state} = $q == 0 ? NAME_STATE :
1000 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1001 $self->{c} = $self->{get_char}->();
1002 redo A;
1003 } elsif ($self->{c} == 0x000D) { # \r
1004 $self->{state} = ESCAPE_BEFORE_NL_STATE;
1005 $self->{c} = $self->{get_char}->();
1006 redo A;
1007 } else {
1008 $self->{t}->{value} .= chr $char;
1009 $self->{state} = $q == 0 ? NAME_STATE :
1010 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1011 # reconsume
1012 redo A;
1013 }
1014 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1015 ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1016 if ($self->{c} == 0x000A) { # \n
1017 $self->{state} = $q == 0 ? NAME_STATE :
1018 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1019 $self->{c} = $self->{get_char}->();
1020 redo A;
1021 } else {
1022 $self->{state} = $q == 0 ? NAME_STATE :
1023 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1024 # reprocess
1025 redo A;
1026 }
1027 } elsif ($self->{state} == STRING_STATE) {
1028 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1029 ## a character in |invalid$Q| in |invalid| in |INVALID|,
1030 ## where |$Q = $q == 0x0022 ? 1 : 2|.
1031 ## Or, in |URI|.
1032 if ($self->{c} == 0x005C) { # \
1033 $self->{state} = ESCAPE_OPEN_STATE;
1034 $self->{c} = $self->{get_char}->();
1035 redo A;
1036 } elsif ($self->{c} == $q) { # " | '
1037 if ($self->{t}->{type} == STRING_TOKEN) {
1038 $self->{state} = BEFORE_TOKEN_STATE;
1039 $self->{c} = $self->{get_char}->();
1040 return $self->{t};
1041 #redo A;
1042 } else {
1043 $self->{state} = URI_AFTER_WSP_STATE;
1044 $self->{c} = $self->{get_char}->();
1045 redo A;
1046 }
1047 } elsif ($self->{c} == 0x000A or # \n
1048 $self->{c} == 0x000D or # \r
1049 $self->{c} == 0x000C or # \f
1050 $self->{c} == -1) {
1051 $self->{t}->{type} = {
1052 STRING_TOKEN, INVALID_TOKEN,
1053 INVALID_TOKEN, INVALID_TOKEN,
1054 URI_TOKEN, URI_INVALID_TOKEN,
1055 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1056 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1057 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1058 }->{$self->{t}->{type}};
1059 $self->{state} = BEFORE_TOKEN_STATE;
1060 # reconsume
1061 return $self->{t};
1062 #redo A;
1063 } else {
1064 $self->{t}->{value} .= chr $self->{c};
1065 # stay in the state
1066 $self->{c} = $self->{get_char}->();
1067 redo A;
1068 }
1069 } elsif ($self->{state} == NUMBER_STATE) {
1070 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1071 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1072 $self->{t}->{value} .= chr $self->{c};
1073 # stay in the state
1074 $self->{c} = $self->{get_char}->();
1075 redo A;
1076 } elsif ($self->{c} == 0x002E) { # .
1077 $self->{state} = NUMBER_DOT_STATE;
1078 $self->{c} = $self->{get_char}->();
1079 redo A;
1080 } else {
1081 $self->{t}->{number} = $self->{t}->{value};
1082 $self->{t}->{value} = '';
1083 $self->{state} = AFTER_NUMBER_STATE;
1084 # reprocess
1085 redo A;
1086 }
1087 } elsif ($self->{state} == NUMBER_DOT_STATE) {
1088 ## NOTE: The character immediately following |.| in |num|.
1089 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1090 $self->{t}->{value} .= '.' . chr $self->{c};
1091 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1092 $self->{c} = $self->{get_char}->();
1093 redo A;
1094 } else {
1095 unshift @{$self->{token}}, {type => DOT_TOKEN};
1096 $self->{t}->{number} = $self->{t}->{value};
1097 $self->{t}->{value} = '';
1098 $self->{state} = BEFORE_TOKEN_STATE;
1099 # reprocess
1100 return $self->{t};
1101 #redo A;
1102 }
1103 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1104 ## NOTE: The character immediately following |.| at the beginning of |num|.
1105 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1106 $self->{t}->{value} .= '.' . chr $self->{c};
1107 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1108 $self->{c} = $self->{get_char}->();
1109 redo A;
1110 } else {
1111 $self->{state} = BEFORE_TOKEN_STATE;
1112 # reprocess
1113 return {type => DOT_TOKEN};
1114 #redo A;
1115 }
1116 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1117 ## NOTE: |[0-9]| in |num| after |.|.
1118 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1119 $self->{t}->{value} .= chr $self->{c};
1120 # stay in the state
1121 $self->{c} = $self->{get_char}->();
1122 redo A;
1123 } else {
1124 $self->{t}->{number} = $self->{t}->{value};
1125 $self->{t}->{value} = '';
1126 $self->{state} = AFTER_NUMBER_STATE;
1127 # reprocess
1128 redo A;
1129 }
1130 } else {
1131 die "$0: Unknown state |$self->{state}|";
1132 }
1133 } # A
1134 } # get_next_token
1135
1136 =head1 LICENSE
1137
1138 Copyright 2007 Wakaba <w@suika.fam.cx>
1139
1140 This library is free software; you can redistribute it
1141 and/or modify it under the same terms as Perl itself.
1142
1143 =cut
1144
1145 1;
1146 # $Date: 2007/09/30 12:03:09 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24