/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (show annotations) (download)
Sat Sep 8 05:57:05 2007 UTC (18 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +2 -3 lines
++ whatpm/t/ChangeLog	8 Sep 2007 05:56:49 -0000
2007-09-08  Wakaba  <wakaba@suika.fam.cx>

	* css-token-1.test, CSS-Tokenizer.t: New files.

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3
4 sub BEFORE_TOKEN_STATE () { 0 }
5 sub BEFORE_NMSTART_STATE () { 1 }
6 sub NAME_STATE () { 2 }
7 sub ESCAPE_OPEN_STATE () { 3 }
8 sub STRING_STATE () { 4 }
9 sub HASH_OPEN_STATE () { 5 }
10 sub NUMBER_STATE () { 6 }
11 sub NUMBER_FRACTION_STATE () { 7 }
12 sub AFTER_NUMBER_STATE () { 8 }
13 sub URI_BEFORE_WSP_STATE () { 9 }
14 sub ESCAPE_STATE () { 10 }
15 sub ESCAPE_BEFORE_LF_STATE () { 11 }
16 sub ESCAPE_BEFORE_NL_STATE () { 12 }
17 sub NUMBER_DOT_STATE () { 13 }
18 sub NUMBER_DOT_NUMBER_STATE () { 14 }
19 sub DELIM_STATE () { 15 }
20 sub URI_UNQUOTED_STATE () { 16 }
21 sub URI_AFTER_WSP_STATE () { 17 }
22 sub AFTER_AT_STATE () { 18 }
23 sub AFTER_AT_HYPHEN_STATE () { 19 }
24
25 sub IDENT_TOKEN () { 1 }
26 sub ATKEYWORD_TOKEN () { 2 }
27 sub HASH_TOKEN () { 3 }
28 sub FUNCTION_TOKEN () { 4 }
29 sub URI_TOKEN () { 5 }
30 sub URI_INVALID_TOKEN () { 6 }
31 sub URI_PREFIX_TOKEN () { 7 }
32 sub URI_PREFIX_INVALID_TOKEN () { 8 }
33 sub STRING_TOKEN () { 9 }
34 sub INVALID_TOKEN () { 10 }
35 sub NUMBER_TOKEN () { 11 }
36 sub DIMENSION_TOKEN () { 12 }
37 sub PERCENTAGE_TOKEN () { 13 }
38 sub UNICODE_RANGE_TOKEN () { 14 }
39 sub DELIM_TOKEN () { 16 }
40 sub PLUS_TOKEN () { 17 }
41 sub GREATER_TOKEN () { 18 }
42 sub COMMA_TOKEN () { 19 }
43 sub TILDE_TOKEN () { 20 }
44 sub DASHMATCH_TOKEN () { 21 }
45 sub PREFIXMATCH_TOKEN () { 22 }
46 sub SUFFIXMATCH_TOKEN () { 23 }
47 sub SUBSTRINGMATCH_TOKEN () { 24 }
48 sub INCLUDES_TOKEN () { 25 }
49 sub SEMICOLON_TOKEN () { 26 }
50 sub LBRACE_TOKEN () { 27 }
51 sub RBRACE_TOKEN () { 28 }
52 sub LPAREN_TOKEN () { 29 }
53 sub RPAREN_TOKEN () { 30 }
54 sub LBRACKET_TOKEN () { 31 }
55 sub RBRACKET_TOKEN () { 32 }
56 sub S_TOKEN () { 33 }
57 sub CDO_TOKEN () { 34 }
58 sub CDC_TOKEN () { 35 }
59 sub COMMENT_TOKEN () { 36 }
60 sub COMMENT_INVALID_TOKEN () { 37 }
61 sub EOF_TOKEN () { 38 }
62
63 our @TokenName = qw(
64 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
65 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
66 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
67 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
68 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
69 COMMENT_INVALID EOF
70 );
71
72 sub new ($) {
73 my $self = bless {token => [], get_char => sub { -1 },
74 onerror => sub { }}, shift;
75 return $self;
76 } # new
77
78 sub init ($) {
79 my $self = shift;
80 $self->{state} = BEFORE_TOKEN_STATE;
81 $self->{c} = $self->{get_char}->();
82 #$self->{t} = {type => token-type, value => value, number => number};
83 } # init
84
85 sub get_next_token ($) {
86 my $self = shift;
87 if (@{$self->{token}}) {
88 return shift @{$self->{token}};
89 }
90
91 my $char;
92 my $num; # |{num}|, if any.
93 my $i; # |$i + 1|th character in |unicode| in |escape|.
94 my $q;
95 ## NOTE:
96 ## 0: in |ident|.
97 ## 1: in |URI| outside of |string|.
98 ## 0x0022: in |string1| or |invalid1|.
99 ## 0x0027: in |string2| or |invalid2|.
100
101 A: {
102 if ($self->{state} == BEFORE_TOKEN_STATE) {
103 if ($self->{c} == 0x002D) { # -
104 ## NOTE: |-| in |ident| in |IDENT|
105 $self->{t} = {type => IDENT_TOKEN, value => '-'};
106 $self->{state} = BEFORE_NMSTART_STATE;
107 $self->{c} = $self->{get_char}->();
108 redo A;
109 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
110 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
111 $self->{c} = $self->{get_char}->();
112 if ($self->{c} == 0x002B) { # +
113 $self->{c} = $self->{get_char}->();
114 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
115 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
116 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
117 $self->{c} == 0x003F) { # ?
118 $self->{t}->{value} .= '+' . chr $self->{c};
119 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
120 $self->{c} = $self->{get_char}->();
121 C: for (2..6) {
122 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
123 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
124 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
125 $self->{c} == 0x003F) { # ?
126 $self->{t}->{value} .= chr $self->{c};
127 $self->{c} = $self->{get_char}->();
128 } else {
129 last C;
130 }
131 } # C
132
133 if ($self->{c} == 0x002D) { # -
134 $self->{c} = $self->{get_char}->();
135 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
136 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
137 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
138 $self->{t}->{value} .= '-' . chr $self->{c};
139 $self->{c} = $self->{get_char}->();
140 C: for (2..6) {
141 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
142 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
143 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
144 $self->{t}->{value} .= chr $self->{c};
145 $self->{c} = $self->{get_char}->();
146 } else {
147 last C;
148 }
149 } # C
150
151 #
152 } else {
153 my $token = $self->{t};
154 $self->{t} = {type => IDENT_TOKEN, value => '-'};
155 $self->{state} = BEFORE_NMSTART_STATE;
156 # reprocess
157 return $token;
158 #redo A;
159 }
160 }
161
162 $self->{state} = BEFORE_TOKEN_STATE;
163 # reprocess
164 return $self->{t};
165 #redo A;
166 } else {
167 unshift @{$self->{token}}, {type => PLUS_TOKEN};
168 $self->{state} = BEFORE_TOKEN_STATE;
169 # reprocess
170 return $self->{t};
171 #redo A;
172 }
173 } else {
174 $self->{state} = NAME_STATE;
175 # reprocess
176 redo A;
177 }
178 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
179 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
180 $self->{c} == 0x005F or # _
181 $self->{c} > 0x007F) { # nonascii
182 ## NOTE: |nmstart| in |ident| in |IDENT|
183 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
184 $self->{state} = NAME_STATE;
185 $self->{c} = $self->{get_char}->();
186 redo A;
187 } elsif ($self->{c} == 0x005C) { # \
188 ## NOTE: |nmstart| in |ident| in |IDENT|
189 $self->{t} = {type => IDENT_TOKEN, value => ''};
190 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
191 $self->{c} = $self->{get_char}->();
192 redo A;
193 } elsif ($self->{c} == 0x0040) { # @
194 ## NOTE: |@| in |ATKEYWORD|
195 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
196 $self->{state} = AFTER_AT_STATE;
197 $self->{c} = $self->{get_char}->();
198 redo A;
199 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
200 $self->{t} = {type => STRING_TOKEN, value => ''};
201 $self->{state} = STRING_STATE; $q = $self->{c};
202 $self->{c} = $self->{get_char}->();
203 redo A;
204 } elsif ($self->{c} == 0x0023) { # #
205 ## NOTE: |#| in |HASH|.
206 $self->{t} = {type => HASH_TOKEN, value => ''};
207 $self->{state} = HASH_OPEN_STATE;
208 $self->{c} = $self->{get_char}->();
209 redo A;
210 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
211 ## NOTE: |num|.
212 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
213 $self->{state} = NUMBER_STATE;
214 $self->{c} = $self->{get_char}->();
215 redo A;
216 } elsif ($self->{c} == 0x002E) { # .
217 ## NOTE: |num|.
218 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
219 $self->{state} = NUMBER_FRACTION_STATE;
220 $self->{c} = $self->{get_char}->();
221 redo A;
222 } elsif ($self->{c} == 0x002F) { # /
223 $self->{c} = $self->{get_char}->();
224 if ($self->{c} == 0x002A) { # *
225 C: {
226 $self->{c} = $self->{get_char}->();
227 if ($self->{c} == 0x002A) { # *
228 D: {
229 $self->{c} = $self->{get_char}->();
230 if ($self->{c} == 0x002F) { # /
231 #
232 } elsif ($self->{c} == 0x002A) { # *
233 redo D;
234 } else {
235 redo C;
236 }
237 } # D
238 } elsif ($self->{c} == -1) {
239 # stay in the state
240 # reprocess
241 return {type => COMMENT_INVALID_TOKEN};
242 #redo A;
243 } else {
244 redo C;
245 }
246 } # C
247
248 # stay in the state.
249 $self->{c} = $self->{get_char}->();
250 redo A;
251 } else {
252 # stay in the state.
253 # reprocess
254 return {type => DELIM_STATE, value => '/'};
255 #redo A;
256 }
257 } elsif ($self->{c} == 0x003C) { # <
258 ## NOTE: |CDO|
259 $self->{c} = $self->{get_char}->();
260 if ($self->{c} == 0x0021) { # !
261 $self->{c} = $self->{get_char}->();
262 if ($self->{c} == 0x002C) { # -
263 $self->{c} = $self->{get_char}->();
264 if ($self->{c} == 0x002C) { # -
265 $self->{state} = BEFORE_TOKEN_STATE;
266 $self->{c} = $self->{get_char}->();
267 return {type => CDO_TOKEN};
268 #redo A;
269 } else {
270 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
271 ## NOTE: |-| in |ident| in |IDENT|
272 $self->{t} = {type => IDENT_TOKEN, value => '-'};
273 $self->{state} = BEFORE_NMSTART_STATE;
274 #reprocess
275 return {type => DELIM_TOKEN, value => '<'};
276 #redo A;
277 }
278 } else {
279 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
280 $self->{state} = BEFORE_TOKEN_STATE;
281 #reprocess
282 return {type => DELIM_TOKEN, value => '<'};
283 #redo A;
284 }
285 } else {
286 $self->{state} = BEFORE_TOKEN_STATE;
287 #reprocess
288 return {type => DELIM_TOKEN, value => '<'};
289 #redo A;
290 }
291 } elsif (my $t = {
292 0x003B => SEMICOLON_TOKEN, # ;
293 0x007B => LBRACE_TOKEN, # {
294 0x007D => RBRACE_TOKEN, # }
295 0x0028 => LPAREN_TOKEN, # (
296 0x0029 => RPAREN_TOKEN, # )
297 0x005B => LBRACKET_TOKEN, # [
298 0x005D => RBRACKET_TOKEN, # ]
299 }->{$self->{c}}) {
300 # stay in the state
301 $self->{c} = $self->{get_char}->();
302 return {type => $t};
303 # redo A;
304 } elsif ({
305 0x0020 => 1, # SP
306 0x0009 => 1, # \t
307 0x000D => 1, # \r
308 0x000A => 1, # \n
309 0x000C => 1, # \f
310 }->{$self->{c}}) {
311 W: {
312 $self->{c} = $self->{get_char}->();
313 if ({
314 0x0020 => 1, # SP
315 0x0009 => 1, # \t
316 0x000D => 1, # \r
317 0x000A => 1, # \n
318 0x000C => 1, # \f
319 }->{$self->{c}}) {
320 redo W;
321 } elsif (my $v = {
322 0x002B => PLUS_TOKEN, # +
323 0x003E => GREATER_TOKEN, # >
324 0x002C => COMMA_TOKEN, # ,
325 0x007E => TILDE_TOKEN, # ~
326 }->{$self->{c}}) {
327 # stay in the state
328 $self->{c} = $self->{get_char}->();
329 return {type => $v};
330 #redo A;
331 } else {
332 # stay in the state
333 # reprocess
334 return {type => S_TOKEN};
335 #redo A;
336 }
337 } # W
338 } elsif (my $v = {
339 0x007C => DASHMATCH_TOKEN, # |
340 0x005E => PREFIXMATCH_TOKEN, # ^
341 0x0024 => SUFFIXMATCH_TOKEN, # $
342 0x002A => SUBSTRINGMATCH_TOKEN, # *
343 }->{$self->{c}}) {
344 my $c = $self->{c};
345 $self->{c} = $self->{get_char}->();
346 if ($self->{c} == 0x003D) { # =
347 # stay in the state
348 $self->{c} = $self->{get_char}->();
349 return {type => $v};
350 #redo A;
351 } else {
352 # stay in the state
353 # reprocess
354 return {type => DELIM_TOKEN, value => chr $c};
355 #redo A;
356 }
357 } elsif ($self->{c} == 0x002B) { # +
358 # stay in the state
359 $self->{c} = $self->{get_char}->();
360 return {type => PLUS_TOKEN};
361 #redo A;
362 } elsif ($self->{c} == 0x003E) { # >
363 # stay in the state
364 $self->{c} = $self->{get_char}->();
365 return {type => GREATER_TOKEN};
366 #redo A;
367 } elsif ($self->{c} == 0x002C) { # ,
368 # stay in the state
369 $self->{c} = $self->{get_char}->();
370 return {type => COMMA_TOKEN};
371 #redo A;
372 } elsif ($self->{c} == 0x007E) { # ~
373 $self->{c} = $self->{get_char}->();
374 if ($self->{c} == 0x003D) { # =
375 # stay in the state
376 $self->{c} = $self->{get_char}->();
377 return {type => INCLUDES_TOKEN};
378 #redo A;
379 } else {
380 # stay in the state
381 # reprocess
382 return {type => TILDE_TOKEN};
383 #redo A;
384 }
385 } elsif ($self->{c} == -1) {
386 # stay in the state
387 $self->{c} = $self->{get_char}->();
388 return {type => EOF_TOKEN};
389 #redo A;
390 } else {
391 # stay in the state
392 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
393 $self->{c} = $self->{get_char}->();
394 return $self->{t};
395 #redo A;
396 }
397 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
398 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
399 ## |FUNCTION|)
400 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
401 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
402 $self->{c} == 0x005F or # _
403 $self->{c} > 0x007F) { # nonascii
404 $self->{t}->{value} .= chr $self->{c};
405 $self->{t}->{type} = DIMENSION_TOKEN
406 if $self->{t}->{type} == NUMBER_TOKEN;
407 $self->{state} = NAME_STATE;
408 $self->{c} = $self->{get_char}->();
409 redo A;
410 } elsif ($self->{c} == 0x005C) { # \
411 ## TODO: 12-\X, 12-\{nl}
412 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
413 $self->{c} = $self->{get_char}->();
414 redo A;
415 } elsif ($self->{c} == 0x002D and # -
416 $self->{t}->{type} == IDENT_TOKEN) {
417 $self->{c} = $self->{get_char}->();
418 if ($self->{c} == 0x003E) { # >
419 $self->{state} = BEFORE_TOKEN_STATE;
420 $self->{c} = $self->{get_char}->();
421 return {type => CDC_TOKEN};
422 #redo A;
423 } else {
424 ## NOTE: |-|, |-|, $self->{c}
425 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
426 # stay in the state
427 # reconsume
428 return {type => DELIM_TOKEN, value => '-'};
429 #redo A;
430 }
431 } else {
432 if ($self->{t}->{type} == NUMBER_TOKEN) {
433 ## NOTE: |-| after |NUMBER|.
434 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
435 $self->{state} = BEFORE_TOKEN_STATE;
436 # reconsume
437 $self->{t}->{value} = $self->{t}->{number};
438 delete $self->{t}->{number};
439 return $self->{t};
440 } else {
441 ## NOTE: |-| not followed by |nmstart|.
442 $self->{state} = BEFORE_TOKEN_STATE;
443 $self->{c} = $self->{get_char}->();
444 return {type => DELIM_TOKEN, value => '-'};
445 }
446 }
447 } elsif ($self->{state} == AFTER_AT_STATE) {
448 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
449 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
450 $self->{c} == 0x005F or # _
451 $self->{c} > 0x007F) { # nonascii
452 $self->{t}->{value} .= chr $self->{c};
453 $self->{state} = NAME_STATE;
454 $self->{c} = $self->{get_char}->();
455 redo A;
456 } elsif ($self->{c} == 0x002D) { # -
457 $self->{t}->{value} .= '-';
458 $self->{state} = AFTER_AT_HYPHEN_STATE;
459 $self->{c} = $self->{get_char}->();
460 redo A;
461 } elsif ($self->{c} == 0x005C) { # \
462 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
463 $self->{c} = $self->{get_char}->();
464 redo A;
465 } else {
466 $self->{state} = BEFORE_TOKEN_STATE;
467 # reprocess
468 return {type => DELIM_TOKEN, value => '@'};
469 }
470 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
471 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
472 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
473 $self->{c} == 0x005F or # _
474 $self->{c} > 0x007F) { # nonascii
475 $self->{t}->{value} .= chr $self->{c};
476 $self->{state} = NAME_STATE;
477 $self->{c} = $self->{get_char}->();
478 redo A;
479 } elsif ($self->{c} == 0x002D) { # -
480 $self->{c} = $self->{get_char}->();
481 if ($self->{c} == 0x003E) { # >
482 unshift @{$self->{token}}, {type => CDC_TOKEN};
483 $self->{state} = BEFORE_TOKEN_STATE;
484 $self->{c} = $self->{get_char}->();
485 return {type => DELIM_TOKEN, value => '@'};
486 #redo A;
487 } else {
488 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
489 $self->{t} = {type => IDENT_TOKEN, value => '-'};
490 $self->{state} = BEFORE_NMSTART_STATE;
491 # reprocess
492 return {type => DELIM_TOKEN, value => '@'};
493 #redo A;
494 }
495 } elsif ($self->{c} == 0x005C) { # \
496 ## TODO: @-\{nl}
497 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
498 $self->{c} = $self->{get_char}->();
499 redo A;
500 } else {
501 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
502 $self->{state} = BEFORE_TOKEN_STATE;
503 # reprocess
504 return {type => DELIM_TOKEN, value => '@'};
505 }
506 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
507 if ($self->{c} == 0x002D) { # -
508 ## NOTE: |-| in |ident|.
509 $self->{t}->{value} = '-';
510 $self->{state} = BEFORE_NMSTART_STATE;
511 $self->{c} = $self->{get_char}->();
512 redo A;
513 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515 $self->{c} == 0x005F or # _
516 $self->{c} > 0x007F) { # nonascii
517 ## NOTE: |nmstart| in |ident|.
518 $self->{t}->{value} = chr $self->{c};
519 $self->{t}->{type} = DIMENSION_TOKEN;
520 $self->{state} = NAME_STATE;
521 $self->{c} = $self->{get_char}->();
522 redo A;
523 } elsif ($self->{c} == 0x005C) { # \
524 ## NOTE: |nmstart| in |ident| in |IDENT|
525 $self->{t}->{value} = '';
526 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
527 $self->{c} = $self->{get_char}->();
528 redo A;
529 } elsif ($self->{c} == 0x0025) { # %
530 $self->{t}->{type} = PERCENTAGE_TOKEN;
531 $self->{state} = BEFORE_TOKEN_STATE;
532 $self->{c} = $self->{get_char}->();
533 return $self->{t};
534 #redo A;
535 } else {
536 $self->{state} = BEFORE_TOKEN_STATE;
537 # reprocess
538 return $self->{t};
539 #redo A;
540 }
541 } elsif ($self->{state} == HASH_OPEN_STATE) {
542 ## NOTE: The first |nmchar| in |name| in |HASH|.
543 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
544 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
545 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
546 $self->{c} == 0x002D or # -
547 $self->{c} == 0x005F or # _
548 $self->{c} > 0x007F) { # nonascii
549 $self->{t}->{value} .= chr $self->{c};
550 $self->{state} = NAME_STATE;
551 $self->{c} = $self->{get_char}->();
552 redo A;
553 } elsif ($self->{c} == 0x005C) { # \
554 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
555 $self->{c} = $self->{get_char}->();
556 redo A;
557 } else {
558 $self->{state} = BEFORE_TOKEN_STATE;
559 $self->{c} = $self->{get_char}->();
560 return {type => DELIM_TOKEN, value => '#'};
561 #redo A;
562 }
563 } elsif ($self->{state} == NAME_STATE) {
564 ## NOTE: |nmchar| in (|ident| or |name|).
565 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
566 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
567 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
568 $self->{c} == 0x005F or # _
569 $self->{c} == 0x002D or # -
570 $self->{c} > 0x007F) { # nonascii
571 $self->{t}->{value} .= chr $self->{c};
572 # stay in the state
573 $self->{c} = $self->{get_char}->();
574 redo A;
575 } elsif ($self->{c} == 0x005C) { # \
576 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
577 $self->{c} = $self->{get_char}->();
578 redo A;
579 } elsif ($self->{c} == 0x0028 and # (
580 $self->{t}->{type} == IDENT_TOKEN) { # (
581 my $func_name = $self->{t}->{value};
582 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
583 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
584 if ($self->{t}->{has_escape}) {
585 ## TODO: warn
586 }
587 $self->{t}->{type}
588 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
589 $self->{t}->{value} = '';
590 $self->{state} = URI_BEFORE_WSP_STATE;
591 $self->{c} = $self->{get_char}->();
592 redo A;
593 } else {
594 $self->{t}->{type} = FUNCTION_TOKEN;
595 $self->{state} = BEFORE_TOKEN_STATE;
596 $self->{c} = $self->{get_char}->();
597 return $self->{t};
598 #redo A;
599 }
600 } else {
601 $self->{state} = BEFORE_TOKEN_STATE;
602 # reconsume
603 return $self->{t};
604 #redo A;
605 }
606 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
607 while ({
608 0x0020 => 1, # SP
609 0x0009 => 1, # \t
610 0x000D => 1, # \r
611 0x000A => 1, # \n
612 0x000C => 1, # \f
613 }->{$self->{c}}) {
614 $self->{c} = $self->{get_char}->();
615 }
616 if ($self->{c} == -1) {
617 $self->{t}->{type} = {
618 URI_TOKEN, URI_INVALID_TOKEN,
619 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622 }->{$self->{t}->{type}};
623 $self->{state} = BEFORE_TOKEN_STATE;
624 $self->{c} = $self->{get_char}->();
625 return $self->{t};
626 #redo A;
627 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
628 ## TODO: Should we consider matches of "(" and ")"?
629 $self->{t}->{type} = {
630 URI_TOKEN, URI_INVALID_TOKEN,
631 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
632 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
633 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
634 }->{$self->{t}->{type}};
635 $self->{state} = URI_UNQUOTED_STATE;
636 $self->{c} = $self->{get_char}->();
637 redo A;
638 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
639 $self->{state} = STRING_STATE; $q = $self->{c};
640 $self->{c} = $self->{get_char}->();
641 redo A;
642 } elsif ($self->{c} == 0x0029) { # )
643 $self->{state} = BEFORE_TOKEN_STATE;
644 $self->{c} = $self->{get_char}->();
645 return $self->{t};
646 #redo A;
647 } elsif ($self->{c} == 0x005C) { # \
648 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
649 $self->{c} = $self->{get_char}->();
650 redo A;
651 } else {
652 $self->{t}->{value} .= chr $self->{c};
653 $self->{state} = URI_UNQUOTED_STATE;
654 $self->{c} = $self->{get_char}->();
655 redo A;
656 }
657 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
658 if ({
659 0x0020 => 1, # SP
660 0x0009 => 1, # \t
661 0x000D => 1, # \r
662 0x000A => 1, # \n
663 0x000C => 1, # \f
664 }->{$self->{c}}) {
665 $self->{state} = URI_AFTER_WSP_STATE;
666 $self->{c} = $self->{get_char}->();
667 redo A;
668 } elsif ($self->{c} == -1) {
669 $self->{t}->{type} = {
670 URI_TOKEN, URI_INVALID_TOKEN,
671 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
672 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
673 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
674 }->{$self->{t}->{type}};
675 $self->{state} = BEFORE_TOKEN_STATE;
676 $self->{c} = $self->{get_char}->();
677 return $self->{t};
678 #redo A;
679 } elsif ($self->{c} < 0x0020 or {
680 0x0022 => 1, # "
681 0x0027 => 1, # '
682 0x0028 => 1, # (
683 }->{$self->{c}}) { # C0 or (
684 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
685 $self->{t}->{type} = {
686 URI_TOKEN, URI_INVALID_TOKEN,
687 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690 }->{$self->{t}->{type}};
691 # stay in the state.
692 $self->{c} = $self->{get_char}->();
693 redo A;
694 } elsif ($self->{c} == 0x0029) { # )
695 $self->{state} = BEFORE_TOKEN_STATE;
696 $self->{c} = $self->{get_char}->();
697 return $self->{t};
698 #redo A;
699 } elsif ($self->{c} == 0x005C) { # \
700 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
701 $self->{c} = $self->{get_char}->();
702 redo A;
703 } else {
704 $self->{t}->{value} .= chr $self->{c};
705 # stay in the state.
706 $self->{c} = $self->{get_char}->();
707 redo A;
708 }
709 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
710 if ({
711 0x0020 => 1, # SP
712 0x0009 => 1, # \t
713 0x000D => 1, # \r
714 0x000A => 1, # \n
715 0x000C => 1, # \f
716 }->{$self->{c}}) {
717 # stay in the state.
718 $self->{c} = $self->{get_char}->();
719 redo A;
720 } elsif ($self->{c} == -1) {
721 $self->{t}->{type} = {
722 URI_TOKEN, URI_INVALID_TOKEN,
723 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
724 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
725 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
726 }->{$self->{t}->{type}};
727 $self->{state} = BEFORE_TOKEN_STATE;
728 $self->{c} = $self->{get_char}->();
729 return $self->{t};
730 #redo A;
731 } elsif ($self->{c} == 0x0029) { # )
732 $self->{state} = BEFORE_TOKEN_STATE;
733 $self->{c} = $self->{get_char}->();
734 return $self->{t};
735 #redo A;
736 } elsif ($self->{c} == 0x005C) { # \
737 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
738 $self->{c} = $self->{get_char}->();
739 redo A;
740 } else {
741 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
742 $self->{t}->{type} = {
743 URI_TOKEN, URI_INVALID_TOKEN,
744 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
745 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
746 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
747 }->{$self->{t}->{type}};
748 # stay in the state.
749 $self->{c} = $self->{get_char}->();
750 redo A;
751 }
752 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
753 $self->{t}->{has_escape} = 1;
754 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
755 ## NOTE: second character of |unicode| in |escape|.
756 $char = $self->{c} - 0x0030;
757 $self->{state} = ESCAPE_STATE; $i = 2;
758 $self->{c} = $self->{get_char}->();
759 redo A;
760 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
761 ## NOTE: second character of |unicode| in |escape|.
762 $char = $self->{c} - 0x0041 + 0xA;
763 $self->{state} = ESCAPE_STATE; $i = 2;
764 $self->{c} = $self->{get_char}->();
765 redo A;
766 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
767 ## NOTE: second character of |unicode| in |escape|.
768 $char = $self->{c} - 0x0061 - 0xA;
769 $self->{state} = ESCAPE_STATE; $i = 2;
770 $self->{c} = $self->{get_char}->();
771 redo A;
772 } elsif ($self->{c} == 0x000A or # \n
773 $self->{c} == 0x000C) { # \f
774 if ($q == 0) {
775 ## NOTE: In |escape| in ... in |ident|.
776 $self->{state} = BEFORE_TOKEN_STATE;
777 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
778 return $self->{t};
779 # reconsume
780 #redo A;
781 } elsif ($q == 1) {
782 ## NOTE: In |escape| in |URI|.
783 $self->{t}->{type} = {
784 URI_TOKEN, URI_INVALID_TOKEN,
785 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
786 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
787 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
788 }->{$self->{t}->{type}};
789 $self->{t}->{value} .= chr $self->{c};
790 $self->{state} = URI_UNQUOTED_STATE;
791 $self->{c} = $self->{get_char}->();
792 redo A;
793 } else {
794 ## Note: In |nl| in ... in |string| or |ident|.
795 $self->{t}->{value} .= chr $self->{c};
796 $self->{state} = STRING_STATE;
797 $self->{c} = $self->{get_char}->();
798 redo A;
799 }
800 } elsif ($self->{c} == 0x000D) { # \r
801 if ($q == 0) {
802 ## NOTE: In |escape| in ... in |ident|.
803 $self->{state} = BEFORE_TOKEN_STATE;
804 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
805 return $self->{t};
806 # reconsume
807 #redo A;
808 } elsif ($q == 1) {
809 $self->{t}->{type} = {
810 URI_TOKEN, URI_INVALID_TOKEN,
811 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
812 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
813 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
814 }->{$self->{t}->{type}};
815 $self->{t}->{value} .= "\x0D\x0A";
816 $self->{state} = URI_UNQUOTED_STATE;
817 $self->{c} = $self->{get_char}->();
818 redo A;
819 } else {
820 ## Note: In |nl| in ... in |string| or |ident|.
821 $self->{t}->{value} .= "\x0D\x0A";
822 $self->{state} = ESCAPE_BEFORE_LF_STATE;
823 $self->{c} = $self->{get_char}->();
824 redo A;
825 }
826 } else {
827 ## NOTE: second character of |escape|.
828 $self->{t}->{value} .= chr $self->{c};
829 $self->{state} = $q == 0 ? NAME_STATE :
830 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
831 $self->{c} = $self->{get_char}->();
832 redo A;
833 }
834 } elsif ($self->{state} == ESCAPE_STATE) {
835 ## NOTE: third..seventh character of |unicode| in |escape|.
836 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
837 $char = $char * 0x10 + $self->{c} - 0x0030;
838 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
839 $self->{c} = $self->{get_char}->();
840 redo A;
841 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
842 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
843 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
844 $self->{c} = $self->{get_char}->();
845 redo A;
846 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
847 $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
848 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
849 $self->{c} = $self->{get_char}->();
850 redo A;
851 } elsif ($self->{c} == 0x0020 or # SP
852 $self->{c} == 0x000A or # \n
853 $self->{c} == 0x0009 or # \t
854 $self->{c} == 0x000C) { # \f
855 $self->{t}->{value} .= chr $char;
856 $self->{state} = $q == 0 ? NAME_STATE :
857 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
858 $self->{c} = $self->{get_char}->();
859 redo A;
860 } elsif ($self->{c} == 0x000D) { # \r
861 $self->{state} = ESCAPE_BEFORE_LF_STATE;
862 $self->{c} = $self->{get_char}->();
863 redo A;
864 } else {
865 $self->{t}->{value} .= chr $char;
866 $self->{state} = $q == 0 ? NAME_STATE :
867 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
868 # reconsume
869 redo A;
870 }
871 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
872 ## NOTE: eightth character of |unicode| in |escape|.
873 if ($self->{c} == 0x0020 or # SP
874 $self->{c} == 0x000A or # \n
875 $self->{c} == 0x0009 or # \t
876 $self->{c} == 0x000C) { # \f
877 $self->{t}->{value} .= chr $char;
878 $self->{state} = $q == 0 ? NAME_STATE :
879 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
880 $self->{c} = $self->{get_char}->();
881 redo A;
882 } elsif ($self->{c} == 0x000D) { # \r
883 $self->{state} = ESCAPE_BEFORE_NL_STATE;
884 $self->{c} = $self->{get_char}->();
885 redo A;
886 } else {
887 $self->{t}->{value} .= chr $char;
888 $self->{state} = $q == 0 ? NAME_STATE :
889 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
890 # reconsume
891 redo A;
892 }
893 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
894 ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
895 if ($self->{c} == 0x000A) { # \n
896 $self->{t}->{value} .= chr $char;
897 $self->{state} = $q == 0 ? NAME_STATE :
898 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
899 $self->{c} = $self->{get_char}->();
900 redo A;
901 } else {
902 $self->{t}->{value} .= chr $char;
903 $self->{state} = $q == 0 ? NAME_STATE :
904 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
905 # reconsume
906 redo A;
907 }
908 } elsif ($self->{state} == STRING_STATE) {
909 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
910 ## a character in |invalid$Q| in |invalid| in |INVALID|,
911 ## where |$Q = $q == 0x0022 ? 1 : 2|.
912 ## Or, in |URI|.
913 if ($self->{c} == 0x005C) { # \
914 $self->{state} = ESCAPE_OPEN_STATE;
915 $self->{c} = $self->{get_char}->();
916 redo A;
917 } elsif ($self->{c} == $q) { # " | '
918 if ($self->{t}->{type} == STRING_TOKEN) {
919 $self->{state} = BEFORE_TOKEN_STATE;
920 $self->{c} = $self->{get_char}->();
921 return $self->{t};
922 #redo A;
923 } else {
924 $self->{state} = URI_AFTER_WSP_STATE;
925 $self->{c} = $self->{get_char}->();
926 redo A;
927 }
928 } elsif ($self->{c} == 0x000A or # \n
929 $self->{c} == 0x000D or # \r
930 $self->{c} == 0x000C or # \f
931 $self->{c} == -1) {
932 $self->{t}->{type} = INVALID_TOKEN;
933 $self->{state} = BEFORE_TOKEN_STATE;
934 # reconsume
935 return $self->{t};
936 #redo A;
937 } else {
938 $self->{t}->{value} .= chr $self->{c};
939 # stay in the state
940 $self->{c} = $self->{get_char}->();
941 redo A;
942 }
943 } elsif ($self->{state} == NUMBER_STATE) {
944 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
945 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
946 $self->{t}->{value} .= chr $self->{c};
947 # stay in the state
948 $self->{c} = $self->{get_char}->();
949 redo A;
950 } elsif ($self->{c} == 0x002E) { # .
951 $self->{state} = NUMBER_DOT_STATE;
952 $self->{c} = $self->{get_char}->();
953 redo A;
954 } else {
955 $self->{t}->{number} = $self->{t}->{value};
956 $self->{t}->{value} = '';
957 $self->{state} = AFTER_NUMBER_STATE;
958 # reprocess
959 redo A;
960 }
961 } elsif ($self->{state} == NUMBER_DOT_STATE) {
962 ## NOTE: The character immediately following |.| in |num|.
963 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
964 $self->{t}->{value} .= '.' . chr $self->{c};
965 $self->{state} = NUMBER_DOT_NUMBER_STATE;
966 $self->{c} = $self->{get_char}->();
967 redo A;
968 } else {
969 unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
970 $self->{t}->{number} = $self->{t}->{value};
971 $self->{t}->{value} = '';
972 $self->{state} = BEFORE_TOKEN_STATE;
973 # reprocess
974 return $self->{t};
975 #redo A;
976 }
977 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
978 ## NOTE: The character immediately following |.| at the beginning of |num|.
979 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
980 $self->{t}->{value} .= '.' . chr $self->{c};
981 $self->{state} = NUMBER_DOT_NUMBER_STATE;
982 $self->{c} = $self->{get_char}->();
983 redo A;
984 } else {
985 $self->{state} = BEFORE_TOKEN_STATE;
986 $self->{c} = $self->{get_char}->();
987 return {type => DELIM_TOKEN, value => '.'};
988 #redo A;
989 }
990 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
991 ## NOTE: |[0-9]| in |num| after |.|.
992 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
993 $self->{t}->{value} .= chr $self->{c};
994 # stay in the state
995 $self->{c} = $self->{get_char}->();
996 redo A;
997 } else {
998 $self->{t}->{number} = $self->{t}->{value};
999 $self->{t}->{value} = '';
1000 $self->{state} = AFTER_NUMBER_STATE;
1001 # reprocess
1002 redo A;
1003 }
1004 } else {
1005 die "$0: Unknown state |$self->{state}|";
1006 }
1007 } # A
1008 } # get_next_token
1009
1010 1;
1011 # $Date: 2007/09/08 03:25:05 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24