/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (show annotations) (download)
Sat Sep 8 03:25:05 2007 UTC (17 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.4: +166 -100 lines
++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 03:25:00 -0000
	* Tokenizer.pm: |UNICODE-RANGE| is implemented.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3
4 sub BEFORE_TOKEN_STATE () { 0 }
5 sub BEFORE_NMSTART_STATE () { 1 }
6 sub NAME_STATE () { 2 }
7 sub ESCAPE_OPEN_STATE () { 3 }
8 sub STRING_STATE () { 4 }
9 sub HASH_OPEN_STATE () { 5 }
10 sub NUMBER_STATE () { 6 }
11 sub NUMBER_FRACTION_STATE () { 7 }
12 sub AFTER_NUMBER_STATE () { 8 }
13 sub URI_BEFORE_WSP_STATE () { 9 }
14 sub ESCAPE_STATE () { 10 }
15 sub ESCAPE_BEFORE_LF_STATE () { 11 }
16 sub ESCAPE_BEFORE_NL_STATE () { 12 }
17 sub NUMBER_DOT_STATE () { 13 }
18 sub NUMBER_DOT_NUMBER_STATE () { 14 }
19 sub DELIM_STATE () { 15 }
20 sub URI_UNQUOTED_STATE () { 16 }
21 sub URI_AFTER_WSP_STATE () { 17 }
22 sub AFTER_AT_STATE () { 18 }
23 sub AFTER_AT_HYPHEN_STATE () { 19 }
24
25 sub IDENT_TOKEN () { 1 }
26 sub ATKEYWORD_TOKEN () { 2 }
27 sub HASH_TOKEN () { 3 }
28 sub FUNCTION_TOKEN () { 4 }
29 sub URI_TOKEN () { 5 }
30 sub URI_INVALID_TOKEN () { 6 }
31 sub URI_PREFIX_TOKEN () { 7 }
32 sub URI_PREFIX_INVALID_TOKEN () { 8 }
33 sub STRING_TOKEN () { 9 }
34 sub INVALID_TOKEN () { 10 }
35 sub NUMBER_TOKEN () { 11 }
36 sub DIMENSION_TOKEN () { 12 }
37 sub PERCENTAGE_TOKEN () { 13 }
38 sub UNICODE_RANGE_TOKEN () { 14 }
39 sub UNICODE_RANGE_INVALID_TOKEN () { 15 }
40 sub DELIM_TOKEN () { 16 }
41 sub PLUS_TOKEN () { 17 }
42 sub GREATER_TOKEN () { 18 }
43 sub COMMA_TOKEN () { 19 }
44 sub TILDE_TOKEN () { 20 }
45 sub DASHMATCH_TOKEN () { 21 }
46 sub PREFIXMATCH_TOKEN () { 22 }
47 sub SUFFIXMATCH_TOKEN () { 23 }
48 sub SUBSTRINGMATCH_TOKEN () { 24 }
49 sub INCLUDES_TOKEN () { 25 }
50 sub SEMICOLON_TOKEN () { 26 }
51 sub LBRACE_TOKEN () { 27 }
52 sub RBRACE_TOKEN () { 28 }
53 sub LPAREN_TOKEN () { 29 }
54 sub RPAREN_TOKEN () { 30 }
55 sub LBRACKET_TOKEN () { 31 }
56 sub RBRACKET_TOKEN () { 32 }
57 sub S_TOKEN () { 33 }
58 sub CDO_TOKEN () { 34 }
59 sub CDC_TOKEN () { 35 }
60 sub COMMENT_TOKEN () { 36 }
61 sub COMMENT_INVALID_TOKEN () { 37 }
62 sub EOF_TOKEN () { 38 }
63
64 our @TokenName = qw(
65 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
66 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
67 UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
68 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
69 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
70 COMMENT_INVALID EOF
71 );
72
73 sub new ($) {
74 my $self = bless {token => [], get_char => sub { -1 },
75 onerror => sub { }}, shift;
76 return $self;
77 } # new
78
79 sub init ($) {
80 my $self = shift;
81 $self->{state} = BEFORE_TOKEN_STATE;
82 $self->{c} = $self->{get_char}->();
83 #$self->{t} = {type => token-type, value => value, number => number};
84 } # init
85
86 sub get_next_token ($) {
87 my $self = shift;
88 if (@{$self->{token}}) {
89 return shift @{$self->{token}};
90 }
91
92 my $char;
93 my $num; # |{num}|, if any.
94 my $i; # |$i + 1|th character in |unicode| in |escape|.
95 my $q;
96 ## NOTE:
97 ## 0: in |ident|.
98 ## 1: in |URI| outside of |string|.
99 ## 0x0022: in |string1| or |invalid1|.
100 ## 0x0027: in |string2| or |invalid2|.
101
102 A: {
103 if ($self->{state} == BEFORE_TOKEN_STATE) {
104 if ($self->{c} == 0x002D) { # -
105 ## NOTE: |-| in |ident| in |IDENT|
106 $self->{t} = {type => IDENT_TOKEN, value => '-'};
107 $self->{state} = BEFORE_NMSTART_STATE;
108 $self->{c} = $self->{get_char}->();
109 redo A;
110 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
111 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
112 $self->{c} = $self->{get_char}->();
113 if ($self->{c} == 0x002B) { # +
114 $self->{c} = $self->{get_char}->();
115 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
116 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
117 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
118 $self->{c} == 0x003F) { # ?
119 $self->{t}->{value} .= '+' . chr $self->{c};
120 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
121 $self->{c} = $self->{get_char}->();
122 C: for (2..6) {
123 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
124 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
125 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
126 $self->{c} == 0x003F) { # ?
127 $self->{t}->{value} .= chr $self->{c};
128 $self->{c} = $self->{get_char}->();
129 } else {
130 last C;
131 }
132 } # C
133
134 if ($self->{c} == 0x002D) { # -
135 $self->{c} = $self->{get_char}->();
136 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
137 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
138 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
139 $self->{t}->{value} .= '-' . chr $self->{c};
140 $self->{c} = $self->{get_char}->();
141 C: for (2..6) {
142 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
145 $self->{t}->{value} .= chr $self->{c};
146 $self->{c} = $self->{get_char}->();
147 } else {
148 last C;
149 }
150 } # C
151
152 #
153 } else {
154 my $token = $self->{t};
155 $self->{t} = {type => IDENT_TOKEN, value => '-'};
156 $self->{state} = BEFORE_NMSTART_STATE;
157 # reprocess
158 return $token;
159 #redo A;
160 }
161 }
162
163 $self->{state} = BEFORE_TOKEN_STATE;
164 # reprocess
165 return $self->{t};
166 #redo A;
167 } else {
168 unshift @{$self->{token}}, {type => PLUS_TOKEN};
169 $self->{state} = BEFORE_TOKEN_STATE;
170 # reprocess
171 return $self->{t};
172 #redo A;
173 }
174 } else {
175 $self->{state} = NAME_STATE;
176 # reprocess
177 redo A;
178 }
179 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
180 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
181 $self->{c} == 0x005F or # _
182 $self->{c} > 0x007F) { # nonascii
183 ## NOTE: |nmstart| in |ident| in |IDENT|
184 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
185 $self->{state} = NAME_STATE;
186 $self->{c} = $self->{get_char}->();
187 redo A;
188 } elsif ($self->{c} == 0x005C) { # \
189 ## NOTE: |nmstart| in |ident| in |IDENT|
190 $self->{t} = {type => IDENT_TOKEN, value => ''};
191 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
192 $self->{c} = $self->{get_char}->();
193 redo A;
194 } elsif ($self->{c} == 0x0040) { # @
195 ## NOTE: |@| in |ATKEYWORD|
196 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
197 $self->{state} = AFTER_AT_STATE;
198 $self->{c} = $self->{get_char}->();
199 redo A;
200 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
201 $self->{t} = {type => STRING_TOKEN, value => ''};
202 $self->{state} = STRING_STATE; $q = $self->{c};
203 $self->{c} = $self->{get_char}->();
204 redo A;
205 } elsif ($self->{c} == 0x0023) { # #
206 ## NOTE: |#| in |HASH|.
207 $self->{t} = {type => HASH_TOKEN, value => ''};
208 $self->{state} = HASH_OPEN_STATE;
209 $self->{c} = $self->{get_char}->();
210 redo A;
211 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
212 ## NOTE: |num|.
213 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
214 $self->{state} = NUMBER_STATE;
215 $self->{c} = $self->{get_char}->();
216 redo A;
217 } elsif ($self->{c} == 0x002E) { # .
218 ## NOTE: |num|.
219 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
220 $self->{state} = NUMBER_FRACTION_STATE;
221 $self->{c} = $self->{get_char}->();
222 redo A;
223 } elsif ($self->{c} == 0x002F) { # /
224 $self->{c} = $self->{get_char}->();
225 if ($self->{c} == 0x002A) { # *
226 C: {
227 $self->{c} = $self->{get_char}->();
228 if ($self->{c} == 0x002A) { # *
229 D: {
230 $self->{c} = $self->{get_char}->();
231 if ($self->{c} == 0x002F) { # /
232 #
233 } elsif ($self->{c} == 0x002A) { # *
234 redo D;
235 } else {
236 redo C;
237 }
238 } # D
239 } elsif ($self->{c} == -1) {
240 # stay in the state
241 # reprocess
242 return {type => COMMENT_INVALID_TOKEN};
243 #redo A;
244 } else {
245 redo C;
246 }
247 } # C
248
249 # stay in the state.
250 $self->{c} = $self->{get_char}->();
251 redo A;
252 } else {
253 # stay in the state.
254 # reprocess
255 return {type => DELIM_STATE, value => '/'};
256 #redo A;
257 }
258 } elsif ($self->{c} == 0x003C) { # <
259 ## NOTE: |CDO|
260 $self->{c} = $self->{get_char}->();
261 if ($self->{c} == 0x0021) { # !
262 $self->{c} = $self->{get_char}->();
263 if ($self->{c} == 0x002C) { # -
264 $self->{c} = $self->{get_char}->();
265 if ($self->{c} == 0x002C) { # -
266 $self->{state} = BEFORE_TOKEN_STATE;
267 $self->{c} = $self->{get_char}->();
268 return {type => CDO_TOKEN};
269 #redo A;
270 } else {
271 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
272 ## NOTE: |-| in |ident| in |IDENT|
273 $self->{t} = {type => IDENT_TOKEN, value => '-'};
274 $self->{state} = BEFORE_NMSTART_STATE;
275 #reprocess
276 return {type => DELIM_TOKEN, value => '<'};
277 #redo A;
278 }
279 } else {
280 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
281 $self->{state} = BEFORE_TOKEN_STATE;
282 #reprocess
283 return {type => DELIM_TOKEN, value => '<'};
284 #redo A;
285 }
286 } else {
287 $self->{state} = BEFORE_TOKEN_STATE;
288 #reprocess
289 return {type => DELIM_TOKEN, value => '<'};
290 #redo A;
291 }
292 } elsif (my $t = {
293 0x003B => SEMICOLON_TOKEN, # ;
294 0x007B => LBRACE_TOKEN, # {
295 0x007D => RBRACE_TOKEN, # }
296 0x0028 => LPAREN_TOKEN, # (
297 0x0029 => RPAREN_TOKEN, # )
298 0x005B => LBRACKET_TOKEN, # [
299 0x005D => RBRACKET_TOKEN, # ]
300 }->{$self->{c}}) {
301 # stay in the state
302 $self->{c} = $self->{get_char}->();
303 return {type => $t};
304 # redo A;
305 } elsif ({
306 0x0020 => 1, # SP
307 0x0009 => 1, # \t
308 0x000D => 1, # \r
309 0x000A => 1, # \n
310 0x000C => 1, # \f
311 }->{$self->{c}}) {
312 W: {
313 $self->{c} = $self->{get_char}->();
314 if ({
315 0x0020 => 1, # SP
316 0x0009 => 1, # \t
317 0x000D => 1, # \r
318 0x000A => 1, # \n
319 0x000C => 1, # \f
320 }->{$self->{c}}) {
321 redo W;
322 } elsif (my $v = {
323 0x002B => PLUS_TOKEN, # +
324 0x003E => GREATER_TOKEN, # >
325 0x002C => COMMA_TOKEN, # ,
326 0x007E => TILDE_TOKEN, # ~
327 }->{$self->{c}}) {
328 # stay in the state
329 $self->{c} = $self->{get_char}->();
330 return {type => $v};
331 #redo A;
332 } else {
333 # stay in the state
334 # reprocess
335 return {type => S_TOKEN};
336 #redo A;
337 }
338 } # W
339 } elsif (my $v = {
340 0x007C => DASHMATCH_TOKEN, # |
341 0x005E => PREFIXMATCH_TOKEN, # ^
342 0x0024 => SUFFIXMATCH_TOKEN, # $
343 0x002A => SUBSTRINGMATCH_TOKEN, # *
344 }->{$self->{c}}) {
345 my $c = $self->{c};
346 $self->{c} = $self->{get_char}->();
347 if ($self->{c} == 0x003D) { # =
348 # stay in the state
349 $self->{c} = $self->{get_char}->();
350 return {type => $v};
351 #redo A;
352 } else {
353 # stay in the state
354 # reprocess
355 return {type => DELIM_TOKEN, value => chr $c};
356 #redo A;
357 }
358 } elsif ($self->{c} == 0x002B) { # +
359 # stay in the state
360 $self->{c} = $self->{get_char}->();
361 return {type => PLUS_TOKEN};
362 #redo A;
363 } elsif ($self->{c} == 0x003E) { # >
364 # stay in the state
365 $self->{c} = $self->{get_char}->();
366 return {type => GREATER_TOKEN};
367 #redo A;
368 } elsif ($self->{c} == 0x002C) { # ,
369 # stay in the state
370 $self->{c} = $self->{get_char}->();
371 return {type => COMMA_TOKEN};
372 #redo A;
373 } elsif ($self->{c} == 0x007E) { # ~
374 $self->{c} = $self->{get_char}->();
375 if ($self->{c} == 0x003D) { # =
376 # stay in the state
377 $self->{c} = $self->{get_char}->();
378 return {type => INCLUDES_TOKEN};
379 #redo A;
380 } else {
381 # stay in the state
382 # reprocess
383 return {type => TILDE_TOKEN};
384 #redo A;
385 }
386 } elsif ($self->{c} == -1) {
387 # stay in the state
388 $self->{c} = $self->{get_char}->();
389 return {type => EOF_TOKEN};
390 #redo A;
391 } else {
392 # stay in the state
393 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
394 $self->{c} = $self->{get_char}->();
395 return $self->{t};
396 #redo A;
397 }
398 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
399 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
400 ## |FUNCTION|)
401 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
402 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
403 $self->{c} == 0x005F or # _
404 $self->{c} > 0x007F) { # nonascii
405 $self->{t}->{value} .= chr $self->{c};
406 $self->{t}->{type} = DIMENSION_TOKEN
407 if $self->{t}->{type} == NUMBER_TOKEN;
408 $self->{state} = NAME_STATE;
409 $self->{c} = $self->{get_char}->();
410 redo A;
411 } elsif ($self->{c} == 0x005C) { # \
412 ## TODO: 12-\X, 12-\{nl}
413 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
414 $self->{c} = $self->{get_char}->();
415 redo A;
416 } elsif ($self->{c} == 0x002D and # -
417 $self->{t}->{type} == IDENT_TOKEN) {
418 $self->{c} = $self->{get_char}->();
419 if ($self->{c} == 0x003E) { # >
420 $self->{state} = BEFORE_TOKEN_STATE;
421 $self->{c} = $self->{get_char}->();
422 return {type => CDC_TOKEN};
423 #redo A;
424 } else {
425 ## NOTE: |-|, |-|, $self->{c}
426 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
427 # stay in the state
428 # reconsume
429 return {type => DELIM_TOKEN, value => '-'};
430 #redo A;
431 }
432 } else {
433 if ($self->{t}->{type} == NUMBER_TOKEN) {
434 ## NOTE: |-| after |NUMBER|.
435 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
436 $self->{state} = BEFORE_TOKEN_STATE;
437 # reconsume
438 $self->{t}->{value} = $self->{t}->{number};
439 delete $self->{t}->{number};
440 return $self->{t};
441 } else {
442 ## NOTE: |-| not followed by |nmstart|.
443 $self->{state} = BEFORE_TOKEN_STATE;
444 $self->{c} = $self->{get_char}->();
445 return {type => DELIM_TOKEN, value => '-'};
446 }
447 }
448 } elsif ($self->{state} == AFTER_AT_STATE) {
449 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
450 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
451 $self->{c} == 0x005F or # _
452 $self->{c} > 0x007F) { # nonascii
453 $self->{t}->{value} .= chr $self->{c};
454 $self->{state} = NAME_STATE;
455 $self->{c} = $self->{get_char}->();
456 redo A;
457 } elsif ($self->{c} == 0x002D) { # -
458 $self->{t}->{value} .= '-';
459 $self->{state} = AFTER_AT_HYPHEN_STATE;
460 $self->{c} = $self->{get_char}->();
461 redo A;
462 } elsif ($self->{c} == 0x005C) { # \
463 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
464 $self->{c} = $self->{get_char}->();
465 redo A;
466 } else {
467 $self->{state} = BEFORE_TOKEN_STATE;
468 # reprocess
469 return {type => DELIM_TOKEN, value => '@'};
470 }
471 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
472 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
473 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
474 $self->{c} == 0x005F or # _
475 $self->{c} > 0x007F) { # nonascii
476 $self->{t}->{value} .= chr $self->{c};
477 $self->{state} = NAME_STATE;
478 $self->{c} = $self->{get_char}->();
479 redo A;
480 } elsif ($self->{c} == 0x002D) { # -
481 $self->{c} = $self->{get_char}->();
482 if ($self->{c} == 0x003E) { # >
483 unshift @{$self->{token}}, {type => CDC_TOKEN};
484 $self->{state} = BEFORE_TOKEN_STATE;
485 $self->{c} = $self->{get_char}->();
486 return {type => DELIM_TOKEN, value => '@'};
487 #redo A;
488 } else {
489 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
490 $self->{t} = {type => IDENT_TOKEN, value => '-'};
491 $self->{state} = BEFORE_NMSTART_STATE;
492 # reprocess
493 return {type => DELIM_TOKEN, value => '@'};
494 #redo A;
495 }
496 } elsif ($self->{c} == 0x005C) { # \
497 ## TODO: @-\{nl}
498 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
499 $self->{c} = $self->{get_char}->();
500 redo A;
501 } else {
502 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
503 $self->{state} = BEFORE_TOKEN_STATE;
504 # reprocess
505 return {type => DELIM_TOKEN, value => '@'};
506 }
507 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
508 if ($self->{c} == 0x002D) { # -
509 ## NOTE: |-| in |ident|.
510 $self->{t}->{value} = '-';
511 $self->{state} = BEFORE_NMSTART_STATE;
512 $self->{c} = $self->{get_char}->();
513 redo A;
514 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
515 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
516 $self->{c} == 0x005F or # _
517 $self->{c} > 0x007F) { # nonascii
518 ## NOTE: |nmstart| in |ident|.
519 $self->{t}->{value} = chr $self->{c};
520 $self->{t}->{type} = DIMENSION_TOKEN;
521 $self->{state} = NAME_STATE;
522 $self->{c} = $self->{get_char}->();
523 redo A;
524 } elsif ($self->{c} == 0x005C) { # \
525 ## NOTE: |nmstart| in |ident| in |IDENT|
526 $self->{t}->{value} = '';
527 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
528 $self->{c} = $self->{get_char}->();
529 redo A;
530 } elsif ($self->{c} == 0x0025) { # %
531 $self->{t}->{type} = PERCENTAGE_TOKEN;
532 $self->{state} = BEFORE_TOKEN_STATE;
533 $self->{c} = $self->{get_char}->();
534 return $self->{t};
535 #redo A;
536 } else {
537 $self->{state} = BEFORE_TOKEN_STATE;
538 # reprocess
539 return $self->{t};
540 #redo A;
541 }
542 } elsif ($self->{state} == HASH_OPEN_STATE) {
543 ## NOTE: The first |nmchar| in |name| in |HASH|.
544 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
545 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
546 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
547 $self->{c} == 0x002D or # -
548 $self->{c} == 0x005F or # _
549 $self->{c} > 0x007F) { # nonascii
550 $self->{t}->{value} .= chr $self->{c};
551 $self->{state} = NAME_STATE;
552 $self->{c} = $self->{get_char}->();
553 redo A;
554 } elsif ($self->{c} == 0x005C) { # \
555 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
556 $self->{c} = $self->{get_char}->();
557 redo A;
558 } else {
559 $self->{state} = BEFORE_TOKEN_STATE;
560 $self->{c} = $self->{get_char}->();
561 return {type => DELIM_TOKEN, value => '#'};
562 #redo A;
563 }
564 } elsif ($self->{state} == NAME_STATE) {
565 ## NOTE: |nmchar| in (|ident| or |name|).
566 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
567 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
568 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
569 $self->{c} == 0x005F or # _
570 $self->{c} == 0x002D or # -
571 $self->{c} > 0x007F) { # nonascii
572 $self->{t}->{value} .= chr $self->{c};
573 # stay in the state
574 $self->{c} = $self->{get_char}->();
575 redo A;
576 } elsif ($self->{c} == 0x005C) { # \
577 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
578 $self->{c} = $self->{get_char}->();
579 redo A;
580 } elsif ($self->{c} == 0x0028 and # (
581 $self->{t}->{type} == IDENT_TOKEN) { # (
582 my $func_name = $self->{t}->{value};
583 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
584 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
585 if ($self->{t}->{has_escape}) {
586 ## TODO: warn
587 }
588 $self->{t}->{type}
589 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
590 $self->{t}->{value} = '';
591 $self->{state} = URI_BEFORE_WSP_STATE;
592 $self->{c} = $self->{get_char}->();
593 redo A;
594 } else {
595 $self->{t}->{type} = FUNCTION_TOKEN;
596 $self->{state} = BEFORE_TOKEN_STATE;
597 $self->{c} = $self->{get_char}->();
598 return $self->{t};
599 #redo A;
600 }
601 } else {
602 $self->{state} = BEFORE_TOKEN_STATE;
603 # reconsume
604 return $self->{t};
605 #redo A;
606 }
607 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
608 while ({
609 0x0020 => 1, # SP
610 0x0009 => 1, # \t
611 0x000D => 1, # \r
612 0x000A => 1, # \n
613 0x000C => 1, # \f
614 }->{$self->{c}}) {
615 $self->{c} = $self->{get_char}->();
616 }
617 if ($self->{c} == -1) {
618 $self->{t}->{type} = {
619 URI_TOKEN, URI_INVALID_TOKEN,
620 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
621 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
622 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
623 }->{$self->{t}->{type}};
624 $self->{state} = BEFORE_TOKEN_STATE;
625 $self->{c} = $self->{get_char}->();
626 return $self->{t};
627 #redo A;
628 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
629 ## TODO: Should we consider matches of "(" and ")"?
630 $self->{t}->{type} = {
631 URI_TOKEN, URI_INVALID_TOKEN,
632 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
633 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
634 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
635 }->{$self->{t}->{type}};
636 $self->{state} = URI_UNQUOTED_STATE;
637 $self->{c} = $self->{get_char}->();
638 redo A;
639 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
640 $self->{state} = STRING_STATE; $q = $self->{c};
641 $self->{c} = $self->{get_char}->();
642 redo A;
643 } elsif ($self->{c} == 0x0029) { # )
644 $self->{state} = BEFORE_TOKEN_STATE;
645 $self->{c} = $self->{get_char}->();
646 return $self->{t};
647 #redo A;
648 } elsif ($self->{c} == 0x005C) { # \
649 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
650 $self->{c} = $self->{get_char}->();
651 redo A;
652 } else {
653 $self->{t}->{value} .= chr $self->{c};
654 $self->{state} = URI_UNQUOTED_STATE;
655 $self->{c} = $self->{get_char}->();
656 redo A;
657 }
658 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
659 if ({
660 0x0020 => 1, # SP
661 0x0009 => 1, # \t
662 0x000D => 1, # \r
663 0x000A => 1, # \n
664 0x000C => 1, # \f
665 }->{$self->{c}}) {
666 $self->{state} = URI_AFTER_WSP_STATE;
667 $self->{c} = $self->{get_char}->();
668 redo A;
669 } elsif ($self->{c} == -1) {
670 $self->{t}->{type} = {
671 URI_TOKEN, URI_INVALID_TOKEN,
672 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
673 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
674 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
675 }->{$self->{t}->{type}};
676 $self->{state} = BEFORE_TOKEN_STATE;
677 $self->{c} = $self->{get_char}->();
678 return $self->{t};
679 #redo A;
680 } elsif ($self->{c} < 0x0020 or {
681 0x0022 => 1, # "
682 0x0027 => 1, # '
683 0x0028 => 1, # (
684 }->{$self->{c}}) { # C0 or (
685 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
686 $self->{t}->{type} = {
687 URI_TOKEN, URI_INVALID_TOKEN,
688 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
689 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
690 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
691 }->{$self->{t}->{type}};
692 # stay in the state.
693 $self->{c} = $self->{get_char}->();
694 redo A;
695 } elsif ($self->{c} == 0x0029) { # )
696 $self->{state} = BEFORE_TOKEN_STATE;
697 $self->{c} = $self->{get_char}->();
698 return $self->{t};
699 #redo A;
700 } elsif ($self->{c} == 0x005C) { # \
701 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
702 $self->{c} = $self->{get_char}->();
703 redo A;
704 } else {
705 $self->{t}->{value} .= chr $self->{c};
706 # stay in the state.
707 $self->{c} = $self->{get_char}->();
708 redo A;
709 }
710 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
711 if ({
712 0x0020 => 1, # SP
713 0x0009 => 1, # \t
714 0x000D => 1, # \r
715 0x000A => 1, # \n
716 0x000C => 1, # \f
717 }->{$self->{c}}) {
718 # stay in the state.
719 $self->{c} = $self->{get_char}->();
720 redo A;
721 } elsif ($self->{c} == -1) {
722 $self->{t}->{type} = {
723 URI_TOKEN, URI_INVALID_TOKEN,
724 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
725 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
726 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
727 }->{$self->{t}->{type}};
728 $self->{state} = BEFORE_TOKEN_STATE;
729 $self->{c} = $self->{get_char}->();
730 return $self->{t};
731 #redo A;
732 } elsif ($self->{c} == 0x0029) { # )
733 $self->{state} = BEFORE_TOKEN_STATE;
734 $self->{c} = $self->{get_char}->();
735 return $self->{t};
736 #redo A;
737 } elsif ($self->{c} == 0x005C) { # \
738 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
739 $self->{c} = $self->{get_char}->();
740 redo A;
741 } else {
742 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
743 $self->{t}->{type} = {
744 URI_TOKEN, URI_INVALID_TOKEN,
745 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
746 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
747 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
748 }->{$self->{t}->{type}};
749 # stay in the state.
750 $self->{c} = $self->{get_char}->();
751 redo A;
752 }
753 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
754 $self->{t}->{has_escape} = 1;
755 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
756 ## NOTE: second character of |unicode| in |escape|.
757 $char = $self->{c} - 0x0030;
758 $self->{state} = ESCAPE_STATE; $i = 2;
759 $self->{c} = $self->{get_char}->();
760 redo A;
761 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
762 ## NOTE: second character of |unicode| in |escape|.
763 $char = $self->{c} - 0x0041 + 0xA;
764 $self->{state} = ESCAPE_STATE; $i = 2;
765 $self->{c} = $self->{get_char}->();
766 redo A;
767 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
768 ## NOTE: second character of |unicode| in |escape|.
769 $char = $self->{c} - 0x0061 - 0xA;
770 $self->{state} = ESCAPE_STATE; $i = 2;
771 $self->{c} = $self->{get_char}->();
772 redo A;
773 } elsif ($self->{c} == 0x000A or # \n
774 $self->{c} == 0x000C) { # \f
775 if ($q == 0) {
776 ## NOTE: In |escape| in ... in |ident|.
777 $self->{state} = BEFORE_TOKEN_STATE;
778 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
779 return $self->{t};
780 # reconsume
781 #redo A;
782 } elsif ($q == 1) {
783 ## NOTE: In |escape| in |URI|.
784 $self->{t}->{type} = {
785 URI_TOKEN, URI_INVALID_TOKEN,
786 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
787 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
788 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
789 }->{$self->{t}->{type}};
790 $self->{t}->{value} .= chr $self->{c};
791 $self->{state} = URI_UNQUOTED_STATE;
792 $self->{c} = $self->{get_char}->();
793 redo A;
794 } else {
795 ## Note: In |nl| in ... in |string| or |ident|.
796 $self->{t}->{value} .= chr $self->{c};
797 $self->{state} = STRING_STATE;
798 $self->{c} = $self->{get_char}->();
799 redo A;
800 }
801 } elsif ($self->{c} == 0x000D) { # \r
802 if ($q == 0) {
803 ## NOTE: In |escape| in ... in |ident|.
804 $self->{state} = BEFORE_TOKEN_STATE;
805 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
806 return $self->{t};
807 # reconsume
808 #redo A;
809 } elsif ($q == 1) {
810 $self->{t}->{type} = {
811 URI_TOKEN, URI_INVALID_TOKEN,
812 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815 }->{$self->{t}->{type}};
816 $self->{t}->{value} .= "\x0D\x0A";
817 $self->{state} = URI_UNQUOTED_STATE;
818 $self->{c} = $self->{get_char}->();
819 redo A;
820 } else {
821 ## Note: In |nl| in ... in |string| or |ident|.
822 $self->{t}->{value} .= "\x0D\x0A";
823 $self->{state} = ESCAPE_BEFORE_LF_STATE;
824 $self->{c} = $self->{get_char}->();
825 redo A;
826 }
827 } else {
828 ## NOTE: second character of |escape|.
829 $self->{t}->{value} .= chr $self->{c};
830 $self->{state} = $q == 0 ? NAME_STATE :
831 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
832 $self->{c} = $self->{get_char}->();
833 redo A;
834 }
835 } elsif ($self->{state} == ESCAPE_STATE) {
836 ## NOTE: third..seventh character of |unicode| in |escape|.
837 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
838 $char = $char * 0x10 + $self->{c} - 0x0030;
839 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
840 $self->{c} = $self->{get_char}->();
841 redo A;
842 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
843 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
844 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
845 $self->{c} = $self->{get_char}->();
846 redo A;
847 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
848 $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
849 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
850 $self->{c} = $self->{get_char}->();
851 redo A;
852 } elsif ($self->{c} == 0x0020 or # SP
853 $self->{c} == 0x000A or # \n
854 $self->{c} == 0x0009 or # \t
855 $self->{c} == 0x000C) { # \f
856 $self->{t}->{value} .= chr $char;
857 $self->{state} = $q == 0 ? NAME_STATE :
858 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
859 $self->{c} = $self->{get_char}->();
860 redo A;
861 } elsif ($self->{c} == 0x000D) { # \r
862 $self->{state} = ESCAPE_BEFORE_LF_STATE;
863 $self->{c} = $self->{get_char}->();
864 redo A;
865 } else {
866 $self->{t}->{value} .= chr $char;
867 $self->{state} = $q == 0 ? NAME_STATE :
868 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
869 # reconsume
870 redo A;
871 }
872 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
873 ## NOTE: eightth character of |unicode| in |escape|.
874 if ($self->{c} == 0x0020 or # SP
875 $self->{c} == 0x000A or # \n
876 $self->{c} == 0x0009 or # \t
877 $self->{c} == 0x000C) { # \f
878 $self->{t}->{value} .= chr $char;
879 $self->{state} = $q == 0 ? NAME_STATE :
880 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
881 $self->{c} = $self->{get_char}->();
882 redo A;
883 } elsif ($self->{c} == 0x000D) { # \r
884 $self->{state} = ESCAPE_BEFORE_NL_STATE;
885 $self->{c} = $self->{get_char}->();
886 redo A;
887 } else {
888 $self->{t}->{value} .= chr $char;
889 $self->{state} = $q == 0 ? NAME_STATE :
890 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
891 # reconsume
892 redo A;
893 }
894 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
895 ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
896 if ($self->{c} == 0x000A) { # \n
897 $self->{t}->{value} .= chr $char;
898 $self->{state} = $q == 0 ? NAME_STATE :
899 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
900 $self->{c} = $self->{get_char}->();
901 redo A;
902 } else {
903 $self->{t}->{value} .= chr $char;
904 $self->{state} = $q == 0 ? NAME_STATE :
905 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
906 # reconsume
907 redo A;
908 }
909 } elsif ($self->{state} == STRING_STATE) {
910 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
911 ## a character in |invalid$Q| in |invalid| in |INVALID|,
912 ## where |$Q = $q == 0x0022 ? 1 : 2|.
913 ## Or, in |URI|.
914 if ($self->{c} == 0x005C) { # \
915 $self->{state} = ESCAPE_OPEN_STATE;
916 $self->{c} = $self->{get_char}->();
917 redo A;
918 } elsif ($self->{c} == $q) { # " | '
919 if ($self->{t}->{type} == STRING_TOKEN) {
920 $self->{state} = BEFORE_TOKEN_STATE;
921 $self->{c} = $self->{get_char}->();
922 return $self->{t};
923 #redo A;
924 } else {
925 $self->{state} = URI_AFTER_WSP_STATE;
926 $self->{c} = $self->{get_char}->();
927 redo A;
928 }
929 } elsif ($self->{c} == 0x000A or # \n
930 $self->{c} == 0x000D or # \r
931 $self->{c} == 0x000C or # \f
932 $self->{c} == -1) {
933 $self->{t}->{type} = INVALID_TOKEN;
934 $self->{state} = BEFORE_TOKEN_STATE;
935 # reconsume
936 return $self->{t};
937 #redo A;
938 } else {
939 $self->{t}->{value} .= chr $self->{c};
940 # stay in the state
941 $self->{c} = $self->{get_char}->();
942 redo A;
943 }
944 } elsif ($self->{state} == NUMBER_STATE) {
945 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
946 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
947 $self->{t}->{value} .= chr $self->{c};
948 # stay in the state
949 $self->{c} = $self->{get_char}->();
950 redo A;
951 } elsif ($self->{c} == 0x002E) { # .
952 $self->{state} = NUMBER_DOT_STATE;
953 $self->{c} = $self->{get_char}->();
954 redo A;
955 } else {
956 $self->{t}->{number} = $self->{t}->{value};
957 $self->{t}->{value} = '';
958 $self->{state} = AFTER_NUMBER_STATE;
959 # reprocess
960 redo A;
961 }
962 } elsif ($self->{state} == NUMBER_DOT_STATE) {
963 ## NOTE: The character immediately following |.| in |num|.
964 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
965 $self->{t}->{value} .= '.' . chr $self->{c};
966 $self->{state} = NUMBER_DOT_NUMBER_STATE;
967 $self->{c} = $self->{get_char}->();
968 redo A;
969 } else {
970 unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
971 $self->{t}->{number} = $self->{t}->{value};
972 $self->{t}->{value} = '';
973 $self->{state} = BEFORE_TOKEN_STATE;
974 # reprocess
975 return $self->{t};
976 #redo A;
977 }
978 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
979 ## NOTE: The character immediately following |.| at the beginning of |num|.
980 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
981 $self->{t}->{value} .= '.' . chr $self->{c};
982 $self->{state} = NUMBER_DOT_NUMBER_STATE;
983 $self->{c} = $self->{get_char}->();
984 redo A;
985 } else {
986 $self->{state} = BEFORE_TOKEN_STATE;
987 $self->{c} = $self->{get_char}->();
988 return {type => DELIM_TOKEN, value => '.'};
989 #redo A;
990 }
991 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
992 ## NOTE: |[0-9]| in |num| after |.|.
993 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
994 $self->{t}->{value} .= chr $self->{c};
995 # stay in the state
996 $self->{c} = $self->{get_char}->();
997 redo A;
998 } else {
999 $self->{t}->{number} = $self->{t}->{value};
1000 $self->{t}->{value} = '';
1001 $self->{state} = AFTER_NUMBER_STATE;
1002 # reprocess
1003 redo A;
1004 }
1005 } else {
1006 die "$0: Unknown state |$self->{state}|";
1007 }
1008 } # A
1009 } # get_next_token
1010
1011 1;
1012 # $Date: 2007/09/08 02:58:24 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24