/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations) (download)
Sat Sep 8 02:58:24 2007 UTC (17 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.3: +38 -2 lines
++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 02:58:20 -0000
	* Tokenizer.pm: |COMMENT| is implemented.
	A bug for treatement for |@-->| is fixed.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3
4 sub BEFORE_TOKEN_STATE () { 0 }
5 sub BEFORE_NMSTART_STATE () { 1 }
6 sub NAME_STATE () { 2 }
7 sub ESCAPE_OPEN_STATE () { 3 }
8 sub STRING_STATE () { 4 }
9 sub HASH_OPEN_STATE () { 5 }
10 sub NUMBER_STATE () { 6 }
11 sub NUMBER_FRACTION_STATE () { 7 }
12 sub AFTER_NUMBER_STATE () { 8 }
13 sub URI_BEFORE_WSP_STATE () { 9 }
14 sub ESCAPE_STATE () { 10 }
15 sub ESCAPE_BEFORE_LF_STATE () { 11 }
16 sub ESCAPE_BEFORE_NL_STATE () { 12 }
17 sub NUMBER_DOT_STATE () { 13 }
18 sub NUMBER_DOT_NUMBER_STATE () { 14 }
19 sub DELIM_STATE () { 15 }
20 sub URI_UNQUOTED_STATE () { 16 }
21 sub URI_AFTER_WSP_STATE () { 17 }
22 sub AFTER_AT_STATE () { 18 }
23 sub AFTER_AT_HYPHEN_STATE () { 19 }
24
25 sub IDENT_TOKEN () { 1 }
26 sub ATKEYWORD_TOKEN () { 2 }
27 sub HASH_TOKEN () { 3 }
28 sub FUNCTION_TOKEN () { 4 }
29 sub URI_TOKEN () { 5 }
30 sub URI_INVALID_TOKEN () { 6 }
31 sub URI_PREFIX_TOKEN () { 7 }
32 sub URI_PREFIX_INVALID_TOKEN () { 8 }
33 sub STRING_TOKEN () { 9 }
34 sub INVALID_TOKEN () { 10 }
35 sub NUMBER_TOKEN () { 11 }
36 sub DIMENSION_TOKEN () { 12 }
37 sub PERCENTAGE_TOKEN () { 13 }
38 sub UNICODE_RANGE_TOKEN () { 14 }
39 sub UNICODE_RANGE_INVALID_TOKEN () { 15 }
40 sub DELIM_TOKEN () { 16 }
41 sub PLUS_TOKEN () { 17 }
42 sub GREATER_TOKEN () { 18 }
43 sub COMMA_TOKEN () { 19 }
44 sub TILDE_TOKEN () { 20 }
45 sub DASHMATCH_TOKEN () { 21 }
46 sub PREFIXMATCH_TOKEN () { 22 }
47 sub SUFFIXMATCH_TOKEN () { 23 }
48 sub SUBSTRINGMATCH_TOKEN () { 24 }
49 sub INCLUDES_TOKEN () { 25 }
50 sub SEMICOLON_TOKEN () { 26 }
51 sub LBRACE_TOKEN () { 27 }
52 sub RBRACE_TOKEN () { 28 }
53 sub LPAREN_TOKEN () { 29 }
54 sub RPAREN_TOKEN () { 30 }
55 sub LBRACKET_TOKEN () { 31 }
56 sub RBRACKET_TOKEN () { 32 }
57 sub S_TOKEN () { 33 }
58 sub CDO_TOKEN () { 34 }
59 sub CDC_TOKEN () { 35 }
60 sub COMMENT_TOKEN () { 36 }
61 sub COMMENT_INVALID_TOKEN () { 37 }
62 sub EOF_TOKEN () { 38 }
63
64 our @TokenName = qw(
65 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
66 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
67 UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
68 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
69 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
70 COMMENT_INVALID EOF
71 );
72
73 sub new ($) {
74 my $self = bless {token => [], get_char => sub { -1 },
75 onerror => sub { }}, shift;
76 return $self;
77 } # new
78
79 sub init ($) {
80 my $self = shift;
81 $self->{state} = BEFORE_TOKEN_STATE;
82 $self->{c} = $self->{get_char}->();
83 } # init
84
85 sub get_next_token ($) {
86 my $self = shift;
87 if (@{$self->{token}}) {
88 return shift @{$self->{token}};
89 }
90
91 my $current_token;
92 my $char;
93 my $num; # |{num}|, if any.
94 my $i; # |$i + 1|th character in |unicode| in |escape|.
95 my $q;
96 ## NOTE:
97 ## 0: in |ident|.
98 ## 1: in |URI| outside of |string|.
99 ## 0x0022: in |string1| or |invalid1|.
100 ## 0x0027: in |string2| or |invalid2|.
101
102 A: {
103 if ($self->{state} == BEFORE_TOKEN_STATE) {
104 if ($self->{c} == 0x002D) { # -
105 ## NOTE: |-| in |ident| in |IDENT|
106 $current_token = {type => IDENT_TOKEN, value => '-'};
107 $self->{state} = BEFORE_NMSTART_STATE;
108 $self->{c} = $self->{get_char}->();
109 redo A;
110 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
111 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
112 $self->{c} == 0x005F or # _
113 $self->{c} > 0x007F) { # nonascii
114 ## NOTE: |nmstart| in |ident| in |IDENT|
115 $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};
116 $self->{state} = NAME_STATE;
117 $self->{c} = $self->{get_char}->();
118 redo A;
119 } elsif ($self->{c} == 0x005C) { # \
120 ## NOTE: |nmstart| in |ident| in |IDENT|
121 $current_token = {type => IDENT_TOKEN, value => ''};
122 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
123 $self->{c} = $self->{get_char}->();
124 redo A;
125 } elsif ($self->{c} == 0x0040) { # @
126 ## NOTE: |@| in |ATKEYWORD|
127 $current_token = {type => ATKEYWORD_TOKEN, value => ''};
128 $self->{state} = AFTER_AT_STATE;
129 $self->{c} = $self->{get_char}->();
130 redo A;
131 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
132 $current_token = {type => STRING_TOKEN, value => ''};
133 $self->{state} = STRING_STATE; $q = $self->{c};
134 $self->{c} = $self->{get_char}->();
135 redo A;
136 } elsif ($self->{c} == 0x0023) { # #
137 ## NOTE: |#| in |HASH|.
138 $current_token = {type => HASH_TOKEN, value => ''};
139 $self->{state} = HASH_OPEN_STATE;
140 $self->{c} = $self->{get_char}->();
141 redo A;
142 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
143 ## NOTE: |num|.
144 $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};
145 $self->{state} = NUMBER_STATE;
146 $self->{c} = $self->{get_char}->();
147 redo A;
148 } elsif ($self->{c} == 0x002E) { # .
149 ## NOTE: |num|.
150 $current_token = {type => NUMBER_TOKEN, value => '0'};
151 $self->{state} = NUMBER_FRACTION_STATE;
152 $self->{c} = $self->{get_char}->();
153 redo A;
154 } elsif ($self->{c} == 0x002F) { # /
155 $self->{c} = $self->{get_char}->();
156 if ($self->{c} == 0x002A) { # *
157 C: {
158 $self->{c} = $self->{get_char}->();
159 if ($self->{c} == 0x002A) { # *
160 D: {
161 $self->{c} = $self->{get_char}->();
162 if ($self->{c} == 0x002F) { # /
163 #
164 } elsif ($self->{c} == 0x002A) { # *
165 redo D;
166 } else {
167 redo C;
168 }
169 } # D
170 } elsif ($self->{c} == -1) {
171 # stay in the state
172 # reprocess
173 return {type => COMMENT_INVALID_TOKEN};
174 #redo A;
175 } else {
176 redo C;
177 }
178 } # C
179
180 # stay in the state.
181 $self->{c} = $self->{get_char}->();
182 redo A;
183 } else {
184 # stay in the state.
185 # reprocess
186 return {type => DELIM_STATE, value => '/'};
187 #redo A;
188 }
189 } elsif ($self->{c} == 0x003C) { # <
190 ## NOTE: |CDO|
191 $self->{c} = $self->{get_char}->();
192 if ($self->{c} == 0x0021) { # !
193 $self->{c} = $self->{get_char}->();
194 if ($self->{c} == 0x002C) { # -
195 $self->{c} = $self->{get_char}->();
196 if ($self->{c} == 0x002C) { # -
197 $self->{state} = BEFORE_TOKEN_STATE;
198 $self->{c} = $self->{get_char}->();
199 return {type => CDO_TOKEN};
200 #redo A;
201 } else {
202 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
203 ## NOTE: |-| in |ident| in |IDENT|
204 $current_token = {type => IDENT_TOKEN, value => '-'};
205 $self->{state} = BEFORE_NMSTART_STATE;
206 #reprocess
207 return {type => DELIM_TOKEN, value => '<'};
208 #redo A;
209 }
210 } else {
211 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
212 $self->{state} = BEFORE_TOKEN_STATE;
213 #reprocess
214 return {type => DELIM_TOKEN, value => '<'};
215 #redo A;
216 }
217 } else {
218 $self->{state} = BEFORE_TOKEN_STATE;
219 #reprocess
220 return {type => DELIM_TOKEN, value => '<'};
221 #redo A;
222 }
223 } elsif (my $t = {
224 0x003B => SEMICOLON_TOKEN, # ;
225 0x007B => LBRACE_TOKEN, # {
226 0x007D => RBRACE_TOKEN, # }
227 0x0028 => LPAREN_TOKEN, # (
228 0x0029 => RPAREN_TOKEN, # )
229 0x005B => LBRACKET_TOKEN, # [
230 0x005D => RBRACKET_TOKEN, # ]
231 }->{$self->{c}}) {
232 # stay in the state
233 $self->{c} = $self->{get_char}->();
234 return {type => $t};
235 # redo A;
236 } elsif ({
237 0x0020 => 1, # SP
238 0x0009 => 1, # \t
239 0x000D => 1, # \r
240 0x000A => 1, # \n
241 0x000C => 1, # \f
242 }->{$self->{c}}) {
243 W: {
244 $self->{c} = $self->{get_char}->();
245 if ({
246 0x0020 => 1, # SP
247 0x0009 => 1, # \t
248 0x000D => 1, # \r
249 0x000A => 1, # \n
250 0x000C => 1, # \f
251 }->{$self->{c}}) {
252 redo W;
253 } elsif (my $v = {
254 0x002B => PLUS_TOKEN, # +
255 0x003E => GREATER_TOKEN, # >
256 0x002C => COMMA_TOKEN, # ,
257 0x007E => TILDE_TOKEN, # ~
258 }->{$self->{c}}) {
259 # stay in the state
260 $self->{c} = $self->{get_char}->();
261 return {type => $v};
262 #redo A;
263 } else {
264 # stay in the state
265 # reprocess
266 return {type => S_TOKEN};
267 #redo A;
268 }
269 } # W
270 } elsif (my $v = {
271 0x007C => DASHMATCH_TOKEN, # |
272 0x005E => PREFIXMATCH_TOKEN, # ^
273 0x0024 => SUFFIXMATCH_TOKEN, # $
274 0x002A => SUBSTRINGMATCH_TOKEN, # *
275 }->{$self->{c}}) {
276 my $c = $self->{c};
277 $self->{c} = $self->{get_char}->();
278 if ($self->{c} == 0x003D) { # =
279 # stay in the state
280 $self->{c} = $self->{get_char}->();
281 return {type => $v};
282 #redo A;
283 } else {
284 # stay in the state
285 # reprocess
286 return {type => DELIM_TOKEN, value => chr $c};
287 #redo A;
288 }
289 } elsif ($self->{c} == 0x002B) { # +
290 # stay in the state
291 $self->{c} = $self->{get_char}->();
292 return {type => PLUS_TOKEN};
293 #redo A;
294 } elsif ($self->{c} == 0x003E) { # >
295 # stay in the state
296 $self->{c} = $self->{get_char}->();
297 return {type => GREATER_TOKEN};
298 #redo A;
299 } elsif ($self->{c} == 0x002C) { # ,
300 # stay in the state
301 $self->{c} = $self->{get_char}->();
302 return {type => COMMA_TOKEN};
303 #redo A;
304 } elsif ($self->{c} == 0x007E) { # ~
305 $self->{c} = $self->{get_char}->();
306 if ($self->{c} == 0x003D) { # =
307 # stay in the state
308 $self->{c} = $self->{get_char}->();
309 return {type => INCLUDES_TOKEN};
310 #redo A;
311 } else {
312 # stay in the state
313 # reprocess
314 return {type => TILDE_TOKEN};
315 #redo A;
316 }
317 } elsif ($self->{c} == -1) {
318 # stay in the state
319 $self->{c} = $self->{get_char}->();
320 return {type => EOF_TOKEN};
321 #redo A;
322 } else {
323 # stay in the state
324 $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};
325 $self->{c} = $self->{get_char}->();
326 return $current_token;
327 #redo A;
328 }
329 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
330 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
331 ## |FUNCTION|)
332 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
333 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
334 $self->{c} == 0x005F or # _
335 $self->{c} > 0x007F) { # nonascii
336 $current_token->{value} .= chr $self->{c};
337 $current_token->{type} = DIMENSION_TOKEN
338 if $current_token->{type} == NUMBER_TOKEN;
339 $self->{state} = NAME_STATE;
340 $self->{c} = $self->{get_char}->();
341 redo A;
342 } elsif ($self->{c} == 0x005C) { # \
343 ## TODO: 12-\X, 12-\{nl}
344 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
345 $self->{c} = $self->{get_char}->();
346 redo A;
347 } elsif ($self->{c} == 0x002D and # -
348 $current_token->{type} == IDENT_TOKEN) {
349 $self->{c} = $self->{get_char}->();
350 if ($self->{c} == 0x003E) { # >
351 $self->{state} = BEFORE_TOKEN_STATE;
352 $self->{c} = $self->{get_char}->();
353 return {type => CDC_TOKEN};
354 #redo A;
355 } else {
356 ## NOTE: |-|, |-|, $self->{c}
357 #$current_token = {type => IDENT_TOKEN, value => '-'};
358 # stay in the state
359 # reconsume
360 return {type => DELIM_TOKEN, value => '-'};
361 #redo A;
362 }
363 } else {
364 if ($current_token->{type} == NUMBER_TOKEN) {
365 ## NOTE: |-| after |NUMBER|.
366 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
367 $self->{state} = BEFORE_TOKEN_STATE;
368 # reconsume
369 $current_token->{value} = $current_token->{number};
370 delete $current_token->{number};
371 return $current_token;
372 } else {
373 ## NOTE: |-| not followed by |nmstart|.
374 $self->{state} = BEFORE_TOKEN_STATE;
375 $self->{c} = $self->{get_char}->();
376 return {type => DELIM_TOKEN, value => '-'};
377 }
378 }
379 } elsif ($self->{state} == AFTER_AT_STATE) {
380 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
381 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
382 $self->{c} == 0x005F or # _
383 $self->{c} > 0x007F) { # nonascii
384 $current_token->{value} .= chr $self->{c};
385 $self->{state} = NAME_STATE;
386 $self->{c} = $self->{get_char}->();
387 redo A;
388 } elsif ($self->{c} == 0x002D) { # -
389 $current_token->{value} .= '-';
390 $self->{state} = AFTER_AT_HYPHEN_STATE;
391 $self->{c} = $self->{get_char}->();
392 redo A;
393 } elsif ($self->{c} == 0x005C) { # \
394 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
395 $self->{c} = $self->{get_char}->();
396 redo A;
397 } else {
398 $self->{state} = BEFORE_TOKEN_STATE;
399 # reprocess
400 return {type => DELIM_TOKEN, value => '@'};
401 }
402 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
403 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
404 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
405 $self->{c} == 0x005F or # _
406 $self->{c} > 0x007F) { # nonascii
407 $current_token->{value} .= chr $self->{c};
408 $self->{state} = NAME_STATE;
409 $self->{c} = $self->{get_char}->();
410 redo A;
411 } elsif ($self->{c} == 0x002D) { # -
412 $self->{c} = $self->{get_char}->();
413 if ($self->{c} == 0x003E) { # >
414 unshift @{$self->{token}}, {type => CDC_TOKEN};
415 $self->{state} = BEFORE_TOKEN_STATE;
416 $self->{c} = $self->{get_char}->();
417 return {type => DELIM_TOKEN, value => '@'};
418 #redo A;
419 } else {
420 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
421 $current_token = {type => IDENT_TOKEN, value => '-'};
422 $self->{state} = BEFORE_NMSTART_STATE;
423 # reprocess
424 return {type => DELIM_TOKEN, value => '@'};
425 #redo A;
426 }
427 } elsif ($self->{c} == 0x005C) { # \
428 ## TODO: @-\{nl}
429 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
430 $self->{c} = $self->{get_char}->();
431 redo A;
432 } else {
433 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
434 $self->{state} = BEFORE_TOKEN_STATE;
435 # reprocess
436 return {type => DELIM_TOKEN, value => '@'};
437 }
438 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
439 if ($self->{c} == 0x002D) { # -
440 ## NOTE: |-| in |ident|.
441 $current_token->{value} = '-';
442 $self->{state} = BEFORE_NMSTART_STATE;
443 $self->{c} = $self->{get_char}->();
444 redo A;
445 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
446 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
447 $self->{c} == 0x005F or # _
448 $self->{c} > 0x007F) { # nonascii
449 ## NOTE: |nmstart| in |ident|.
450 $current_token->{value} = chr $self->{c};
451 $current_token->{type} = DIMENSION_TOKEN;
452 $self->{state} = NAME_STATE;
453 $self->{c} = $self->{get_char}->();
454 redo A;
455 } elsif ($self->{c} == 0x005C) { # \
456 ## NOTE: |nmstart| in |ident| in |IDENT|
457 $current_token->{value} = '';
458 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
459 $self->{c} = $self->{get_char}->();
460 redo A;
461 } elsif ($self->{c} == 0x0025) { # %
462 $current_token->{type} = PERCENTAGE_TOKEN;
463 $self->{state} = BEFORE_TOKEN_STATE;
464 $self->{c} = $self->{get_char}->();
465 return $current_token;
466 #redo A;
467 } else {
468 $self->{state} = BEFORE_TOKEN_STATE;
469 # reprocess
470 return $current_token;
471 #redo A;
472 }
473 } elsif ($self->{state} == HASH_OPEN_STATE) {
474 ## NOTE: The first |nmchar| in |name| in |HASH|.
475 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
476 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
477 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
478 $self->{c} == 0x002D or # -
479 $self->{c} == 0x005F or # _
480 $self->{c} > 0x007F) { # nonascii
481 $current_token->{value} .= chr $self->{c};
482 $self->{state} = NAME_STATE;
483 $self->{c} = $self->{get_char}->();
484 redo A;
485 } elsif ($self->{c} == 0x005C) { # \
486 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
487 $self->{c} = $self->{get_char}->();
488 redo A;
489 } else {
490 $self->{state} = BEFORE_TOKEN_STATE;
491 $self->{c} = $self->{get_char}->();
492 return {type => DELIM_TOKEN, value => '#'};
493 #redo A;
494 }
495 } elsif ($self->{state} == NAME_STATE) {
496 ## NOTE: |nmchar| in (|ident| or |name|).
497 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
498 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
499 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
500 $self->{c} == 0x005F or # _
501 $self->{c} == 0x002D or # -
502 $self->{c} > 0x007F) { # nonascii
503 $current_token->{value} .= chr $self->{c};
504 # stay in the state
505 $self->{c} = $self->{get_char}->();
506 redo A;
507 } elsif ($self->{c} == 0x005C) { # \
508 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
509 $self->{c} = $self->{get_char}->();
510 redo A;
511 } elsif ($self->{c} == 0x0028 and # (
512 $current_token->{type} == IDENT_TOKEN) { # (
513 my $func_name = $current_token->{value};
514 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
515 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
516 if ($current_token->{has_escape}) {
517 ## TODO: warn
518 }
519 $current_token->{type}
520 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
521 $current_token->{value} = '';
522 $self->{state} = URI_BEFORE_WSP_STATE;
523 $self->{c} = $self->{get_char}->();
524 redo A;
525 } else {
526 $current_token->{type} = FUNCTION_TOKEN;
527 $self->{state} = BEFORE_TOKEN_STATE;
528 $self->{c} = $self->{get_char}->();
529 return $current_token;
530 #redo A;
531 }
532 } else {
533 $self->{state} = BEFORE_TOKEN_STATE;
534 # reconsume
535 return $current_token;
536 #redo A;
537 }
538 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
539 while ({
540 0x0020 => 1, # SP
541 0x0009 => 1, # \t
542 0x000D => 1, # \r
543 0x000A => 1, # \n
544 0x000C => 1, # \f
545 }->{$self->{c}}) {
546 $self->{c} = $self->{get_char}->();
547 }
548 if ($self->{c} == -1) {
549 $current_token->{type} = {
550 URI_TOKEN, URI_INVALID_TOKEN,
551 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
552 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
553 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
554 }->{$current_token->{type}};
555 $self->{state} = BEFORE_TOKEN_STATE;
556 $self->{c} = $self->{get_char}->();
557 return $current_token;
558 #redo A;
559 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
560 ## TODO: Should we consider matches of "(" and ")"?
561 $current_token->{type} = {
562 URI_TOKEN, URI_INVALID_TOKEN,
563 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
564 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
565 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
566 }->{$current_token->{type}};
567 $self->{state} = URI_UNQUOTED_STATE;
568 $self->{c} = $self->{get_char}->();
569 redo A;
570 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
571 $self->{state} = STRING_STATE; $q = $self->{c};
572 $self->{c} = $self->{get_char}->();
573 redo A;
574 } elsif ($self->{c} == 0x0029) { # )
575 $self->{state} = BEFORE_TOKEN_STATE;
576 $self->{c} = $self->{get_char}->();
577 return $current_token;
578 #redo A;
579 } elsif ($self->{c} == 0x005C) { # \
580 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
581 $self->{c} = $self->{get_char}->();
582 redo A;
583 } else {
584 $current_token->{value} .= chr $self->{c};
585 $self->{state} = URI_UNQUOTED_STATE;
586 $self->{c} = $self->{get_char}->();
587 redo A;
588 }
589 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
590 if ({
591 0x0020 => 1, # SP
592 0x0009 => 1, # \t
593 0x000D => 1, # \r
594 0x000A => 1, # \n
595 0x000C => 1, # \f
596 }->{$self->{c}}) {
597 $self->{state} = URI_AFTER_WSP_STATE;
598 $self->{c} = $self->{get_char}->();
599 redo A;
600 } elsif ($self->{c} == -1) {
601 $current_token->{type} = {
602 URI_TOKEN, URI_INVALID_TOKEN,
603 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
604 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
605 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
606 }->{$current_token->{type}};
607 $self->{state} = BEFORE_TOKEN_STATE;
608 $self->{c} = $self->{get_char}->();
609 return $current_token;
610 #redo A;
611 } elsif ($self->{c} < 0x0020 or {
612 0x0022 => 1, # "
613 0x0027 => 1, # '
614 0x0028 => 1, # (
615 }->{$self->{c}}) { # C0 or (
616 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
617 $current_token->{type} = {
618 URI_TOKEN, URI_INVALID_TOKEN,
619 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622 }->{$current_token->{type}};
623 # stay in the state.
624 $self->{c} = $self->{get_char}->();
625 redo A;
626 } elsif ($self->{c} == 0x0029) { # )
627 $self->{state} = BEFORE_TOKEN_STATE;
628 $self->{c} = $self->{get_char}->();
629 return $current_token;
630 #redo A;
631 } elsif ($self->{c} == 0x005C) { # \
632 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
633 $self->{c} = $self->{get_char}->();
634 redo A;
635 } else {
636 $current_token->{value} .= chr $self->{c};
637 # stay in the state.
638 $self->{c} = $self->{get_char}->();
639 redo A;
640 }
641 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
642 if ({
643 0x0020 => 1, # SP
644 0x0009 => 1, # \t
645 0x000D => 1, # \r
646 0x000A => 1, # \n
647 0x000C => 1, # \f
648 }->{$self->{c}}) {
649 # stay in the state.
650 $self->{c} = $self->{get_char}->();
651 redo A;
652 } elsif ($self->{c} == -1) {
653 $current_token->{type} = {
654 URI_TOKEN, URI_INVALID_TOKEN,
655 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
656 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
657 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
658 }->{$current_token->{type}};
659 $self->{state} = BEFORE_TOKEN_STATE;
660 $self->{c} = $self->{get_char}->();
661 return $current_token;
662 #redo A;
663 } elsif ($self->{c} == 0x0029) { # )
664 $self->{state} = BEFORE_TOKEN_STATE;
665 $self->{c} = $self->{get_char}->();
666 return $current_token;
667 #redo A;
668 } elsif ($self->{c} == 0x005C) { # \
669 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
670 $self->{c} = $self->{get_char}->();
671 redo A;
672 } else {
673 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
674 $current_token->{type} = {
675 URI_TOKEN, URI_INVALID_TOKEN,
676 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
677 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
678 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
679 }->{$current_token->{type}};
680 # stay in the state.
681 $self->{c} = $self->{get_char}->();
682 redo A;
683 }
684 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
685 $current_token->{has_escape} = 1;
686 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
687 ## NOTE: second character of |unicode| in |escape|.
688 $char = $self->{c} - 0x0030;
689 $self->{state} = ESCAPE_STATE; $i = 2;
690 $self->{c} = $self->{get_char}->();
691 redo A;
692 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
693 ## NOTE: second character of |unicode| in |escape|.
694 $char = $self->{c} - 0x0041 + 0xA;
695 $self->{state} = ESCAPE_STATE; $i = 2;
696 $self->{c} = $self->{get_char}->();
697 redo A;
698 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
699 ## NOTE: second character of |unicode| in |escape|.
700 $char = $self->{c} - 0x0061 - 0xA;
701 $self->{state} = ESCAPE_STATE; $i = 2;
702 $self->{c} = $self->{get_char}->();
703 redo A;
704 } elsif ($self->{c} == 0x000A or # \n
705 $self->{c} == 0x000C) { # \f
706 if ($q == 0) {
707 ## NOTE: In |escape| in ... in |ident|.
708 $self->{state} = BEFORE_TOKEN_STATE;
709 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
710 return $current_token;
711 # reconsume
712 #redo A;
713 } elsif ($q == 1) {
714 ## NOTE: In |escape| in |URI|.
715 $current_token->{type} = {
716 URI_TOKEN, URI_INVALID_TOKEN,
717 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
718 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
719 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
720 }->{$current_token->{type}};
721 $current_token->{value} .= chr $self->{c};
722 $self->{state} = URI_UNQUOTED_STATE;
723 $self->{c} = $self->{get_char}->();
724 redo A;
725 } else {
726 ## Note: In |nl| in ... in |string| or |ident|.
727 $current_token->{value} .= chr $self->{c};
728 $self->{state} = STRING_STATE;
729 $self->{c} = $self->{get_char}->();
730 redo A;
731 }
732 } elsif ($self->{c} == 0x000D) { # \r
733 if ($q == 0) {
734 ## NOTE: In |escape| in ... in |ident|.
735 $self->{state} = BEFORE_TOKEN_STATE;
736 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
737 return $current_token;
738 # reconsume
739 #redo A;
740 } elsif ($q == 1) {
741 $current_token->{type} = {
742 URI_TOKEN, URI_INVALID_TOKEN,
743 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
744 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
745 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
746 }->{$current_token->{type}};
747 $current_token->{value} .= "\x0D\x0A";
748 $self->{state} = URI_UNQUOTED_STATE;
749 $self->{c} = $self->{get_char}->();
750 redo A;
751 } else {
752 ## Note: In |nl| in ... in |string| or |ident|.
753 $current_token->{value} .= "\x0D\x0A";
754 $self->{state} = ESCAPE_BEFORE_LF_STATE;
755 $self->{c} = $self->{get_char}->();
756 redo A;
757 }
758 } else {
759 ## NOTE: second character of |escape|.
760 $current_token->{value} .= chr $self->{c};
761 $self->{state} = $q == 0 ? NAME_STATE :
762 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
763 $self->{c} = $self->{get_char}->();
764 redo A;
765 }
766 } elsif ($self->{state} == ESCAPE_STATE) {
767 ## NOTE: third..seventh character of |unicode| in |escape|.
768 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
769 $char = $char * 0x10 + $self->{c} - 0x0030;
770 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
771 $self->{c} = $self->{get_char}->();
772 redo A;
773 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
774 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
775 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
776 $self->{c} = $self->{get_char}->();
777 redo A;
778 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
779 $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
780 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
781 $self->{c} = $self->{get_char}->();
782 redo A;
783 } elsif ($self->{c} == 0x0020 or # SP
784 $self->{c} == 0x000A or # \n
785 $self->{c} == 0x0009 or # \t
786 $self->{c} == 0x000C) { # \f
787 $current_token->{value} .= chr $char;
788 $self->{state} = $q == 0 ? NAME_STATE :
789 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
790 $self->{c} = $self->{get_char}->();
791 redo A;
792 } elsif ($self->{c} == 0x000D) { # \r
793 $self->{state} = ESCAPE_BEFORE_LF_STATE;
794 $self->{c} = $self->{get_char}->();
795 redo A;
796 } else {
797 $current_token->{value} .= chr $char;
798 $self->{state} = $q == 0 ? NAME_STATE :
799 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
800 # reconsume
801 redo A;
802 }
803 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
804 ## NOTE: eightth character of |unicode| in |escape|.
805 if ($self->{c} == 0x0020 or # SP
806 $self->{c} == 0x000A or # \n
807 $self->{c} == 0x0009 or # \t
808 $self->{c} == 0x000C) { # \f
809 $current_token->{value} .= chr $char;
810 $self->{state} = $q == 0 ? NAME_STATE :
811 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
812 $self->{c} = $self->{get_char}->();
813 redo A;
814 } elsif ($self->{c} == 0x000D) { # \r
815 $self->{state} = ESCAPE_BEFORE_NL_STATE;
816 $self->{c} = $self->{get_char}->();
817 redo A;
818 } else {
819 $current_token->{value} .= chr $char;
820 $self->{state} = $q == 0 ? NAME_STATE :
821 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
822 # reconsume
823 redo A;
824 }
825 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
826 ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
827 if ($self->{c} == 0x000A) { # \n
828 $current_token->{value} .= chr $char;
829 $self->{state} = $q == 0 ? NAME_STATE :
830 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
831 $self->{c} = $self->{get_char}->();
832 redo A;
833 } else {
834 $current_token->{value} .= chr $char;
835 $self->{state} = $q == 0 ? NAME_STATE :
836 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
837 # reconsume
838 redo A;
839 }
840 } elsif ($self->{state} == STRING_STATE) {
841 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
842 ## a character in |invalid$Q| in |invalid| in |INVALID|,
843 ## where |$Q = $q == 0x0022 ? 1 : 2|.
844 ## Or, in |URI|.
845 if ($self->{c} == 0x005C) { # \
846 $self->{state} = ESCAPE_OPEN_STATE;
847 $self->{c} = $self->{get_char}->();
848 redo A;
849 } elsif ($self->{c} == $q) { # " | '
850 if ($current_token->{type} == STRING_TOKEN) {
851 $self->{state} = BEFORE_TOKEN_STATE;
852 $self->{c} = $self->{get_char}->();
853 return $current_token;
854 #redo A;
855 } else {
856 $self->{state} = URI_AFTER_WSP_STATE;
857 $self->{c} = $self->{get_char}->();
858 redo A;
859 }
860 } elsif ($self->{c} == 0x000A or # \n
861 $self->{c} == 0x000D or # \r
862 $self->{c} == 0x000C or # \f
863 $self->{c} == -1) {
864 $current_token->{type} = INVALID_TOKEN;
865 $self->{state} = BEFORE_TOKEN_STATE;
866 # reconsume
867 return $current_token;
868 #redo A;
869 } else {
870 $current_token->{value} .= chr $self->{c};
871 # stay in the state
872 $self->{c} = $self->{get_char}->();
873 redo A;
874 }
875 } elsif ($self->{state} == NUMBER_STATE) {
876 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
877 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
878 $current_token->{value} .= chr $self->{c};
879 # stay in the state
880 $self->{c} = $self->{get_char}->();
881 redo A;
882 } elsif ($self->{c} == 0x002E) { # .
883 $self->{state} = NUMBER_DOT_STATE;
884 $self->{c} = $self->{get_char}->();
885 redo A;
886 } else {
887 $current_token->{number} = $current_token->{value};
888 $current_token->{value} = '';
889 $self->{state} = AFTER_NUMBER_STATE;
890 # reprocess
891 redo A;
892 }
893 } elsif ($self->{state} == NUMBER_DOT_STATE) {
894 ## NOTE: The character immediately following |.| in |num|.
895 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
896 $current_token->{value} .= '.' . chr $self->{c};
897 $self->{state} = NUMBER_DOT_NUMBER_STATE;
898 $self->{c} = $self->{get_char}->();
899 redo A;
900 } else {
901 unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
902 $current_token->{number} = $current_token->{value};
903 $current_token->{value} = '';
904 $self->{state} = BEFORE_TOKEN_STATE;
905 # reprocess
906 return $current_token;
907 #redo A;
908 }
909 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
910 ## NOTE: The character immediately following |.| at the beginning of |num|.
911 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
912 $current_token->{value} .= '.' . chr $self->{c};
913 $self->{state} = NUMBER_DOT_NUMBER_STATE;
914 $self->{c} = $self->{get_char}->();
915 redo A;
916 } else {
917 $self->{state} = BEFORE_TOKEN_STATE;
918 $self->{c} = $self->{get_char}->();
919 return {type => DELIM_TOKEN, value => '.'};
920 #redo A;
921 }
922 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
923 ## NOTE: |[0-9]| in |num| after |.|.
924 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
925 $current_token->{value} .= chr $self->{c};
926 # stay in the state
927 $self->{c} = $self->{get_char}->();
928 redo A;
929 } else {
930 $current_token->{number} = $current_token->{value};
931 $current_token->{value} = '';
932 $self->{state} = AFTER_NUMBER_STATE;
933 # reprocess
934 redo A;
935 }
936 } else {
937 die "$0: Unknown state |$self->{state}|";
938 }
939 } # A
940
941 ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|
942
943 } # get_next_token
944
945 1;
946 # $Date: 2007/09/08 02:40:47 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24