/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (hide annotations) (download)
Wed Oct 17 10:46:26 2007 UTC (17 years ago) by wakaba
Branch: MAIN
Changes since 1.15: +11 -1 lines
++ whatpm/Whatpm/ChangeLog	17 Oct 2007 10:45:53 -0000
	* Makefile (clean): New rule.

	* NanoDOM.pm (public_id, system_id): New attributes.

2007-10-17  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3 wakaba 1.16 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.14 require Exporter;
6     push our @ISA, 'Exporter';
7    
8 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
9     sub BEFORE_NMSTART_STATE () { 1 }
10     sub NAME_STATE () { 2 }
11     sub ESCAPE_OPEN_STATE () { 3 }
12     sub STRING_STATE () { 4 }
13     sub HASH_OPEN_STATE () { 5 }
14     sub NUMBER_STATE () { 6 }
15     sub NUMBER_FRACTION_STATE () { 7 }
16     sub AFTER_NUMBER_STATE () { 8 }
17     sub URI_BEFORE_WSP_STATE () { 9 }
18     sub ESCAPE_STATE () { 10 }
19     sub ESCAPE_BEFORE_LF_STATE () { 11 }
20     sub ESCAPE_BEFORE_NL_STATE () { 12 }
21     sub NUMBER_DOT_STATE () { 13 }
22     sub NUMBER_DOT_NUMBER_STATE () { 14 }
23     sub DELIM_STATE () { 15 }
24 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
25     sub URI_AFTER_WSP_STATE () { 17 }
26     sub AFTER_AT_STATE () { 18 }
27     sub AFTER_AT_HYPHEN_STATE () { 19 }
28 wakaba 1.2
29     sub IDENT_TOKEN () { 1 }
30     sub ATKEYWORD_TOKEN () { 2 }
31     sub HASH_TOKEN () { 3 }
32     sub FUNCTION_TOKEN () { 4 }
33     sub URI_TOKEN () { 5 }
34     sub URI_INVALID_TOKEN () { 6 }
35     sub URI_PREFIX_TOKEN () { 7 }
36     sub URI_PREFIX_INVALID_TOKEN () { 8 }
37     sub STRING_TOKEN () { 9 }
38     sub INVALID_TOKEN () { 10 }
39     sub NUMBER_TOKEN () { 11 }
40     sub DIMENSION_TOKEN () { 12 }
41     sub PERCENTAGE_TOKEN () { 13 }
42     sub UNICODE_RANGE_TOKEN () { 14 }
43     sub DELIM_TOKEN () { 16 }
44     sub PLUS_TOKEN () { 17 }
45     sub GREATER_TOKEN () { 18 }
46     sub COMMA_TOKEN () { 19 }
47     sub TILDE_TOKEN () { 20 }
48     sub DASHMATCH_TOKEN () { 21 }
49     sub PREFIXMATCH_TOKEN () { 22 }
50     sub SUFFIXMATCH_TOKEN () { 23 }
51     sub SUBSTRINGMATCH_TOKEN () { 24 }
52     sub INCLUDES_TOKEN () { 25 }
53     sub SEMICOLON_TOKEN () { 26 }
54     sub LBRACE_TOKEN () { 27 }
55     sub RBRACE_TOKEN () { 28 }
56     sub LPAREN_TOKEN () { 29 }
57     sub RPAREN_TOKEN () { 30 }
58     sub LBRACKET_TOKEN () { 31 }
59     sub RBRACKET_TOKEN () { 32 }
60     sub S_TOKEN () { 33 }
61     sub CDO_TOKEN () { 34 }
62     sub CDC_TOKEN () { 35 }
63     sub COMMENT_TOKEN () { 36 }
64     sub COMMENT_INVALID_TOKEN () { 37 }
65     sub EOF_TOKEN () { 38 }
66 wakaba 1.13 sub MINUS_TOKEN () { 39 }
67     sub STAR_TOKEN () { 40 }
68     sub VBAR_TOKEN () { 41 }
69     sub DOT_TOKEN () { 42 }
70     sub COLON_TOKEN () { 43 }
71     sub MATCH_TOKEN () { 44 }
72     sub EXCLAMATION_TOKEN () { 45 }
73 wakaba 1.2
74     our @TokenName = qw(
75 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77 wakaba 1.6 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78 wakaba 1.2 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80 wakaba 1.13 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81 wakaba 1.2 );
82    
83 wakaba 1.14 our @EXPORT_OK = qw(
84     IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85     URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86     STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87     UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88     TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89     SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90     RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91     S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92     MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93     EXCLAMATION_TOKEN
94     );
95    
96     our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98 wakaba 1.1 sub new ($) {
99 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
100     onerror => sub { }}, shift;
101 wakaba 1.1 return $self;
102     } # new
103    
104     sub init ($) {
105     my $self = shift;
106     $self->{state} = BEFORE_TOKEN_STATE;
107     $self->{c} = $self->{get_char}->();
108 wakaba 1.5 #$self->{t} = {type => token-type, value => value, number => number};
109 wakaba 1.1 } # init
110    
111     sub get_next_token ($) {
112     my $self = shift;
113     if (@{$self->{token}}) {
114     return shift @{$self->{token}};
115     }
116    
117     my $char;
118     my $num; # |{num}|, if any.
119     my $i; # |$i + 1|th character in |unicode| in |escape|.
120 wakaba 1.3 my $q;
121     ## NOTE:
122     ## 0: in |ident|.
123     ## 1: in |URI| outside of |string|.
124     ## 0x0022: in |string1| or |invalid1|.
125     ## 0x0027: in |string2| or |invalid2|.
126 wakaba 1.1
127     A: {
128     if ($self->{state} == BEFORE_TOKEN_STATE) {
129     if ($self->{c} == 0x002D) { # -
130     ## NOTE: |-| in |ident| in |IDENT|
131 wakaba 1.7 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
132 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
133     $self->{c} = $self->{get_char}->();
134     redo A;
135 wakaba 1.5 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136     $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
137     $self->{c} = $self->{get_char}->();
138     if ($self->{c} == 0x002B) { # +
139     $self->{c} = $self->{get_char}->();
140     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
141     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
142     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
143     $self->{c} == 0x003F) { # ?
144 wakaba 1.12 $self->{t}->{value} = chr $self->{c};
145 wakaba 1.5 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
146     $self->{c} = $self->{get_char}->();
147     C: for (2..6) {
148     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
149     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
150     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
151     $self->{c} == 0x003F) { # ?
152     $self->{t}->{value} .= chr $self->{c};
153     $self->{c} = $self->{get_char}->();
154     } else {
155     last C;
156     }
157     } # C
158    
159     if ($self->{c} == 0x002D) { # -
160     $self->{c} = $self->{get_char}->();
161     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
162     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
163     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
164     $self->{t}->{value} .= '-' . chr $self->{c};
165     $self->{c} = $self->{get_char}->();
166     C: for (2..6) {
167     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
168     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
169     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
170     $self->{t}->{value} .= chr $self->{c};
171     $self->{c} = $self->{get_char}->();
172     } else {
173     last C;
174     }
175     } # C
176    
177     #
178     } else {
179     my $token = $self->{t};
180     $self->{t} = {type => IDENT_TOKEN, value => '-'};
181     $self->{state} = BEFORE_NMSTART_STATE;
182     # reprocess
183     return $token;
184     #redo A;
185     }
186     }
187    
188     $self->{state} = BEFORE_TOKEN_STATE;
189     # reprocess
190     return $self->{t};
191     #redo A;
192     } else {
193     unshift @{$self->{token}}, {type => PLUS_TOKEN};
194     $self->{state} = BEFORE_TOKEN_STATE;
195     # reprocess
196     return $self->{t};
197     #redo A;
198     }
199     } else {
200     $self->{state} = NAME_STATE;
201     # reprocess
202     redo A;
203     }
204 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
205     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
206 wakaba 1.1 $self->{c} == 0x005F or # _
207     $self->{c} > 0x007F) { # nonascii
208     ## NOTE: |nmstart| in |ident| in |IDENT|
209 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
210 wakaba 1.1 $self->{state} = NAME_STATE;
211     $self->{c} = $self->{get_char}->();
212     redo A;
213     } elsif ($self->{c} == 0x005C) { # \
214     ## NOTE: |nmstart| in |ident| in |IDENT|
215 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => ''};
216 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
217     $self->{c} = $self->{get_char}->();
218     redo A;
219     } elsif ($self->{c} == 0x0040) { # @
220     ## NOTE: |@| in |ATKEYWORD|
221 wakaba 1.5 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
222 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
223 wakaba 1.1 $self->{c} = $self->{get_char}->();
224     redo A;
225 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
226 wakaba 1.5 $self->{t} = {type => STRING_TOKEN, value => ''};
227 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
228 wakaba 1.1 $self->{c} = $self->{get_char}->();
229     redo A;
230     } elsif ($self->{c} == 0x0023) { # #
231     ## NOTE: |#| in |HASH|.
232 wakaba 1.5 $self->{t} = {type => HASH_TOKEN, value => ''};
233 wakaba 1.1 $self->{state} = HASH_OPEN_STATE;
234     $self->{c} = $self->{get_char}->();
235     redo A;
236     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
237     ## NOTE: |num|.
238 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
239 wakaba 1.1 $self->{state} = NUMBER_STATE;
240     $self->{c} = $self->{get_char}->();
241     redo A;
242     } elsif ($self->{c} == 0x002E) { # .
243     ## NOTE: |num|.
244 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
245 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
246     $self->{c} = $self->{get_char}->();
247     redo A;
248 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
249     $self->{c} = $self->{get_char}->();
250     if ($self->{c} == 0x002A) { # *
251     C: {
252     $self->{c} = $self->{get_char}->();
253     if ($self->{c} == 0x002A) { # *
254     D: {
255     $self->{c} = $self->{get_char}->();
256     if ($self->{c} == 0x002F) { # /
257     #
258     } elsif ($self->{c} == 0x002A) { # *
259     redo D;
260     } else {
261     redo C;
262     }
263     } # D
264     } elsif ($self->{c} == -1) {
265     # stay in the state
266     # reprocess
267     return {type => COMMENT_INVALID_TOKEN};
268     #redo A;
269     } else {
270     redo C;
271     }
272     } # C
273    
274     # stay in the state.
275     $self->{c} = $self->{get_char}->();
276     redo A;
277     } else {
278     # stay in the state.
279     # reprocess
280 wakaba 1.9 return {type => DELIM_TOKEN, value => '/'};
281 wakaba 1.4 #redo A;
282     }
283 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
284     ## NOTE: |CDO|
285     $self->{c} = $self->{get_char}->();
286     if ($self->{c} == 0x0021) { # !
287     $self->{c} = $self->{get_char}->();
288 wakaba 1.9 if ($self->{c} == 0x002D) { # -
289 wakaba 1.1 $self->{c} = $self->{get_char}->();
290 wakaba 1.9 if ($self->{c} == 0x002D) { # -
291 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
292     $self->{c} = $self->{get_char}->();
293     return {type => CDO_TOKEN};
294     #redo A;
295     } else {
296 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
297 wakaba 1.1 ## NOTE: |-| in |ident| in |IDENT|
298 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
299 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
300     #reprocess
301     return {type => DELIM_TOKEN, value => '<'};
302     #redo A;
303     }
304     } else {
305 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
306 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
307     #reprocess
308     return {type => DELIM_TOKEN, value => '<'};
309     #redo A;
310     }
311     } else {
312     $self->{state} = BEFORE_TOKEN_STATE;
313     #reprocess
314     return {type => DELIM_TOKEN, value => '<'};
315     #redo A;
316     }
317 wakaba 1.2 } elsif (my $t = {
318 wakaba 1.13 0x0021 => EXCLAMATION_TOKEN, # !
319     0x002D => MINUS_TOKEN, # -
320     0x002E => DOT_TOKEN, # .
321     0x003A => COLON_TOKEN, # :
322     0x003B => SEMICOLON_TOKEN, # ;
323     0x003D => MATCH_TOKEN, # =
324     0x007B => LBRACE_TOKEN, # {
325     0x007D => RBRACE_TOKEN, # }
326     0x0028 => LPAREN_TOKEN, # (
327     0x0029 => RPAREN_TOKEN, # )
328     0x005B => LBRACKET_TOKEN, # [
329     0x005D => RBRACKET_TOKEN, # ]
330 wakaba 1.1 }->{$self->{c}}) {
331     # stay in the state
332     $self->{c} = $self->{get_char}->();
333 wakaba 1.2 return {type => $t};
334 wakaba 1.1 # redo A;
335     } elsif ({
336     0x0020 => 1, # SP
337     0x0009 => 1, # \t
338     0x000D => 1, # \r
339     0x000A => 1, # \n
340     0x000C => 1, # \f
341     }->{$self->{c}}) {
342     W: {
343     $self->{c} = $self->{get_char}->();
344     if ({
345     0x0020 => 1, # SP
346     0x0009 => 1, # \t
347     0x000D => 1, # \r
348     0x000A => 1, # \n
349     0x000C => 1, # \f
350     }->{$self->{c}}) {
351     redo W;
352     } elsif (my $v = {
353     0x002B => PLUS_TOKEN, # +
354     0x003E => GREATER_TOKEN, # >
355     0x002C => COMMA_TOKEN, # ,
356     0x007E => TILDE_TOKEN, # ~
357     }->{$self->{c}}) {
358     # stay in the state
359     $self->{c} = $self->{get_char}->();
360     return {type => $v};
361     #redo A;
362     } else {
363     # stay in the state
364     # reprocess
365     return {type => S_TOKEN};
366     #redo A;
367     }
368     } # W
369     } elsif (my $v = {
370     0x007C => DASHMATCH_TOKEN, # |
371     0x005E => PREFIXMATCH_TOKEN, # ^
372     0x0024 => SUFFIXMATCH_TOKEN, # $
373     0x002A => SUBSTRINGMATCH_TOKEN, # *
374     }->{$self->{c}}) {
375 wakaba 1.2 my $c = $self->{c};
376 wakaba 1.1 $self->{c} = $self->{get_char}->();
377     if ($self->{c} == 0x003D) { # =
378     # stay in the state
379     $self->{c} = $self->{get_char}->();
380     return {type => $v};
381     #redo A;
382 wakaba 1.13 } elsif ($v = {
383     0x002A => STAR_TOKEN, # *
384     0x007C => VBAR_TOKEN, # |
385     }->{$c}) {
386     # stay in the state.
387     # reprocess
388     return {type => $v};
389     #redo A;
390 wakaba 1.1 } else {
391     # stay in the state
392     # reprocess
393 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
394 wakaba 1.1 #redo A;
395     }
396     } elsif ($self->{c} == 0x002B) { # +
397     # stay in the state
398     $self->{c} = $self->{get_char}->();
399     return {type => PLUS_TOKEN};
400     #redo A;
401     } elsif ($self->{c} == 0x003E) { # >
402     # stay in the state
403     $self->{c} = $self->{get_char}->();
404     return {type => GREATER_TOKEN};
405     #redo A;
406     } elsif ($self->{c} == 0x002C) { # ,
407     # stay in the state
408     $self->{c} = $self->{get_char}->();
409     return {type => COMMA_TOKEN};
410     #redo A;
411     } elsif ($self->{c} == 0x007E) { # ~
412     $self->{c} = $self->{get_char}->();
413     if ($self->{c} == 0x003D) { # =
414     # stay in the state
415     $self->{c} = $self->{get_char}->();
416     return {type => INCLUDES_TOKEN};
417     #redo A;
418     } else {
419     # stay in the state
420     # reprocess
421     return {type => TILDE_TOKEN};
422     #redo A;
423     }
424     } elsif ($self->{c} == -1) {
425     # stay in the state
426     $self->{c} = $self->{get_char}->();
427     return {type => EOF_TOKEN};
428     #redo A;
429     } else {
430     # stay in the state
431 wakaba 1.5 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
432 wakaba 1.1 $self->{c} = $self->{get_char}->();
433 wakaba 1.5 return $self->{t};
434 wakaba 1.1 #redo A;
435     }
436     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
437 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
438     ## |FUNCTION|)
439 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
440     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
441 wakaba 1.1 $self->{c} == 0x005F or # _
442     $self->{c} > 0x007F) { # nonascii
443 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
444     $self->{t}->{type} = DIMENSION_TOKEN
445     if $self->{t}->{type} == NUMBER_TOKEN;
446 wakaba 1.1 $self->{state} = NAME_STATE;
447     $self->{c} = $self->{get_char}->();
448     redo A;
449     } elsif ($self->{c} == 0x005C) { # \
450     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
451     $self->{c} = $self->{get_char}->();
452     redo A;
453 wakaba 1.10 } elsif ($self->{c} == 0x002D) { # -
454     if ($self->{t}->{type} == IDENT_TOKEN) {
455     $self->{c} = $self->{get_char}->();
456     if ($self->{c} == 0x003E) { # >
457     $self->{state} = BEFORE_TOKEN_STATE;
458     $self->{c} = $self->{get_char}->();
459     return {type => CDC_TOKEN};
460     #redo A;
461     } else {
462     ## NOTE: |-|, |-|, $self->{c}
463     #$self->{t} = {type => IDENT_TOKEN, value => '-'};
464     # stay in the state
465     # reconsume
466 wakaba 1.13 return {type => MINUS_TOKEN};
467 wakaba 1.10 #redo A;
468     }
469     } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
470 wakaba 1.1 $self->{c} = $self->{get_char}->();
471 wakaba 1.10 if ($self->{c} == 0x003E) { # >
472     unshift @{$self->{token}}, {type => CDC_TOKEN};
473     $self->{t}->{type} = NUMBER_TOKEN;
474     $self->{t}->{value} = '';
475     $self->{state} = BEFORE_TOKEN_STATE;
476     $self->{c} = $self->{get_char}->();
477     return $self->{t};
478     #redo A;
479     } else {
480     ## NOTE: |-|, |-|, $self->{c}
481     my $t = $self->{t};
482     $t->{type} = NUMBER_TOKEN;
483     $t->{value} = '';
484     $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
485 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
486 wakaba 1.10 # stay in the state
487     # reconsume
488     return $t;
489     #redo A;
490     }
491 wakaba 1.1 } else {
492 wakaba 1.10 #
493 wakaba 1.1 }
494     } else {
495 wakaba 1.10 #
496     }
497    
498     if ($self->{t}->{type} == DIMENSION_TOKEN) {
499     ## NOTE: |-| after |NUMBER|.
500 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
501 wakaba 1.10 $self->{state} = BEFORE_TOKEN_STATE;
502     # reprocess
503     $self->{t}->{type} = NUMBER_TOKEN;
504     $self->{t}->{value} = '';
505     return $self->{t};
506     } else {
507     ## NOTE: |-| not followed by |nmstart|.
508     $self->{state} = BEFORE_TOKEN_STATE;
509     # reprocess
510 wakaba 1.13 return {type => MINUS_TOKEN};
511 wakaba 1.1 }
512 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
513     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515     $self->{c} == 0x005F or # _
516     $self->{c} > 0x007F) { # nonascii
517 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
518 wakaba 1.3 $self->{state} = NAME_STATE;
519     $self->{c} = $self->{get_char}->();
520     redo A;
521     } elsif ($self->{c} == 0x002D) { # -
522 wakaba 1.5 $self->{t}->{value} .= '-';
523 wakaba 1.3 $self->{state} = AFTER_AT_HYPHEN_STATE;
524     $self->{c} = $self->{get_char}->();
525     redo A;
526     } elsif ($self->{c} == 0x005C) { # \
527     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
528     $self->{c} = $self->{get_char}->();
529     redo A;
530     } else {
531     $self->{state} = BEFORE_TOKEN_STATE;
532     # reprocess
533     return {type => DELIM_TOKEN, value => '@'};
534     }
535     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
536     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
537     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
538     $self->{c} == 0x005F or # _
539     $self->{c} > 0x007F) { # nonascii
540 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
541 wakaba 1.3 $self->{state} = NAME_STATE;
542     $self->{c} = $self->{get_char}->();
543     redo A;
544     } elsif ($self->{c} == 0x002D) { # -
545     $self->{c} = $self->{get_char}->();
546     if ($self->{c} == 0x003E) { # >
547 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
548 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
549     $self->{c} = $self->{get_char}->();
550 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
551 wakaba 1.3 #redo A;
552     } else {
553 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
554 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
555 wakaba 1.3 $self->{state} = BEFORE_NMSTART_STATE;
556     # reprocess
557     return {type => DELIM_TOKEN, value => '@'};
558     #redo A;
559     }
560     } elsif ($self->{c} == 0x005C) { # \
561     ## TODO: @-\{nl}
562     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
563     $self->{c} = $self->{get_char}->();
564     redo A;
565     } else {
566 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
567 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
568     # reprocess
569     return {type => DELIM_TOKEN, value => '@'};
570     }
571 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
572     if ($self->{c} == 0x002D) { # -
573     ## NOTE: |-| in |ident|.
574 wakaba 1.10 $self->{t}->{hyphen} = 1;
575 wakaba 1.5 $self->{t}->{value} = '-';
576 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
577 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
578     $self->{c} = $self->{get_char}->();
579     redo A;
580 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
581     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
582 wakaba 1.1 $self->{c} == 0x005F or # _
583     $self->{c} > 0x007F) { # nonascii
584     ## NOTE: |nmstart| in |ident|.
585 wakaba 1.5 $self->{t}->{value} = chr $self->{c};
586     $self->{t}->{type} = DIMENSION_TOKEN;
587 wakaba 1.1 $self->{state} = NAME_STATE;
588     $self->{c} = $self->{get_char}->();
589     redo A;
590     } elsif ($self->{c} == 0x005C) { # \
591     ## NOTE: |nmstart| in |ident| in |IDENT|
592 wakaba 1.5 $self->{t}->{value} = '';
593 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
594 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
595     $self->{c} = $self->{get_char}->();
596     redo A;
597     } elsif ($self->{c} == 0x0025) { # %
598 wakaba 1.5 $self->{t}->{type} = PERCENTAGE_TOKEN;
599 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
600     $self->{c} = $self->{get_char}->();
601 wakaba 1.5 return $self->{t};
602 wakaba 1.1 #redo A;
603     } else {
604     $self->{state} = BEFORE_TOKEN_STATE;
605     # reprocess
606 wakaba 1.5 return $self->{t};
607 wakaba 1.1 #redo A;
608     }
609     } elsif ($self->{state} == HASH_OPEN_STATE) {
610     ## NOTE: The first |nmchar| in |name| in |HASH|.
611 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
612     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
613     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
614 wakaba 1.1 $self->{c} == 0x002D or # -
615     $self->{c} == 0x005F or # _
616     $self->{c} > 0x007F) { # nonascii
617 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
618 wakaba 1.1 $self->{state} = NAME_STATE;
619     $self->{c} = $self->{get_char}->();
620     redo A;
621     } elsif ($self->{c} == 0x005C) { # \
622     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
623     $self->{c} = $self->{get_char}->();
624     redo A;
625     } else {
626     $self->{state} = BEFORE_TOKEN_STATE;
627 wakaba 1.9 # reprocess
628 wakaba 1.1 return {type => DELIM_TOKEN, value => '#'};
629     #redo A;
630     }
631     } elsif ($self->{state} == NAME_STATE) {
632     ## NOTE: |nmchar| in (|ident| or |name|).
633 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
634     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
635     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
636 wakaba 1.1 $self->{c} == 0x005F or # _
637     $self->{c} == 0x002D or # -
638     $self->{c} > 0x007F) { # nonascii
639 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
640 wakaba 1.1 # stay in the state
641     $self->{c} = $self->{get_char}->();
642     redo A;
643     } elsif ($self->{c} == 0x005C) { # \
644 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
645 wakaba 1.1 $self->{c} = $self->{get_char}->();
646     redo A;
647     } elsif ($self->{c} == 0x0028 and # (
648 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) { # (
649     my $func_name = $self->{t}->{value};
650 wakaba 1.3 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
651     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
652 wakaba 1.5 if ($self->{t}->{has_escape}) {
653 wakaba 1.3 ## TODO: warn
654     }
655 wakaba 1.5 $self->{t}->{type}
656 wakaba 1.3 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
657 wakaba 1.5 $self->{t}->{value} = '';
658 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
659     $self->{c} = $self->{get_char}->();
660     redo A;
661     } else {
662 wakaba 1.5 $self->{t}->{type} = FUNCTION_TOKEN;
663 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
664     $self->{c} = $self->{get_char}->();
665 wakaba 1.5 return $self->{t};
666 wakaba 1.1 #redo A;
667     }
668     } else {
669     $self->{state} = BEFORE_TOKEN_STATE;
670     # reconsume
671 wakaba 1.5 return $self->{t};
672 wakaba 1.1 #redo A;
673     }
674 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
675     while ({
676     0x0020 => 1, # SP
677     0x0009 => 1, # \t
678     0x000D => 1, # \r
679     0x000A => 1, # \n
680     0x000C => 1, # \f
681     }->{$self->{c}}) {
682     $self->{c} = $self->{get_char}->();
683     }
684     if ($self->{c} == -1) {
685 wakaba 1.5 $self->{t}->{type} = {
686 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
687     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690 wakaba 1.5 }->{$self->{t}->{type}};
691 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
692     $self->{c} = $self->{get_char}->();
693 wakaba 1.5 return $self->{t};
694 wakaba 1.3 #redo A;
695     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
696     ## TODO: Should we consider matches of "(" and ")"?
697 wakaba 1.5 $self->{t}->{type} = {
698 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
699     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
700     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
701     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
702 wakaba 1.5 }->{$self->{t}->{type}};
703 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
704     $self->{c} = $self->{get_char}->();
705     redo A;
706     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
707     $self->{state} = STRING_STATE; $q = $self->{c};
708     $self->{c} = $self->{get_char}->();
709     redo A;
710     } elsif ($self->{c} == 0x0029) { # )
711     $self->{state} = BEFORE_TOKEN_STATE;
712     $self->{c} = $self->{get_char}->();
713 wakaba 1.5 return $self->{t};
714 wakaba 1.3 #redo A;
715     } elsif ($self->{c} == 0x005C) { # \
716     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
717     $self->{c} = $self->{get_char}->();
718     redo A;
719     } else {
720 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
721 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
722     $self->{c} = $self->{get_char}->();
723     redo A;
724     }
725     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
726     if ({
727     0x0020 => 1, # SP
728     0x0009 => 1, # \t
729     0x000D => 1, # \r
730     0x000A => 1, # \n
731     0x000C => 1, # \f
732     }->{$self->{c}}) {
733     $self->{state} = URI_AFTER_WSP_STATE;
734     $self->{c} = $self->{get_char}->();
735     redo A;
736     } elsif ($self->{c} == -1) {
737 wakaba 1.5 $self->{t}->{type} = {
738 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
739     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
740     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
741     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
742 wakaba 1.5 }->{$self->{t}->{type}};
743 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
744     $self->{c} = $self->{get_char}->();
745 wakaba 1.5 return $self->{t};
746 wakaba 1.3 #redo A;
747     } elsif ($self->{c} < 0x0020 or {
748     0x0022 => 1, # "
749     0x0027 => 1, # '
750     0x0028 => 1, # (
751     }->{$self->{c}}) { # C0 or (
752     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
753 wakaba 1.5 $self->{t}->{type} = {
754 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
755     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
756     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
757     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
758 wakaba 1.5 }->{$self->{t}->{type}};
759 wakaba 1.3 # stay in the state.
760     $self->{c} = $self->{get_char}->();
761     redo A;
762     } elsif ($self->{c} == 0x0029) { # )
763     $self->{state} = BEFORE_TOKEN_STATE;
764     $self->{c} = $self->{get_char}->();
765 wakaba 1.5 return $self->{t};
766 wakaba 1.3 #redo A;
767     } elsif ($self->{c} == 0x005C) { # \
768     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
769     $self->{c} = $self->{get_char}->();
770     redo A;
771     } else {
772 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
773 wakaba 1.3 # stay in the state.
774     $self->{c} = $self->{get_char}->();
775     redo A;
776     }
777     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
778     if ({
779     0x0020 => 1, # SP
780     0x0009 => 1, # \t
781     0x000D => 1, # \r
782     0x000A => 1, # \n
783     0x000C => 1, # \f
784     }->{$self->{c}}) {
785     # stay in the state.
786     $self->{c} = $self->{get_char}->();
787     redo A;
788     } elsif ($self->{c} == -1) {
789 wakaba 1.5 $self->{t}->{type} = {
790 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
791     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
792     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
793     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
794 wakaba 1.5 }->{$self->{t}->{type}};
795 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
796     $self->{c} = $self->{get_char}->();
797 wakaba 1.5 return $self->{t};
798 wakaba 1.3 #redo A;
799     } elsif ($self->{c} == 0x0029) { # )
800     $self->{state} = BEFORE_TOKEN_STATE;
801     $self->{c} = $self->{get_char}->();
802 wakaba 1.5 return $self->{t};
803 wakaba 1.3 #redo A;
804     } elsif ($self->{c} == 0x005C) { # \
805     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
806     $self->{c} = $self->{get_char}->();
807     redo A;
808     } else {
809     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
810 wakaba 1.5 $self->{t}->{type} = {
811 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
812     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815 wakaba 1.5 }->{$self->{t}->{type}};
816 wakaba 1.3 # stay in the state.
817     $self->{c} = $self->{get_char}->();
818     redo A;
819     }
820 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
821 wakaba 1.5 $self->{t}->{has_escape} = 1;
822 wakaba 1.1 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
823     ## NOTE: second character of |unicode| in |escape|.
824     $char = $self->{c} - 0x0030;
825     $self->{state} = ESCAPE_STATE; $i = 2;
826     $self->{c} = $self->{get_char}->();
827     redo A;
828     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
829     ## NOTE: second character of |unicode| in |escape|.
830     $char = $self->{c} - 0x0041 + 0xA;
831     $self->{state} = ESCAPE_STATE; $i = 2;
832     $self->{c} = $self->{get_char}->();
833     redo A;
834 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
835 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
836 wakaba 1.7 $char = $self->{c} - 0x0061 + 0xA;
837 wakaba 1.1 $self->{state} = ESCAPE_STATE; $i = 2;
838     $self->{c} = $self->{get_char}->();
839     redo A;
840     } elsif ($self->{c} == 0x000A or # \n
841     $self->{c} == 0x000C) { # \f
842     if ($q == 0) {
843 wakaba 1.7 #
844 wakaba 1.3 } elsif ($q == 1) {
845     ## NOTE: In |escape| in |URI|.
846 wakaba 1.5 $self->{t}->{type} = {
847 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
848     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
849     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
850     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
851 wakaba 1.5 }->{$self->{t}->{type}};
852     $self->{t}->{value} .= chr $self->{c};
853 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
854     $self->{c} = $self->{get_char}->();
855     redo A;
856 wakaba 1.1 } else {
857     ## Note: In |nl| in ... in |string| or |ident|.
858     $self->{state} = STRING_STATE;
859     $self->{c} = $self->{get_char}->();
860     redo A;
861     }
862     } elsif ($self->{c} == 0x000D) { # \r
863     if ($q == 0) {
864 wakaba 1.7 #
865 wakaba 1.3 } elsif ($q == 1) {
866 wakaba 1.7 ## NOTE: In |escape| in |URI|.
867 wakaba 1.5 $self->{t}->{type} = {
868 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
869     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872 wakaba 1.5 }->{$self->{t}->{type}};
873 wakaba 1.8 $self->{state} = ESCAPE_BEFORE_LF_STATE;
874 wakaba 1.3 $self->{c} = $self->{get_char}->();
875     redo A;
876 wakaba 1.1 } else {
877     ## Note: In |nl| in ... in |string| or |ident|.
878     $self->{state} = ESCAPE_BEFORE_LF_STATE;
879     $self->{c} = $self->{get_char}->();
880     redo A;
881     }
882 wakaba 1.7 } elsif ($self->{c} == -1) {
883     #
884 wakaba 1.1 } else {
885     ## NOTE: second character of |escape|.
886 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
887 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
888     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
889 wakaba 1.1 $self->{c} = $self->{get_char}->();
890     redo A;
891     }
892 wakaba 1.7
893     if ($q == 0) {
894 wakaba 1.10 if ($self->{t}->{type} == DIMENSION_TOKEN) {
895     if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
896     $self->{state} = BEFORE_TOKEN_STATE;
897     # reprocess
898     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
899 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
900 wakaba 1.10 $self->{t}->{type} = NUMBER_TOKEN;
901     $self->{t}->{value} = '';
902     return $self->{t};
903     #redo A;
904     } elsif (length $self->{t}->{value}) {
905     $self->{state} = BEFORE_TOKEN_STATE;
906     # reprocess
907     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908     return $self->{t};
909     #redo A;
910     } else {
911     $self->{state} = BEFORE_TOKEN_STATE;
912     # reprocess
913     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914     $self->{t}->{type} = NUMBER_TOKEN;
915     $self->{t}->{value} = '';
916     return $self->{t};
917     #redo A;
918     }
919 wakaba 1.7 } else {
920 wakaba 1.10 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
921     $self->{state} = BEFORE_TOKEN_STATE;
922     # reprocess
923     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
924 wakaba 1.13 return {type => MINUS_TOKEN};
925 wakaba 1.10 #redo A;
926     } elsif (length $self->{t}->{value}) {
927     $self->{state} = BEFORE_TOKEN_STATE;
928     # reprocess
929     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
930     return $self->{t};
931     #redo A;
932     } else {
933     $self->{state} = BEFORE_TOKEN_STATE;
934     # reprocess
935     return {type => DELIM_TOKEN, value => '\\'};
936     #redo A;
937     }
938 wakaba 1.7 }
939 wakaba 1.8 } elsif ($q == 1) {
940     $self->{state} = URI_UNQUOTED_STATE;
941 wakaba 1.7 $self->{c} = $self->{get_char}->();
942     redo A;
943 wakaba 1.8 } else {
944     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
945     $self->{t}->{type} = {
946     STRING_TOKEN, INVALID_TOKEN,
947     URI_TOKEN, URI_INVALID_TOKEN,
948     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
949     }->{$self->{t}->{type}} || $self->{t}->{type};
950     $self->{state} = BEFORE_TOKEN_STATE;
951     # reprocess
952     return $self->{t};
953     #redo A;
954 wakaba 1.7 }
955 wakaba 1.1 } elsif ($self->{state} == ESCAPE_STATE) {
956     ## NOTE: third..seventh character of |unicode| in |escape|.
957     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
958     $char = $char * 0x10 + $self->{c} - 0x0030;
959     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
960     $self->{c} = $self->{get_char}->();
961     redo A;
962     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
963     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
964     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
965     $self->{c} = $self->{get_char}->();
966     redo A;
967 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
968 wakaba 1.7 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
969 wakaba 1.1 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
970     $self->{c} = $self->{get_char}->();
971     redo A;
972     } elsif ($self->{c} == 0x0020 or # SP
973     $self->{c} == 0x000A or # \n
974     $self->{c} == 0x0009 or # \t
975     $self->{c} == 0x000C) { # \f
976 wakaba 1.5 $self->{t}->{value} .= chr $char;
977 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
978     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
979 wakaba 1.1 $self->{c} = $self->{get_char}->();
980     redo A;
981     } elsif ($self->{c} == 0x000D) { # \r
982     $self->{state} = ESCAPE_BEFORE_LF_STATE;
983     $self->{c} = $self->{get_char}->();
984     redo A;
985     } else {
986 wakaba 1.5 $self->{t}->{value} .= chr $char;
987 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
988     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989 wakaba 1.1 # reconsume
990     redo A;
991     }
992     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
993     ## NOTE: eightth character of |unicode| in |escape|.
994     if ($self->{c} == 0x0020 or # SP
995     $self->{c} == 0x000A or # \n
996     $self->{c} == 0x0009 or # \t
997     $self->{c} == 0x000C) { # \f
998 wakaba 1.5 $self->{t}->{value} .= chr $char;
999 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1000     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1001 wakaba 1.1 $self->{c} = $self->{get_char}->();
1002     redo A;
1003     } elsif ($self->{c} == 0x000D) { # \r
1004     $self->{state} = ESCAPE_BEFORE_NL_STATE;
1005     $self->{c} = $self->{get_char}->();
1006     redo A;
1007     } else {
1008 wakaba 1.5 $self->{t}->{value} .= chr $char;
1009 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1010     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1011 wakaba 1.1 # reconsume
1012     redo A;
1013     }
1014     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1015 wakaba 1.15 ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1016 wakaba 1.1 if ($self->{c} == 0x000A) { # \n
1017 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1018     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1019 wakaba 1.1 $self->{c} = $self->{get_char}->();
1020     redo A;
1021     } else {
1022 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1023     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1024 wakaba 1.8 # reprocess
1025 wakaba 1.1 redo A;
1026     }
1027     } elsif ($self->{state} == STRING_STATE) {
1028     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1029     ## a character in |invalid$Q| in |invalid| in |INVALID|,
1030     ## where |$Q = $q == 0x0022 ? 1 : 2|.
1031 wakaba 1.3 ## Or, in |URI|.
1032 wakaba 1.1 if ($self->{c} == 0x005C) { # \
1033     $self->{state} = ESCAPE_OPEN_STATE;
1034     $self->{c} = $self->{get_char}->();
1035     redo A;
1036     } elsif ($self->{c} == $q) { # " | '
1037 wakaba 1.5 if ($self->{t}->{type} == STRING_TOKEN) {
1038 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
1039     $self->{c} = $self->{get_char}->();
1040 wakaba 1.5 return $self->{t};
1041 wakaba 1.3 #redo A;
1042     } else {
1043     $self->{state} = URI_AFTER_WSP_STATE;
1044     $self->{c} = $self->{get_char}->();
1045     redo A;
1046     }
1047 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
1048     $self->{c} == 0x000D or # \r
1049     $self->{c} == 0x000C or # \f
1050     $self->{c} == -1) {
1051 wakaba 1.11 $self->{t}->{type} = {
1052     STRING_TOKEN, INVALID_TOKEN,
1053     INVALID_TOKEN, INVALID_TOKEN,
1054     URI_TOKEN, URI_INVALID_TOKEN,
1055     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1056     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1057     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1058     }->{$self->{t}->{type}};
1059 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1060     # reconsume
1061 wakaba 1.5 return $self->{t};
1062 wakaba 1.1 #redo A;
1063     } else {
1064 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1065 wakaba 1.1 # stay in the state
1066     $self->{c} = $self->{get_char}->();
1067     redo A;
1068     }
1069     } elsif ($self->{state} == NUMBER_STATE) {
1070     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1071     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1072 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1073 wakaba 1.1 # stay in the state
1074     $self->{c} = $self->{get_char}->();
1075     redo A;
1076     } elsif ($self->{c} == 0x002E) { # .
1077     $self->{state} = NUMBER_DOT_STATE;
1078     $self->{c} = $self->{get_char}->();
1079     redo A;
1080     } else {
1081 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1082     $self->{t}->{value} = '';
1083 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1084     # reprocess
1085 wakaba 1.2 redo A;
1086 wakaba 1.1 }
1087     } elsif ($self->{state} == NUMBER_DOT_STATE) {
1088     ## NOTE: The character immediately following |.| in |num|.
1089     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1090 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1091 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1092     $self->{c} = $self->{get_char}->();
1093     redo A;
1094     } else {
1095 wakaba 1.13 unshift @{$self->{token}}, {type => DOT_TOKEN};
1096 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1097     $self->{t}->{value} = '';
1098 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1099     # reprocess
1100 wakaba 1.5 return $self->{t};
1101 wakaba 1.1 #redo A;
1102     }
1103     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1104     ## NOTE: The character immediately following |.| at the beginning of |num|.
1105     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1106 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1107 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1108     $self->{c} = $self->{get_char}->();
1109     redo A;
1110     } else {
1111     $self->{state} = BEFORE_TOKEN_STATE;
1112 wakaba 1.9 # reprocess
1113 wakaba 1.13 return {type => DOT_TOKEN};
1114 wakaba 1.1 #redo A;
1115     }
1116     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1117     ## NOTE: |[0-9]| in |num| after |.|.
1118     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1119 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1120 wakaba 1.1 # stay in the state
1121     $self->{c} = $self->{get_char}->();
1122     redo A;
1123     } else {
1124 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1125     $self->{t}->{value} = '';
1126 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1127     # reprocess
1128 wakaba 1.2 redo A;
1129 wakaba 1.1 }
1130     } else {
1131     die "$0: Unknown state |$self->{state}|";
1132     }
1133     } # A
1134     } # get_next_token
1135    
1136 wakaba 1.16 =head1 LICENSE
1137    
1138     Copyright 2007 Wakaba <w@suika.fam.cx>
1139    
1140     This library is free software; you can redistribute it
1141     and/or modify it under the same terms as Perl itself.
1142    
1143     =cut
1144    
1145 wakaba 1.1 1;
1146 wakaba 1.16 # $Date: 2007/09/30 12:03:09 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24