/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download)
Tue Oct 14 14:57:52 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +16 -2 lines
++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:56:52 -0000
	* cdata-1.dat: Tests on CDATA section outside of the root element
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:57:28 -0000
	* Tokenizer.pm.src: Parse error if CDATA section is not closed or
	is placed outside of the root element.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.6 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
182 wakaba 1.1 #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194    
195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196     $self->{line_prev} = $self->{line};
197     $self->{column_prev} = $self->{column};
198     $self->{column}++;
199     $self->{nc}
200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201     } else {
202     $self->{set_nc}->($self);
203     }
204    
205     $self->{token} = [];
206     # $self->{escape}
207     } # _initialize_tokenizer
208    
209     ## A token has:
210     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212     ## ->{name} (DOCTYPE_TOKEN)
213     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214     ## ->{pubid} (DOCTYPE_TOKEN)
215     ## ->{sysid} (DOCTYPE_TOKEN)
216     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218     ## ->{name}
219     ## ->{value}
220     ## ->{has_reference} == 1 or 0
221     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224     ## while the token is pushed back to the stack.
225    
226     ## Emitted token MUST immediately be handled by the tree construction state.
227    
228     ## Before each step, UA MAY check to see if either one of the scripts in
229     ## "list of scripts that will execute as soon as possible" or the first
230     ## script in the "list of scripts that will execute asynchronously",
231     ## has completed loading. If one has, then it MUST be executed
232     ## and removed from the list.
233    
234     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235     ## (This requirement was dropped from HTML5 spec, unfortunately.)
236    
237     my $is_space = {
238     0x0009 => 1, # CHARACTER TABULATION (HT)
239     0x000A => 1, # LINE FEED (LF)
240     #0x000B => 0, # LINE TABULATION (VT)
241     0x000C => 1, # FORM FEED (FF)
242     #0x000D => 1, # CARRIAGE RETURN (CR)
243     0x0020 => 1, # SPACE (SP)
244     };
245    
246     sub _get_next_token ($) {
247     my $self = shift;
248    
249     if ($self->{self_closing}) {
250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251     ## NOTE: The |self_closing| flag is only set by start tag token.
252     ## In addition, when a start tag token is emitted, it is always set to
253     ## |ct|.
254     delete $self->{self_closing};
255     }
256    
257     if (@{$self->{token}}) {
258     $self->{self_closing} = $self->{token}->[0]->{self_closing};
259     return shift @{$self->{token}};
260     }
261    
262     A: {
263     if ($self->{state} == PCDATA_STATE) {
264     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265    
266     if ($self->{nc} == 0x0026) { # &
267    
268     ## NOTE: In the spec, the tokenizer is switched to the
269     ## "entity data state". In this implementation, the tokenizer
270     ## is switched to the |ENTITY_STATE|, which is an implementation
271     ## of the "consume a character reference" algorithm.
272     $self->{entity_add} = -1;
273     $self->{prev_state} = DATA_STATE;
274     $self->{state} = ENTITY_STATE;
275    
276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277     $self->{line_prev} = $self->{line};
278     $self->{column_prev} = $self->{column};
279     $self->{column}++;
280     $self->{nc}
281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282     } else {
283     $self->{set_nc}->($self);
284     }
285    
286     redo A;
287     } elsif ($self->{nc} == 0x003C) { # <
288    
289     $self->{state} = TAG_OPEN_STATE;
290    
291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292     $self->{line_prev} = $self->{line};
293     $self->{column_prev} = $self->{column};
294     $self->{column}++;
295     $self->{nc}
296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297     } else {
298     $self->{set_nc}->($self);
299     }
300    
301     redo A;
302     } elsif ($self->{nc} == -1) {
303    
304     return ({type => END_OF_FILE_TOKEN,
305     line => $self->{line}, column => $self->{column}});
306     last A; ## TODO: ok?
307     } else {
308    
309     #
310     }
311    
312     # Anything else
313     my $token = {type => CHARACTER_TOKEN,
314     data => chr $self->{nc},
315     line => $self->{line}, column => $self->{column},
316     };
317     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318    
319     ## Stay in the state.
320    
321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322     $self->{line_prev} = $self->{line};
323     $self->{column_prev} = $self->{column};
324     $self->{column}++;
325     $self->{nc}
326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327     } else {
328     $self->{set_nc}->($self);
329     }
330    
331     return ($token);
332     redo A;
333     } elsif ($self->{state} == DATA_STATE) {
334     $self->{s_kwd} = '' unless defined $self->{s_kwd};
335     if ($self->{nc} == 0x0026) { # &
336     $self->{s_kwd} = '';
337     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338     not $self->{escape}) {
339    
340     ## NOTE: In the spec, the tokenizer is switched to the
341     ## "entity data state". In this implementation, the tokenizer
342     ## is switched to the |ENTITY_STATE|, which is an implementation
343     ## of the "consume a character reference" algorithm.
344     $self->{entity_add} = -1;
345     $self->{prev_state} = DATA_STATE;
346     $self->{state} = ENTITY_STATE;
347    
348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349     $self->{line_prev} = $self->{line};
350     $self->{column_prev} = $self->{column};
351     $self->{column}++;
352     $self->{nc}
353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354     } else {
355     $self->{set_nc}->($self);
356     }
357    
358     redo A;
359     } else {
360    
361     #
362     }
363     } elsif ($self->{nc} == 0x002D) { # -
364     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
366 wakaba 1.1
367     $self->{escape} = 1; # unless $self->{escape};
368     $self->{s_kwd} = '--';
369     #
370 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
371 wakaba 1.1
372     $self->{s_kwd} = '--';
373     #
374 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
375    
376     $self->{s_kwd} .= '-';
377     #
378 wakaba 1.1 } else {
379    
380 wakaba 1.5 $self->{s_kwd} = '-';
381 wakaba 1.1 #
382     }
383     }
384    
385     #
386     } elsif ($self->{nc} == 0x0021) { # !
387     if (length $self->{s_kwd}) {
388    
389     $self->{s_kwd} .= '!';
390     #
391     } else {
392    
393     #$self->{s_kwd} = '';
394     #
395     }
396     #
397     } elsif ($self->{nc} == 0x003C) { # <
398     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
399     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
400     not $self->{escape})) {
401    
402     $self->{state} = TAG_OPEN_STATE;
403    
404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
405     $self->{line_prev} = $self->{line};
406     $self->{column_prev} = $self->{column};
407     $self->{column}++;
408     $self->{nc}
409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
410     } else {
411     $self->{set_nc}->($self);
412     }
413    
414     redo A;
415     } else {
416    
417     $self->{s_kwd} = '';
418     #
419     }
420     } elsif ($self->{nc} == 0x003E) { # >
421     if ($self->{escape} and
422     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
423     if ($self->{s_kwd} eq '--') {
424    
425     delete $self->{escape};
426 wakaba 1.5 #
427 wakaba 1.1 } else {
428    
429 wakaba 1.5 #
430 wakaba 1.1 }
431 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
432    
433     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
434     line => $self->{line_prev},
435     column => $self->{column_prev} - 1);
436     #
437 wakaba 1.1 } else {
438    
439 wakaba 1.5 #
440 wakaba 1.1 }
441    
442     $self->{s_kwd} = '';
443     #
444 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
445     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
446    
447     $self->{s_kwd} .= ']';
448     } elsif ($self->{s_kwd} eq ']]') {
449    
450     #
451     } else {
452    
453     $self->{s_kwd} = '';
454     }
455     #
456 wakaba 1.1 } elsif ($self->{nc} == -1) {
457    
458     $self->{s_kwd} = '';
459     return ({type => END_OF_FILE_TOKEN,
460     line => $self->{line}, column => $self->{column}});
461     last A; ## TODO: ok?
462     } else {
463    
464     $self->{s_kwd} = '';
465     #
466     }
467    
468     # Anything else
469     my $token = {type => CHARACTER_TOKEN,
470     data => chr $self->{nc},
471     line => $self->{line}, column => $self->{column},
472     };
473 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
474 wakaba 1.1 length $token->{data})) {
475     $self->{s_kwd} = '';
476     }
477    
478     ## Stay in the data state.
479 wakaba 1.5 if (not $self->{is_xml} and
480     $self->{content_model} == PCDATA_CONTENT_MODEL) {
481 wakaba 1.1
482     $self->{state} = PCDATA_STATE;
483     } else {
484    
485     ## Stay in the state.
486     }
487    
488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
489     $self->{line_prev} = $self->{line};
490     $self->{column_prev} = $self->{column};
491     $self->{column}++;
492     $self->{nc}
493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
494     } else {
495     $self->{set_nc}->($self);
496     }
497    
498     return ($token);
499     redo A;
500     } elsif ($self->{state} == TAG_OPEN_STATE) {
501     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
502     if ($self->{nc} == 0x002F) { # /
503    
504    
505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
506     $self->{line_prev} = $self->{line};
507     $self->{column_prev} = $self->{column};
508     $self->{column}++;
509     $self->{nc}
510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
511     } else {
512     $self->{set_nc}->($self);
513     }
514    
515     $self->{state} = CLOSE_TAG_OPEN_STATE;
516     redo A;
517     } elsif ($self->{nc} == 0x0021) { # !
518    
519     $self->{s_kwd} = '<' unless $self->{escape};
520     #
521     } else {
522    
523     #
524     }
525    
526     ## reconsume
527     $self->{state} = DATA_STATE;
528 wakaba 1.5 $self->{s_kwd} = '';
529 wakaba 1.1 return ({type => CHARACTER_TOKEN, data => '<',
530     line => $self->{line_prev},
531     column => $self->{column_prev},
532     });
533     redo A;
534     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
535     if ($self->{nc} == 0x0021) { # !
536    
537     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
538    
539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
540     $self->{line_prev} = $self->{line};
541     $self->{column_prev} = $self->{column};
542     $self->{column}++;
543     $self->{nc}
544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
545     } else {
546     $self->{set_nc}->($self);
547     }
548    
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551    
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553    
554     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
555     $self->{line_prev} = $self->{line};
556     $self->{column_prev} = $self->{column};
557     $self->{column}++;
558     $self->{nc}
559     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
560     } else {
561     $self->{set_nc}->($self);
562     }
563    
564     redo A;
565     } elsif (0x0041 <= $self->{nc} and
566     $self->{nc} <= 0x005A) { # A..Z
567    
568     $self->{ct}
569     = {type => START_TAG_TOKEN,
570 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
571 wakaba 1.1 line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574    
575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
576     $self->{line_prev} = $self->{line};
577     $self->{column_prev} = $self->{column};
578     $self->{column}++;
579     $self->{nc}
580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
581     } else {
582     $self->{set_nc}->($self);
583     }
584    
585     redo A;
586     } elsif (0x0061 <= $self->{nc} and
587     $self->{nc} <= 0x007A) { # a..z
588    
589     $self->{ct} = {type => START_TAG_TOKEN,
590     tag_name => chr ($self->{nc}),
591     line => $self->{line_prev},
592     column => $self->{column_prev}};
593     $self->{state} = TAG_NAME_STATE;
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     redo A;
606     } elsif ($self->{nc} == 0x003E) { # >
607    
608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
609     line => $self->{line_prev},
610     column => $self->{column_prev});
611     $self->{state} = DATA_STATE;
612 wakaba 1.5 $self->{s_kwd} = '';
613 wakaba 1.1
614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
615     $self->{line_prev} = $self->{line};
616     $self->{column_prev} = $self->{column};
617     $self->{column}++;
618     $self->{nc}
619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
620     } else {
621     $self->{set_nc}->($self);
622     }
623    
624    
625     return ({type => CHARACTER_TOKEN, data => '<>',
626     line => $self->{line_prev},
627     column => $self->{column_prev},
628     });
629    
630     redo A;
631     } elsif ($self->{nc} == 0x003F) { # ?
632    
633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
634     line => $self->{line_prev},
635     column => $self->{column_prev});
636     $self->{state} = BOGUS_COMMENT_STATE;
637     $self->{ct} = {type => COMMENT_TOKEN, data => '',
638     line => $self->{line_prev},
639     column => $self->{column_prev},
640     };
641     ## $self->{nc} is intentionally left as is
642     redo A;
643     } else {
644    
645     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
646     line => $self->{line_prev},
647     column => $self->{column_prev});
648     $self->{state} = DATA_STATE;
649 wakaba 1.5 $self->{s_kwd} = '';
650 wakaba 1.1 ## reconsume
651    
652     return ({type => CHARACTER_TOKEN, data => '<',
653     line => $self->{line_prev},
654     column => $self->{column_prev},
655     });
656    
657     redo A;
658     }
659     } else {
660     die "$0: $self->{content_model} in tag open";
661     }
662     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
663     ## NOTE: The "close tag open state" in the spec is implemented as
664     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
665    
666     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
667     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
668     if (defined $self->{last_stag_name}) {
669     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
670     $self->{s_kwd} = '';
671     ## Reconsume.
672     redo A;
673     } else {
674     ## No start tag token has ever been emitted
675     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
676    
677     $self->{state} = DATA_STATE;
678 wakaba 1.5 $self->{s_kwd} = '';
679 wakaba 1.1 ## Reconsume.
680     return ({type => CHARACTER_TOKEN, data => '</',
681     line => $l, column => $c,
682     });
683     redo A;
684     }
685     }
686    
687     if (0x0041 <= $self->{nc} and
688     $self->{nc} <= 0x005A) { # A..Z
689    
690     $self->{ct}
691     = {type => END_TAG_TOKEN,
692 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
693 wakaba 1.1 line => $l, column => $c};
694     $self->{state} = TAG_NAME_STATE;
695    
696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
697     $self->{line_prev} = $self->{line};
698     $self->{column_prev} = $self->{column};
699     $self->{column}++;
700     $self->{nc}
701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
702     } else {
703     $self->{set_nc}->($self);
704     }
705    
706     redo A;
707     } elsif (0x0061 <= $self->{nc} and
708     $self->{nc} <= 0x007A) { # a..z
709    
710     $self->{ct} = {type => END_TAG_TOKEN,
711     tag_name => chr ($self->{nc}),
712     line => $l, column => $c};
713     $self->{state} = TAG_NAME_STATE;
714    
715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
716     $self->{line_prev} = $self->{line};
717     $self->{column_prev} = $self->{column};
718     $self->{column}++;
719     $self->{nc}
720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
721     } else {
722     $self->{set_nc}->($self);
723     }
724    
725     redo A;
726     } elsif ($self->{nc} == 0x003E) { # >
727    
728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
729     line => $self->{line_prev}, ## "<" in "</>"
730     column => $self->{column_prev} - 1);
731     $self->{state} = DATA_STATE;
732 wakaba 1.5 $self->{s_kwd} = '';
733 wakaba 1.1
734     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
735     $self->{line_prev} = $self->{line};
736     $self->{column_prev} = $self->{column};
737     $self->{column}++;
738     $self->{nc}
739     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
740     } else {
741     $self->{set_nc}->($self);
742     }
743    
744     redo A;
745     } elsif ($self->{nc} == -1) {
746    
747     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
748 wakaba 1.5 $self->{s_kwd} = '';
749 wakaba 1.1 $self->{state} = DATA_STATE;
750     # reconsume
751    
752     return ({type => CHARACTER_TOKEN, data => '</',
753     line => $l, column => $c,
754     });
755    
756     redo A;
757     } else {
758    
759     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
760     $self->{state} = BOGUS_COMMENT_STATE;
761     $self->{ct} = {type => COMMENT_TOKEN, data => '',
762     line => $self->{line_prev}, # "<" of "</"
763     column => $self->{column_prev} - 1,
764     };
765     ## NOTE: $self->{nc} is intentionally left as is.
766     ## Although the "anything else" case of the spec not explicitly
767     ## states that the next input character is to be reconsumed,
768     ## it will be included to the |data| of the comment token
769     ## generated from the bogus end tag, as defined in the
770     ## "bogus comment state" entry.
771     redo A;
772     }
773     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
774     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
775     if (length $ch) {
776     my $CH = $ch;
777     $ch =~ tr/a-z/A-Z/;
778     my $nch = chr $self->{nc};
779     if ($nch eq $ch or $nch eq $CH) {
780    
781     ## Stay in the state.
782     $self->{s_kwd} .= $nch;
783    
784     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
785     $self->{line_prev} = $self->{line};
786     $self->{column_prev} = $self->{column};
787     $self->{column}++;
788     $self->{nc}
789     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
790     } else {
791     $self->{set_nc}->($self);
792     }
793    
794     redo A;
795     } else {
796    
797     $self->{state} = DATA_STATE;
798 wakaba 1.5 $self->{s_kwd} = '';
799 wakaba 1.1 ## Reconsume.
800     return ({type => CHARACTER_TOKEN,
801     data => '</' . $self->{s_kwd},
802     line => $self->{line_prev},
803     column => $self->{column_prev} - 1 - length $self->{s_kwd},
804     });
805     redo A;
806     }
807     } else { # after "<{tag-name}"
808     unless ($is_space->{$self->{nc}} or
809     {
810     0x003E => 1, # >
811     0x002F => 1, # /
812     -1 => 1, # EOF
813     }->{$self->{nc}}) {
814    
815     ## Reconsume.
816     $self->{state} = DATA_STATE;
817 wakaba 1.5 $self->{s_kwd} = '';
818 wakaba 1.1 return ({type => CHARACTER_TOKEN,
819     data => '</' . $self->{s_kwd},
820     line => $self->{line_prev},
821     column => $self->{column_prev} - 1 - length $self->{s_kwd},
822     });
823     redo A;
824     } else {
825    
826     $self->{ct}
827     = {type => END_TAG_TOKEN,
828     tag_name => $self->{last_stag_name},
829     line => $self->{line_prev},
830     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
831     $self->{state} = TAG_NAME_STATE;
832     ## Reconsume.
833     redo A;
834     }
835     }
836     } elsif ($self->{state} == TAG_NAME_STATE) {
837     if ($is_space->{$self->{nc}}) {
838    
839     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
840    
841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
842     $self->{line_prev} = $self->{line};
843     $self->{column_prev} = $self->{column};
844     $self->{column}++;
845     $self->{nc}
846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
847     } else {
848     $self->{set_nc}->($self);
849     }
850    
851     redo A;
852     } elsif ($self->{nc} == 0x003E) { # >
853     if ($self->{ct}->{type} == START_TAG_TOKEN) {
854    
855     $self->{last_stag_name} = $self->{ct}->{tag_name};
856     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
857     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
858     #if ($self->{ct}->{attributes}) {
859     # ## NOTE: This should never be reached.
860     # !!! cp (36);
861     # !!! parse-error (type => 'end tag attribute');
862     #} else {
863    
864     #}
865     } else {
866     die "$0: $self->{ct}->{type}: Unknown token type";
867     }
868     $self->{state} = DATA_STATE;
869 wakaba 1.5 $self->{s_kwd} = '';
870 wakaba 1.1
871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
872     $self->{line_prev} = $self->{line};
873     $self->{column_prev} = $self->{column};
874     $self->{column}++;
875     $self->{nc}
876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
877     } else {
878     $self->{set_nc}->($self);
879     }
880    
881    
882     return ($self->{ct}); # start tag or end tag
883    
884     redo A;
885     } elsif (0x0041 <= $self->{nc} and
886     $self->{nc} <= 0x005A) { # A..Z
887    
888 wakaba 1.4 $self->{ct}->{tag_name}
889     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
890 wakaba 1.1 # start tag or end tag
891     ## Stay in this state
892    
893     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
894     $self->{line_prev} = $self->{line};
895     $self->{column_prev} = $self->{column};
896     $self->{column}++;
897     $self->{nc}
898     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
899     } else {
900     $self->{set_nc}->($self);
901     }
902    
903     redo A;
904     } elsif ($self->{nc} == -1) {
905     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
906     if ($self->{ct}->{type} == START_TAG_TOKEN) {
907    
908     $self->{last_stag_name} = $self->{ct}->{tag_name};
909     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
910     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
911     #if ($self->{ct}->{attributes}) {
912     # ## NOTE: This state should never be reached.
913     # !!! cp (40);
914     # !!! parse-error (type => 'end tag attribute');
915     #} else {
916    
917     #}
918     } else {
919     die "$0: $self->{ct}->{type}: Unknown token type";
920     }
921     $self->{state} = DATA_STATE;
922 wakaba 1.5 $self->{s_kwd} = '';
923 wakaba 1.1 # reconsume
924    
925     return ($self->{ct}); # start tag or end tag
926    
927     redo A;
928     } elsif ($self->{nc} == 0x002F) { # /
929    
930     $self->{state} = SELF_CLOSING_START_TAG_STATE;
931    
932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
933     $self->{line_prev} = $self->{line};
934     $self->{column_prev} = $self->{column};
935     $self->{column}++;
936     $self->{nc}
937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
938     } else {
939     $self->{set_nc}->($self);
940     }
941    
942     redo A;
943     } else {
944    
945     $self->{ct}->{tag_name} .= chr $self->{nc};
946     # start tag or end tag
947     ## Stay in the state
948    
949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
950     $self->{line_prev} = $self->{line};
951     $self->{column_prev} = $self->{column};
952     $self->{column}++;
953     $self->{nc}
954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
955     } else {
956     $self->{set_nc}->($self);
957     }
958    
959     redo A;
960     }
961     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
962     if ($is_space->{$self->{nc}}) {
963    
964     ## Stay in the state
965    
966     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
967     $self->{line_prev} = $self->{line};
968     $self->{column_prev} = $self->{column};
969     $self->{column}++;
970     $self->{nc}
971     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
972     } else {
973     $self->{set_nc}->($self);
974     }
975    
976     redo A;
977     } elsif ($self->{nc} == 0x003E) { # >
978     if ($self->{ct}->{type} == START_TAG_TOKEN) {
979    
980     $self->{last_stag_name} = $self->{ct}->{tag_name};
981     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
982     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
983     if ($self->{ct}->{attributes}) {
984    
985     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
986     } else {
987    
988     }
989     } else {
990     die "$0: $self->{ct}->{type}: Unknown token type";
991     }
992     $self->{state} = DATA_STATE;
993 wakaba 1.5 $self->{s_kwd} = '';
994 wakaba 1.1
995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996     $self->{line_prev} = $self->{line};
997     $self->{column_prev} = $self->{column};
998     $self->{column}++;
999     $self->{nc}
1000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001     } else {
1002     $self->{set_nc}->($self);
1003     }
1004    
1005    
1006     return ($self->{ct}); # start tag or end tag
1007    
1008     redo A;
1009     } elsif (0x0041 <= $self->{nc} and
1010     $self->{nc} <= 0x005A) { # A..Z
1011    
1012     $self->{ca}
1013 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1014 wakaba 1.1 value => '',
1015     line => $self->{line}, column => $self->{column}};
1016     $self->{state} = ATTRIBUTE_NAME_STATE;
1017    
1018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1019     $self->{line_prev} = $self->{line};
1020     $self->{column_prev} = $self->{column};
1021     $self->{column}++;
1022     $self->{nc}
1023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1024     } else {
1025     $self->{set_nc}->($self);
1026     }
1027    
1028     redo A;
1029     } elsif ($self->{nc} == 0x002F) { # /
1030    
1031     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1032    
1033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1034     $self->{line_prev} = $self->{line};
1035     $self->{column_prev} = $self->{column};
1036     $self->{column}++;
1037     $self->{nc}
1038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1039     } else {
1040     $self->{set_nc}->($self);
1041     }
1042    
1043     redo A;
1044     } elsif ($self->{nc} == -1) {
1045     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1046     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1047    
1048     $self->{last_stag_name} = $self->{ct}->{tag_name};
1049     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1050     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1051     if ($self->{ct}->{attributes}) {
1052    
1053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1054     } else {
1055    
1056     }
1057     } else {
1058     die "$0: $self->{ct}->{type}: Unknown token type";
1059     }
1060     $self->{state} = DATA_STATE;
1061 wakaba 1.5 $self->{s_kwd} = '';
1062 wakaba 1.1 # reconsume
1063    
1064     return ($self->{ct}); # start tag or end tag
1065    
1066     redo A;
1067     } else {
1068     if ({
1069     0x0022 => 1, # "
1070     0x0027 => 1, # '
1071     0x003D => 1, # =
1072     }->{$self->{nc}}) {
1073    
1074     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1075     } else {
1076    
1077     }
1078     $self->{ca}
1079     = {name => chr ($self->{nc}),
1080     value => '',
1081     line => $self->{line}, column => $self->{column}};
1082     $self->{state} = ATTRIBUTE_NAME_STATE;
1083    
1084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1085     $self->{line_prev} = $self->{line};
1086     $self->{column_prev} = $self->{column};
1087     $self->{column}++;
1088     $self->{nc}
1089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1090     } else {
1091     $self->{set_nc}->($self);
1092     }
1093    
1094     redo A;
1095     }
1096     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1097     my $before_leave = sub {
1098     if (exists $self->{ct}->{attributes} # start tag or end tag
1099     ->{$self->{ca}->{name}}) { # MUST
1100    
1101     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1102     ## Discard $self->{ca} # MUST
1103     } else {
1104    
1105     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1106     = $self->{ca};
1107     }
1108     }; # $before_leave
1109    
1110     if ($is_space->{$self->{nc}}) {
1111    
1112     $before_leave->();
1113     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1114    
1115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1116     $self->{line_prev} = $self->{line};
1117     $self->{column_prev} = $self->{column};
1118     $self->{column}++;
1119     $self->{nc}
1120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1121     } else {
1122     $self->{set_nc}->($self);
1123     }
1124    
1125     redo A;
1126     } elsif ($self->{nc} == 0x003D) { # =
1127    
1128     $before_leave->();
1129     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1130    
1131     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1132     $self->{line_prev} = $self->{line};
1133     $self->{column_prev} = $self->{column};
1134     $self->{column}++;
1135     $self->{nc}
1136     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1137     } else {
1138     $self->{set_nc}->($self);
1139     }
1140    
1141     redo A;
1142     } elsif ($self->{nc} == 0x003E) { # >
1143     $before_leave->();
1144     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1145    
1146     $self->{last_stag_name} = $self->{ct}->{tag_name};
1147     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1148    
1149     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1150     if ($self->{ct}->{attributes}) {
1151     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1152     }
1153     } else {
1154     die "$0: $self->{ct}->{type}: Unknown token type";
1155     }
1156     $self->{state} = DATA_STATE;
1157 wakaba 1.5 $self->{s_kwd} = '';
1158 wakaba 1.1
1159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1160     $self->{line_prev} = $self->{line};
1161     $self->{column_prev} = $self->{column};
1162     $self->{column}++;
1163     $self->{nc}
1164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1165     } else {
1166     $self->{set_nc}->($self);
1167     }
1168    
1169    
1170     return ($self->{ct}); # start tag or end tag
1171    
1172     redo A;
1173     } elsif (0x0041 <= $self->{nc} and
1174     $self->{nc} <= 0x005A) { # A..Z
1175    
1176 wakaba 1.4 $self->{ca}->{name}
1177     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1178 wakaba 1.1 ## Stay in the state
1179    
1180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1181     $self->{line_prev} = $self->{line};
1182     $self->{column_prev} = $self->{column};
1183     $self->{column}++;
1184     $self->{nc}
1185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1186     } else {
1187     $self->{set_nc}->($self);
1188     }
1189    
1190     redo A;
1191     } elsif ($self->{nc} == 0x002F) { # /
1192    
1193     $before_leave->();
1194     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1195    
1196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1197     $self->{line_prev} = $self->{line};
1198     $self->{column_prev} = $self->{column};
1199     $self->{column}++;
1200     $self->{nc}
1201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1202     } else {
1203     $self->{set_nc}->($self);
1204     }
1205    
1206     redo A;
1207     } elsif ($self->{nc} == -1) {
1208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1209     $before_leave->();
1210     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211    
1212     $self->{last_stag_name} = $self->{ct}->{tag_name};
1213     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215     if ($self->{ct}->{attributes}) {
1216    
1217     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1218     } else {
1219     ## NOTE: This state should never be reached.
1220    
1221     }
1222     } else {
1223     die "$0: $self->{ct}->{type}: Unknown token type";
1224     }
1225     $self->{state} = DATA_STATE;
1226 wakaba 1.5 $self->{s_kwd} = '';
1227 wakaba 1.1 # reconsume
1228    
1229     return ($self->{ct}); # start tag or end tag
1230    
1231     redo A;
1232     } else {
1233     if ($self->{nc} == 0x0022 or # "
1234     $self->{nc} == 0x0027) { # '
1235    
1236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1237     } else {
1238    
1239     }
1240     $self->{ca}->{name} .= chr ($self->{nc});
1241     ## Stay in the state
1242    
1243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1244     $self->{line_prev} = $self->{line};
1245     $self->{column_prev} = $self->{column};
1246     $self->{column}++;
1247     $self->{nc}
1248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1249     } else {
1250     $self->{set_nc}->($self);
1251     }
1252    
1253     redo A;
1254     }
1255     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1256     if ($is_space->{$self->{nc}}) {
1257    
1258     ## Stay in the state
1259    
1260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1261     $self->{line_prev} = $self->{line};
1262     $self->{column_prev} = $self->{column};
1263     $self->{column}++;
1264     $self->{nc}
1265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1266     } else {
1267     $self->{set_nc}->($self);
1268     }
1269    
1270     redo A;
1271     } elsif ($self->{nc} == 0x003D) { # =
1272    
1273     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1274    
1275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1276     $self->{line_prev} = $self->{line};
1277     $self->{column_prev} = $self->{column};
1278     $self->{column}++;
1279     $self->{nc}
1280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1281     } else {
1282     $self->{set_nc}->($self);
1283     }
1284    
1285     redo A;
1286     } elsif ($self->{nc} == 0x003E) { # >
1287     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1288    
1289     $self->{last_stag_name} = $self->{ct}->{tag_name};
1290     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1291     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1292     if ($self->{ct}->{attributes}) {
1293    
1294     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1295     } else {
1296     ## NOTE: This state should never be reached.
1297    
1298     }
1299     } else {
1300     die "$0: $self->{ct}->{type}: Unknown token type";
1301     }
1302     $self->{state} = DATA_STATE;
1303 wakaba 1.5 $self->{s_kwd} = '';
1304 wakaba 1.1
1305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1306     $self->{line_prev} = $self->{line};
1307     $self->{column_prev} = $self->{column};
1308     $self->{column}++;
1309     $self->{nc}
1310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1311     } else {
1312     $self->{set_nc}->($self);
1313     }
1314    
1315    
1316     return ($self->{ct}); # start tag or end tag
1317    
1318     redo A;
1319     } elsif (0x0041 <= $self->{nc} and
1320     $self->{nc} <= 0x005A) { # A..Z
1321    
1322     $self->{ca}
1323 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1324 wakaba 1.1 value => '',
1325     line => $self->{line}, column => $self->{column}};
1326     $self->{state} = ATTRIBUTE_NAME_STATE;
1327    
1328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1329     $self->{line_prev} = $self->{line};
1330     $self->{column_prev} = $self->{column};
1331     $self->{column}++;
1332     $self->{nc}
1333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1334     } else {
1335     $self->{set_nc}->($self);
1336     }
1337    
1338     redo A;
1339     } elsif ($self->{nc} == 0x002F) { # /
1340    
1341     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1342    
1343     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1344     $self->{line_prev} = $self->{line};
1345     $self->{column_prev} = $self->{column};
1346     $self->{column}++;
1347     $self->{nc}
1348     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1349     } else {
1350     $self->{set_nc}->($self);
1351     }
1352    
1353     redo A;
1354     } elsif ($self->{nc} == -1) {
1355     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1356     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1357    
1358     $self->{last_stag_name} = $self->{ct}->{tag_name};
1359     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1360     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1361     if ($self->{ct}->{attributes}) {
1362    
1363     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1364     } else {
1365     ## NOTE: This state should never be reached.
1366    
1367     }
1368     } else {
1369     die "$0: $self->{ct}->{type}: Unknown token type";
1370     }
1371 wakaba 1.5 $self->{s_kwd} = '';
1372 wakaba 1.1 $self->{state} = DATA_STATE;
1373     # reconsume
1374    
1375     return ($self->{ct}); # start tag or end tag
1376    
1377     redo A;
1378     } else {
1379     if ($self->{nc} == 0x0022 or # "
1380     $self->{nc} == 0x0027) { # '
1381    
1382     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1383     } else {
1384    
1385     }
1386     $self->{ca}
1387     = {name => chr ($self->{nc}),
1388     value => '',
1389     line => $self->{line}, column => $self->{column}};
1390     $self->{state} = ATTRIBUTE_NAME_STATE;
1391    
1392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1393     $self->{line_prev} = $self->{line};
1394     $self->{column_prev} = $self->{column};
1395     $self->{column}++;
1396     $self->{nc}
1397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1398     } else {
1399     $self->{set_nc}->($self);
1400     }
1401    
1402     redo A;
1403     }
1404     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1405     if ($is_space->{$self->{nc}}) {
1406    
1407     ## Stay in the state
1408    
1409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1410     $self->{line_prev} = $self->{line};
1411     $self->{column_prev} = $self->{column};
1412     $self->{column}++;
1413     $self->{nc}
1414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1415     } else {
1416     $self->{set_nc}->($self);
1417     }
1418    
1419     redo A;
1420     } elsif ($self->{nc} == 0x0022) { # "
1421    
1422     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1423    
1424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1425     $self->{line_prev} = $self->{line};
1426     $self->{column_prev} = $self->{column};
1427     $self->{column}++;
1428     $self->{nc}
1429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1430     } else {
1431     $self->{set_nc}->($self);
1432     }
1433    
1434     redo A;
1435     } elsif ($self->{nc} == 0x0026) { # &
1436    
1437     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1438     ## reconsume
1439     redo A;
1440     } elsif ($self->{nc} == 0x0027) { # '
1441    
1442     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     } elsif ($self->{nc} == 0x003E) { # >
1456     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1457     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1458    
1459     $self->{last_stag_name} = $self->{ct}->{tag_name};
1460     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1461     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1462     if ($self->{ct}->{attributes}) {
1463    
1464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1465     } else {
1466     ## NOTE: This state should never be reached.
1467    
1468     }
1469     } else {
1470     die "$0: $self->{ct}->{type}: Unknown token type";
1471     }
1472     $self->{state} = DATA_STATE;
1473 wakaba 1.5 $self->{s_kwd} = '';
1474 wakaba 1.1
1475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1476     $self->{line_prev} = $self->{line};
1477     $self->{column_prev} = $self->{column};
1478     $self->{column}++;
1479     $self->{nc}
1480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1481     } else {
1482     $self->{set_nc}->($self);
1483     }
1484    
1485    
1486     return ($self->{ct}); # start tag or end tag
1487    
1488     redo A;
1489     } elsif ($self->{nc} == -1) {
1490     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1491     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1492    
1493     $self->{last_stag_name} = $self->{ct}->{tag_name};
1494     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1495     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1496     if ($self->{ct}->{attributes}) {
1497    
1498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1499     } else {
1500     ## NOTE: This state should never be reached.
1501    
1502     }
1503     } else {
1504     die "$0: $self->{ct}->{type}: Unknown token type";
1505     }
1506     $self->{state} = DATA_STATE;
1507 wakaba 1.5 $self->{s_kwd} = '';
1508 wakaba 1.1 ## reconsume
1509    
1510     return ($self->{ct}); # start tag or end tag
1511    
1512     redo A;
1513     } else {
1514     if ($self->{nc} == 0x003D) { # =
1515    
1516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1517     } else {
1518    
1519     }
1520     $self->{ca}->{value} .= chr ($self->{nc});
1521     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1522    
1523     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524     $self->{line_prev} = $self->{line};
1525     $self->{column_prev} = $self->{column};
1526     $self->{column}++;
1527     $self->{nc}
1528     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529     } else {
1530     $self->{set_nc}->($self);
1531     }
1532    
1533     redo A;
1534     }
1535     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1536     if ($self->{nc} == 0x0022) { # "
1537    
1538     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1539    
1540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1541     $self->{line_prev} = $self->{line};
1542     $self->{column_prev} = $self->{column};
1543     $self->{column}++;
1544     $self->{nc}
1545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1546     } else {
1547     $self->{set_nc}->($self);
1548     }
1549    
1550     redo A;
1551     } elsif ($self->{nc} == 0x0026) { # &
1552    
1553     ## NOTE: In the spec, the tokenizer is switched to the
1554     ## "entity in attribute value state". In this implementation, the
1555     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1556     ## implementation of the "consume a character reference" algorithm.
1557     $self->{prev_state} = $self->{state};
1558     $self->{entity_add} = 0x0022; # "
1559     $self->{state} = ENTITY_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589     $self->{state} = DATA_STATE;
1590 wakaba 1.5 $self->{s_kwd} = '';
1591 wakaba 1.1 ## reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597    
1598     $self->{ca}->{value} .= chr ($self->{nc});
1599     $self->{read_until}->($self->{ca}->{value},
1600     q["&],
1601     length $self->{ca}->{value});
1602    
1603     ## Stay in the state
1604    
1605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1606     $self->{line_prev} = $self->{line};
1607     $self->{column_prev} = $self->{column};
1608     $self->{column}++;
1609     $self->{nc}
1610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1611     } else {
1612     $self->{set_nc}->($self);
1613     }
1614    
1615     redo A;
1616     }
1617     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1618     if ($self->{nc} == 0x0027) { # '
1619    
1620     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1621    
1622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623     $self->{line_prev} = $self->{line};
1624     $self->{column_prev} = $self->{column};
1625     $self->{column}++;
1626     $self->{nc}
1627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628     } else {
1629     $self->{set_nc}->($self);
1630     }
1631    
1632     redo A;
1633     } elsif ($self->{nc} == 0x0026) { # &
1634    
1635     ## NOTE: In the spec, the tokenizer is switched to the
1636     ## "entity in attribute value state". In this implementation, the
1637     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1638     ## implementation of the "consume a character reference" algorithm.
1639     $self->{entity_add} = 0x0027; # '
1640     $self->{prev_state} = $self->{state};
1641     $self->{state} = ENTITY_STATE;
1642    
1643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644     $self->{line_prev} = $self->{line};
1645     $self->{column_prev} = $self->{column};
1646     $self->{column}++;
1647     $self->{nc}
1648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649     } else {
1650     $self->{set_nc}->($self);
1651     }
1652    
1653     redo A;
1654     } elsif ($self->{nc} == -1) {
1655     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1656     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1657    
1658     $self->{last_stag_name} = $self->{ct}->{tag_name};
1659     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1661     if ($self->{ct}->{attributes}) {
1662    
1663     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1664     } else {
1665     ## NOTE: This state should never be reached.
1666    
1667     }
1668     } else {
1669     die "$0: $self->{ct}->{type}: Unknown token type";
1670     }
1671     $self->{state} = DATA_STATE;
1672 wakaba 1.5 $self->{s_kwd} = '';
1673 wakaba 1.1 ## reconsume
1674    
1675     return ($self->{ct}); # start tag or end tag
1676    
1677     redo A;
1678     } else {
1679    
1680     $self->{ca}->{value} .= chr ($self->{nc});
1681     $self->{read_until}->($self->{ca}->{value},
1682     q['&],
1683     length $self->{ca}->{value});
1684    
1685     ## Stay in the state
1686    
1687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1688     $self->{line_prev} = $self->{line};
1689     $self->{column_prev} = $self->{column};
1690     $self->{column}++;
1691     $self->{nc}
1692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1693     } else {
1694     $self->{set_nc}->($self);
1695     }
1696    
1697     redo A;
1698     }
1699     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1700     if ($is_space->{$self->{nc}}) {
1701    
1702     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1703    
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714     redo A;
1715     } elsif ($self->{nc} == 0x0026) { # &
1716    
1717     ## NOTE: In the spec, the tokenizer is switched to the
1718     ## "entity in attribute value state". In this implementation, the
1719     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1720     ## implementation of the "consume a character reference" algorithm.
1721     $self->{entity_add} = -1;
1722     $self->{prev_state} = $self->{state};
1723     $self->{state} = ENTITY_STATE;
1724    
1725     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1726     $self->{line_prev} = $self->{line};
1727     $self->{column_prev} = $self->{column};
1728     $self->{column}++;
1729     $self->{nc}
1730     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1731     } else {
1732     $self->{set_nc}->($self);
1733     }
1734    
1735     redo A;
1736     } elsif ($self->{nc} == 0x003E) { # >
1737     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1738    
1739     $self->{last_stag_name} = $self->{ct}->{tag_name};
1740     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1741     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1742     if ($self->{ct}->{attributes}) {
1743    
1744     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1745     } else {
1746     ## NOTE: This state should never be reached.
1747    
1748     }
1749     } else {
1750     die "$0: $self->{ct}->{type}: Unknown token type";
1751     }
1752     $self->{state} = DATA_STATE;
1753 wakaba 1.5 $self->{s_kwd} = '';
1754 wakaba 1.1
1755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1756     $self->{line_prev} = $self->{line};
1757     $self->{column_prev} = $self->{column};
1758     $self->{column}++;
1759     $self->{nc}
1760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1761     } else {
1762     $self->{set_nc}->($self);
1763     }
1764    
1765    
1766     return ($self->{ct}); # start tag or end tag
1767    
1768     redo A;
1769     } elsif ($self->{nc} == -1) {
1770     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1771     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1772    
1773     $self->{last_stag_name} = $self->{ct}->{tag_name};
1774     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1775     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1776     if ($self->{ct}->{attributes}) {
1777    
1778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1779     } else {
1780     ## NOTE: This state should never be reached.
1781    
1782     }
1783     } else {
1784     die "$0: $self->{ct}->{type}: Unknown token type";
1785     }
1786     $self->{state} = DATA_STATE;
1787 wakaba 1.5 $self->{s_kwd} = '';
1788 wakaba 1.1 ## reconsume
1789    
1790     return ($self->{ct}); # start tag or end tag
1791    
1792     redo A;
1793     } else {
1794     if ({
1795     0x0022 => 1, # "
1796     0x0027 => 1, # '
1797     0x003D => 1, # =
1798     }->{$self->{nc}}) {
1799    
1800     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1801     } else {
1802    
1803     }
1804     $self->{ca}->{value} .= chr ($self->{nc});
1805     $self->{read_until}->($self->{ca}->{value},
1806     q["'=& >],
1807     length $self->{ca}->{value});
1808    
1809     ## Stay in the state
1810    
1811     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1812     $self->{line_prev} = $self->{line};
1813     $self->{column_prev} = $self->{column};
1814     $self->{column}++;
1815     $self->{nc}
1816     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1817     } else {
1818     $self->{set_nc}->($self);
1819     }
1820    
1821     redo A;
1822     }
1823     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1824     if ($is_space->{$self->{nc}}) {
1825    
1826     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1827    
1828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1829     $self->{line_prev} = $self->{line};
1830     $self->{column_prev} = $self->{column};
1831     $self->{column}++;
1832     $self->{nc}
1833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1834     } else {
1835     $self->{set_nc}->($self);
1836     }
1837    
1838     redo A;
1839     } elsif ($self->{nc} == 0x003E) { # >
1840     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1841    
1842     $self->{last_stag_name} = $self->{ct}->{tag_name};
1843     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1844     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1845     if ($self->{ct}->{attributes}) {
1846    
1847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1848     } else {
1849     ## NOTE: This state should never be reached.
1850    
1851     }
1852     } else {
1853     die "$0: $self->{ct}->{type}: Unknown token type";
1854     }
1855     $self->{state} = DATA_STATE;
1856 wakaba 1.5 $self->{s_kwd} = '';
1857 wakaba 1.1
1858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1859     $self->{line_prev} = $self->{line};
1860     $self->{column_prev} = $self->{column};
1861     $self->{column}++;
1862     $self->{nc}
1863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1864     } else {
1865     $self->{set_nc}->($self);
1866     }
1867    
1868    
1869     return ($self->{ct}); # start tag or end tag
1870    
1871     redo A;
1872     } elsif ($self->{nc} == 0x002F) { # /
1873    
1874     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1875    
1876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1877     $self->{line_prev} = $self->{line};
1878     $self->{column_prev} = $self->{column};
1879     $self->{column}++;
1880     $self->{nc}
1881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1882     } else {
1883     $self->{set_nc}->($self);
1884     }
1885    
1886     redo A;
1887     } elsif ($self->{nc} == -1) {
1888     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1889     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1890    
1891     $self->{last_stag_name} = $self->{ct}->{tag_name};
1892     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1893     if ($self->{ct}->{attributes}) {
1894    
1895     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1896     } else {
1897     ## NOTE: This state should never be reached.
1898    
1899     }
1900     } else {
1901     die "$0: $self->{ct}->{type}: Unknown token type";
1902     }
1903     $self->{state} = DATA_STATE;
1904 wakaba 1.5 $self->{s_kwd} = '';
1905 wakaba 1.1 ## Reconsume.
1906     return ($self->{ct}); # start tag or end tag
1907     redo A;
1908     } else {
1909    
1910     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1911     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1912     ## reconsume
1913     redo A;
1914     }
1915     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1916     if ($self->{nc} == 0x003E) { # >
1917     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1918    
1919     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1920     ## TODO: Different type than slash in start tag
1921     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1922     if ($self->{ct}->{attributes}) {
1923    
1924     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1925     } else {
1926    
1927     }
1928     ## TODO: Test |<title></title/>|
1929     } else {
1930    
1931     $self->{self_closing} = 1;
1932     }
1933    
1934     $self->{state} = DATA_STATE;
1935 wakaba 1.5 $self->{s_kwd} = '';
1936 wakaba 1.1
1937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938     $self->{line_prev} = $self->{line};
1939     $self->{column_prev} = $self->{column};
1940     $self->{column}++;
1941     $self->{nc}
1942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943     } else {
1944     $self->{set_nc}->($self);
1945     }
1946    
1947    
1948     return ($self->{ct}); # start tag or end tag
1949    
1950     redo A;
1951     } elsif ($self->{nc} == -1) {
1952     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1953     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1954    
1955     $self->{last_stag_name} = $self->{ct}->{tag_name};
1956     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1957     if ($self->{ct}->{attributes}) {
1958    
1959     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1960     } else {
1961     ## NOTE: This state should never be reached.
1962    
1963     }
1964     } else {
1965     die "$0: $self->{ct}->{type}: Unknown token type";
1966     }
1967     $self->{state} = DATA_STATE;
1968 wakaba 1.5 $self->{s_kwd} = '';
1969 wakaba 1.1 ## Reconsume.
1970     return ($self->{ct}); # start tag or end tag
1971     redo A;
1972     } else {
1973    
1974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1975     ## TODO: This error type is wrong.
1976     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1977     ## Reconsume.
1978     redo A;
1979     }
1980     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1981     ## (only happen if PCDATA state)
1982    
1983     ## NOTE: Unlike spec's "bogus comment state", this implementation
1984     ## consumes characters one-by-one basis.
1985    
1986     if ($self->{nc} == 0x003E) { # >
1987    
1988     $self->{state} = DATA_STATE;
1989 wakaba 1.5 $self->{s_kwd} = '';
1990 wakaba 1.1
1991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1992     $self->{line_prev} = $self->{line};
1993     $self->{column_prev} = $self->{column};
1994     $self->{column}++;
1995     $self->{nc}
1996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1997     } else {
1998     $self->{set_nc}->($self);
1999     }
2000    
2001    
2002     return ($self->{ct}); # comment
2003     redo A;
2004     } elsif ($self->{nc} == -1) {
2005    
2006     $self->{state} = DATA_STATE;
2007 wakaba 1.5 $self->{s_kwd} = '';
2008 wakaba 1.1 ## reconsume
2009    
2010     return ($self->{ct}); # comment
2011     redo A;
2012     } else {
2013    
2014     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2015     $self->{read_until}->($self->{ct}->{data},
2016     q[>],
2017     length $self->{ct}->{data});
2018    
2019     ## Stay in the state.
2020    
2021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2022     $self->{line_prev} = $self->{line};
2023     $self->{column_prev} = $self->{column};
2024     $self->{column}++;
2025     $self->{nc}
2026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2027     } else {
2028     $self->{set_nc}->($self);
2029     }
2030    
2031     redo A;
2032     }
2033     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2034     ## (only happen if PCDATA state)
2035    
2036     if ($self->{nc} == 0x002D) { # -
2037    
2038     $self->{state} = MD_HYPHEN_STATE;
2039    
2040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2041     $self->{line_prev} = $self->{line};
2042     $self->{column_prev} = $self->{column};
2043     $self->{column}++;
2044     $self->{nc}
2045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2046     } else {
2047     $self->{set_nc}->($self);
2048     }
2049    
2050     redo A;
2051     } elsif ($self->{nc} == 0x0044 or # D
2052     $self->{nc} == 0x0064) { # d
2053     ## ASCII case-insensitive.
2054    
2055     $self->{state} = MD_DOCTYPE_STATE;
2056     $self->{s_kwd} = chr $self->{nc};
2057    
2058     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2059     $self->{line_prev} = $self->{line};
2060     $self->{column_prev} = $self->{column};
2061     $self->{column}++;
2062     $self->{nc}
2063     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2064     } else {
2065     $self->{set_nc}->($self);
2066     }
2067    
2068     redo A;
2069 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2070     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2071     $self->{is_xml}) and
2072 wakaba 1.1 $self->{nc} == 0x005B) { # [
2073    
2074     $self->{state} = MD_CDATA_STATE;
2075     $self->{s_kwd} = '[';
2076    
2077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2078     $self->{line_prev} = $self->{line};
2079     $self->{column_prev} = $self->{column};
2080     $self->{column}++;
2081     $self->{nc}
2082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2083     } else {
2084     $self->{set_nc}->($self);
2085     }
2086    
2087     redo A;
2088     } else {
2089    
2090     }
2091    
2092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2093     line => $self->{line_prev},
2094     column => $self->{column_prev} - 1);
2095     ## Reconsume.
2096     $self->{state} = BOGUS_COMMENT_STATE;
2097     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2098     line => $self->{line_prev},
2099     column => $self->{column_prev} - 1,
2100     };
2101     redo A;
2102     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2103     if ($self->{nc} == 0x002D) { # -
2104    
2105     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2106     line => $self->{line_prev},
2107     column => $self->{column_prev} - 2,
2108     };
2109     $self->{state} = COMMENT_START_STATE;
2110    
2111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2112     $self->{line_prev} = $self->{line};
2113     $self->{column_prev} = $self->{column};
2114     $self->{column}++;
2115     $self->{nc}
2116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2117     } else {
2118     $self->{set_nc}->($self);
2119     }
2120    
2121     redo A;
2122     } else {
2123    
2124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2125     line => $self->{line_prev},
2126     column => $self->{column_prev} - 2);
2127     $self->{state} = BOGUS_COMMENT_STATE;
2128     ## Reconsume.
2129     $self->{ct} = {type => COMMENT_TOKEN,
2130     data => '-',
2131     line => $self->{line_prev},
2132     column => $self->{column_prev} - 2,
2133     };
2134     redo A;
2135     }
2136     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2137     ## ASCII case-insensitive.
2138     if ($self->{nc} == [
2139     undef,
2140     0x004F, # O
2141     0x0043, # C
2142     0x0054, # T
2143     0x0059, # Y
2144     0x0050, # P
2145     ]->[length $self->{s_kwd}] or
2146     $self->{nc} == [
2147     undef,
2148     0x006F, # o
2149     0x0063, # c
2150     0x0074, # t
2151     0x0079, # y
2152     0x0070, # p
2153     ]->[length $self->{s_kwd}]) {
2154    
2155     ## Stay in the state.
2156     $self->{s_kwd} .= chr $self->{nc};
2157    
2158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2159     $self->{line_prev} = $self->{line};
2160     $self->{column_prev} = $self->{column};
2161     $self->{column}++;
2162     $self->{nc}
2163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2164     } else {
2165     $self->{set_nc}->($self);
2166     }
2167    
2168     redo A;
2169     } elsif ((length $self->{s_kwd}) == 6 and
2170     ($self->{nc} == 0x0045 or # E
2171     $self->{nc} == 0x0065)) { # e
2172    
2173     $self->{state} = DOCTYPE_STATE;
2174     $self->{ct} = {type => DOCTYPE_TOKEN,
2175     quirks => 1,
2176     line => $self->{line_prev},
2177     column => $self->{column_prev} - 7,
2178     };
2179    
2180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2181     $self->{line_prev} = $self->{line};
2182     $self->{column_prev} = $self->{column};
2183     $self->{column}++;
2184     $self->{nc}
2185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2186     } else {
2187     $self->{set_nc}->($self);
2188     }
2189    
2190     redo A;
2191     } else {
2192    
2193     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2194     line => $self->{line_prev},
2195     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2196     $self->{state} = BOGUS_COMMENT_STATE;
2197     ## Reconsume.
2198     $self->{ct} = {type => COMMENT_TOKEN,
2199     data => $self->{s_kwd},
2200     line => $self->{line_prev},
2201     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2202     };
2203     redo A;
2204     }
2205     } elsif ($self->{state} == MD_CDATA_STATE) {
2206     if ($self->{nc} == {
2207     '[' => 0x0043, # C
2208     '[C' => 0x0044, # D
2209     '[CD' => 0x0041, # A
2210     '[CDA' => 0x0054, # T
2211     '[CDAT' => 0x0041, # A
2212     }->{$self->{s_kwd}}) {
2213    
2214     ## Stay in the state.
2215     $self->{s_kwd} .= chr $self->{nc};
2216    
2217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218     $self->{line_prev} = $self->{line};
2219     $self->{column_prev} = $self->{column};
2220     $self->{column}++;
2221     $self->{nc}
2222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223     } else {
2224     $self->{set_nc}->($self);
2225     }
2226    
2227     redo A;
2228     } elsif ($self->{s_kwd} eq '[CDATA' and
2229     $self->{nc} == 0x005B) { # [
2230    
2231 wakaba 1.6
2232     if ($self->{is_xml} and
2233     not $self->{tainted} and
2234     @{$self->{open_elements} or []} == 0) {
2235     $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2236     line => $self->{line_prev},
2237     column => $self->{column_prev} - 7);
2238     $self->{tainted} = 1;
2239     }
2240    
2241 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2242     data => '',
2243     line => $self->{line_prev},
2244     column => $self->{column_prev} - 7};
2245     $self->{state} = CDATA_SECTION_STATE;
2246    
2247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2248     $self->{line_prev} = $self->{line};
2249     $self->{column_prev} = $self->{column};
2250     $self->{column}++;
2251     $self->{nc}
2252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2253     } else {
2254     $self->{set_nc}->($self);
2255     }
2256    
2257     redo A;
2258     } else {
2259    
2260     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2261     line => $self->{line_prev},
2262     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2263     $self->{state} = BOGUS_COMMENT_STATE;
2264     ## Reconsume.
2265     $self->{ct} = {type => COMMENT_TOKEN,
2266     data => $self->{s_kwd},
2267     line => $self->{line_prev},
2268     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2269     };
2270     redo A;
2271     }
2272     } elsif ($self->{state} == COMMENT_START_STATE) {
2273     if ($self->{nc} == 0x002D) { # -
2274    
2275     $self->{state} = COMMENT_START_DASH_STATE;
2276    
2277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2278     $self->{line_prev} = $self->{line};
2279     $self->{column_prev} = $self->{column};
2280     $self->{column}++;
2281     $self->{nc}
2282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2283     } else {
2284     $self->{set_nc}->($self);
2285     }
2286    
2287     redo A;
2288     } elsif ($self->{nc} == 0x003E) { # >
2289    
2290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2291     $self->{state} = DATA_STATE;
2292 wakaba 1.5 $self->{s_kwd} = '';
2293 wakaba 1.1
2294     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295     $self->{line_prev} = $self->{line};
2296     $self->{column_prev} = $self->{column};
2297     $self->{column}++;
2298     $self->{nc}
2299     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300     } else {
2301     $self->{set_nc}->($self);
2302     }
2303    
2304    
2305     return ($self->{ct}); # comment
2306    
2307     redo A;
2308     } elsif ($self->{nc} == -1) {
2309    
2310     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2311     $self->{state} = DATA_STATE;
2312 wakaba 1.5 $self->{s_kwd} = '';
2313 wakaba 1.1 ## reconsume
2314    
2315     return ($self->{ct}); # comment
2316    
2317     redo A;
2318     } else {
2319    
2320     $self->{ct}->{data} # comment
2321     .= chr ($self->{nc});
2322     $self->{state} = COMMENT_STATE;
2323    
2324     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2325     $self->{line_prev} = $self->{line};
2326     $self->{column_prev} = $self->{column};
2327     $self->{column}++;
2328     $self->{nc}
2329     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2330     } else {
2331     $self->{set_nc}->($self);
2332     }
2333    
2334     redo A;
2335     }
2336     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2337     if ($self->{nc} == 0x002D) { # -
2338    
2339     $self->{state} = COMMENT_END_STATE;
2340    
2341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2342     $self->{line_prev} = $self->{line};
2343     $self->{column_prev} = $self->{column};
2344     $self->{column}++;
2345     $self->{nc}
2346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2347     } else {
2348     $self->{set_nc}->($self);
2349     }
2350    
2351     redo A;
2352     } elsif ($self->{nc} == 0x003E) { # >
2353    
2354     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2355     $self->{state} = DATA_STATE;
2356 wakaba 1.5 $self->{s_kwd} = '';
2357 wakaba 1.1
2358     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2359     $self->{line_prev} = $self->{line};
2360     $self->{column_prev} = $self->{column};
2361     $self->{column}++;
2362     $self->{nc}
2363     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2364     } else {
2365     $self->{set_nc}->($self);
2366     }
2367    
2368    
2369     return ($self->{ct}); # comment
2370    
2371     redo A;
2372     } elsif ($self->{nc} == -1) {
2373    
2374     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2375     $self->{state} = DATA_STATE;
2376 wakaba 1.5 $self->{s_kwd} = '';
2377 wakaba 1.1 ## reconsume
2378    
2379     return ($self->{ct}); # comment
2380    
2381     redo A;
2382     } else {
2383    
2384     $self->{ct}->{data} # comment
2385     .= '-' . chr ($self->{nc});
2386     $self->{state} = COMMENT_STATE;
2387    
2388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2389     $self->{line_prev} = $self->{line};
2390     $self->{column_prev} = $self->{column};
2391     $self->{column}++;
2392     $self->{nc}
2393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2394     } else {
2395     $self->{set_nc}->($self);
2396     }
2397    
2398     redo A;
2399     }
2400     } elsif ($self->{state} == COMMENT_STATE) {
2401     if ($self->{nc} == 0x002D) { # -
2402    
2403     $self->{state} = COMMENT_END_DASH_STATE;
2404    
2405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2406     $self->{line_prev} = $self->{line};
2407     $self->{column_prev} = $self->{column};
2408     $self->{column}++;
2409     $self->{nc}
2410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2411     } else {
2412     $self->{set_nc}->($self);
2413     }
2414    
2415     redo A;
2416     } elsif ($self->{nc} == -1) {
2417    
2418     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2419     $self->{state} = DATA_STATE;
2420 wakaba 1.5 $self->{s_kwd} = '';
2421 wakaba 1.1 ## reconsume
2422    
2423     return ($self->{ct}); # comment
2424    
2425     redo A;
2426     } else {
2427    
2428     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2429     $self->{read_until}->($self->{ct}->{data},
2430     q[-],
2431     length $self->{ct}->{data});
2432    
2433     ## Stay in the state
2434    
2435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2436     $self->{line_prev} = $self->{line};
2437     $self->{column_prev} = $self->{column};
2438     $self->{column}++;
2439     $self->{nc}
2440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2441     } else {
2442     $self->{set_nc}->($self);
2443     }
2444    
2445     redo A;
2446     }
2447     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2448     if ($self->{nc} == 0x002D) { # -
2449    
2450     $self->{state} = COMMENT_END_STATE;
2451    
2452     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2453     $self->{line_prev} = $self->{line};
2454     $self->{column_prev} = $self->{column};
2455     $self->{column}++;
2456     $self->{nc}
2457     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2458     } else {
2459     $self->{set_nc}->($self);
2460     }
2461    
2462     redo A;
2463     } elsif ($self->{nc} == -1) {
2464    
2465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2466 wakaba 1.5 $self->{s_kwd} = '';
2467 wakaba 1.1 $self->{state} = DATA_STATE;
2468 wakaba 1.5 $self->{s_kwd} = '';
2469 wakaba 1.1 ## reconsume
2470    
2471     return ($self->{ct}); # comment
2472    
2473     redo A;
2474     } else {
2475    
2476     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2477     $self->{state} = COMMENT_STATE;
2478    
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     }
2491     } elsif ($self->{state} == COMMENT_END_STATE) {
2492     if ($self->{nc} == 0x003E) { # >
2493    
2494     $self->{state} = DATA_STATE;
2495 wakaba 1.5 $self->{s_kwd} = '';
2496 wakaba 1.1
2497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2498     $self->{line_prev} = $self->{line};
2499     $self->{column_prev} = $self->{column};
2500     $self->{column}++;
2501     $self->{nc}
2502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2503     } else {
2504     $self->{set_nc}->($self);
2505     }
2506    
2507    
2508     return ($self->{ct}); # comment
2509    
2510     redo A;
2511     } elsif ($self->{nc} == 0x002D) { # -
2512    
2513     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2514     line => $self->{line_prev},
2515     column => $self->{column_prev});
2516     $self->{ct}->{data} .= '-'; # comment
2517     ## Stay in the state
2518    
2519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2520     $self->{line_prev} = $self->{line};
2521     $self->{column_prev} = $self->{column};
2522     $self->{column}++;
2523     $self->{nc}
2524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2525     } else {
2526     $self->{set_nc}->($self);
2527     }
2528    
2529     redo A;
2530     } elsif ($self->{nc} == -1) {
2531    
2532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2533     $self->{state} = DATA_STATE;
2534 wakaba 1.5 $self->{s_kwd} = '';
2535 wakaba 1.1 ## reconsume
2536    
2537     return ($self->{ct}); # comment
2538    
2539     redo A;
2540     } else {
2541    
2542     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2543     line => $self->{line_prev},
2544     column => $self->{column_prev});
2545     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2546     $self->{state} = COMMENT_STATE;
2547    
2548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2549     $self->{line_prev} = $self->{line};
2550     $self->{column_prev} = $self->{column};
2551     $self->{column}++;
2552     $self->{nc}
2553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2554     } else {
2555     $self->{set_nc}->($self);
2556     }
2557    
2558     redo A;
2559     }
2560     } elsif ($self->{state} == DOCTYPE_STATE) {
2561     if ($is_space->{$self->{nc}}) {
2562    
2563     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2564    
2565     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2566     $self->{line_prev} = $self->{line};
2567     $self->{column_prev} = $self->{column};
2568     $self->{column}++;
2569     $self->{nc}
2570     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2571     } else {
2572     $self->{set_nc}->($self);
2573     }
2574    
2575     redo A;
2576     } else {
2577    
2578     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2579     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2580     ## reconsume
2581     redo A;
2582     }
2583     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2584     if ($is_space->{$self->{nc}}) {
2585    
2586     ## Stay in the state
2587    
2588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2589     $self->{line_prev} = $self->{line};
2590     $self->{column_prev} = $self->{column};
2591     $self->{column}++;
2592     $self->{nc}
2593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2594     } else {
2595     $self->{set_nc}->($self);
2596     }
2597    
2598     redo A;
2599     } elsif ($self->{nc} == 0x003E) { # >
2600    
2601     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2602     $self->{state} = DATA_STATE;
2603 wakaba 1.5 $self->{s_kwd} = '';
2604 wakaba 1.1
2605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2606     $self->{line_prev} = $self->{line};
2607     $self->{column_prev} = $self->{column};
2608     $self->{column}++;
2609     $self->{nc}
2610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2611     } else {
2612     $self->{set_nc}->($self);
2613     }
2614    
2615    
2616     return ($self->{ct}); # DOCTYPE (quirks)
2617    
2618     redo A;
2619     } elsif ($self->{nc} == -1) {
2620    
2621     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2622     $self->{state} = DATA_STATE;
2623 wakaba 1.5 $self->{s_kwd} = '';
2624 wakaba 1.1 ## reconsume
2625    
2626     return ($self->{ct}); # DOCTYPE (quirks)
2627    
2628     redo A;
2629     } else {
2630    
2631     $self->{ct}->{name} = chr $self->{nc};
2632     delete $self->{ct}->{quirks};
2633     $self->{state} = DOCTYPE_NAME_STATE;
2634    
2635     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2636     $self->{line_prev} = $self->{line};
2637     $self->{column_prev} = $self->{column};
2638     $self->{column}++;
2639     $self->{nc}
2640     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2641     } else {
2642     $self->{set_nc}->($self);
2643     }
2644    
2645     redo A;
2646     }
2647     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2648     ## ISSUE: Redundant "First," in the spec.
2649     if ($is_space->{$self->{nc}}) {
2650    
2651     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2652    
2653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2654     $self->{line_prev} = $self->{line};
2655     $self->{column_prev} = $self->{column};
2656     $self->{column}++;
2657     $self->{nc}
2658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2659     } else {
2660     $self->{set_nc}->($self);
2661     }
2662    
2663     redo A;
2664     } elsif ($self->{nc} == 0x003E) { # >
2665    
2666     $self->{state} = DATA_STATE;
2667 wakaba 1.5 $self->{s_kwd} = '';
2668 wakaba 1.1
2669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2670     $self->{line_prev} = $self->{line};
2671     $self->{column_prev} = $self->{column};
2672     $self->{column}++;
2673     $self->{nc}
2674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2675     } else {
2676     $self->{set_nc}->($self);
2677     }
2678    
2679    
2680     return ($self->{ct}); # DOCTYPE
2681    
2682     redo A;
2683     } elsif ($self->{nc} == -1) {
2684    
2685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2686     $self->{state} = DATA_STATE;
2687 wakaba 1.5 $self->{s_kwd} = '';
2688 wakaba 1.1 ## reconsume
2689    
2690     $self->{ct}->{quirks} = 1;
2691     return ($self->{ct}); # DOCTYPE
2692    
2693     redo A;
2694     } else {
2695    
2696     $self->{ct}->{name}
2697     .= chr ($self->{nc}); # DOCTYPE
2698     ## Stay in the state
2699    
2700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2701     $self->{line_prev} = $self->{line};
2702     $self->{column_prev} = $self->{column};
2703     $self->{column}++;
2704     $self->{nc}
2705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2706     } else {
2707     $self->{set_nc}->($self);
2708     }
2709    
2710     redo A;
2711     }
2712     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2713     if ($is_space->{$self->{nc}}) {
2714    
2715     ## Stay in the state
2716    
2717     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2718     $self->{line_prev} = $self->{line};
2719     $self->{column_prev} = $self->{column};
2720     $self->{column}++;
2721     $self->{nc}
2722     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2723     } else {
2724     $self->{set_nc}->($self);
2725     }
2726    
2727     redo A;
2728     } elsif ($self->{nc} == 0x003E) { # >
2729    
2730     $self->{state} = DATA_STATE;
2731 wakaba 1.5 $self->{s_kwd} = '';
2732 wakaba 1.1
2733     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2734     $self->{line_prev} = $self->{line};
2735     $self->{column_prev} = $self->{column};
2736     $self->{column}++;
2737     $self->{nc}
2738     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2739     } else {
2740     $self->{set_nc}->($self);
2741     }
2742    
2743    
2744     return ($self->{ct}); # DOCTYPE
2745    
2746     redo A;
2747     } elsif ($self->{nc} == -1) {
2748    
2749     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2750     $self->{state} = DATA_STATE;
2751 wakaba 1.5 $self->{s_kwd} = '';
2752 wakaba 1.1 ## reconsume
2753    
2754     $self->{ct}->{quirks} = 1;
2755     return ($self->{ct}); # DOCTYPE
2756    
2757     redo A;
2758     } elsif ($self->{nc} == 0x0050 or # P
2759     $self->{nc} == 0x0070) { # p
2760     $self->{state} = PUBLIC_STATE;
2761     $self->{s_kwd} = chr $self->{nc};
2762    
2763     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2764     $self->{line_prev} = $self->{line};
2765     $self->{column_prev} = $self->{column};
2766     $self->{column}++;
2767     $self->{nc}
2768     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2769     } else {
2770     $self->{set_nc}->($self);
2771     }
2772    
2773     redo A;
2774     } elsif ($self->{nc} == 0x0053 or # S
2775     $self->{nc} == 0x0073) { # s
2776     $self->{state} = SYSTEM_STATE;
2777     $self->{s_kwd} = chr $self->{nc};
2778    
2779     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2780     $self->{line_prev} = $self->{line};
2781     $self->{column_prev} = $self->{column};
2782     $self->{column}++;
2783     $self->{nc}
2784     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2785     } else {
2786     $self->{set_nc}->($self);
2787     }
2788    
2789     redo A;
2790     } else {
2791    
2792     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2793     $self->{ct}->{quirks} = 1;
2794    
2795     $self->{state} = BOGUS_DOCTYPE_STATE;
2796    
2797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2798     $self->{line_prev} = $self->{line};
2799     $self->{column_prev} = $self->{column};
2800     $self->{column}++;
2801     $self->{nc}
2802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2803     } else {
2804     $self->{set_nc}->($self);
2805     }
2806    
2807     redo A;
2808     }
2809     } elsif ($self->{state} == PUBLIC_STATE) {
2810     ## ASCII case-insensitive
2811     if ($self->{nc} == [
2812     undef,
2813     0x0055, # U
2814     0x0042, # B
2815     0x004C, # L
2816     0x0049, # I
2817     ]->[length $self->{s_kwd}] or
2818     $self->{nc} == [
2819     undef,
2820     0x0075, # u
2821     0x0062, # b
2822     0x006C, # l
2823     0x0069, # i
2824     ]->[length $self->{s_kwd}]) {
2825    
2826     ## Stay in the state.
2827     $self->{s_kwd} .= chr $self->{nc};
2828    
2829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2830     $self->{line_prev} = $self->{line};
2831     $self->{column_prev} = $self->{column};
2832     $self->{column}++;
2833     $self->{nc}
2834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2835     } else {
2836     $self->{set_nc}->($self);
2837     }
2838    
2839     redo A;
2840     } elsif ((length $self->{s_kwd}) == 5 and
2841     ($self->{nc} == 0x0043 or # C
2842     $self->{nc} == 0x0063)) { # c
2843    
2844     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2845    
2846     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2847     $self->{line_prev} = $self->{line};
2848     $self->{column_prev} = $self->{column};
2849     $self->{column}++;
2850     $self->{nc}
2851     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2852     } else {
2853     $self->{set_nc}->($self);
2854     }
2855    
2856     redo A;
2857     } else {
2858    
2859     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2860     line => $self->{line_prev},
2861     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2862     $self->{ct}->{quirks} = 1;
2863    
2864     $self->{state} = BOGUS_DOCTYPE_STATE;
2865     ## Reconsume.
2866     redo A;
2867     }
2868     } elsif ($self->{state} == SYSTEM_STATE) {
2869     ## ASCII case-insensitive
2870     if ($self->{nc} == [
2871     undef,
2872     0x0059, # Y
2873     0x0053, # S
2874     0x0054, # T
2875     0x0045, # E
2876     ]->[length $self->{s_kwd}] or
2877     $self->{nc} == [
2878     undef,
2879     0x0079, # y
2880     0x0073, # s
2881     0x0074, # t
2882     0x0065, # e
2883     ]->[length $self->{s_kwd}]) {
2884    
2885     ## Stay in the state.
2886     $self->{s_kwd} .= chr $self->{nc};
2887    
2888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2889     $self->{line_prev} = $self->{line};
2890     $self->{column_prev} = $self->{column};
2891     $self->{column}++;
2892     $self->{nc}
2893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2894     } else {
2895     $self->{set_nc}->($self);
2896     }
2897    
2898     redo A;
2899     } elsif ((length $self->{s_kwd}) == 5 and
2900     ($self->{nc} == 0x004D or # M
2901     $self->{nc} == 0x006D)) { # m
2902    
2903     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2904    
2905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2906     $self->{line_prev} = $self->{line};
2907     $self->{column_prev} = $self->{column};
2908     $self->{column}++;
2909     $self->{nc}
2910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2911     } else {
2912     $self->{set_nc}->($self);
2913     }
2914    
2915     redo A;
2916     } else {
2917    
2918     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2919     line => $self->{line_prev},
2920     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2921     $self->{ct}->{quirks} = 1;
2922    
2923     $self->{state} = BOGUS_DOCTYPE_STATE;
2924     ## Reconsume.
2925     redo A;
2926     }
2927     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2928     if ($is_space->{$self->{nc}}) {
2929    
2930     ## Stay in the state
2931    
2932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2933     $self->{line_prev} = $self->{line};
2934     $self->{column_prev} = $self->{column};
2935     $self->{column}++;
2936     $self->{nc}
2937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2938     } else {
2939     $self->{set_nc}->($self);
2940     }
2941    
2942     redo A;
2943     } elsif ($self->{nc} eq 0x0022) { # "
2944    
2945     $self->{ct}->{pubid} = ''; # DOCTYPE
2946     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2947    
2948     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2949     $self->{line_prev} = $self->{line};
2950     $self->{column_prev} = $self->{column};
2951     $self->{column}++;
2952     $self->{nc}
2953     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2954     } else {
2955     $self->{set_nc}->($self);
2956     }
2957    
2958     redo A;
2959     } elsif ($self->{nc} eq 0x0027) { # '
2960    
2961     $self->{ct}->{pubid} = ''; # DOCTYPE
2962     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2963    
2964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2965     $self->{line_prev} = $self->{line};
2966     $self->{column_prev} = $self->{column};
2967     $self->{column}++;
2968     $self->{nc}
2969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2970     } else {
2971     $self->{set_nc}->($self);
2972     }
2973    
2974     redo A;
2975     } elsif ($self->{nc} eq 0x003E) { # >
2976    
2977     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2978    
2979     $self->{state} = DATA_STATE;
2980 wakaba 1.5 $self->{s_kwd} = '';
2981 wakaba 1.1
2982     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2983     $self->{line_prev} = $self->{line};
2984     $self->{column_prev} = $self->{column};
2985     $self->{column}++;
2986     $self->{nc}
2987     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2988     } else {
2989     $self->{set_nc}->($self);
2990     }
2991    
2992    
2993     $self->{ct}->{quirks} = 1;
2994     return ($self->{ct}); # DOCTYPE
2995    
2996     redo A;
2997     } elsif ($self->{nc} == -1) {
2998    
2999     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3000    
3001     $self->{state} = DATA_STATE;
3002 wakaba 1.5 $self->{s_kwd} = '';
3003 wakaba 1.1 ## reconsume
3004    
3005     $self->{ct}->{quirks} = 1;
3006     return ($self->{ct}); # DOCTYPE
3007    
3008     redo A;
3009     } else {
3010    
3011     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3012     $self->{ct}->{quirks} = 1;
3013    
3014     $self->{state} = BOGUS_DOCTYPE_STATE;
3015    
3016     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3017     $self->{line_prev} = $self->{line};
3018     $self->{column_prev} = $self->{column};
3019     $self->{column}++;
3020     $self->{nc}
3021     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3022     } else {
3023     $self->{set_nc}->($self);
3024     }
3025    
3026     redo A;
3027     }
3028     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3029     if ($self->{nc} == 0x0022) { # "
3030    
3031     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3032    
3033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3034     $self->{line_prev} = $self->{line};
3035     $self->{column_prev} = $self->{column};
3036     $self->{column}++;
3037     $self->{nc}
3038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3039     } else {
3040     $self->{set_nc}->($self);
3041     }
3042    
3043     redo A;
3044     } elsif ($self->{nc} == 0x003E) { # >
3045    
3046     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3047    
3048     $self->{state} = DATA_STATE;
3049 wakaba 1.5 $self->{s_kwd} = '';
3050 wakaba 1.1
3051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3052     $self->{line_prev} = $self->{line};
3053     $self->{column_prev} = $self->{column};
3054     $self->{column}++;
3055     $self->{nc}
3056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3057     } else {
3058     $self->{set_nc}->($self);
3059     }
3060    
3061    
3062     $self->{ct}->{quirks} = 1;
3063     return ($self->{ct}); # DOCTYPE
3064    
3065     redo A;
3066     } elsif ($self->{nc} == -1) {
3067    
3068     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3069    
3070     $self->{state} = DATA_STATE;
3071 wakaba 1.5 $self->{s_kwd} = '';
3072 wakaba 1.1 ## reconsume
3073    
3074     $self->{ct}->{quirks} = 1;
3075     return ($self->{ct}); # DOCTYPE
3076    
3077     redo A;
3078     } else {
3079    
3080     $self->{ct}->{pubid} # DOCTYPE
3081     .= chr $self->{nc};
3082     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3083     length $self->{ct}->{pubid});
3084    
3085     ## Stay in the state
3086    
3087     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3088     $self->{line_prev} = $self->{line};
3089     $self->{column_prev} = $self->{column};
3090     $self->{column}++;
3091     $self->{nc}
3092     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3093     } else {
3094     $self->{set_nc}->($self);
3095     }
3096    
3097     redo A;
3098     }
3099     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3100     if ($self->{nc} == 0x0027) { # '
3101    
3102     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3103    
3104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105     $self->{line_prev} = $self->{line};
3106     $self->{column_prev} = $self->{column};
3107     $self->{column}++;
3108     $self->{nc}
3109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110     } else {
3111     $self->{set_nc}->($self);
3112     }
3113    
3114     redo A;
3115     } elsif ($self->{nc} == 0x003E) { # >
3116    
3117     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3118    
3119     $self->{state} = DATA_STATE;
3120 wakaba 1.5 $self->{s_kwd} = '';
3121 wakaba 1.1
3122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123     $self->{line_prev} = $self->{line};
3124     $self->{column_prev} = $self->{column};
3125     $self->{column}++;
3126     $self->{nc}
3127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128     } else {
3129     $self->{set_nc}->($self);
3130     }
3131    
3132    
3133     $self->{ct}->{quirks} = 1;
3134     return ($self->{ct}); # DOCTYPE
3135    
3136     redo A;
3137     } elsif ($self->{nc} == -1) {
3138    
3139     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3140    
3141     $self->{state} = DATA_STATE;
3142 wakaba 1.5 $self->{s_kwd} = '';
3143 wakaba 1.1 ## reconsume
3144    
3145     $self->{ct}->{quirks} = 1;
3146     return ($self->{ct}); # DOCTYPE
3147    
3148     redo A;
3149     } else {
3150    
3151     $self->{ct}->{pubid} # DOCTYPE
3152     .= chr $self->{nc};
3153     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3154     length $self->{ct}->{pubid});
3155    
3156     ## Stay in the state
3157    
3158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3159     $self->{line_prev} = $self->{line};
3160     $self->{column_prev} = $self->{column};
3161     $self->{column}++;
3162     $self->{nc}
3163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3164     } else {
3165     $self->{set_nc}->($self);
3166     }
3167    
3168     redo A;
3169     }
3170     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3171     if ($is_space->{$self->{nc}}) {
3172    
3173     ## Stay in the state
3174    
3175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3176     $self->{line_prev} = $self->{line};
3177     $self->{column_prev} = $self->{column};
3178     $self->{column}++;
3179     $self->{nc}
3180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3181     } else {
3182     $self->{set_nc}->($self);
3183     }
3184    
3185     redo A;
3186     } elsif ($self->{nc} == 0x0022) { # "
3187    
3188     $self->{ct}->{sysid} = ''; # DOCTYPE
3189     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3190    
3191     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3192     $self->{line_prev} = $self->{line};
3193     $self->{column_prev} = $self->{column};
3194     $self->{column}++;
3195     $self->{nc}
3196     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3197     } else {
3198     $self->{set_nc}->($self);
3199     }
3200    
3201     redo A;
3202     } elsif ($self->{nc} == 0x0027) { # '
3203    
3204     $self->{ct}->{sysid} = ''; # DOCTYPE
3205     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3206    
3207     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3208     $self->{line_prev} = $self->{line};
3209     $self->{column_prev} = $self->{column};
3210     $self->{column}++;
3211     $self->{nc}
3212     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3213     } else {
3214     $self->{set_nc}->($self);
3215     }
3216    
3217     redo A;
3218     } elsif ($self->{nc} == 0x003E) { # >
3219    
3220     $self->{state} = DATA_STATE;
3221 wakaba 1.5 $self->{s_kwd} = '';
3222 wakaba 1.1
3223     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3224     $self->{line_prev} = $self->{line};
3225     $self->{column_prev} = $self->{column};
3226     $self->{column}++;
3227     $self->{nc}
3228     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3229     } else {
3230     $self->{set_nc}->($self);
3231     }
3232    
3233    
3234     return ($self->{ct}); # DOCTYPE
3235    
3236     redo A;
3237     } elsif ($self->{nc} == -1) {
3238    
3239     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3240    
3241     $self->{state} = DATA_STATE;
3242 wakaba 1.5 $self->{s_kwd} = '';
3243 wakaba 1.1 ## reconsume
3244    
3245     $self->{ct}->{quirks} = 1;
3246     return ($self->{ct}); # DOCTYPE
3247    
3248     redo A;
3249     } else {
3250    
3251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3252     $self->{ct}->{quirks} = 1;
3253    
3254     $self->{state} = BOGUS_DOCTYPE_STATE;
3255    
3256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3257     $self->{line_prev} = $self->{line};
3258     $self->{column_prev} = $self->{column};
3259     $self->{column}++;
3260     $self->{nc}
3261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3262     } else {
3263     $self->{set_nc}->($self);
3264     }
3265    
3266     redo A;
3267     }
3268     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3269     if ($is_space->{$self->{nc}}) {
3270    
3271     ## Stay in the state
3272    
3273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3274     $self->{line_prev} = $self->{line};
3275     $self->{column_prev} = $self->{column};
3276     $self->{column}++;
3277     $self->{nc}
3278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3279     } else {
3280     $self->{set_nc}->($self);
3281     }
3282    
3283     redo A;
3284     } elsif ($self->{nc} == 0x0022) { # "
3285    
3286     $self->{ct}->{sysid} = ''; # DOCTYPE
3287     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3288    
3289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3290     $self->{line_prev} = $self->{line};
3291     $self->{column_prev} = $self->{column};
3292     $self->{column}++;
3293     $self->{nc}
3294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3295     } else {
3296     $self->{set_nc}->($self);
3297     }
3298    
3299     redo A;
3300     } elsif ($self->{nc} == 0x0027) { # '
3301    
3302     $self->{ct}->{sysid} = ''; # DOCTYPE
3303     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3304    
3305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3306     $self->{line_prev} = $self->{line};
3307     $self->{column_prev} = $self->{column};
3308     $self->{column}++;
3309     $self->{nc}
3310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3311     } else {
3312     $self->{set_nc}->($self);
3313     }
3314    
3315     redo A;
3316     } elsif ($self->{nc} == 0x003E) { # >
3317    
3318     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3319     $self->{state} = DATA_STATE;
3320 wakaba 1.5 $self->{s_kwd} = '';
3321 wakaba 1.1
3322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3323     $self->{line_prev} = $self->{line};
3324     $self->{column_prev} = $self->{column};
3325     $self->{column}++;
3326     $self->{nc}
3327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3328     } else {
3329     $self->{set_nc}->($self);
3330     }
3331    
3332    
3333     $self->{ct}->{quirks} = 1;
3334     return ($self->{ct}); # DOCTYPE
3335    
3336     redo A;
3337     } elsif ($self->{nc} == -1) {
3338    
3339     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3340    
3341     $self->{state} = DATA_STATE;
3342 wakaba 1.5 $self->{s_kwd} = '';
3343 wakaba 1.1 ## reconsume
3344    
3345     $self->{ct}->{quirks} = 1;
3346     return ($self->{ct}); # DOCTYPE
3347    
3348     redo A;
3349     } else {
3350    
3351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3352     $self->{ct}->{quirks} = 1;
3353    
3354     $self->{state} = BOGUS_DOCTYPE_STATE;
3355    
3356     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3357     $self->{line_prev} = $self->{line};
3358     $self->{column_prev} = $self->{column};
3359     $self->{column}++;
3360     $self->{nc}
3361     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3362     } else {
3363     $self->{set_nc}->($self);
3364     }
3365    
3366     redo A;
3367     }
3368     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3369     if ($self->{nc} == 0x0022) { # "
3370    
3371     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3372    
3373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3374     $self->{line_prev} = $self->{line};
3375     $self->{column_prev} = $self->{column};
3376     $self->{column}++;
3377     $self->{nc}
3378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3379     } else {
3380     $self->{set_nc}->($self);
3381     }
3382    
3383     redo A;
3384     } elsif ($self->{nc} == 0x003E) { # >
3385    
3386     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3387    
3388     $self->{state} = DATA_STATE;
3389 wakaba 1.5 $self->{s_kwd} = '';
3390 wakaba 1.1
3391     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3392     $self->{line_prev} = $self->{line};
3393     $self->{column_prev} = $self->{column};
3394     $self->{column}++;
3395     $self->{nc}
3396     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3397     } else {
3398     $self->{set_nc}->($self);
3399     }
3400    
3401    
3402     $self->{ct}->{quirks} = 1;
3403     return ($self->{ct}); # DOCTYPE
3404    
3405     redo A;
3406     } elsif ($self->{nc} == -1) {
3407    
3408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3409    
3410     $self->{state} = DATA_STATE;
3411 wakaba 1.5 $self->{s_kwd} = '';
3412 wakaba 1.1 ## reconsume
3413    
3414     $self->{ct}->{quirks} = 1;
3415     return ($self->{ct}); # DOCTYPE
3416    
3417     redo A;
3418     } else {
3419    
3420     $self->{ct}->{sysid} # DOCTYPE
3421     .= chr $self->{nc};
3422     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3423     length $self->{ct}->{sysid});
3424    
3425     ## Stay in the state
3426    
3427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428     $self->{line_prev} = $self->{line};
3429     $self->{column_prev} = $self->{column};
3430     $self->{column}++;
3431     $self->{nc}
3432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3433     } else {
3434     $self->{set_nc}->($self);
3435     }
3436    
3437     redo A;
3438     }
3439     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3440     if ($self->{nc} == 0x0027) { # '
3441    
3442     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3443    
3444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3445     $self->{line_prev} = $self->{line};
3446     $self->{column_prev} = $self->{column};
3447     $self->{column}++;
3448     $self->{nc}
3449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3450     } else {
3451     $self->{set_nc}->($self);
3452     }
3453    
3454     redo A;
3455     } elsif ($self->{nc} == 0x003E) { # >
3456    
3457     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3458    
3459     $self->{state} = DATA_STATE;
3460 wakaba 1.5 $self->{s_kwd} = '';
3461 wakaba 1.1
3462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3463     $self->{line_prev} = $self->{line};
3464     $self->{column_prev} = $self->{column};
3465     $self->{column}++;
3466     $self->{nc}
3467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3468     } else {
3469     $self->{set_nc}->($self);
3470     }
3471    
3472    
3473     $self->{ct}->{quirks} = 1;
3474     return ($self->{ct}); # DOCTYPE
3475    
3476     redo A;
3477     } elsif ($self->{nc} == -1) {
3478    
3479     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3480    
3481     $self->{state} = DATA_STATE;
3482 wakaba 1.5 $self->{s_kwd} = '';
3483 wakaba 1.1 ## reconsume
3484    
3485     $self->{ct}->{quirks} = 1;
3486     return ($self->{ct}); # DOCTYPE
3487    
3488     redo A;
3489     } else {
3490    
3491     $self->{ct}->{sysid} # DOCTYPE
3492     .= chr $self->{nc};
3493     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3494     length $self->{ct}->{sysid});
3495    
3496     ## Stay in the state
3497    
3498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3499     $self->{line_prev} = $self->{line};
3500     $self->{column_prev} = $self->{column};
3501     $self->{column}++;
3502     $self->{nc}
3503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3504     } else {
3505     $self->{set_nc}->($self);
3506     }
3507    
3508     redo A;
3509     }
3510     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3511     if ($is_space->{$self->{nc}}) {
3512    
3513     ## Stay in the state
3514    
3515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3516     $self->{line_prev} = $self->{line};
3517     $self->{column_prev} = $self->{column};
3518     $self->{column}++;
3519     $self->{nc}
3520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3521     } else {
3522     $self->{set_nc}->($self);
3523     }
3524    
3525     redo A;
3526     } elsif ($self->{nc} == 0x003E) { # >
3527    
3528     $self->{state} = DATA_STATE;
3529 wakaba 1.5 $self->{s_kwd} = '';
3530 wakaba 1.1
3531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3532     $self->{line_prev} = $self->{line};
3533     $self->{column_prev} = $self->{column};
3534     $self->{column}++;
3535     $self->{nc}
3536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3537     } else {
3538     $self->{set_nc}->($self);
3539     }
3540    
3541    
3542     return ($self->{ct}); # DOCTYPE
3543    
3544     redo A;
3545     } elsif ($self->{nc} == -1) {
3546    
3547     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3548     $self->{state} = DATA_STATE;
3549 wakaba 1.5 $self->{s_kwd} = '';
3550 wakaba 1.1 ## reconsume
3551    
3552     $self->{ct}->{quirks} = 1;
3553     return ($self->{ct}); # DOCTYPE
3554    
3555     redo A;
3556     } else {
3557    
3558     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3559     #$self->{ct}->{quirks} = 1;
3560    
3561     $self->{state} = BOGUS_DOCTYPE_STATE;
3562    
3563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3564     $self->{line_prev} = $self->{line};
3565     $self->{column_prev} = $self->{column};
3566     $self->{column}++;
3567     $self->{nc}
3568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3569     } else {
3570     $self->{set_nc}->($self);
3571     }
3572    
3573     redo A;
3574     }
3575     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3576     if ($self->{nc} == 0x003E) { # >
3577    
3578     $self->{state} = DATA_STATE;
3579 wakaba 1.5 $self->{s_kwd} = '';
3580 wakaba 1.1
3581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3582     $self->{line_prev} = $self->{line};
3583     $self->{column_prev} = $self->{column};
3584     $self->{column}++;
3585     $self->{nc}
3586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3587     } else {
3588     $self->{set_nc}->($self);
3589     }
3590    
3591    
3592     return ($self->{ct}); # DOCTYPE
3593    
3594     redo A;
3595     } elsif ($self->{nc} == -1) {
3596    
3597     $self->{state} = DATA_STATE;
3598 wakaba 1.5 $self->{s_kwd} = '';
3599 wakaba 1.1 ## reconsume
3600    
3601     return ($self->{ct}); # DOCTYPE
3602    
3603     redo A;
3604     } else {
3605    
3606     my $s = '';
3607     $self->{read_until}->($s, q[>], 0);
3608    
3609     ## Stay in the state
3610    
3611     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3612     $self->{line_prev} = $self->{line};
3613     $self->{column_prev} = $self->{column};
3614     $self->{column}++;
3615     $self->{nc}
3616     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3617     } else {
3618     $self->{set_nc}->($self);
3619     }
3620    
3621     redo A;
3622     }
3623     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3624     ## NOTE: "CDATA section state" in the state is jointly implemented
3625     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3626     ## and |CDATA_SECTION_MSE2_STATE|.
3627    
3628     if ($self->{nc} == 0x005D) { # ]
3629    
3630     $self->{state} = CDATA_SECTION_MSE1_STATE;
3631    
3632     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3633     $self->{line_prev} = $self->{line};
3634     $self->{column_prev} = $self->{column};
3635     $self->{column}++;
3636     $self->{nc}
3637     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3638     } else {
3639     $self->{set_nc}->($self);
3640     }
3641    
3642     redo A;
3643     } elsif ($self->{nc} == -1) {
3644 wakaba 1.6 if ($self->{is_xml}) {
3645     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3646     }
3647    
3648 wakaba 1.1 $self->{state} = DATA_STATE;
3649 wakaba 1.5 $self->{s_kwd} = '';
3650 wakaba 1.1
3651     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3652     $self->{line_prev} = $self->{line};
3653     $self->{column_prev} = $self->{column};
3654     $self->{column}++;
3655     $self->{nc}
3656     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3657     } else {
3658     $self->{set_nc}->($self);
3659     }
3660    
3661     if (length $self->{ct}->{data}) { # character
3662    
3663     return ($self->{ct}); # character
3664     } else {
3665    
3666     ## No token to emit. $self->{ct} is discarded.
3667     }
3668     redo A;
3669     } else {
3670    
3671     $self->{ct}->{data} .= chr $self->{nc};
3672     $self->{read_until}->($self->{ct}->{data},
3673     q<]>,
3674     length $self->{ct}->{data});
3675    
3676     ## Stay in the state.
3677    
3678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3679     $self->{line_prev} = $self->{line};
3680     $self->{column_prev} = $self->{column};
3681     $self->{column}++;
3682     $self->{nc}
3683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3684     } else {
3685     $self->{set_nc}->($self);
3686     }
3687    
3688     redo A;
3689     }
3690    
3691     ## ISSUE: "text tokens" in spec.
3692     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3693     if ($self->{nc} == 0x005D) { # ]
3694    
3695     $self->{state} = CDATA_SECTION_MSE2_STATE;
3696    
3697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3698     $self->{line_prev} = $self->{line};
3699     $self->{column_prev} = $self->{column};
3700     $self->{column}++;
3701     $self->{nc}
3702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3703     } else {
3704     $self->{set_nc}->($self);
3705     }
3706    
3707     redo A;
3708     } else {
3709    
3710     $self->{ct}->{data} .= ']';
3711     $self->{state} = CDATA_SECTION_STATE;
3712     ## Reconsume.
3713     redo A;
3714     }
3715     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3716     if ($self->{nc} == 0x003E) { # >
3717     $self->{state} = DATA_STATE;
3718 wakaba 1.5 $self->{s_kwd} = '';
3719 wakaba 1.1
3720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3721     $self->{line_prev} = $self->{line};
3722     $self->{column_prev} = $self->{column};
3723     $self->{column}++;
3724     $self->{nc}
3725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3726     } else {
3727     $self->{set_nc}->($self);
3728     }
3729    
3730     if (length $self->{ct}->{data}) { # character
3731    
3732     return ($self->{ct}); # character
3733     } else {
3734    
3735     ## No token to emit. $self->{ct} is discarded.
3736     }
3737     redo A;
3738     } elsif ($self->{nc} == 0x005D) { # ]
3739     # character
3740     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3741     ## Stay in the state.
3742    
3743     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3744     $self->{line_prev} = $self->{line};
3745     $self->{column_prev} = $self->{column};
3746     $self->{column}++;
3747     $self->{nc}
3748     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3749     } else {
3750     $self->{set_nc}->($self);
3751     }
3752    
3753     redo A;
3754     } else {
3755    
3756     $self->{ct}->{data} .= ']]'; # character
3757     $self->{state} = CDATA_SECTION_STATE;
3758     ## Reconsume.
3759     redo A;
3760     }
3761     } elsif ($self->{state} == ENTITY_STATE) {
3762     if ($is_space->{$self->{nc}} or
3763     {
3764     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3765     $self->{entity_add} => 1,
3766     }->{$self->{nc}}) {
3767    
3768     ## Don't consume
3769     ## No error
3770     ## Return nothing.
3771     #
3772     } elsif ($self->{nc} == 0x0023) { # #
3773    
3774     $self->{state} = ENTITY_HASH_STATE;
3775     $self->{s_kwd} = '#';
3776    
3777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3778     $self->{line_prev} = $self->{line};
3779     $self->{column_prev} = $self->{column};
3780     $self->{column}++;
3781     $self->{nc}
3782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3783     } else {
3784     $self->{set_nc}->($self);
3785     }
3786    
3787     redo A;
3788     } elsif ((0x0041 <= $self->{nc} and
3789     $self->{nc} <= 0x005A) or # A..Z
3790     (0x0061 <= $self->{nc} and
3791     $self->{nc} <= 0x007A)) { # a..z
3792    
3793     require Whatpm::_NamedEntityList;
3794     $self->{state} = ENTITY_NAME_STATE;
3795     $self->{s_kwd} = chr $self->{nc};
3796     $self->{entity__value} = $self->{s_kwd};
3797     $self->{entity__match} = 0;
3798    
3799     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3800     $self->{line_prev} = $self->{line};
3801     $self->{column_prev} = $self->{column};
3802     $self->{column}++;
3803     $self->{nc}
3804     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3805     } else {
3806     $self->{set_nc}->($self);
3807     }
3808    
3809     redo A;
3810     } else {
3811    
3812     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3813     ## Return nothing.
3814     #
3815     }
3816    
3817     ## NOTE: No character is consumed by the "consume a character
3818     ## reference" algorithm. In other word, there is an "&" character
3819     ## that does not introduce a character reference, which would be
3820     ## appended to the parent element or the attribute value in later
3821     ## process of the tokenizer.
3822    
3823     if ($self->{prev_state} == DATA_STATE) {
3824    
3825     $self->{state} = $self->{prev_state};
3826 wakaba 1.5 $self->{s_kwd} = '';
3827 wakaba 1.1 ## Reconsume.
3828     return ({type => CHARACTER_TOKEN, data => '&',
3829     line => $self->{line_prev},
3830     column => $self->{column_prev},
3831     });
3832     redo A;
3833     } else {
3834    
3835     $self->{ca}->{value} .= '&';
3836     $self->{state} = $self->{prev_state};
3837 wakaba 1.5 $self->{s_kwd} = '';
3838 wakaba 1.1 ## Reconsume.
3839     redo A;
3840     }
3841     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3842     if ($self->{nc} == 0x0078 or # x
3843     $self->{nc} == 0x0058) { # X
3844    
3845     $self->{state} = HEXREF_X_STATE;
3846     $self->{s_kwd} .= chr $self->{nc};
3847    
3848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3849     $self->{line_prev} = $self->{line};
3850     $self->{column_prev} = $self->{column};
3851     $self->{column}++;
3852     $self->{nc}
3853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3854     } else {
3855     $self->{set_nc}->($self);
3856     }
3857    
3858     redo A;
3859     } elsif (0x0030 <= $self->{nc} and
3860     $self->{nc} <= 0x0039) { # 0..9
3861    
3862     $self->{state} = NCR_NUM_STATE;
3863     $self->{s_kwd} = $self->{nc} - 0x0030;
3864    
3865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3866     $self->{line_prev} = $self->{line};
3867     $self->{column_prev} = $self->{column};
3868     $self->{column}++;
3869     $self->{nc}
3870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3871     } else {
3872     $self->{set_nc}->($self);
3873     }
3874    
3875     redo A;
3876     } else {
3877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3878     line => $self->{line_prev},
3879     column => $self->{column_prev} - 1);
3880    
3881     ## NOTE: According to the spec algorithm, nothing is returned,
3882     ## and then "&#" is appended to the parent element or the attribute
3883     ## value in the later processing.
3884    
3885     if ($self->{prev_state} == DATA_STATE) {
3886    
3887     $self->{state} = $self->{prev_state};
3888 wakaba 1.5 $self->{s_kwd} = '';
3889 wakaba 1.1 ## Reconsume.
3890     return ({type => CHARACTER_TOKEN,
3891     data => '&#',
3892     line => $self->{line_prev},
3893     column => $self->{column_prev} - 1,
3894     });
3895     redo A;
3896     } else {
3897    
3898     $self->{ca}->{value} .= '&#';
3899     $self->{state} = $self->{prev_state};
3900 wakaba 1.5 $self->{s_kwd} = '';
3901 wakaba 1.1 ## Reconsume.
3902     redo A;
3903     }
3904     }
3905     } elsif ($self->{state} == NCR_NUM_STATE) {
3906     if (0x0030 <= $self->{nc} and
3907     $self->{nc} <= 0x0039) { # 0..9
3908    
3909     $self->{s_kwd} *= 10;
3910     $self->{s_kwd} += $self->{nc} - 0x0030;
3911    
3912     ## Stay in the state.
3913    
3914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915     $self->{line_prev} = $self->{line};
3916     $self->{column_prev} = $self->{column};
3917     $self->{column}++;
3918     $self->{nc}
3919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920     } else {
3921     $self->{set_nc}->($self);
3922     }
3923    
3924     redo A;
3925     } elsif ($self->{nc} == 0x003B) { # ;
3926    
3927    
3928     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3929     $self->{line_prev} = $self->{line};
3930     $self->{column_prev} = $self->{column};
3931     $self->{column}++;
3932     $self->{nc}
3933     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3934     } else {
3935     $self->{set_nc}->($self);
3936     }
3937    
3938     #
3939     } else {
3940    
3941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3942     ## Reconsume.
3943     #
3944     }
3945    
3946     my $code = $self->{s_kwd};
3947     my $l = $self->{line_prev};
3948     my $c = $self->{column_prev};
3949     if ($charref_map->{$code}) {
3950    
3951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3952     text => (sprintf 'U+%04X', $code),
3953     line => $l, column => $c);
3954     $code = $charref_map->{$code};
3955     } elsif ($code > 0x10FFFF) {
3956    
3957     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3958     text => (sprintf 'U-%08X', $code),
3959     line => $l, column => $c);
3960     $code = 0xFFFD;
3961     }
3962    
3963     if ($self->{prev_state} == DATA_STATE) {
3964    
3965     $self->{state} = $self->{prev_state};
3966 wakaba 1.5 $self->{s_kwd} = '';
3967 wakaba 1.1 ## Reconsume.
3968     return ({type => CHARACTER_TOKEN, data => chr $code,
3969     line => $l, column => $c,
3970     });
3971     redo A;
3972     } else {
3973    
3974     $self->{ca}->{value} .= chr $code;
3975     $self->{ca}->{has_reference} = 1;
3976     $self->{state} = $self->{prev_state};
3977 wakaba 1.5 $self->{s_kwd} = '';
3978 wakaba 1.1 ## Reconsume.
3979     redo A;
3980     }
3981     } elsif ($self->{state} == HEXREF_X_STATE) {
3982     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3983     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3984     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3985     # 0..9, A..F, a..f
3986    
3987     $self->{state} = HEXREF_HEX_STATE;
3988     $self->{s_kwd} = 0;
3989     ## Reconsume.
3990     redo A;
3991     } else {
3992     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3993     line => $self->{line_prev},
3994     column => $self->{column_prev} - 2);
3995    
3996     ## NOTE: According to the spec algorithm, nothing is returned,
3997     ## and then "&#" followed by "X" or "x" is appended to the parent
3998     ## element or the attribute value in the later processing.
3999    
4000     if ($self->{prev_state} == DATA_STATE) {
4001    
4002     $self->{state} = $self->{prev_state};
4003 wakaba 1.5 $self->{s_kwd} = '';
4004 wakaba 1.1 ## Reconsume.
4005     return ({type => CHARACTER_TOKEN,
4006     data => '&' . $self->{s_kwd},
4007     line => $self->{line_prev},
4008     column => $self->{column_prev} - length $self->{s_kwd},
4009     });
4010     redo A;
4011     } else {
4012    
4013     $self->{ca}->{value} .= '&' . $self->{s_kwd};
4014     $self->{state} = $self->{prev_state};
4015 wakaba 1.5 $self->{s_kwd} = '';
4016 wakaba 1.1 ## Reconsume.
4017     redo A;
4018     }
4019     }
4020     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4021     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4022     # 0..9
4023    
4024     $self->{s_kwd} *= 0x10;
4025     $self->{s_kwd} += $self->{nc} - 0x0030;
4026     ## Stay in the state.
4027    
4028     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4029     $self->{line_prev} = $self->{line};
4030     $self->{column_prev} = $self->{column};
4031     $self->{column}++;
4032     $self->{nc}
4033     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4034     } else {
4035     $self->{set_nc}->($self);
4036     }
4037    
4038     redo A;
4039     } elsif (0x0061 <= $self->{nc} and
4040     $self->{nc} <= 0x0066) { # a..f
4041    
4042     $self->{s_kwd} *= 0x10;
4043     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4044     ## Stay in the state.
4045    
4046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4047     $self->{line_prev} = $self->{line};
4048     $self->{column_prev} = $self->{column};
4049     $self->{column}++;
4050     $self->{nc}
4051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4052     } else {
4053     $self->{set_nc}->($self);
4054     }
4055    
4056     redo A;
4057     } elsif (0x0041 <= $self->{nc} and
4058     $self->{nc} <= 0x0046) { # A..F
4059    
4060     $self->{s_kwd} *= 0x10;
4061     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4062     ## Stay in the state.
4063    
4064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4065     $self->{line_prev} = $self->{line};
4066     $self->{column_prev} = $self->{column};
4067     $self->{column}++;
4068     $self->{nc}
4069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4070     } else {
4071     $self->{set_nc}->($self);
4072     }
4073    
4074     redo A;
4075     } elsif ($self->{nc} == 0x003B) { # ;
4076    
4077    
4078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4079     $self->{line_prev} = $self->{line};
4080     $self->{column_prev} = $self->{column};
4081     $self->{column}++;
4082     $self->{nc}
4083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4084     } else {
4085     $self->{set_nc}->($self);
4086     }
4087    
4088     #
4089     } else {
4090    
4091     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4092     line => $self->{line},
4093     column => $self->{column});
4094     ## Reconsume.
4095     #
4096     }
4097    
4098     my $code = $self->{s_kwd};
4099     my $l = $self->{line_prev};
4100     my $c = $self->{column_prev};
4101     if ($charref_map->{$code}) {
4102    
4103     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4104     text => (sprintf 'U+%04X', $code),
4105     line => $l, column => $c);
4106     $code = $charref_map->{$code};
4107     } elsif ($code > 0x10FFFF) {
4108    
4109     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4110     text => (sprintf 'U-%08X', $code),
4111     line => $l, column => $c);
4112     $code = 0xFFFD;
4113     }
4114    
4115     if ($self->{prev_state} == DATA_STATE) {
4116    
4117     $self->{state} = $self->{prev_state};
4118 wakaba 1.5 $self->{s_kwd} = '';
4119 wakaba 1.1 ## Reconsume.
4120     return ({type => CHARACTER_TOKEN, data => chr $code,
4121     line => $l, column => $c,
4122     });
4123     redo A;
4124     } else {
4125    
4126     $self->{ca}->{value} .= chr $code;
4127     $self->{ca}->{has_reference} = 1;
4128     $self->{state} = $self->{prev_state};
4129 wakaba 1.5 $self->{s_kwd} = '';
4130 wakaba 1.1 ## Reconsume.
4131     redo A;
4132     }
4133     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4134     if (length $self->{s_kwd} < 30 and
4135     ## NOTE: Some number greater than the maximum length of entity name
4136     ((0x0041 <= $self->{nc} and # a
4137     $self->{nc} <= 0x005A) or # x
4138     (0x0061 <= $self->{nc} and # a
4139     $self->{nc} <= 0x007A) or # z
4140     (0x0030 <= $self->{nc} and # 0
4141     $self->{nc} <= 0x0039) or # 9
4142     $self->{nc} == 0x003B)) { # ;
4143     our $EntityChar;
4144     $self->{s_kwd} .= chr $self->{nc};
4145     if (defined $EntityChar->{$self->{s_kwd}}) {
4146     if ($self->{nc} == 0x003B) { # ;
4147    
4148     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4149     $self->{entity__match} = 1;
4150    
4151     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4152     $self->{line_prev} = $self->{line};
4153     $self->{column_prev} = $self->{column};
4154     $self->{column}++;
4155     $self->{nc}
4156     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4157     } else {
4158     $self->{set_nc}->($self);
4159     }
4160    
4161     #
4162     } else {
4163    
4164     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4165     $self->{entity__match} = -1;
4166     ## Stay in the state.
4167    
4168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4169     $self->{line_prev} = $self->{line};
4170     $self->{column_prev} = $self->{column};
4171     $self->{column}++;
4172     $self->{nc}
4173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4174     } else {
4175     $self->{set_nc}->($self);
4176     }
4177    
4178     redo A;
4179     }
4180     } else {
4181    
4182     $self->{entity__value} .= chr $self->{nc};
4183     $self->{entity__match} *= 2;
4184     ## Stay in the state.
4185    
4186     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4187     $self->{line_prev} = $self->{line};
4188     $self->{column_prev} = $self->{column};
4189     $self->{column}++;
4190     $self->{nc}
4191     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4192     } else {
4193     $self->{set_nc}->($self);
4194     }
4195    
4196     redo A;
4197     }
4198     }
4199    
4200     my $data;
4201     my $has_ref;
4202     if ($self->{entity__match} > 0) {
4203    
4204     $data = $self->{entity__value};
4205     $has_ref = 1;
4206     #
4207     } elsif ($self->{entity__match} < 0) {
4208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4209     if ($self->{prev_state} != DATA_STATE and # in attribute
4210     $self->{entity__match} < -1) {
4211    
4212     $data = '&' . $self->{s_kwd};
4213     #
4214     } else {
4215    
4216     $data = $self->{entity__value};
4217     $has_ref = 1;
4218     #
4219     }
4220     } else {
4221    
4222     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4223     line => $self->{line_prev},
4224     column => $self->{column_prev} - length $self->{s_kwd});
4225     $data = '&' . $self->{s_kwd};
4226     #
4227     }
4228    
4229     ## NOTE: In these cases, when a character reference is found,
4230     ## it is consumed and a character token is returned, or, otherwise,
4231     ## nothing is consumed and returned, according to the spec algorithm.
4232     ## In this implementation, anything that has been examined by the
4233     ## tokenizer is appended to the parent element or the attribute value
4234     ## as string, either literal string when no character reference or
4235     ## entity-replaced string otherwise, in this stage, since any characters
4236     ## that would not be consumed are appended in the data state or in an
4237     ## appropriate attribute value state anyway.
4238    
4239     if ($self->{prev_state} == DATA_STATE) {
4240    
4241     $self->{state} = $self->{prev_state};
4242 wakaba 1.5 $self->{s_kwd} = '';
4243 wakaba 1.1 ## Reconsume.
4244     return ({type => CHARACTER_TOKEN,
4245     data => $data,
4246     line => $self->{line_prev},
4247     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4248     });
4249     redo A;
4250     } else {
4251    
4252     $self->{ca}->{value} .= $data;
4253     $self->{ca}->{has_reference} = 1 if $has_ref;
4254     $self->{state} = $self->{prev_state};
4255 wakaba 1.5 $self->{s_kwd} = '';
4256 wakaba 1.1 ## Reconsume.
4257     redo A;
4258     }
4259     } else {
4260     die "$0: $self->{state}: Unknown state";
4261     }
4262     } # A
4263    
4264     die "$0: _get_next_token: unexpected case";
4265     } # _get_next_token
4266    
4267     1;
4268 wakaba 1.6 ## $Date: 2008/10/14 14:38:59 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24