/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations) (download) (as text)
Wed Oct 15 08:05:47 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.8: +13 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	15 Oct 2008 08:04:32 -0000
	* XML-Parser.t: "xml/ns-elements-1.dat" added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 08:05:44 -0000
	* ns-elements-1.dat: New test data file.

	* ns-attrs-1.dat: New test data added.

	* elements-1.dat: New test data file.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 08:03:32 -0000
	* Tokenizer.pm.src: XML tag name start charcter support for start
	tags.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 08:04:01 -0000
	* Parser.pm.src: Bug fixes for the handling of ":" in the element
	type names and attribute names.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.9 our $VERSION=do{my @r=(q$Revision: 1.8 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117 wakaba 1.8 ## XML states
118     sub PI_STATE () { 51 }
119     sub PI_TARGET_STATE () { 52 }
120     sub PI_TARGET_AFTER_STATE () { 53 }
121     sub PI_DATA_STATE () { 54 }
122     sub PI_AFTER_STATE () { 55 }
123     sub PI_DATA_AFTER_STATE () { 56 }
124    
125 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
126     ## list and descriptions)
127    
128     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129     sub FOREIGN_EL () { 0b1_00000000000 }
130    
131     ## Character reference mappings
132    
133     my $charref_map = {
134     0x0D => 0x000A,
135     0x80 => 0x20AC,
136     0x81 => 0xFFFD,
137     0x82 => 0x201A,
138     0x83 => 0x0192,
139     0x84 => 0x201E,
140     0x85 => 0x2026,
141     0x86 => 0x2020,
142     0x87 => 0x2021,
143     0x88 => 0x02C6,
144     0x89 => 0x2030,
145     0x8A => 0x0160,
146     0x8B => 0x2039,
147     0x8C => 0x0152,
148     0x8D => 0xFFFD,
149     0x8E => 0x017D,
150     0x8F => 0xFFFD,
151     0x90 => 0xFFFD,
152     0x91 => 0x2018,
153     0x92 => 0x2019,
154     0x93 => 0x201C,
155     0x94 => 0x201D,
156     0x95 => 0x2022,
157     0x96 => 0x2013,
158     0x97 => 0x2014,
159     0x98 => 0x02DC,
160     0x99 => 0x2122,
161     0x9A => 0x0161,
162     0x9B => 0x203A,
163     0x9C => 0x0153,
164     0x9D => 0xFFFD,
165     0x9E => 0x017E,
166     0x9F => 0x0178,
167     }; # $charref_map
168     $charref_map->{$_} = 0xFFFD
169     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176    
177     ## Implementations MUST act as if state machine in the spec
178    
179     sub _initialize_tokenizer ($) {
180     my $self = shift;
181    
182     ## NOTE: Fields set by |new| constructor:
183     #$self->{level}
184     #$self->{set_nc}
185     #$self->{parse_error}
186 wakaba 1.3 #$self->{is_xml} (if XML)
187 wakaba 1.1
188     $self->{state} = DATA_STATE; # MUST
189 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
190 wakaba 1.1 #$self->{entity__value}; # initialized when used
191     #$self->{entity__match}; # initialized when used
192     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193     undef $self->{ct}; # current token
194     undef $self->{ca}; # current attribute
195     undef $self->{last_stag_name}; # last emitted start tag name
196     #$self->{prev_state}; # initialized when used
197     delete $self->{self_closing};
198     $self->{char_buffer} = '';
199     $self->{char_buffer_pos} = 0;
200     $self->{nc} = -1; # next input character
201     #$self->{next_nc}
202     !!!next-input-character;
203     $self->{token} = [];
204     # $self->{escape}
205     } # _initialize_tokenizer
206    
207     ## A token has:
208     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
210     ## ->{name} (DOCTYPE_TOKEN)
211     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212     ## ->{pubid} (DOCTYPE_TOKEN)
213     ## ->{sysid} (DOCTYPE_TOKEN)
214     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
215     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
216     ## ->{name}
217     ## ->{value}
218     ## ->{has_reference} == 1 or 0
219     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
220 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
221 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
223     ## while the token is pushed back to the stack.
224    
225     ## Emitted token MUST immediately be handled by the tree construction state.
226    
227     ## Before each step, UA MAY check to see if either one of the scripts in
228     ## "list of scripts that will execute as soon as possible" or the first
229     ## script in the "list of scripts that will execute asynchronously",
230     ## has completed loading. If one has, then it MUST be executed
231     ## and removed from the list.
232    
233     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
234     ## (This requirement was dropped from HTML5 spec, unfortunately.)
235    
236     my $is_space = {
237     0x0009 => 1, # CHARACTER TABULATION (HT)
238     0x000A => 1, # LINE FEED (LF)
239     #0x000B => 0, # LINE TABULATION (VT)
240     0x000C => 1, # FORM FEED (FF)
241     #0x000D => 1, # CARRIAGE RETURN (CR)
242     0x0020 => 1, # SPACE (SP)
243     };
244    
245     sub _get_next_token ($) {
246     my $self = shift;
247    
248     if ($self->{self_closing}) {
249     !!!parse-error (type => 'nestc', token => $self->{ct});
250     ## NOTE: The |self_closing| flag is only set by start tag token.
251     ## In addition, when a start tag token is emitted, it is always set to
252     ## |ct|.
253     delete $self->{self_closing};
254     }
255    
256     if (@{$self->{token}}) {
257     $self->{self_closing} = $self->{token}->[0]->{self_closing};
258     return shift @{$self->{token}};
259     }
260    
261     A: {
262     if ($self->{state} == PCDATA_STATE) {
263     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
264    
265     if ($self->{nc} == 0x0026) { # &
266     !!!cp (0.1);
267     ## NOTE: In the spec, the tokenizer is switched to the
268     ## "entity data state". In this implementation, the tokenizer
269     ## is switched to the |ENTITY_STATE|, which is an implementation
270     ## of the "consume a character reference" algorithm.
271     $self->{entity_add} = -1;
272     $self->{prev_state} = DATA_STATE;
273     $self->{state} = ENTITY_STATE;
274     !!!next-input-character;
275     redo A;
276     } elsif ($self->{nc} == 0x003C) { # <
277     !!!cp (0.2);
278     $self->{state} = TAG_OPEN_STATE;
279     !!!next-input-character;
280     redo A;
281     } elsif ($self->{nc} == -1) {
282     !!!cp (0.3);
283     !!!emit ({type => END_OF_FILE_TOKEN,
284     line => $self->{line}, column => $self->{column}});
285     last A; ## TODO: ok?
286     } else {
287     !!!cp (0.4);
288     #
289     }
290    
291     # Anything else
292     my $token = {type => CHARACTER_TOKEN,
293     data => chr $self->{nc},
294     line => $self->{line}, column => $self->{column},
295     };
296     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
297    
298     ## Stay in the state.
299     !!!next-input-character;
300     !!!emit ($token);
301     redo A;
302     } elsif ($self->{state} == DATA_STATE) {
303     $self->{s_kwd} = '' unless defined $self->{s_kwd};
304     if ($self->{nc} == 0x0026) { # &
305     $self->{s_kwd} = '';
306     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
307     not $self->{escape}) {
308     !!!cp (1);
309     ## NOTE: In the spec, the tokenizer is switched to the
310     ## "entity data state". In this implementation, the tokenizer
311     ## is switched to the |ENTITY_STATE|, which is an implementation
312     ## of the "consume a character reference" algorithm.
313     $self->{entity_add} = -1;
314     $self->{prev_state} = DATA_STATE;
315     $self->{state} = ENTITY_STATE;
316     !!!next-input-character;
317     redo A;
318     } else {
319     !!!cp (2);
320     #
321     }
322     } elsif ($self->{nc} == 0x002D) { # -
323     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
324 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
325 wakaba 1.1 !!!cp (3);
326     $self->{escape} = 1; # unless $self->{escape};
327     $self->{s_kwd} = '--';
328     #
329 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
330 wakaba 1.1 !!!cp (4);
331     $self->{s_kwd} = '--';
332     #
333 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
334     !!!cp (4.1);
335     $self->{s_kwd} .= '-';
336     #
337 wakaba 1.1 } else {
338     !!!cp (5);
339 wakaba 1.5 $self->{s_kwd} = '-';
340 wakaba 1.1 #
341     }
342     }
343    
344     #
345     } elsif ($self->{nc} == 0x0021) { # !
346     if (length $self->{s_kwd}) {
347     !!!cp (5.1);
348     $self->{s_kwd} .= '!';
349     #
350     } else {
351     !!!cp (5.2);
352     #$self->{s_kwd} = '';
353     #
354     }
355     #
356     } elsif ($self->{nc} == 0x003C) { # <
357     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
358     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
359     not $self->{escape})) {
360     !!!cp (6);
361     $self->{state} = TAG_OPEN_STATE;
362     !!!next-input-character;
363     redo A;
364     } else {
365     !!!cp (7);
366     $self->{s_kwd} = '';
367     #
368     }
369     } elsif ($self->{nc} == 0x003E) { # >
370     if ($self->{escape} and
371     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
372     if ($self->{s_kwd} eq '--') {
373     !!!cp (8);
374     delete $self->{escape};
375 wakaba 1.5 #
376 wakaba 1.1 } else {
377     !!!cp (9);
378 wakaba 1.5 #
379 wakaba 1.1 }
380 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
381     !!!cp (9.1);
382     !!!parse-error (type => 'unmatched mse', ## TODO: type
383     line => $self->{line_prev},
384     column => $self->{column_prev} - 1);
385     #
386 wakaba 1.1 } else {
387     !!!cp (10);
388 wakaba 1.5 #
389 wakaba 1.1 }
390    
391     $self->{s_kwd} = '';
392     #
393 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
394     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
395     !!!cp (10.1);
396     $self->{s_kwd} .= ']';
397     } elsif ($self->{s_kwd} eq ']]') {
398     !!!cp (10.2);
399     #
400     } else {
401     !!!cp (10.3);
402     $self->{s_kwd} = '';
403     }
404     #
405 wakaba 1.1 } elsif ($self->{nc} == -1) {
406     !!!cp (11);
407     $self->{s_kwd} = '';
408     !!!emit ({type => END_OF_FILE_TOKEN,
409     line => $self->{line}, column => $self->{column}});
410     last A; ## TODO: ok?
411     } else {
412     !!!cp (12);
413     $self->{s_kwd} = '';
414     #
415     }
416    
417     # Anything else
418     my $token = {type => CHARACTER_TOKEN,
419     data => chr $self->{nc},
420     line => $self->{line}, column => $self->{column},
421     };
422 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
423 wakaba 1.1 length $token->{data})) {
424     $self->{s_kwd} = '';
425     }
426    
427     ## Stay in the data state.
428 wakaba 1.5 if (not $self->{is_xml} and
429     $self->{content_model} == PCDATA_CONTENT_MODEL) {
430 wakaba 1.1 !!!cp (13);
431     $self->{state} = PCDATA_STATE;
432     } else {
433     !!!cp (14);
434     ## Stay in the state.
435     }
436     !!!next-input-character;
437     !!!emit ($token);
438     redo A;
439     } elsif ($self->{state} == TAG_OPEN_STATE) {
440     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
441     if ($self->{nc} == 0x002F) { # /
442     !!!cp (15);
443     !!!next-input-character;
444     $self->{state} = CLOSE_TAG_OPEN_STATE;
445     redo A;
446     } elsif ($self->{nc} == 0x0021) { # !
447     !!!cp (15.1);
448     $self->{s_kwd} = '<' unless $self->{escape};
449     #
450     } else {
451     !!!cp (16);
452     #
453     }
454    
455     ## reconsume
456     $self->{state} = DATA_STATE;
457 wakaba 1.5 $self->{s_kwd} = '';
458 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN, data => '<',
459     line => $self->{line_prev},
460     column => $self->{column_prev},
461     });
462     redo A;
463     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
464     if ($self->{nc} == 0x0021) { # !
465     !!!cp (17);
466     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
467     !!!next-input-character;
468     redo A;
469     } elsif ($self->{nc} == 0x002F) { # /
470     !!!cp (18);
471     $self->{state} = CLOSE_TAG_OPEN_STATE;
472     !!!next-input-character;
473     redo A;
474     } elsif (0x0041 <= $self->{nc} and
475     $self->{nc} <= 0x005A) { # A..Z
476     !!!cp (19);
477     $self->{ct}
478     = {type => START_TAG_TOKEN,
479 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
480 wakaba 1.1 line => $self->{line_prev},
481     column => $self->{column_prev}};
482     $self->{state} = TAG_NAME_STATE;
483     !!!next-input-character;
484     redo A;
485     } elsif (0x0061 <= $self->{nc} and
486     $self->{nc} <= 0x007A) { # a..z
487     !!!cp (20);
488     $self->{ct} = {type => START_TAG_TOKEN,
489     tag_name => chr ($self->{nc}),
490     line => $self->{line_prev},
491     column => $self->{column_prev}};
492     $self->{state} = TAG_NAME_STATE;
493     !!!next-input-character;
494     redo A;
495     } elsif ($self->{nc} == 0x003E) { # >
496     !!!cp (21);
497     !!!parse-error (type => 'empty start tag',
498     line => $self->{line_prev},
499     column => $self->{column_prev});
500     $self->{state} = DATA_STATE;
501 wakaba 1.5 $self->{s_kwd} = '';
502 wakaba 1.1 !!!next-input-character;
503    
504     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
505     line => $self->{line_prev},
506     column => $self->{column_prev},
507     });
508    
509     redo A;
510     } elsif ($self->{nc} == 0x003F) { # ?
511 wakaba 1.8 if ($self->{is_xml}) {
512     !!!cp (22.1);
513     $self->{state} = PI_STATE;
514     !!!next-input-character;
515     redo A;
516     } else {
517     !!!cp (22);
518     !!!parse-error (type => 'pio',
519     line => $self->{line_prev},
520     column => $self->{column_prev});
521     $self->{state} = BOGUS_COMMENT_STATE;
522     $self->{ct} = {type => COMMENT_TOKEN, data => '',
523     line => $self->{line_prev},
524     column => $self->{column_prev},
525     };
526     ## $self->{nc} is intentionally left as is
527     redo A;
528     }
529 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
530 wakaba 1.1 !!!cp (23);
531     !!!parse-error (type => 'bare stago',
532     line => $self->{line_prev},
533     column => $self->{column_prev});
534     $self->{state} = DATA_STATE;
535 wakaba 1.5 $self->{s_kwd} = '';
536 wakaba 1.1 ## reconsume
537    
538     !!!emit ({type => CHARACTER_TOKEN, data => '<',
539     line => $self->{line_prev},
540     column => $self->{column_prev},
541     });
542    
543     redo A;
544 wakaba 1.9 } else {
545     ## XML5: "<:" is a parse error.
546     !!!cp (23.1);
547     $self->{ct} = {type => START_TAG_TOKEN,
548     tag_name => chr ($self->{nc}),
549     line => $self->{line_prev},
550     column => $self->{column_prev}};
551     $self->{state} = TAG_NAME_STATE;
552     !!!next-input-character;
553     redo A;
554 wakaba 1.1 }
555     } else {
556     die "$0: $self->{content_model} in tag open";
557     }
558     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
559     ## NOTE: The "close tag open state" in the spec is implemented as
560     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
561    
562     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
563     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
564     if (defined $self->{last_stag_name}) {
565     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
566     $self->{s_kwd} = '';
567     ## Reconsume.
568     redo A;
569     } else {
570     ## No start tag token has ever been emitted
571     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
572     !!!cp (28);
573     $self->{state} = DATA_STATE;
574 wakaba 1.5 $self->{s_kwd} = '';
575 wakaba 1.1 ## Reconsume.
576     !!!emit ({type => CHARACTER_TOKEN, data => '</',
577     line => $l, column => $c,
578     });
579     redo A;
580     }
581     }
582    
583     if (0x0041 <= $self->{nc} and
584     $self->{nc} <= 0x005A) { # A..Z
585     !!!cp (29);
586     $self->{ct}
587     = {type => END_TAG_TOKEN,
588 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
589 wakaba 1.1 line => $l, column => $c};
590     $self->{state} = TAG_NAME_STATE;
591     !!!next-input-character;
592     redo A;
593     } elsif (0x0061 <= $self->{nc} and
594     $self->{nc} <= 0x007A) { # a..z
595     !!!cp (30);
596     $self->{ct} = {type => END_TAG_TOKEN,
597     tag_name => chr ($self->{nc}),
598     line => $l, column => $c};
599     $self->{state} = TAG_NAME_STATE;
600     !!!next-input-character;
601     redo A;
602     } elsif ($self->{nc} == 0x003E) { # >
603     !!!cp (31);
604     !!!parse-error (type => 'empty end tag',
605     line => $self->{line_prev}, ## "<" in "</>"
606     column => $self->{column_prev} - 1);
607     $self->{state} = DATA_STATE;
608 wakaba 1.5 $self->{s_kwd} = '';
609 wakaba 1.1 !!!next-input-character;
610     redo A;
611     } elsif ($self->{nc} == -1) {
612     !!!cp (32);
613     !!!parse-error (type => 'bare etago');
614 wakaba 1.5 $self->{s_kwd} = '';
615 wakaba 1.1 $self->{state} = DATA_STATE;
616     # reconsume
617    
618     !!!emit ({type => CHARACTER_TOKEN, data => '</',
619     line => $l, column => $c,
620     });
621    
622     redo A;
623     } else {
624     !!!cp (33);
625     !!!parse-error (type => 'bogus end tag');
626     $self->{state} = BOGUS_COMMENT_STATE;
627     $self->{ct} = {type => COMMENT_TOKEN, data => '',
628     line => $self->{line_prev}, # "<" of "</"
629     column => $self->{column_prev} - 1,
630     };
631     ## NOTE: $self->{nc} is intentionally left as is.
632     ## Although the "anything else" case of the spec not explicitly
633     ## states that the next input character is to be reconsumed,
634     ## it will be included to the |data| of the comment token
635     ## generated from the bogus end tag, as defined in the
636     ## "bogus comment state" entry.
637     redo A;
638     }
639     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
640     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
641     if (length $ch) {
642     my $CH = $ch;
643     $ch =~ tr/a-z/A-Z/;
644     my $nch = chr $self->{nc};
645     if ($nch eq $ch or $nch eq $CH) {
646     !!!cp (24);
647     ## Stay in the state.
648     $self->{s_kwd} .= $nch;
649     !!!next-input-character;
650     redo A;
651     } else {
652     !!!cp (25);
653     $self->{state} = DATA_STATE;
654 wakaba 1.5 $self->{s_kwd} = '';
655 wakaba 1.1 ## Reconsume.
656     !!!emit ({type => CHARACTER_TOKEN,
657     data => '</' . $self->{s_kwd},
658     line => $self->{line_prev},
659     column => $self->{column_prev} - 1 - length $self->{s_kwd},
660     });
661     redo A;
662     }
663     } else { # after "<{tag-name}"
664     unless ($is_space->{$self->{nc}} or
665     {
666     0x003E => 1, # >
667     0x002F => 1, # /
668     -1 => 1, # EOF
669     }->{$self->{nc}}) {
670     !!!cp (26);
671     ## Reconsume.
672     $self->{state} = DATA_STATE;
673 wakaba 1.5 $self->{s_kwd} = '';
674 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
675     data => '</' . $self->{s_kwd},
676     line => $self->{line_prev},
677     column => $self->{column_prev} - 1 - length $self->{s_kwd},
678     });
679     redo A;
680     } else {
681     !!!cp (27);
682     $self->{ct}
683     = {type => END_TAG_TOKEN,
684     tag_name => $self->{last_stag_name},
685     line => $self->{line_prev},
686     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
687     $self->{state} = TAG_NAME_STATE;
688     ## Reconsume.
689     redo A;
690     }
691     }
692     } elsif ($self->{state} == TAG_NAME_STATE) {
693     if ($is_space->{$self->{nc}}) {
694     !!!cp (34);
695     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
696     !!!next-input-character;
697     redo A;
698     } elsif ($self->{nc} == 0x003E) { # >
699     if ($self->{ct}->{type} == START_TAG_TOKEN) {
700     !!!cp (35);
701     $self->{last_stag_name} = $self->{ct}->{tag_name};
702     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
703     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
704     #if ($self->{ct}->{attributes}) {
705     # ## NOTE: This should never be reached.
706     # !!! cp (36);
707     # !!! parse-error (type => 'end tag attribute');
708     #} else {
709     !!!cp (37);
710     #}
711     } else {
712     die "$0: $self->{ct}->{type}: Unknown token type";
713     }
714     $self->{state} = DATA_STATE;
715 wakaba 1.5 $self->{s_kwd} = '';
716 wakaba 1.1 !!!next-input-character;
717    
718     !!!emit ($self->{ct}); # start tag or end tag
719    
720     redo A;
721     } elsif (0x0041 <= $self->{nc} and
722     $self->{nc} <= 0x005A) { # A..Z
723     !!!cp (38);
724 wakaba 1.4 $self->{ct}->{tag_name}
725     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
726 wakaba 1.1 # start tag or end tag
727     ## Stay in this state
728     !!!next-input-character;
729     redo A;
730     } elsif ($self->{nc} == -1) {
731     !!!parse-error (type => 'unclosed tag');
732     if ($self->{ct}->{type} == START_TAG_TOKEN) {
733     !!!cp (39);
734     $self->{last_stag_name} = $self->{ct}->{tag_name};
735     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
736     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
737     #if ($self->{ct}->{attributes}) {
738     # ## NOTE: This state should never be reached.
739     # !!! cp (40);
740     # !!! parse-error (type => 'end tag attribute');
741     #} else {
742     !!!cp (41);
743     #}
744     } else {
745     die "$0: $self->{ct}->{type}: Unknown token type";
746     }
747     $self->{state} = DATA_STATE;
748 wakaba 1.5 $self->{s_kwd} = '';
749 wakaba 1.1 # reconsume
750    
751     !!!emit ($self->{ct}); # start tag or end tag
752    
753     redo A;
754     } elsif ($self->{nc} == 0x002F) { # /
755     !!!cp (42);
756     $self->{state} = SELF_CLOSING_START_TAG_STATE;
757     !!!next-input-character;
758     redo A;
759     } else {
760     !!!cp (44);
761     $self->{ct}->{tag_name} .= chr $self->{nc};
762     # start tag or end tag
763     ## Stay in the state
764     !!!next-input-character;
765     redo A;
766     }
767     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
768     if ($is_space->{$self->{nc}}) {
769     !!!cp (45);
770     ## Stay in the state
771     !!!next-input-character;
772     redo A;
773     } elsif ($self->{nc} == 0x003E) { # >
774     if ($self->{ct}->{type} == START_TAG_TOKEN) {
775     !!!cp (46);
776     $self->{last_stag_name} = $self->{ct}->{tag_name};
777     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
778     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
779     if ($self->{ct}->{attributes}) {
780     !!!cp (47);
781     !!!parse-error (type => 'end tag attribute');
782     } else {
783     !!!cp (48);
784     }
785     } else {
786     die "$0: $self->{ct}->{type}: Unknown token type";
787     }
788     $self->{state} = DATA_STATE;
789 wakaba 1.5 $self->{s_kwd} = '';
790 wakaba 1.1 !!!next-input-character;
791    
792     !!!emit ($self->{ct}); # start tag or end tag
793    
794     redo A;
795     } elsif (0x0041 <= $self->{nc} and
796     $self->{nc} <= 0x005A) { # A..Z
797     !!!cp (49);
798     $self->{ca}
799 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
800 wakaba 1.1 value => '',
801     line => $self->{line}, column => $self->{column}};
802     $self->{state} = ATTRIBUTE_NAME_STATE;
803     !!!next-input-character;
804     redo A;
805     } elsif ($self->{nc} == 0x002F) { # /
806     !!!cp (50);
807     $self->{state} = SELF_CLOSING_START_TAG_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == -1) {
811     !!!parse-error (type => 'unclosed tag');
812     if ($self->{ct}->{type} == START_TAG_TOKEN) {
813     !!!cp (52);
814     $self->{last_stag_name} = $self->{ct}->{tag_name};
815     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
816     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
817     if ($self->{ct}->{attributes}) {
818     !!!cp (53);
819     !!!parse-error (type => 'end tag attribute');
820     } else {
821     !!!cp (54);
822     }
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 # reconsume
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } else {
834     if ({
835     0x0022 => 1, # "
836     0x0027 => 1, # '
837     0x003D => 1, # =
838     }->{$self->{nc}}) {
839     !!!cp (55);
840     !!!parse-error (type => 'bad attribute name');
841     } else {
842     !!!cp (56);
843     }
844     $self->{ca}
845     = {name => chr ($self->{nc}),
846     value => '',
847     line => $self->{line}, column => $self->{column}};
848     $self->{state} = ATTRIBUTE_NAME_STATE;
849     !!!next-input-character;
850     redo A;
851     }
852     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
853     my $before_leave = sub {
854     if (exists $self->{ct}->{attributes} # start tag or end tag
855     ->{$self->{ca}->{name}}) { # MUST
856     !!!cp (57);
857     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
858     ## Discard $self->{ca} # MUST
859     } else {
860     !!!cp (58);
861     $self->{ct}->{attributes}->{$self->{ca}->{name}}
862     = $self->{ca};
863     }
864     }; # $before_leave
865    
866     if ($is_space->{$self->{nc}}) {
867     !!!cp (59);
868     $before_leave->();
869     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
870     !!!next-input-character;
871     redo A;
872     } elsif ($self->{nc} == 0x003D) { # =
873     !!!cp (60);
874     $before_leave->();
875     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
876     !!!next-input-character;
877     redo A;
878     } elsif ($self->{nc} == 0x003E) { # >
879     $before_leave->();
880     if ($self->{ct}->{type} == START_TAG_TOKEN) {
881     !!!cp (61);
882     $self->{last_stag_name} = $self->{ct}->{tag_name};
883     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
884     !!!cp (62);
885     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
886     if ($self->{ct}->{attributes}) {
887     !!!parse-error (type => 'end tag attribute');
888     }
889     } else {
890     die "$0: $self->{ct}->{type}: Unknown token type";
891     }
892     $self->{state} = DATA_STATE;
893 wakaba 1.5 $self->{s_kwd} = '';
894 wakaba 1.1 !!!next-input-character;
895    
896     !!!emit ($self->{ct}); # start tag or end tag
897    
898     redo A;
899     } elsif (0x0041 <= $self->{nc} and
900     $self->{nc} <= 0x005A) { # A..Z
901     !!!cp (63);
902 wakaba 1.4 $self->{ca}->{name}
903     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
904 wakaba 1.1 ## Stay in the state
905     !!!next-input-character;
906     redo A;
907     } elsif ($self->{nc} == 0x002F) { # /
908     !!!cp (64);
909     $before_leave->();
910     $self->{state} = SELF_CLOSING_START_TAG_STATE;
911     !!!next-input-character;
912     redo A;
913     } elsif ($self->{nc} == -1) {
914     !!!parse-error (type => 'unclosed tag');
915     $before_leave->();
916     if ($self->{ct}->{type} == START_TAG_TOKEN) {
917     !!!cp (66);
918     $self->{last_stag_name} = $self->{ct}->{tag_name};
919     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
920     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
921     if ($self->{ct}->{attributes}) {
922     !!!cp (67);
923     !!!parse-error (type => 'end tag attribute');
924     } else {
925     ## NOTE: This state should never be reached.
926     !!!cp (68);
927     }
928     } else {
929     die "$0: $self->{ct}->{type}: Unknown token type";
930     }
931     $self->{state} = DATA_STATE;
932 wakaba 1.5 $self->{s_kwd} = '';
933 wakaba 1.1 # reconsume
934    
935     !!!emit ($self->{ct}); # start tag or end tag
936    
937     redo A;
938     } else {
939     if ($self->{nc} == 0x0022 or # "
940     $self->{nc} == 0x0027) { # '
941     !!!cp (69);
942     !!!parse-error (type => 'bad attribute name');
943     } else {
944     !!!cp (70);
945     }
946     $self->{ca}->{name} .= chr ($self->{nc});
947     ## Stay in the state
948     !!!next-input-character;
949     redo A;
950     }
951     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
952     if ($is_space->{$self->{nc}}) {
953     !!!cp (71);
954     ## Stay in the state
955     !!!next-input-character;
956     redo A;
957     } elsif ($self->{nc} == 0x003D) { # =
958     !!!cp (72);
959     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
960     !!!next-input-character;
961     redo A;
962     } elsif ($self->{nc} == 0x003E) { # >
963     if ($self->{ct}->{type} == START_TAG_TOKEN) {
964     !!!cp (73);
965     $self->{last_stag_name} = $self->{ct}->{tag_name};
966     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
967     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
968     if ($self->{ct}->{attributes}) {
969     !!!cp (74);
970     !!!parse-error (type => 'end tag attribute');
971     } else {
972     ## NOTE: This state should never be reached.
973     !!!cp (75);
974     }
975     } else {
976     die "$0: $self->{ct}->{type}: Unknown token type";
977     }
978     $self->{state} = DATA_STATE;
979 wakaba 1.5 $self->{s_kwd} = '';
980 wakaba 1.1 !!!next-input-character;
981    
982     !!!emit ($self->{ct}); # start tag or end tag
983    
984     redo A;
985     } elsif (0x0041 <= $self->{nc} and
986     $self->{nc} <= 0x005A) { # A..Z
987     !!!cp (76);
988     $self->{ca}
989 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
990 wakaba 1.1 value => '',
991     line => $self->{line}, column => $self->{column}};
992     $self->{state} = ATTRIBUTE_NAME_STATE;
993     !!!next-input-character;
994     redo A;
995     } elsif ($self->{nc} == 0x002F) { # /
996     !!!cp (77);
997     $self->{state} = SELF_CLOSING_START_TAG_STATE;
998     !!!next-input-character;
999     redo A;
1000     } elsif ($self->{nc} == -1) {
1001     !!!parse-error (type => 'unclosed tag');
1002     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1003     !!!cp (79);
1004     $self->{last_stag_name} = $self->{ct}->{tag_name};
1005     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1006     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1007     if ($self->{ct}->{attributes}) {
1008     !!!cp (80);
1009     !!!parse-error (type => 'end tag attribute');
1010     } else {
1011     ## NOTE: This state should never be reached.
1012     !!!cp (81);
1013     }
1014     } else {
1015     die "$0: $self->{ct}->{type}: Unknown token type";
1016     }
1017 wakaba 1.5 $self->{s_kwd} = '';
1018 wakaba 1.1 $self->{state} = DATA_STATE;
1019     # reconsume
1020    
1021     !!!emit ($self->{ct}); # start tag or end tag
1022    
1023     redo A;
1024     } else {
1025     if ($self->{nc} == 0x0022 or # "
1026     $self->{nc} == 0x0027) { # '
1027     !!!cp (78);
1028     !!!parse-error (type => 'bad attribute name');
1029     } else {
1030     !!!cp (82);
1031     }
1032     $self->{ca}
1033     = {name => chr ($self->{nc}),
1034     value => '',
1035     line => $self->{line}, column => $self->{column}};
1036     $self->{state} = ATTRIBUTE_NAME_STATE;
1037     !!!next-input-character;
1038     redo A;
1039     }
1040     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1041     if ($is_space->{$self->{nc}}) {
1042     !!!cp (83);
1043     ## Stay in the state
1044     !!!next-input-character;
1045     redo A;
1046     } elsif ($self->{nc} == 0x0022) { # "
1047     !!!cp (84);
1048     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1049     !!!next-input-character;
1050     redo A;
1051     } elsif ($self->{nc} == 0x0026) { # &
1052     !!!cp (85);
1053     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1054     ## reconsume
1055     redo A;
1056     } elsif ($self->{nc} == 0x0027) { # '
1057     !!!cp (86);
1058     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1059     !!!next-input-character;
1060     redo A;
1061     } elsif ($self->{nc} == 0x003E) { # >
1062     !!!parse-error (type => 'empty unquoted attribute value');
1063     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1064     !!!cp (87);
1065     $self->{last_stag_name} = $self->{ct}->{tag_name};
1066     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1067     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1068     if ($self->{ct}->{attributes}) {
1069     !!!cp (88);
1070     !!!parse-error (type => 'end tag attribute');
1071     } else {
1072     ## NOTE: This state should never be reached.
1073     !!!cp (89);
1074     }
1075     } else {
1076     die "$0: $self->{ct}->{type}: Unknown token type";
1077     }
1078     $self->{state} = DATA_STATE;
1079 wakaba 1.5 $self->{s_kwd} = '';
1080 wakaba 1.1 !!!next-input-character;
1081    
1082     !!!emit ($self->{ct}); # start tag or end tag
1083    
1084     redo A;
1085     } elsif ($self->{nc} == -1) {
1086     !!!parse-error (type => 'unclosed tag');
1087     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1088     !!!cp (90);
1089     $self->{last_stag_name} = $self->{ct}->{tag_name};
1090     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1091     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1092     if ($self->{ct}->{attributes}) {
1093     !!!cp (91);
1094     !!!parse-error (type => 'end tag attribute');
1095     } else {
1096     ## NOTE: This state should never be reached.
1097     !!!cp (92);
1098     }
1099     } else {
1100     die "$0: $self->{ct}->{type}: Unknown token type";
1101     }
1102     $self->{state} = DATA_STATE;
1103 wakaba 1.5 $self->{s_kwd} = '';
1104 wakaba 1.1 ## reconsume
1105    
1106     !!!emit ($self->{ct}); # start tag or end tag
1107    
1108     redo A;
1109     } else {
1110     if ($self->{nc} == 0x003D) { # =
1111     !!!cp (93);
1112     !!!parse-error (type => 'bad attribute value');
1113     } else {
1114     !!!cp (94);
1115     }
1116     $self->{ca}->{value} .= chr ($self->{nc});
1117     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1118     !!!next-input-character;
1119     redo A;
1120     }
1121     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1122     if ($self->{nc} == 0x0022) { # "
1123     !!!cp (95);
1124     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1125     !!!next-input-character;
1126     redo A;
1127     } elsif ($self->{nc} == 0x0026) { # &
1128     !!!cp (96);
1129     ## NOTE: In the spec, the tokenizer is switched to the
1130     ## "entity in attribute value state". In this implementation, the
1131     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1132     ## implementation of the "consume a character reference" algorithm.
1133     $self->{prev_state} = $self->{state};
1134     $self->{entity_add} = 0x0022; # "
1135     $self->{state} = ENTITY_STATE;
1136     !!!next-input-character;
1137     redo A;
1138     } elsif ($self->{nc} == -1) {
1139     !!!parse-error (type => 'unclosed attribute value');
1140     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1141     !!!cp (97);
1142     $self->{last_stag_name} = $self->{ct}->{tag_name};
1143     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1144     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1145     if ($self->{ct}->{attributes}) {
1146     !!!cp (98);
1147     !!!parse-error (type => 'end tag attribute');
1148     } else {
1149     ## NOTE: This state should never be reached.
1150     !!!cp (99);
1151     }
1152     } else {
1153     die "$0: $self->{ct}->{type}: Unknown token type";
1154     }
1155     $self->{state} = DATA_STATE;
1156 wakaba 1.5 $self->{s_kwd} = '';
1157 wakaba 1.1 ## reconsume
1158    
1159     !!!emit ($self->{ct}); # start tag or end tag
1160    
1161     redo A;
1162     } else {
1163     !!!cp (100);
1164     $self->{ca}->{value} .= chr ($self->{nc});
1165     $self->{read_until}->($self->{ca}->{value},
1166     q["&],
1167     length $self->{ca}->{value});
1168    
1169     ## Stay in the state
1170     !!!next-input-character;
1171     redo A;
1172     }
1173     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1174     if ($self->{nc} == 0x0027) { # '
1175     !!!cp (101);
1176     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1177     !!!next-input-character;
1178     redo A;
1179     } elsif ($self->{nc} == 0x0026) { # &
1180     !!!cp (102);
1181     ## NOTE: In the spec, the tokenizer is switched to the
1182     ## "entity in attribute value state". In this implementation, the
1183     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1184     ## implementation of the "consume a character reference" algorithm.
1185     $self->{entity_add} = 0x0027; # '
1186     $self->{prev_state} = $self->{state};
1187     $self->{state} = ENTITY_STATE;
1188     !!!next-input-character;
1189     redo A;
1190     } elsif ($self->{nc} == -1) {
1191     !!!parse-error (type => 'unclosed attribute value');
1192     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1193     !!!cp (103);
1194     $self->{last_stag_name} = $self->{ct}->{tag_name};
1195     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1196     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1197     if ($self->{ct}->{attributes}) {
1198     !!!cp (104);
1199     !!!parse-error (type => 'end tag attribute');
1200     } else {
1201     ## NOTE: This state should never be reached.
1202     !!!cp (105);
1203     }
1204     } else {
1205     die "$0: $self->{ct}->{type}: Unknown token type";
1206     }
1207     $self->{state} = DATA_STATE;
1208 wakaba 1.5 $self->{s_kwd} = '';
1209 wakaba 1.1 ## reconsume
1210    
1211     !!!emit ($self->{ct}); # start tag or end tag
1212    
1213     redo A;
1214     } else {
1215     !!!cp (106);
1216     $self->{ca}->{value} .= chr ($self->{nc});
1217     $self->{read_until}->($self->{ca}->{value},
1218     q['&],
1219     length $self->{ca}->{value});
1220    
1221     ## Stay in the state
1222     !!!next-input-character;
1223     redo A;
1224     }
1225     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1226     if ($is_space->{$self->{nc}}) {
1227     !!!cp (107);
1228     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1229     !!!next-input-character;
1230     redo A;
1231     } elsif ($self->{nc} == 0x0026) { # &
1232     !!!cp (108);
1233     ## NOTE: In the spec, the tokenizer is switched to the
1234     ## "entity in attribute value state". In this implementation, the
1235     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1236     ## implementation of the "consume a character reference" algorithm.
1237     $self->{entity_add} = -1;
1238     $self->{prev_state} = $self->{state};
1239     $self->{state} = ENTITY_STATE;
1240     !!!next-input-character;
1241     redo A;
1242     } elsif ($self->{nc} == 0x003E) { # >
1243     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1244     !!!cp (109);
1245     $self->{last_stag_name} = $self->{ct}->{tag_name};
1246     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1247     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1248     if ($self->{ct}->{attributes}) {
1249     !!!cp (110);
1250     !!!parse-error (type => 'end tag attribute');
1251     } else {
1252     ## NOTE: This state should never be reached.
1253     !!!cp (111);
1254     }
1255     } else {
1256     die "$0: $self->{ct}->{type}: Unknown token type";
1257     }
1258     $self->{state} = DATA_STATE;
1259 wakaba 1.5 $self->{s_kwd} = '';
1260 wakaba 1.1 !!!next-input-character;
1261    
1262     !!!emit ($self->{ct}); # start tag or end tag
1263    
1264     redo A;
1265     } elsif ($self->{nc} == -1) {
1266     !!!parse-error (type => 'unclosed tag');
1267     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1268     !!!cp (112);
1269     $self->{last_stag_name} = $self->{ct}->{tag_name};
1270     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1271     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1272     if ($self->{ct}->{attributes}) {
1273     !!!cp (113);
1274     !!!parse-error (type => 'end tag attribute');
1275     } else {
1276     ## NOTE: This state should never be reached.
1277     !!!cp (114);
1278     }
1279     } else {
1280     die "$0: $self->{ct}->{type}: Unknown token type";
1281     }
1282     $self->{state} = DATA_STATE;
1283 wakaba 1.5 $self->{s_kwd} = '';
1284 wakaba 1.1 ## reconsume
1285    
1286     !!!emit ($self->{ct}); # start tag or end tag
1287    
1288     redo A;
1289     } else {
1290     if ({
1291     0x0022 => 1, # "
1292     0x0027 => 1, # '
1293     0x003D => 1, # =
1294     }->{$self->{nc}}) {
1295     !!!cp (115);
1296     !!!parse-error (type => 'bad attribute value');
1297     } else {
1298     !!!cp (116);
1299     }
1300     $self->{ca}->{value} .= chr ($self->{nc});
1301     $self->{read_until}->($self->{ca}->{value},
1302     q["'=& >],
1303     length $self->{ca}->{value});
1304    
1305     ## Stay in the state
1306     !!!next-input-character;
1307     redo A;
1308     }
1309     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1310     if ($is_space->{$self->{nc}}) {
1311     !!!cp (118);
1312     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1313     !!!next-input-character;
1314     redo A;
1315     } elsif ($self->{nc} == 0x003E) { # >
1316     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1317     !!!cp (119);
1318     $self->{last_stag_name} = $self->{ct}->{tag_name};
1319     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1320     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1321     if ($self->{ct}->{attributes}) {
1322     !!!cp (120);
1323     !!!parse-error (type => 'end tag attribute');
1324     } else {
1325     ## NOTE: This state should never be reached.
1326     !!!cp (121);
1327     }
1328     } else {
1329     die "$0: $self->{ct}->{type}: Unknown token type";
1330     }
1331     $self->{state} = DATA_STATE;
1332 wakaba 1.5 $self->{s_kwd} = '';
1333 wakaba 1.1 !!!next-input-character;
1334    
1335     !!!emit ($self->{ct}); # start tag or end tag
1336    
1337     redo A;
1338     } elsif ($self->{nc} == 0x002F) { # /
1339     !!!cp (122);
1340     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1341     !!!next-input-character;
1342     redo A;
1343     } elsif ($self->{nc} == -1) {
1344     !!!parse-error (type => 'unclosed tag');
1345     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1346     !!!cp (122.3);
1347     $self->{last_stag_name} = $self->{ct}->{tag_name};
1348     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1349     if ($self->{ct}->{attributes}) {
1350     !!!cp (122.1);
1351     !!!parse-error (type => 'end tag attribute');
1352     } else {
1353     ## NOTE: This state should never be reached.
1354     !!!cp (122.2);
1355     }
1356     } else {
1357     die "$0: $self->{ct}->{type}: Unknown token type";
1358     }
1359     $self->{state} = DATA_STATE;
1360 wakaba 1.5 $self->{s_kwd} = '';
1361 wakaba 1.1 ## Reconsume.
1362     !!!emit ($self->{ct}); # start tag or end tag
1363     redo A;
1364     } else {
1365     !!!cp ('124.1');
1366     !!!parse-error (type => 'no space between attributes');
1367     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1368     ## reconsume
1369     redo A;
1370     }
1371     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1372     if ($self->{nc} == 0x003E) { # >
1373     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1374     !!!cp ('124.2');
1375     !!!parse-error (type => 'nestc', token => $self->{ct});
1376     ## TODO: Different type than slash in start tag
1377     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1378     if ($self->{ct}->{attributes}) {
1379     !!!cp ('124.4');
1380     !!!parse-error (type => 'end tag attribute');
1381     } else {
1382     !!!cp ('124.5');
1383     }
1384     ## TODO: Test |<title></title/>|
1385     } else {
1386     !!!cp ('124.3');
1387     $self->{self_closing} = 1;
1388     }
1389    
1390     $self->{state} = DATA_STATE;
1391 wakaba 1.5 $self->{s_kwd} = '';
1392 wakaba 1.1 !!!next-input-character;
1393    
1394     !!!emit ($self->{ct}); # start tag or end tag
1395    
1396     redo A;
1397     } elsif ($self->{nc} == -1) {
1398     !!!parse-error (type => 'unclosed tag');
1399     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1400     !!!cp (124.7);
1401     $self->{last_stag_name} = $self->{ct}->{tag_name};
1402     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1403     if ($self->{ct}->{attributes}) {
1404     !!!cp (124.5);
1405     !!!parse-error (type => 'end tag attribute');
1406     } else {
1407     ## NOTE: This state should never be reached.
1408     !!!cp (124.6);
1409     }
1410     } else {
1411     die "$0: $self->{ct}->{type}: Unknown token type";
1412     }
1413     $self->{state} = DATA_STATE;
1414 wakaba 1.5 $self->{s_kwd} = '';
1415 wakaba 1.1 ## Reconsume.
1416     !!!emit ($self->{ct}); # start tag or end tag
1417     redo A;
1418     } else {
1419     !!!cp ('124.4');
1420     !!!parse-error (type => 'nestc');
1421     ## TODO: This error type is wrong.
1422     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1423     ## Reconsume.
1424     redo A;
1425     }
1426     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1427     ## (only happen if PCDATA state)
1428    
1429     ## NOTE: Unlike spec's "bogus comment state", this implementation
1430     ## consumes characters one-by-one basis.
1431    
1432     if ($self->{nc} == 0x003E) { # >
1433     !!!cp (124);
1434     $self->{state} = DATA_STATE;
1435 wakaba 1.5 $self->{s_kwd} = '';
1436 wakaba 1.1 !!!next-input-character;
1437    
1438     !!!emit ($self->{ct}); # comment
1439     redo A;
1440     } elsif ($self->{nc} == -1) {
1441     !!!cp (125);
1442     $self->{state} = DATA_STATE;
1443 wakaba 1.5 $self->{s_kwd} = '';
1444 wakaba 1.1 ## reconsume
1445    
1446     !!!emit ($self->{ct}); # comment
1447     redo A;
1448     } else {
1449     !!!cp (126);
1450     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1451     $self->{read_until}->($self->{ct}->{data},
1452     q[>],
1453     length $self->{ct}->{data});
1454    
1455     ## Stay in the state.
1456     !!!next-input-character;
1457     redo A;
1458     }
1459     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1460     ## (only happen if PCDATA state)
1461    
1462     if ($self->{nc} == 0x002D) { # -
1463     !!!cp (133);
1464     $self->{state} = MD_HYPHEN_STATE;
1465     !!!next-input-character;
1466     redo A;
1467     } elsif ($self->{nc} == 0x0044 or # D
1468     $self->{nc} == 0x0064) { # d
1469     ## ASCII case-insensitive.
1470     !!!cp (130);
1471     $self->{state} = MD_DOCTYPE_STATE;
1472     $self->{s_kwd} = chr $self->{nc};
1473     !!!next-input-character;
1474     redo A;
1475 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1476     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1477     $self->{is_xml}) and
1478 wakaba 1.1 $self->{nc} == 0x005B) { # [
1479     !!!cp (135.4);
1480     $self->{state} = MD_CDATA_STATE;
1481     $self->{s_kwd} = '[';
1482     !!!next-input-character;
1483     redo A;
1484     } else {
1485     !!!cp (136);
1486     }
1487    
1488     !!!parse-error (type => 'bogus comment',
1489     line => $self->{line_prev},
1490     column => $self->{column_prev} - 1);
1491     ## Reconsume.
1492     $self->{state} = BOGUS_COMMENT_STATE;
1493     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1494     line => $self->{line_prev},
1495     column => $self->{column_prev} - 1,
1496     };
1497     redo A;
1498     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1499     if ($self->{nc} == 0x002D) { # -
1500     !!!cp (127);
1501     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1502     line => $self->{line_prev},
1503     column => $self->{column_prev} - 2,
1504     };
1505     $self->{state} = COMMENT_START_STATE;
1506     !!!next-input-character;
1507     redo A;
1508     } else {
1509     !!!cp (128);
1510     !!!parse-error (type => 'bogus comment',
1511     line => $self->{line_prev},
1512     column => $self->{column_prev} - 2);
1513     $self->{state} = BOGUS_COMMENT_STATE;
1514     ## Reconsume.
1515     $self->{ct} = {type => COMMENT_TOKEN,
1516     data => '-',
1517     line => $self->{line_prev},
1518     column => $self->{column_prev} - 2,
1519     };
1520     redo A;
1521     }
1522     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1523     ## ASCII case-insensitive.
1524     if ($self->{nc} == [
1525     undef,
1526     0x004F, # O
1527     0x0043, # C
1528     0x0054, # T
1529     0x0059, # Y
1530     0x0050, # P
1531     ]->[length $self->{s_kwd}] or
1532     $self->{nc} == [
1533     undef,
1534     0x006F, # o
1535     0x0063, # c
1536     0x0074, # t
1537     0x0079, # y
1538     0x0070, # p
1539     ]->[length $self->{s_kwd}]) {
1540     !!!cp (131);
1541     ## Stay in the state.
1542     $self->{s_kwd} .= chr $self->{nc};
1543     !!!next-input-character;
1544     redo A;
1545     } elsif ((length $self->{s_kwd}) == 6 and
1546     ($self->{nc} == 0x0045 or # E
1547     $self->{nc} == 0x0065)) { # e
1548     !!!cp (129);
1549     $self->{state} = DOCTYPE_STATE;
1550     $self->{ct} = {type => DOCTYPE_TOKEN,
1551     quirks => 1,
1552     line => $self->{line_prev},
1553     column => $self->{column_prev} - 7,
1554     };
1555     !!!next-input-character;
1556     redo A;
1557     } else {
1558     !!!cp (132);
1559     !!!parse-error (type => 'bogus comment',
1560     line => $self->{line_prev},
1561     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1562     $self->{state} = BOGUS_COMMENT_STATE;
1563     ## Reconsume.
1564     $self->{ct} = {type => COMMENT_TOKEN,
1565     data => $self->{s_kwd},
1566     line => $self->{line_prev},
1567     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1568     };
1569     redo A;
1570     }
1571     } elsif ($self->{state} == MD_CDATA_STATE) {
1572     if ($self->{nc} == {
1573     '[' => 0x0043, # C
1574     '[C' => 0x0044, # D
1575     '[CD' => 0x0041, # A
1576     '[CDA' => 0x0054, # T
1577     '[CDAT' => 0x0041, # A
1578     }->{$self->{s_kwd}}) {
1579     !!!cp (135.1);
1580     ## Stay in the state.
1581     $self->{s_kwd} .= chr $self->{nc};
1582     !!!next-input-character;
1583     redo A;
1584     } elsif ($self->{s_kwd} eq '[CDATA' and
1585     $self->{nc} == 0x005B) { # [
1586 wakaba 1.6 if ($self->{is_xml} and
1587     not $self->{tainted} and
1588     @{$self->{open_elements} or []} == 0) {
1589 wakaba 1.8 !!!cp (135.2);
1590 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1591     line => $self->{line_prev},
1592     column => $self->{column_prev} - 7);
1593     $self->{tainted} = 1;
1594 wakaba 1.8 } else {
1595     !!!cp (135.21);
1596 wakaba 1.6 }
1597    
1598 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1599     data => '',
1600     line => $self->{line_prev},
1601     column => $self->{column_prev} - 7};
1602     $self->{state} = CDATA_SECTION_STATE;
1603     !!!next-input-character;
1604     redo A;
1605     } else {
1606     !!!cp (135.3);
1607     !!!parse-error (type => 'bogus comment',
1608     line => $self->{line_prev},
1609     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1610     $self->{state} = BOGUS_COMMENT_STATE;
1611     ## Reconsume.
1612     $self->{ct} = {type => COMMENT_TOKEN,
1613     data => $self->{s_kwd},
1614     line => $self->{line_prev},
1615     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1616     };
1617     redo A;
1618     }
1619     } elsif ($self->{state} == COMMENT_START_STATE) {
1620     if ($self->{nc} == 0x002D) { # -
1621     !!!cp (137);
1622     $self->{state} = COMMENT_START_DASH_STATE;
1623     !!!next-input-character;
1624     redo A;
1625     } elsif ($self->{nc} == 0x003E) { # >
1626     !!!cp (138);
1627     !!!parse-error (type => 'bogus comment');
1628     $self->{state} = DATA_STATE;
1629 wakaba 1.5 $self->{s_kwd} = '';
1630 wakaba 1.1 !!!next-input-character;
1631    
1632     !!!emit ($self->{ct}); # comment
1633    
1634     redo A;
1635     } elsif ($self->{nc} == -1) {
1636     !!!cp (139);
1637     !!!parse-error (type => 'unclosed comment');
1638     $self->{state} = DATA_STATE;
1639 wakaba 1.5 $self->{s_kwd} = '';
1640 wakaba 1.1 ## reconsume
1641    
1642     !!!emit ($self->{ct}); # comment
1643    
1644     redo A;
1645     } else {
1646     !!!cp (140);
1647     $self->{ct}->{data} # comment
1648     .= chr ($self->{nc});
1649     $self->{state} = COMMENT_STATE;
1650     !!!next-input-character;
1651     redo A;
1652     }
1653     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1654     if ($self->{nc} == 0x002D) { # -
1655     !!!cp (141);
1656     $self->{state} = COMMENT_END_STATE;
1657     !!!next-input-character;
1658     redo A;
1659     } elsif ($self->{nc} == 0x003E) { # >
1660     !!!cp (142);
1661     !!!parse-error (type => 'bogus comment');
1662     $self->{state} = DATA_STATE;
1663 wakaba 1.5 $self->{s_kwd} = '';
1664 wakaba 1.1 !!!next-input-character;
1665    
1666     !!!emit ($self->{ct}); # comment
1667    
1668     redo A;
1669     } elsif ($self->{nc} == -1) {
1670     !!!cp (143);
1671     !!!parse-error (type => 'unclosed comment');
1672     $self->{state} = DATA_STATE;
1673 wakaba 1.5 $self->{s_kwd} = '';
1674 wakaba 1.1 ## reconsume
1675    
1676     !!!emit ($self->{ct}); # comment
1677    
1678     redo A;
1679     } else {
1680     !!!cp (144);
1681     $self->{ct}->{data} # comment
1682     .= '-' . chr ($self->{nc});
1683     $self->{state} = COMMENT_STATE;
1684     !!!next-input-character;
1685     redo A;
1686     }
1687     } elsif ($self->{state} == COMMENT_STATE) {
1688     if ($self->{nc} == 0x002D) { # -
1689     !!!cp (145);
1690     $self->{state} = COMMENT_END_DASH_STATE;
1691     !!!next-input-character;
1692     redo A;
1693     } elsif ($self->{nc} == -1) {
1694     !!!cp (146);
1695     !!!parse-error (type => 'unclosed comment');
1696     $self->{state} = DATA_STATE;
1697 wakaba 1.5 $self->{s_kwd} = '';
1698 wakaba 1.1 ## reconsume
1699    
1700     !!!emit ($self->{ct}); # comment
1701    
1702     redo A;
1703     } else {
1704     !!!cp (147);
1705     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1706     $self->{read_until}->($self->{ct}->{data},
1707     q[-],
1708     length $self->{ct}->{data});
1709    
1710     ## Stay in the state
1711     !!!next-input-character;
1712     redo A;
1713     }
1714     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1715     if ($self->{nc} == 0x002D) { # -
1716     !!!cp (148);
1717     $self->{state} = COMMENT_END_STATE;
1718     !!!next-input-character;
1719     redo A;
1720     } elsif ($self->{nc} == -1) {
1721     !!!cp (149);
1722     !!!parse-error (type => 'unclosed comment');
1723 wakaba 1.5 $self->{s_kwd} = '';
1724 wakaba 1.1 $self->{state} = DATA_STATE;
1725 wakaba 1.5 $self->{s_kwd} = '';
1726 wakaba 1.1 ## reconsume
1727    
1728     !!!emit ($self->{ct}); # comment
1729    
1730     redo A;
1731     } else {
1732     !!!cp (150);
1733     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1734     $self->{state} = COMMENT_STATE;
1735     !!!next-input-character;
1736     redo A;
1737     }
1738     } elsif ($self->{state} == COMMENT_END_STATE) {
1739     if ($self->{nc} == 0x003E) { # >
1740     !!!cp (151);
1741     $self->{state} = DATA_STATE;
1742 wakaba 1.5 $self->{s_kwd} = '';
1743 wakaba 1.1 !!!next-input-character;
1744    
1745     !!!emit ($self->{ct}); # comment
1746    
1747     redo A;
1748     } elsif ($self->{nc} == 0x002D) { # -
1749     !!!cp (152);
1750     !!!parse-error (type => 'dash in comment',
1751     line => $self->{line_prev},
1752     column => $self->{column_prev});
1753     $self->{ct}->{data} .= '-'; # comment
1754     ## Stay in the state
1755     !!!next-input-character;
1756     redo A;
1757     } elsif ($self->{nc} == -1) {
1758     !!!cp (153);
1759     !!!parse-error (type => 'unclosed comment');
1760     $self->{state} = DATA_STATE;
1761 wakaba 1.5 $self->{s_kwd} = '';
1762 wakaba 1.1 ## reconsume
1763    
1764     !!!emit ($self->{ct}); # comment
1765    
1766     redo A;
1767     } else {
1768     !!!cp (154);
1769     !!!parse-error (type => 'dash in comment',
1770     line => $self->{line_prev},
1771     column => $self->{column_prev});
1772     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1773     $self->{state} = COMMENT_STATE;
1774     !!!next-input-character;
1775     redo A;
1776     }
1777     } elsif ($self->{state} == DOCTYPE_STATE) {
1778     if ($is_space->{$self->{nc}}) {
1779     !!!cp (155);
1780     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1781     !!!next-input-character;
1782     redo A;
1783     } else {
1784     !!!cp (156);
1785     !!!parse-error (type => 'no space before DOCTYPE name');
1786     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1787     ## reconsume
1788     redo A;
1789     }
1790     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1791     if ($is_space->{$self->{nc}}) {
1792     !!!cp (157);
1793     ## Stay in the state
1794     !!!next-input-character;
1795     redo A;
1796     } elsif ($self->{nc} == 0x003E) { # >
1797     !!!cp (158);
1798     !!!parse-error (type => 'no DOCTYPE name');
1799     $self->{state} = DATA_STATE;
1800 wakaba 1.5 $self->{s_kwd} = '';
1801 wakaba 1.1 !!!next-input-character;
1802    
1803     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1804    
1805     redo A;
1806     } elsif ($self->{nc} == -1) {
1807     !!!cp (159);
1808     !!!parse-error (type => 'no DOCTYPE name');
1809     $self->{state} = DATA_STATE;
1810 wakaba 1.5 $self->{s_kwd} = '';
1811 wakaba 1.1 ## reconsume
1812    
1813     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1814    
1815     redo A;
1816     } else {
1817     !!!cp (160);
1818     $self->{ct}->{name} = chr $self->{nc};
1819     delete $self->{ct}->{quirks};
1820     $self->{state} = DOCTYPE_NAME_STATE;
1821     !!!next-input-character;
1822     redo A;
1823     }
1824     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1825     ## ISSUE: Redundant "First," in the spec.
1826     if ($is_space->{$self->{nc}}) {
1827     !!!cp (161);
1828     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1829     !!!next-input-character;
1830     redo A;
1831     } elsif ($self->{nc} == 0x003E) { # >
1832     !!!cp (162);
1833     $self->{state} = DATA_STATE;
1834 wakaba 1.5 $self->{s_kwd} = '';
1835 wakaba 1.1 !!!next-input-character;
1836    
1837     !!!emit ($self->{ct}); # DOCTYPE
1838    
1839     redo A;
1840     } elsif ($self->{nc} == -1) {
1841     !!!cp (163);
1842     !!!parse-error (type => 'unclosed DOCTYPE');
1843     $self->{state} = DATA_STATE;
1844 wakaba 1.5 $self->{s_kwd} = '';
1845 wakaba 1.1 ## reconsume
1846    
1847     $self->{ct}->{quirks} = 1;
1848     !!!emit ($self->{ct}); # DOCTYPE
1849    
1850     redo A;
1851     } else {
1852     !!!cp (164);
1853     $self->{ct}->{name}
1854     .= chr ($self->{nc}); # DOCTYPE
1855     ## Stay in the state
1856     !!!next-input-character;
1857     redo A;
1858     }
1859     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1860     if ($is_space->{$self->{nc}}) {
1861     !!!cp (165);
1862     ## Stay in the state
1863     !!!next-input-character;
1864     redo A;
1865     } elsif ($self->{nc} == 0x003E) { # >
1866     !!!cp (166);
1867     $self->{state} = DATA_STATE;
1868 wakaba 1.5 $self->{s_kwd} = '';
1869 wakaba 1.1 !!!next-input-character;
1870    
1871     !!!emit ($self->{ct}); # DOCTYPE
1872    
1873     redo A;
1874     } elsif ($self->{nc} == -1) {
1875     !!!cp (167);
1876     !!!parse-error (type => 'unclosed DOCTYPE');
1877     $self->{state} = DATA_STATE;
1878 wakaba 1.5 $self->{s_kwd} = '';
1879 wakaba 1.1 ## reconsume
1880    
1881     $self->{ct}->{quirks} = 1;
1882     !!!emit ($self->{ct}); # DOCTYPE
1883    
1884     redo A;
1885     } elsif ($self->{nc} == 0x0050 or # P
1886     $self->{nc} == 0x0070) { # p
1887     $self->{state} = PUBLIC_STATE;
1888     $self->{s_kwd} = chr $self->{nc};
1889     !!!next-input-character;
1890     redo A;
1891     } elsif ($self->{nc} == 0x0053 or # S
1892     $self->{nc} == 0x0073) { # s
1893     $self->{state} = SYSTEM_STATE;
1894     $self->{s_kwd} = chr $self->{nc};
1895     !!!next-input-character;
1896     redo A;
1897     } else {
1898     !!!cp (180);
1899     !!!parse-error (type => 'string after DOCTYPE name');
1900     $self->{ct}->{quirks} = 1;
1901    
1902     $self->{state} = BOGUS_DOCTYPE_STATE;
1903     !!!next-input-character;
1904     redo A;
1905     }
1906     } elsif ($self->{state} == PUBLIC_STATE) {
1907     ## ASCII case-insensitive
1908     if ($self->{nc} == [
1909     undef,
1910     0x0055, # U
1911     0x0042, # B
1912     0x004C, # L
1913     0x0049, # I
1914     ]->[length $self->{s_kwd}] or
1915     $self->{nc} == [
1916     undef,
1917     0x0075, # u
1918     0x0062, # b
1919     0x006C, # l
1920     0x0069, # i
1921     ]->[length $self->{s_kwd}]) {
1922     !!!cp (175);
1923     ## Stay in the state.
1924     $self->{s_kwd} .= chr $self->{nc};
1925     !!!next-input-character;
1926     redo A;
1927     } elsif ((length $self->{s_kwd}) == 5 and
1928     ($self->{nc} == 0x0043 or # C
1929     $self->{nc} == 0x0063)) { # c
1930     !!!cp (168);
1931     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1932     !!!next-input-character;
1933     redo A;
1934     } else {
1935     !!!cp (169);
1936     !!!parse-error (type => 'string after DOCTYPE name',
1937     line => $self->{line_prev},
1938     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1939     $self->{ct}->{quirks} = 1;
1940    
1941     $self->{state} = BOGUS_DOCTYPE_STATE;
1942     ## Reconsume.
1943     redo A;
1944     }
1945     } elsif ($self->{state} == SYSTEM_STATE) {
1946     ## ASCII case-insensitive
1947     if ($self->{nc} == [
1948     undef,
1949     0x0059, # Y
1950     0x0053, # S
1951     0x0054, # T
1952     0x0045, # E
1953     ]->[length $self->{s_kwd}] or
1954     $self->{nc} == [
1955     undef,
1956     0x0079, # y
1957     0x0073, # s
1958     0x0074, # t
1959     0x0065, # e
1960     ]->[length $self->{s_kwd}]) {
1961     !!!cp (170);
1962     ## Stay in the state.
1963     $self->{s_kwd} .= chr $self->{nc};
1964     !!!next-input-character;
1965     redo A;
1966     } elsif ((length $self->{s_kwd}) == 5 and
1967     ($self->{nc} == 0x004D or # M
1968     $self->{nc} == 0x006D)) { # m
1969     !!!cp (171);
1970     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1971     !!!next-input-character;
1972     redo A;
1973     } else {
1974     !!!cp (172);
1975     !!!parse-error (type => 'string after DOCTYPE name',
1976     line => $self->{line_prev},
1977     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1978     $self->{ct}->{quirks} = 1;
1979    
1980     $self->{state} = BOGUS_DOCTYPE_STATE;
1981     ## Reconsume.
1982     redo A;
1983     }
1984     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1985     if ($is_space->{$self->{nc}}) {
1986     !!!cp (181);
1987     ## Stay in the state
1988     !!!next-input-character;
1989     redo A;
1990     } elsif ($self->{nc} eq 0x0022) { # "
1991     !!!cp (182);
1992     $self->{ct}->{pubid} = ''; # DOCTYPE
1993     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1994     !!!next-input-character;
1995     redo A;
1996     } elsif ($self->{nc} eq 0x0027) { # '
1997     !!!cp (183);
1998     $self->{ct}->{pubid} = ''; # DOCTYPE
1999     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2000     !!!next-input-character;
2001     redo A;
2002     } elsif ($self->{nc} eq 0x003E) { # >
2003     !!!cp (184);
2004     !!!parse-error (type => 'no PUBLIC literal');
2005    
2006     $self->{state} = DATA_STATE;
2007 wakaba 1.5 $self->{s_kwd} = '';
2008 wakaba 1.1 !!!next-input-character;
2009    
2010     $self->{ct}->{quirks} = 1;
2011     !!!emit ($self->{ct}); # DOCTYPE
2012    
2013     redo A;
2014     } elsif ($self->{nc} == -1) {
2015     !!!cp (185);
2016     !!!parse-error (type => 'unclosed DOCTYPE');
2017    
2018     $self->{state} = DATA_STATE;
2019 wakaba 1.5 $self->{s_kwd} = '';
2020 wakaba 1.1 ## reconsume
2021    
2022     $self->{ct}->{quirks} = 1;
2023     !!!emit ($self->{ct}); # DOCTYPE
2024    
2025     redo A;
2026     } else {
2027     !!!cp (186);
2028     !!!parse-error (type => 'string after PUBLIC');
2029     $self->{ct}->{quirks} = 1;
2030    
2031     $self->{state} = BOGUS_DOCTYPE_STATE;
2032     !!!next-input-character;
2033     redo A;
2034     }
2035     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2036     if ($self->{nc} == 0x0022) { # "
2037     !!!cp (187);
2038     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2039     !!!next-input-character;
2040     redo A;
2041     } elsif ($self->{nc} == 0x003E) { # >
2042     !!!cp (188);
2043     !!!parse-error (type => 'unclosed PUBLIC literal');
2044    
2045     $self->{state} = DATA_STATE;
2046 wakaba 1.5 $self->{s_kwd} = '';
2047 wakaba 1.1 !!!next-input-character;
2048    
2049     $self->{ct}->{quirks} = 1;
2050     !!!emit ($self->{ct}); # DOCTYPE
2051    
2052     redo A;
2053     } elsif ($self->{nc} == -1) {
2054     !!!cp (189);
2055     !!!parse-error (type => 'unclosed PUBLIC literal');
2056    
2057     $self->{state} = DATA_STATE;
2058 wakaba 1.5 $self->{s_kwd} = '';
2059 wakaba 1.1 ## reconsume
2060    
2061     $self->{ct}->{quirks} = 1;
2062     !!!emit ($self->{ct}); # DOCTYPE
2063    
2064     redo A;
2065     } else {
2066     !!!cp (190);
2067     $self->{ct}->{pubid} # DOCTYPE
2068     .= chr $self->{nc};
2069     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2070     length $self->{ct}->{pubid});
2071    
2072     ## Stay in the state
2073     !!!next-input-character;
2074     redo A;
2075     }
2076     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2077     if ($self->{nc} == 0x0027) { # '
2078     !!!cp (191);
2079     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2080     !!!next-input-character;
2081     redo A;
2082     } elsif ($self->{nc} == 0x003E) { # >
2083     !!!cp (192);
2084     !!!parse-error (type => 'unclosed PUBLIC literal');
2085    
2086     $self->{state} = DATA_STATE;
2087 wakaba 1.5 $self->{s_kwd} = '';
2088 wakaba 1.1 !!!next-input-character;
2089    
2090     $self->{ct}->{quirks} = 1;
2091     !!!emit ($self->{ct}); # DOCTYPE
2092    
2093     redo A;
2094     } elsif ($self->{nc} == -1) {
2095     !!!cp (193);
2096     !!!parse-error (type => 'unclosed PUBLIC literal');
2097    
2098     $self->{state} = DATA_STATE;
2099 wakaba 1.5 $self->{s_kwd} = '';
2100 wakaba 1.1 ## reconsume
2101    
2102     $self->{ct}->{quirks} = 1;
2103     !!!emit ($self->{ct}); # DOCTYPE
2104    
2105     redo A;
2106     } else {
2107     !!!cp (194);
2108     $self->{ct}->{pubid} # DOCTYPE
2109     .= chr $self->{nc};
2110     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2111     length $self->{ct}->{pubid});
2112    
2113     ## Stay in the state
2114     !!!next-input-character;
2115     redo A;
2116     }
2117     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2118     if ($is_space->{$self->{nc}}) {
2119     !!!cp (195);
2120     ## Stay in the state
2121     !!!next-input-character;
2122     redo A;
2123     } elsif ($self->{nc} == 0x0022) { # "
2124     !!!cp (196);
2125     $self->{ct}->{sysid} = ''; # DOCTYPE
2126     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2127     !!!next-input-character;
2128     redo A;
2129     } elsif ($self->{nc} == 0x0027) { # '
2130     !!!cp (197);
2131     $self->{ct}->{sysid} = ''; # DOCTYPE
2132     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2133     !!!next-input-character;
2134     redo A;
2135     } elsif ($self->{nc} == 0x003E) { # >
2136     !!!cp (198);
2137     $self->{state} = DATA_STATE;
2138 wakaba 1.5 $self->{s_kwd} = '';
2139 wakaba 1.1 !!!next-input-character;
2140    
2141     !!!emit ($self->{ct}); # DOCTYPE
2142    
2143     redo A;
2144     } elsif ($self->{nc} == -1) {
2145     !!!cp (199);
2146     !!!parse-error (type => 'unclosed DOCTYPE');
2147    
2148     $self->{state} = DATA_STATE;
2149 wakaba 1.5 $self->{s_kwd} = '';
2150 wakaba 1.1 ## reconsume
2151    
2152     $self->{ct}->{quirks} = 1;
2153     !!!emit ($self->{ct}); # DOCTYPE
2154    
2155     redo A;
2156     } else {
2157     !!!cp (200);
2158     !!!parse-error (type => 'string after PUBLIC literal');
2159     $self->{ct}->{quirks} = 1;
2160    
2161     $self->{state} = BOGUS_DOCTYPE_STATE;
2162     !!!next-input-character;
2163     redo A;
2164     }
2165     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2166     if ($is_space->{$self->{nc}}) {
2167     !!!cp (201);
2168     ## Stay in the state
2169     !!!next-input-character;
2170     redo A;
2171     } elsif ($self->{nc} == 0x0022) { # "
2172     !!!cp (202);
2173     $self->{ct}->{sysid} = ''; # DOCTYPE
2174     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2175     !!!next-input-character;
2176     redo A;
2177     } elsif ($self->{nc} == 0x0027) { # '
2178     !!!cp (203);
2179     $self->{ct}->{sysid} = ''; # DOCTYPE
2180     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2181     !!!next-input-character;
2182     redo A;
2183     } elsif ($self->{nc} == 0x003E) { # >
2184     !!!cp (204);
2185     !!!parse-error (type => 'no SYSTEM literal');
2186     $self->{state} = DATA_STATE;
2187 wakaba 1.5 $self->{s_kwd} = '';
2188 wakaba 1.1 !!!next-input-character;
2189    
2190     $self->{ct}->{quirks} = 1;
2191     !!!emit ($self->{ct}); # DOCTYPE
2192    
2193     redo A;
2194     } elsif ($self->{nc} == -1) {
2195     !!!cp (205);
2196     !!!parse-error (type => 'unclosed DOCTYPE');
2197    
2198     $self->{state} = DATA_STATE;
2199 wakaba 1.5 $self->{s_kwd} = '';
2200 wakaba 1.1 ## reconsume
2201    
2202     $self->{ct}->{quirks} = 1;
2203     !!!emit ($self->{ct}); # DOCTYPE
2204    
2205     redo A;
2206     } else {
2207     !!!cp (206);
2208     !!!parse-error (type => 'string after SYSTEM');
2209     $self->{ct}->{quirks} = 1;
2210    
2211     $self->{state} = BOGUS_DOCTYPE_STATE;
2212     !!!next-input-character;
2213     redo A;
2214     }
2215     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2216     if ($self->{nc} == 0x0022) { # "
2217     !!!cp (207);
2218     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2219     !!!next-input-character;
2220     redo A;
2221     } elsif ($self->{nc} == 0x003E) { # >
2222     !!!cp (208);
2223     !!!parse-error (type => 'unclosed SYSTEM literal');
2224    
2225     $self->{state} = DATA_STATE;
2226 wakaba 1.5 $self->{s_kwd} = '';
2227 wakaba 1.1 !!!next-input-character;
2228    
2229     $self->{ct}->{quirks} = 1;
2230     !!!emit ($self->{ct}); # DOCTYPE
2231    
2232     redo A;
2233     } elsif ($self->{nc} == -1) {
2234     !!!cp (209);
2235     !!!parse-error (type => 'unclosed SYSTEM literal');
2236    
2237     $self->{state} = DATA_STATE;
2238 wakaba 1.5 $self->{s_kwd} = '';
2239 wakaba 1.1 ## reconsume
2240    
2241     $self->{ct}->{quirks} = 1;
2242     !!!emit ($self->{ct}); # DOCTYPE
2243    
2244     redo A;
2245     } else {
2246     !!!cp (210);
2247     $self->{ct}->{sysid} # DOCTYPE
2248     .= chr $self->{nc};
2249     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2250     length $self->{ct}->{sysid});
2251    
2252     ## Stay in the state
2253     !!!next-input-character;
2254     redo A;
2255     }
2256     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2257     if ($self->{nc} == 0x0027) { # '
2258     !!!cp (211);
2259     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2260     !!!next-input-character;
2261     redo A;
2262     } elsif ($self->{nc} == 0x003E) { # >
2263     !!!cp (212);
2264     !!!parse-error (type => 'unclosed SYSTEM literal');
2265    
2266     $self->{state} = DATA_STATE;
2267 wakaba 1.5 $self->{s_kwd} = '';
2268 wakaba 1.1 !!!next-input-character;
2269    
2270     $self->{ct}->{quirks} = 1;
2271     !!!emit ($self->{ct}); # DOCTYPE
2272    
2273     redo A;
2274     } elsif ($self->{nc} == -1) {
2275     !!!cp (213);
2276     !!!parse-error (type => 'unclosed SYSTEM literal');
2277    
2278     $self->{state} = DATA_STATE;
2279 wakaba 1.5 $self->{s_kwd} = '';
2280 wakaba 1.1 ## reconsume
2281    
2282     $self->{ct}->{quirks} = 1;
2283     !!!emit ($self->{ct}); # DOCTYPE
2284    
2285     redo A;
2286     } else {
2287     !!!cp (214);
2288     $self->{ct}->{sysid} # DOCTYPE
2289     .= chr $self->{nc};
2290     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2291     length $self->{ct}->{sysid});
2292    
2293     ## Stay in the state
2294     !!!next-input-character;
2295     redo A;
2296     }
2297     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2298     if ($is_space->{$self->{nc}}) {
2299     !!!cp (215);
2300     ## Stay in the state
2301     !!!next-input-character;
2302     redo A;
2303     } elsif ($self->{nc} == 0x003E) { # >
2304     !!!cp (216);
2305     $self->{state} = DATA_STATE;
2306 wakaba 1.5 $self->{s_kwd} = '';
2307 wakaba 1.1 !!!next-input-character;
2308    
2309     !!!emit ($self->{ct}); # DOCTYPE
2310    
2311     redo A;
2312     } elsif ($self->{nc} == -1) {
2313     !!!cp (217);
2314     !!!parse-error (type => 'unclosed DOCTYPE');
2315     $self->{state} = DATA_STATE;
2316 wakaba 1.5 $self->{s_kwd} = '';
2317 wakaba 1.1 ## reconsume
2318    
2319     $self->{ct}->{quirks} = 1;
2320     !!!emit ($self->{ct}); # DOCTYPE
2321    
2322     redo A;
2323     } else {
2324     !!!cp (218);
2325     !!!parse-error (type => 'string after SYSTEM literal');
2326     #$self->{ct}->{quirks} = 1;
2327    
2328     $self->{state} = BOGUS_DOCTYPE_STATE;
2329     !!!next-input-character;
2330     redo A;
2331     }
2332     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2333     if ($self->{nc} == 0x003E) { # >
2334     !!!cp (219);
2335     $self->{state} = DATA_STATE;
2336 wakaba 1.5 $self->{s_kwd} = '';
2337 wakaba 1.1 !!!next-input-character;
2338    
2339     !!!emit ($self->{ct}); # DOCTYPE
2340    
2341     redo A;
2342     } elsif ($self->{nc} == -1) {
2343     !!!cp (220);
2344     $self->{state} = DATA_STATE;
2345 wakaba 1.5 $self->{s_kwd} = '';
2346 wakaba 1.1 ## reconsume
2347    
2348     !!!emit ($self->{ct}); # DOCTYPE
2349    
2350     redo A;
2351     } else {
2352     !!!cp (221);
2353     my $s = '';
2354     $self->{read_until}->($s, q[>], 0);
2355    
2356     ## Stay in the state
2357     !!!next-input-character;
2358     redo A;
2359     }
2360     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2361     ## NOTE: "CDATA section state" in the state is jointly implemented
2362     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2363     ## and |CDATA_SECTION_MSE2_STATE|.
2364    
2365     if ($self->{nc} == 0x005D) { # ]
2366     !!!cp (221.1);
2367     $self->{state} = CDATA_SECTION_MSE1_STATE;
2368     !!!next-input-character;
2369     redo A;
2370     } elsif ($self->{nc} == -1) {
2371 wakaba 1.6 if ($self->{is_xml}) {
2372 wakaba 1.8 !!!cp (221.11);
2373 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2374 wakaba 1.8 } else {
2375     !!!cp (221.12);
2376 wakaba 1.6 }
2377    
2378 wakaba 1.1 $self->{state} = DATA_STATE;
2379 wakaba 1.5 $self->{s_kwd} = '';
2380 wakaba 1.1 !!!next-input-character;
2381     if (length $self->{ct}->{data}) { # character
2382     !!!cp (221.2);
2383     !!!emit ($self->{ct}); # character
2384     } else {
2385     !!!cp (221.3);
2386     ## No token to emit. $self->{ct} is discarded.
2387     }
2388     redo A;
2389     } else {
2390     !!!cp (221.4);
2391     $self->{ct}->{data} .= chr $self->{nc};
2392     $self->{read_until}->($self->{ct}->{data},
2393     q<]>,
2394     length $self->{ct}->{data});
2395    
2396     ## Stay in the state.
2397     !!!next-input-character;
2398     redo A;
2399     }
2400    
2401     ## ISSUE: "text tokens" in spec.
2402     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2403     if ($self->{nc} == 0x005D) { # ]
2404     !!!cp (221.5);
2405     $self->{state} = CDATA_SECTION_MSE2_STATE;
2406     !!!next-input-character;
2407     redo A;
2408     } else {
2409     !!!cp (221.6);
2410     $self->{ct}->{data} .= ']';
2411     $self->{state} = CDATA_SECTION_STATE;
2412     ## Reconsume.
2413     redo A;
2414     }
2415     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2416     if ($self->{nc} == 0x003E) { # >
2417     $self->{state} = DATA_STATE;
2418 wakaba 1.5 $self->{s_kwd} = '';
2419 wakaba 1.1 !!!next-input-character;
2420     if (length $self->{ct}->{data}) { # character
2421     !!!cp (221.7);
2422     !!!emit ($self->{ct}); # character
2423     } else {
2424     !!!cp (221.8);
2425     ## No token to emit. $self->{ct} is discarded.
2426     }
2427     redo A;
2428     } elsif ($self->{nc} == 0x005D) { # ]
2429     !!!cp (221.9); # character
2430     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2431     ## Stay in the state.
2432     !!!next-input-character;
2433     redo A;
2434     } else {
2435     !!!cp (221.11);
2436     $self->{ct}->{data} .= ']]'; # character
2437     $self->{state} = CDATA_SECTION_STATE;
2438     ## Reconsume.
2439     redo A;
2440     }
2441     } elsif ($self->{state} == ENTITY_STATE) {
2442     if ($is_space->{$self->{nc}} or
2443     {
2444     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2445     $self->{entity_add} => 1,
2446     }->{$self->{nc}}) {
2447     !!!cp (1001);
2448     ## Don't consume
2449     ## No error
2450     ## Return nothing.
2451     #
2452     } elsif ($self->{nc} == 0x0023) { # #
2453     !!!cp (999);
2454     $self->{state} = ENTITY_HASH_STATE;
2455     $self->{s_kwd} = '#';
2456     !!!next-input-character;
2457     redo A;
2458     } elsif ((0x0041 <= $self->{nc} and
2459     $self->{nc} <= 0x005A) or # A..Z
2460     (0x0061 <= $self->{nc} and
2461     $self->{nc} <= 0x007A)) { # a..z
2462     !!!cp (998);
2463     require Whatpm::_NamedEntityList;
2464     $self->{state} = ENTITY_NAME_STATE;
2465     $self->{s_kwd} = chr $self->{nc};
2466     $self->{entity__value} = $self->{s_kwd};
2467     $self->{entity__match} = 0;
2468     !!!next-input-character;
2469     redo A;
2470     } else {
2471     !!!cp (1027);
2472     !!!parse-error (type => 'bare ero');
2473     ## Return nothing.
2474     #
2475     }
2476    
2477     ## NOTE: No character is consumed by the "consume a character
2478     ## reference" algorithm. In other word, there is an "&" character
2479     ## that does not introduce a character reference, which would be
2480     ## appended to the parent element or the attribute value in later
2481     ## process of the tokenizer.
2482    
2483     if ($self->{prev_state} == DATA_STATE) {
2484     !!!cp (997);
2485     $self->{state} = $self->{prev_state};
2486 wakaba 1.5 $self->{s_kwd} = '';
2487 wakaba 1.1 ## Reconsume.
2488     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2489     line => $self->{line_prev},
2490     column => $self->{column_prev},
2491     });
2492     redo A;
2493     } else {
2494     !!!cp (996);
2495     $self->{ca}->{value} .= '&';
2496     $self->{state} = $self->{prev_state};
2497 wakaba 1.5 $self->{s_kwd} = '';
2498 wakaba 1.1 ## Reconsume.
2499     redo A;
2500     }
2501     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2502     if ($self->{nc} == 0x0078 or # x
2503     $self->{nc} == 0x0058) { # X
2504     !!!cp (995);
2505     $self->{state} = HEXREF_X_STATE;
2506     $self->{s_kwd} .= chr $self->{nc};
2507     !!!next-input-character;
2508     redo A;
2509     } elsif (0x0030 <= $self->{nc} and
2510     $self->{nc} <= 0x0039) { # 0..9
2511     !!!cp (994);
2512     $self->{state} = NCR_NUM_STATE;
2513     $self->{s_kwd} = $self->{nc} - 0x0030;
2514     !!!next-input-character;
2515     redo A;
2516     } else {
2517     !!!parse-error (type => 'bare nero',
2518     line => $self->{line_prev},
2519     column => $self->{column_prev} - 1);
2520    
2521     ## NOTE: According to the spec algorithm, nothing is returned,
2522     ## and then "&#" is appended to the parent element or the attribute
2523     ## value in the later processing.
2524    
2525     if ($self->{prev_state} == DATA_STATE) {
2526     !!!cp (1019);
2527     $self->{state} = $self->{prev_state};
2528 wakaba 1.5 $self->{s_kwd} = '';
2529 wakaba 1.1 ## Reconsume.
2530     !!!emit ({type => CHARACTER_TOKEN,
2531     data => '&#',
2532     line => $self->{line_prev},
2533     column => $self->{column_prev} - 1,
2534     });
2535     redo A;
2536     } else {
2537     !!!cp (993);
2538     $self->{ca}->{value} .= '&#';
2539     $self->{state} = $self->{prev_state};
2540 wakaba 1.5 $self->{s_kwd} = '';
2541 wakaba 1.1 ## Reconsume.
2542     redo A;
2543     }
2544     }
2545     } elsif ($self->{state} == NCR_NUM_STATE) {
2546     if (0x0030 <= $self->{nc} and
2547     $self->{nc} <= 0x0039) { # 0..9
2548     !!!cp (1012);
2549     $self->{s_kwd} *= 10;
2550     $self->{s_kwd} += $self->{nc} - 0x0030;
2551    
2552     ## Stay in the state.
2553     !!!next-input-character;
2554     redo A;
2555     } elsif ($self->{nc} == 0x003B) { # ;
2556     !!!cp (1013);
2557     !!!next-input-character;
2558     #
2559     } else {
2560     !!!cp (1014);
2561     !!!parse-error (type => 'no refc');
2562     ## Reconsume.
2563     #
2564     }
2565    
2566     my $code = $self->{s_kwd};
2567     my $l = $self->{line_prev};
2568     my $c = $self->{column_prev};
2569     if ($charref_map->{$code}) {
2570     !!!cp (1015);
2571     !!!parse-error (type => 'invalid character reference',
2572     text => (sprintf 'U+%04X', $code),
2573     line => $l, column => $c);
2574     $code = $charref_map->{$code};
2575     } elsif ($code > 0x10FFFF) {
2576     !!!cp (1016);
2577     !!!parse-error (type => 'invalid character reference',
2578     text => (sprintf 'U-%08X', $code),
2579     line => $l, column => $c);
2580     $code = 0xFFFD;
2581     }
2582    
2583     if ($self->{prev_state} == DATA_STATE) {
2584     !!!cp (992);
2585     $self->{state} = $self->{prev_state};
2586 wakaba 1.5 $self->{s_kwd} = '';
2587 wakaba 1.1 ## Reconsume.
2588     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2589 wakaba 1.7 has_reference => 1,
2590 wakaba 1.1 line => $l, column => $c,
2591     });
2592     redo A;
2593     } else {
2594     !!!cp (991);
2595     $self->{ca}->{value} .= chr $code;
2596     $self->{ca}->{has_reference} = 1;
2597     $self->{state} = $self->{prev_state};
2598 wakaba 1.5 $self->{s_kwd} = '';
2599 wakaba 1.1 ## Reconsume.
2600     redo A;
2601     }
2602     } elsif ($self->{state} == HEXREF_X_STATE) {
2603     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2604     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2605     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2606     # 0..9, A..F, a..f
2607     !!!cp (990);
2608     $self->{state} = HEXREF_HEX_STATE;
2609     $self->{s_kwd} = 0;
2610     ## Reconsume.
2611     redo A;
2612     } else {
2613     !!!parse-error (type => 'bare hcro',
2614     line => $self->{line_prev},
2615     column => $self->{column_prev} - 2);
2616    
2617     ## NOTE: According to the spec algorithm, nothing is returned,
2618     ## and then "&#" followed by "X" or "x" is appended to the parent
2619     ## element or the attribute value in the later processing.
2620    
2621     if ($self->{prev_state} == DATA_STATE) {
2622     !!!cp (1005);
2623     $self->{state} = $self->{prev_state};
2624 wakaba 1.5 $self->{s_kwd} = '';
2625 wakaba 1.1 ## Reconsume.
2626     !!!emit ({type => CHARACTER_TOKEN,
2627     data => '&' . $self->{s_kwd},
2628     line => $self->{line_prev},
2629     column => $self->{column_prev} - length $self->{s_kwd},
2630     });
2631     redo A;
2632     } else {
2633     !!!cp (989);
2634     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2635     $self->{state} = $self->{prev_state};
2636 wakaba 1.5 $self->{s_kwd} = '';
2637 wakaba 1.1 ## Reconsume.
2638     redo A;
2639     }
2640     }
2641     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2642     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2643     # 0..9
2644     !!!cp (1002);
2645     $self->{s_kwd} *= 0x10;
2646     $self->{s_kwd} += $self->{nc} - 0x0030;
2647     ## Stay in the state.
2648     !!!next-input-character;
2649     redo A;
2650     } elsif (0x0061 <= $self->{nc} and
2651     $self->{nc} <= 0x0066) { # a..f
2652     !!!cp (1003);
2653     $self->{s_kwd} *= 0x10;
2654     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2655     ## Stay in the state.
2656     !!!next-input-character;
2657     redo A;
2658     } elsif (0x0041 <= $self->{nc} and
2659     $self->{nc} <= 0x0046) { # A..F
2660     !!!cp (1004);
2661     $self->{s_kwd} *= 0x10;
2662     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2663     ## Stay in the state.
2664     !!!next-input-character;
2665     redo A;
2666     } elsif ($self->{nc} == 0x003B) { # ;
2667     !!!cp (1006);
2668     !!!next-input-character;
2669     #
2670     } else {
2671     !!!cp (1007);
2672     !!!parse-error (type => 'no refc',
2673     line => $self->{line},
2674     column => $self->{column});
2675     ## Reconsume.
2676     #
2677     }
2678    
2679     my $code = $self->{s_kwd};
2680     my $l = $self->{line_prev};
2681     my $c = $self->{column_prev};
2682     if ($charref_map->{$code}) {
2683     !!!cp (1008);
2684     !!!parse-error (type => 'invalid character reference',
2685     text => (sprintf 'U+%04X', $code),
2686     line => $l, column => $c);
2687     $code = $charref_map->{$code};
2688     } elsif ($code > 0x10FFFF) {
2689     !!!cp (1009);
2690     !!!parse-error (type => 'invalid character reference',
2691     text => (sprintf 'U-%08X', $code),
2692     line => $l, column => $c);
2693     $code = 0xFFFD;
2694     }
2695    
2696     if ($self->{prev_state} == DATA_STATE) {
2697     !!!cp (988);
2698     $self->{state} = $self->{prev_state};
2699 wakaba 1.5 $self->{s_kwd} = '';
2700 wakaba 1.1 ## Reconsume.
2701     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2702 wakaba 1.7 has_reference => 1,
2703 wakaba 1.1 line => $l, column => $c,
2704     });
2705     redo A;
2706     } else {
2707     !!!cp (987);
2708     $self->{ca}->{value} .= chr $code;
2709     $self->{ca}->{has_reference} = 1;
2710     $self->{state} = $self->{prev_state};
2711 wakaba 1.5 $self->{s_kwd} = '';
2712 wakaba 1.1 ## Reconsume.
2713     redo A;
2714     }
2715     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2716     if (length $self->{s_kwd} < 30 and
2717     ## NOTE: Some number greater than the maximum length of entity name
2718     ((0x0041 <= $self->{nc} and # a
2719     $self->{nc} <= 0x005A) or # x
2720     (0x0061 <= $self->{nc} and # a
2721     $self->{nc} <= 0x007A) or # z
2722     (0x0030 <= $self->{nc} and # 0
2723     $self->{nc} <= 0x0039) or # 9
2724     $self->{nc} == 0x003B)) { # ;
2725     our $EntityChar;
2726     $self->{s_kwd} .= chr $self->{nc};
2727     if (defined $EntityChar->{$self->{s_kwd}}) {
2728     if ($self->{nc} == 0x003B) { # ;
2729     !!!cp (1020);
2730     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2731     $self->{entity__match} = 1;
2732     !!!next-input-character;
2733     #
2734     } else {
2735     !!!cp (1021);
2736     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2737     $self->{entity__match} = -1;
2738     ## Stay in the state.
2739     !!!next-input-character;
2740     redo A;
2741     }
2742     } else {
2743     !!!cp (1022);
2744     $self->{entity__value} .= chr $self->{nc};
2745     $self->{entity__match} *= 2;
2746     ## Stay in the state.
2747     !!!next-input-character;
2748     redo A;
2749     }
2750     }
2751    
2752     my $data;
2753     my $has_ref;
2754     if ($self->{entity__match} > 0) {
2755     !!!cp (1023);
2756     $data = $self->{entity__value};
2757     $has_ref = 1;
2758     #
2759     } elsif ($self->{entity__match} < 0) {
2760     !!!parse-error (type => 'no refc');
2761     if ($self->{prev_state} != DATA_STATE and # in attribute
2762     $self->{entity__match} < -1) {
2763     !!!cp (1024);
2764     $data = '&' . $self->{s_kwd};
2765     #
2766     } else {
2767     !!!cp (1025);
2768     $data = $self->{entity__value};
2769     $has_ref = 1;
2770     #
2771     }
2772     } else {
2773     !!!cp (1026);
2774     !!!parse-error (type => 'bare ero',
2775     line => $self->{line_prev},
2776     column => $self->{column_prev} - length $self->{s_kwd});
2777     $data = '&' . $self->{s_kwd};
2778     #
2779     }
2780    
2781     ## NOTE: In these cases, when a character reference is found,
2782     ## it is consumed and a character token is returned, or, otherwise,
2783     ## nothing is consumed and returned, according to the spec algorithm.
2784     ## In this implementation, anything that has been examined by the
2785     ## tokenizer is appended to the parent element or the attribute value
2786     ## as string, either literal string when no character reference or
2787     ## entity-replaced string otherwise, in this stage, since any characters
2788     ## that would not be consumed are appended in the data state or in an
2789     ## appropriate attribute value state anyway.
2790    
2791     if ($self->{prev_state} == DATA_STATE) {
2792     !!!cp (986);
2793     $self->{state} = $self->{prev_state};
2794 wakaba 1.5 $self->{s_kwd} = '';
2795 wakaba 1.1 ## Reconsume.
2796     !!!emit ({type => CHARACTER_TOKEN,
2797     data => $data,
2798 wakaba 1.7 has_reference => $has_ref,
2799 wakaba 1.1 line => $self->{line_prev},
2800     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2801     });
2802     redo A;
2803     } else {
2804     !!!cp (985);
2805     $self->{ca}->{value} .= $data;
2806     $self->{ca}->{has_reference} = 1 if $has_ref;
2807     $self->{state} = $self->{prev_state};
2808 wakaba 1.5 $self->{s_kwd} = '';
2809 wakaba 1.1 ## Reconsume.
2810     redo A;
2811     }
2812 wakaba 1.8
2813     ## XML-only states
2814    
2815     } elsif ($self->{state} == PI_STATE) {
2816     if ($is_space->{$self->{nc}} or
2817     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2818     $self->{nc} == -1) {
2819     !!!parse-error (type => 'bare pio', ## TODO: type
2820     line => $self->{line_prev},
2821     column => $self->{column_prev}
2822     - 1 * ($self->{nc} != -1));
2823     $self->{state} = BOGUS_COMMENT_STATE;
2824     ## Reconsume.
2825     $self->{ct} = {type => COMMENT_TOKEN,
2826     data => '?',
2827     line => $self->{line_prev},
2828     column => $self->{column_prev}
2829     - 1 * ($self->{nc} != -1),
2830     };
2831     redo A;
2832     } else {
2833     $self->{ct} = {type => PI_TOKEN,
2834     target => chr $self->{nc},
2835     data => '',
2836     line => $self->{line_prev},
2837     column => $self->{column_prev} - 1,
2838     };
2839     $self->{state} = PI_TARGET_STATE;
2840     !!!next-input-character;
2841     redo A;
2842     }
2843     } elsif ($self->{state} == PI_TARGET_STATE) {
2844     if ($is_space->{$self->{nc}}) {
2845     $self->{state} = PI_TARGET_AFTER_STATE;
2846     !!!next-input-character;
2847     redo A;
2848     } elsif ($self->{nc} == -1) {
2849     !!!parse-error (type => 'no pic'); ## TODO: type
2850     $self->{state} = DATA_STATE;
2851     $self->{s_kwd} = '';
2852     ## Reconsume.
2853     !!!emit ($self->{ct}); # pi
2854     redo A;
2855     } elsif ($self->{nc} == 0x003F) { # ?
2856     $self->{state} = PI_AFTER_STATE;
2857     !!!next-input-character;
2858     redo A;
2859     } else {
2860     ## XML5: typo ("tag name" -> "target")
2861     $self->{ct}->{target} .= chr $self->{nc}; # pi
2862     !!!next-input-character;
2863     redo A;
2864     }
2865     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
2866     if ($is_space->{$self->{nc}}) {
2867     ## Stay in the state.
2868     !!!next-input-character;
2869     redo A;
2870     } else {
2871     $self->{state} = PI_DATA_STATE;
2872     ## Reprocess.
2873     redo A;
2874     }
2875     } elsif ($self->{state} == PI_DATA_STATE) {
2876     if ($self->{nc} == 0x003F) { # ?
2877     $self->{state} = PI_DATA_AFTER_STATE;
2878     !!!next-input-character;
2879     redo A;
2880     } elsif ($self->{nc} == -1) {
2881     !!!parse-error (type => 'no pic'); ## TODO: type
2882     $self->{state} = DATA_STATE;
2883     $self->{s_kwd} = '';
2884     ## Reprocess.
2885     !!!emit ($self->{ct}); # pi
2886     redo A;
2887     } else {
2888     $self->{ct}->{data} .= chr $self->{nc}; # pi
2889     $self->{read_until}->($self->{ct}->{data}, q[?],
2890     length $self->{ct}->{data});
2891     ## Stay in the state.
2892     !!!next-input-character;
2893     ## Reprocess.
2894     redo A;
2895     }
2896     } elsif ($self->{state} == PI_AFTER_STATE) {
2897     if ($self->{nc} == 0x003E) { # >
2898     $self->{state} = DATA_STATE;
2899     $self->{s_kwd} = '';
2900     !!!next-input-character;
2901     !!!emit ($self->{ct}); # pi
2902     redo A;
2903     } elsif ($self->{nc} == 0x003F) { # ?
2904     !!!parse-error (type => 'no s after target', ## TODO: type
2905     line => $self->{line_prev},
2906     column => $self->{column_prev}); ## XML5: no error
2907     $self->{ct}->{data} .= '?';
2908     $self->{state} = PI_DATA_AFTER_STATE;
2909     !!!next-input-character;
2910     redo A;
2911     } else {
2912     !!!parse-error (type => 'no s after target', ## TODO: type
2913     line => $self->{line_prev},
2914     column => $self->{column_prev}
2915     + 1 * ($self->{nc} == -1)); ## XML5: no error
2916     $self->{ct}->{data} .= '?'; ## XML5: not appended
2917     $self->{state} = PI_DATA_STATE;
2918     ## Reprocess.
2919     redo A;
2920     }
2921     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
2922     ## XML5: Same as "pi after state" in XML5
2923     if ($self->{nc} == 0x003E) { # >
2924     $self->{state} = DATA_STATE;
2925     $self->{s_kwd} = '';
2926     !!!next-input-character;
2927     !!!emit ($self->{ct}); # pi
2928     redo A;
2929     } elsif ($self->{nc} == 0x003F) { # ?
2930     $self->{ct}->{data} .= '?';
2931     ## Stay in the state.
2932     !!!next-input-character;
2933     redo A;
2934     } else {
2935     $self->{ct}->{data} .= '?'; ## XML5: not appended
2936     $self->{state} = PI_DATA_STATE;
2937     ## Reprocess.
2938     redo A;
2939     }
2940    
2941 wakaba 1.1 } else {
2942     die "$0: $self->{state}: Unknown state";
2943     }
2944     } # A
2945    
2946     die "$0: _get_next_token: unexpected case";
2947     } # _get_next_token
2948    
2949     1;
2950 wakaba 1.9 ## $Date: 2008/10/15 04:38:22 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24