/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (hide annotations) (download) (as text)
Thu Oct 16 03:39:57 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.12: +187 -60 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	16 Oct 2008 03:39:39 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/pis-2.dat" and "xml/comments-2.dat" are added.

++ whatpm/t/xml/ChangeLog	16 Oct 2008 03:39:53 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* doctypes-2.dat: New test added.

	* comments-2.dat, pis-2.dat: New test data files.

++ whatpm/Whatpm/HTML/ChangeLog	16 Oct 2008 03:36:51 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token type END_OF_DOCTYPE_TOKEN added.
	New states DOCTYPE_TAG_STATE and
	BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE are added.  (Bogus
	string after the internal subset, which was handled by the state
	BOGUS_DOCTYPE_STATE, are now handled by the new state.)  Support
	for comments, bogus comments, and processing instructions in the
	internal subset.  If there is the internal subset, then emit the
	doctype token before the internal subset (with its
	$token->{has_internal_subset} flag set) and an
	END_OF_DOCTYPE_TOKEN after the internal subset.

++ whatpm/Whatpm/XML/ChangeLog	16 Oct 2008 03:39:19 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Insertion mode IN_SUBSET_IM added.  In the
	"initial" insertion mode, if the DOCTYPE token's "has internal
	subset" flag is set, then switch to the "in subset" insertion
	mode.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.13 our $VERSION=do{my @r=(q$Revision: 1.12 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.2 );
20    
21     our %EXPORT_TAGS = (
22     token => [qw(
23     DOCTYPE_TOKEN
24     COMMENT_TOKEN
25     START_TAG_TOKEN
26     END_TAG_TOKEN
27     END_OF_FILE_TOKEN
28     CHARACTER_TOKEN
29     PI_TOKEN
30     ABORT_TOKEN
31 wakaba 1.13 END_OF_DOCTYPE_TOKEN
32 wakaba 1.2 )],
33     );
34     }
35    
36 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
37    
38 wakaba 1.2 ## Token types
39    
40 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
41 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
42     sub START_TAG_TOKEN () { 3 }
43     sub END_TAG_TOKEN () { 4 }
44     sub END_OF_FILE_TOKEN () { 5 }
45     sub CHARACTER_TOKEN () { 6 }
46 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
47     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48 wakaba 1.13 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49 wakaba 1.12
50     ## XML5: XML5 has "empty tag token". In this implementation, it is
51     ## represented as a start tag token with $self->{self_closing} flag
52     ## set to true.
53    
54     ## XML5: XML5 has "short end tag token". In this implementation, it
55     ## is represented as an end tag token with $token->{tag_name} flag set
56     ## to an empty string.
57 wakaba 1.1
58     package Whatpm::HTML;
59    
60 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
61    
62 wakaba 1.1 ## Content model flags
63    
64     sub CM_ENTITY () { 0b001 } # & markup in data
65     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
66     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
67    
68     sub PLAINTEXT_CONTENT_MODEL () { 0 }
69     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
70     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
71     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
72    
73     ## Tokenizer states
74    
75     sub DATA_STATE () { 0 }
76     #sub ENTITY_DATA_STATE () { 1 }
77     sub TAG_OPEN_STATE () { 2 }
78     sub CLOSE_TAG_OPEN_STATE () { 3 }
79     sub TAG_NAME_STATE () { 4 }
80     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
81     sub ATTRIBUTE_NAME_STATE () { 6 }
82     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
83     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
84     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
85     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
86     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
87     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
88     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
89     sub COMMENT_START_STATE () { 14 }
90     sub COMMENT_START_DASH_STATE () { 15 }
91     sub COMMENT_STATE () { 16 }
92     sub COMMENT_END_STATE () { 17 }
93     sub COMMENT_END_DASH_STATE () { 18 }
94     sub BOGUS_COMMENT_STATE () { 19 }
95     sub DOCTYPE_STATE () { 20 }
96     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
97     sub DOCTYPE_NAME_STATE () { 22 }
98     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
99     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
100     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
101     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
102     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
103     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
104     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
105     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
106     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
107     sub BOGUS_DOCTYPE_STATE () { 32 }
108     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
109     sub SELF_CLOSING_START_TAG_STATE () { 34 }
110     sub CDATA_SECTION_STATE () { 35 }
111     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
112     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
113     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
114     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
115     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
116     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
117     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
118     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
119     ## NOTE: "Entity data state", "entity in attribute value state", and
120     ## "consume a character reference" algorithm are jointly implemented
121     ## using the following six states:
122     sub ENTITY_STATE () { 44 }
123     sub ENTITY_HASH_STATE () { 45 }
124     sub NCR_NUM_STATE () { 46 }
125     sub HEXREF_X_STATE () { 47 }
126     sub HEXREF_HEX_STATE () { 48 }
127     sub ENTITY_NAME_STATE () { 49 }
128     sub PCDATA_STATE () { 50 } # "data state" in the spec
129    
130 wakaba 1.12 ## XML-only states
131 wakaba 1.8 sub PI_STATE () { 51 }
132     sub PI_TARGET_STATE () { 52 }
133     sub PI_TARGET_AFTER_STATE () { 53 }
134     sub PI_DATA_STATE () { 54 }
135     sub PI_AFTER_STATE () { 55 }
136     sub PI_DATA_AFTER_STATE () { 56 }
137 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139 wakaba 1.13 sub DOCTYPE_TAG_STATE () { 59 }
140     sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141 wakaba 1.8
142 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
143     ## list and descriptions)
144    
145     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
146     sub FOREIGN_EL () { 0b1_00000000000 }
147    
148     ## Character reference mappings
149    
150     my $charref_map = {
151     0x0D => 0x000A,
152     0x80 => 0x20AC,
153     0x81 => 0xFFFD,
154     0x82 => 0x201A,
155     0x83 => 0x0192,
156     0x84 => 0x201E,
157     0x85 => 0x2026,
158     0x86 => 0x2020,
159     0x87 => 0x2021,
160     0x88 => 0x02C6,
161     0x89 => 0x2030,
162     0x8A => 0x0160,
163     0x8B => 0x2039,
164     0x8C => 0x0152,
165     0x8D => 0xFFFD,
166     0x8E => 0x017D,
167     0x8F => 0xFFFD,
168     0x90 => 0xFFFD,
169     0x91 => 0x2018,
170     0x92 => 0x2019,
171     0x93 => 0x201C,
172     0x94 => 0x201D,
173     0x95 => 0x2022,
174     0x96 => 0x2013,
175     0x97 => 0x2014,
176     0x98 => 0x02DC,
177     0x99 => 0x2122,
178     0x9A => 0x0161,
179     0x9B => 0x203A,
180     0x9C => 0x0153,
181     0x9D => 0xFFFD,
182     0x9E => 0x017E,
183     0x9F => 0x0178,
184     }; # $charref_map
185     $charref_map->{$_} = 0xFFFD
186     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
187     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
188     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
189     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
190     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
191     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
192     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
193    
194     ## Implementations MUST act as if state machine in the spec
195    
196     sub _initialize_tokenizer ($) {
197     my $self = shift;
198    
199     ## NOTE: Fields set by |new| constructor:
200     #$self->{level}
201     #$self->{set_nc}
202     #$self->{parse_error}
203 wakaba 1.3 #$self->{is_xml} (if XML)
204 wakaba 1.1
205     $self->{state} = DATA_STATE; # MUST
206 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
207     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
208 wakaba 1.1 #$self->{entity__value}; # initialized when used
209     #$self->{entity__match}; # initialized when used
210     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
211     undef $self->{ct}; # current token
212     undef $self->{ca}; # current attribute
213     undef $self->{last_stag_name}; # last emitted start tag name
214     #$self->{prev_state}; # initialized when used
215     delete $self->{self_closing};
216     $self->{char_buffer} = '';
217     $self->{char_buffer_pos} = 0;
218     $self->{nc} = -1; # next input character
219     #$self->{next_nc}
220     !!!next-input-character;
221     $self->{token} = [];
222     # $self->{escape}
223     } # _initialize_tokenizer
224    
225     ## A token has:
226     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
227 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
228 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
229     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
230 wakaba 1.11 ## ->{target} (PI_TOKEN)
231 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
232     ## ->{sysid} (DOCTYPE_TOKEN)
233     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
234     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
235     ## ->{name}
236     ## ->{value}
237     ## ->{has_reference} == 1 or 0
238 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
239     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
240 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
241 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
242 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
243    
244 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
245     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
246     ## while the token is pushed back to the stack.
247    
248     ## Emitted token MUST immediately be handled by the tree construction state.
249    
250     ## Before each step, UA MAY check to see if either one of the scripts in
251     ## "list of scripts that will execute as soon as possible" or the first
252     ## script in the "list of scripts that will execute asynchronously",
253     ## has completed loading. If one has, then it MUST be executed
254     ## and removed from the list.
255    
256     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
257     ## (This requirement was dropped from HTML5 spec, unfortunately.)
258    
259     my $is_space = {
260     0x0009 => 1, # CHARACTER TABULATION (HT)
261     0x000A => 1, # LINE FEED (LF)
262     #0x000B => 0, # LINE TABULATION (VT)
263 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
264 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
265     0x0020 => 1, # SPACE (SP)
266     };
267    
268     sub _get_next_token ($) {
269     my $self = shift;
270    
271     if ($self->{self_closing}) {
272     !!!parse-error (type => 'nestc', token => $self->{ct});
273     ## NOTE: The |self_closing| flag is only set by start tag token.
274     ## In addition, when a start tag token is emitted, it is always set to
275     ## |ct|.
276     delete $self->{self_closing};
277     }
278    
279     if (@{$self->{token}}) {
280     $self->{self_closing} = $self->{token}->[0]->{self_closing};
281     return shift @{$self->{token}};
282     }
283    
284     A: {
285     if ($self->{state} == PCDATA_STATE) {
286     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
287    
288     if ($self->{nc} == 0x0026) { # &
289     !!!cp (0.1);
290     ## NOTE: In the spec, the tokenizer is switched to the
291     ## "entity data state". In this implementation, the tokenizer
292     ## is switched to the |ENTITY_STATE|, which is an implementation
293     ## of the "consume a character reference" algorithm.
294     $self->{entity_add} = -1;
295     $self->{prev_state} = DATA_STATE;
296     $self->{state} = ENTITY_STATE;
297     !!!next-input-character;
298     redo A;
299     } elsif ($self->{nc} == 0x003C) { # <
300     !!!cp (0.2);
301     $self->{state} = TAG_OPEN_STATE;
302     !!!next-input-character;
303     redo A;
304     } elsif ($self->{nc} == -1) {
305     !!!cp (0.3);
306     !!!emit ({type => END_OF_FILE_TOKEN,
307     line => $self->{line}, column => $self->{column}});
308     last A; ## TODO: ok?
309     } else {
310     !!!cp (0.4);
311     #
312     }
313    
314     # Anything else
315     my $token = {type => CHARACTER_TOKEN,
316     data => chr $self->{nc},
317     line => $self->{line}, column => $self->{column},
318     };
319     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
320    
321     ## Stay in the state.
322     !!!next-input-character;
323     !!!emit ($token);
324     redo A;
325     } elsif ($self->{state} == DATA_STATE) {
326     $self->{s_kwd} = '' unless defined $self->{s_kwd};
327     if ($self->{nc} == 0x0026) { # &
328     $self->{s_kwd} = '';
329     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
330     not $self->{escape}) {
331     !!!cp (1);
332     ## NOTE: In the spec, the tokenizer is switched to the
333     ## "entity data state". In this implementation, the tokenizer
334     ## is switched to the |ENTITY_STATE|, which is an implementation
335     ## of the "consume a character reference" algorithm.
336     $self->{entity_add} = -1;
337     $self->{prev_state} = DATA_STATE;
338     $self->{state} = ENTITY_STATE;
339     !!!next-input-character;
340     redo A;
341     } else {
342     !!!cp (2);
343     #
344     }
345     } elsif ($self->{nc} == 0x002D) { # -
346     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
347 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
348 wakaba 1.1 !!!cp (3);
349     $self->{escape} = 1; # unless $self->{escape};
350     $self->{s_kwd} = '--';
351     #
352 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
353 wakaba 1.1 !!!cp (4);
354     $self->{s_kwd} = '--';
355     #
356 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
357     !!!cp (4.1);
358     $self->{s_kwd} .= '-';
359     #
360 wakaba 1.1 } else {
361     !!!cp (5);
362 wakaba 1.5 $self->{s_kwd} = '-';
363 wakaba 1.1 #
364     }
365     }
366    
367     #
368     } elsif ($self->{nc} == 0x0021) { # !
369     if (length $self->{s_kwd}) {
370     !!!cp (5.1);
371     $self->{s_kwd} .= '!';
372     #
373     } else {
374     !!!cp (5.2);
375     #$self->{s_kwd} = '';
376     #
377     }
378     #
379     } elsif ($self->{nc} == 0x003C) { # <
380     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
381     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
382     not $self->{escape})) {
383     !!!cp (6);
384     $self->{state} = TAG_OPEN_STATE;
385     !!!next-input-character;
386     redo A;
387     } else {
388     !!!cp (7);
389     $self->{s_kwd} = '';
390     #
391     }
392     } elsif ($self->{nc} == 0x003E) { # >
393     if ($self->{escape} and
394     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
395     if ($self->{s_kwd} eq '--') {
396     !!!cp (8);
397     delete $self->{escape};
398 wakaba 1.5 #
399 wakaba 1.1 } else {
400     !!!cp (9);
401 wakaba 1.5 #
402 wakaba 1.1 }
403 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
404     !!!cp (9.1);
405     !!!parse-error (type => 'unmatched mse', ## TODO: type
406     line => $self->{line_prev},
407     column => $self->{column_prev} - 1);
408     #
409 wakaba 1.1 } else {
410     !!!cp (10);
411 wakaba 1.5 #
412 wakaba 1.1 }
413    
414     $self->{s_kwd} = '';
415     #
416 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
417     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
418     !!!cp (10.1);
419     $self->{s_kwd} .= ']';
420     } elsif ($self->{s_kwd} eq ']]') {
421     !!!cp (10.2);
422     #
423     } else {
424     !!!cp (10.3);
425     $self->{s_kwd} = '';
426     }
427     #
428 wakaba 1.1 } elsif ($self->{nc} == -1) {
429     !!!cp (11);
430     $self->{s_kwd} = '';
431     !!!emit ({type => END_OF_FILE_TOKEN,
432     line => $self->{line}, column => $self->{column}});
433     last A; ## TODO: ok?
434     } else {
435     !!!cp (12);
436     $self->{s_kwd} = '';
437     #
438     }
439    
440     # Anything else
441     my $token = {type => CHARACTER_TOKEN,
442     data => chr $self->{nc},
443     line => $self->{line}, column => $self->{column},
444     };
445 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
446 wakaba 1.1 length $token->{data})) {
447     $self->{s_kwd} = '';
448     }
449    
450     ## Stay in the data state.
451 wakaba 1.5 if (not $self->{is_xml} and
452     $self->{content_model} == PCDATA_CONTENT_MODEL) {
453 wakaba 1.1 !!!cp (13);
454     $self->{state} = PCDATA_STATE;
455     } else {
456     !!!cp (14);
457     ## Stay in the state.
458     }
459     !!!next-input-character;
460     !!!emit ($token);
461     redo A;
462     } elsif ($self->{state} == TAG_OPEN_STATE) {
463 wakaba 1.10 ## XML5: "tag state".
464    
465 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
466     if ($self->{nc} == 0x002F) { # /
467     !!!cp (15);
468     !!!next-input-character;
469     $self->{state} = CLOSE_TAG_OPEN_STATE;
470     redo A;
471     } elsif ($self->{nc} == 0x0021) { # !
472     !!!cp (15.1);
473 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
474 wakaba 1.1 #
475     } else {
476     !!!cp (16);
477 wakaba 1.12 $self->{s_kwd} = '';
478 wakaba 1.1 #
479     }
480    
481     ## reconsume
482     $self->{state} = DATA_STATE;
483     !!!emit ({type => CHARACTER_TOKEN, data => '<',
484     line => $self->{line_prev},
485     column => $self->{column_prev},
486     });
487     redo A;
488     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
489     if ($self->{nc} == 0x0021) { # !
490     !!!cp (17);
491     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
492     !!!next-input-character;
493     redo A;
494     } elsif ($self->{nc} == 0x002F) { # /
495     !!!cp (18);
496     $self->{state} = CLOSE_TAG_OPEN_STATE;
497     !!!next-input-character;
498     redo A;
499     } elsif (0x0041 <= $self->{nc} and
500     $self->{nc} <= 0x005A) { # A..Z
501     !!!cp (19);
502     $self->{ct}
503     = {type => START_TAG_TOKEN,
504 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
505 wakaba 1.1 line => $self->{line_prev},
506     column => $self->{column_prev}};
507     $self->{state} = TAG_NAME_STATE;
508     !!!next-input-character;
509     redo A;
510     } elsif (0x0061 <= $self->{nc} and
511     $self->{nc} <= 0x007A) { # a..z
512     !!!cp (20);
513     $self->{ct} = {type => START_TAG_TOKEN,
514     tag_name => chr ($self->{nc}),
515     line => $self->{line_prev},
516     column => $self->{column_prev}};
517     $self->{state} = TAG_NAME_STATE;
518     !!!next-input-character;
519     redo A;
520     } elsif ($self->{nc} == 0x003E) { # >
521     !!!cp (21);
522     !!!parse-error (type => 'empty start tag',
523     line => $self->{line_prev},
524     column => $self->{column_prev});
525     $self->{state} = DATA_STATE;
526 wakaba 1.5 $self->{s_kwd} = '';
527 wakaba 1.1 !!!next-input-character;
528    
529     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
530     line => $self->{line_prev},
531     column => $self->{column_prev},
532     });
533    
534     redo A;
535     } elsif ($self->{nc} == 0x003F) { # ?
536 wakaba 1.8 if ($self->{is_xml}) {
537     !!!cp (22.1);
538     $self->{state} = PI_STATE;
539     !!!next-input-character;
540     redo A;
541     } else {
542     !!!cp (22);
543     !!!parse-error (type => 'pio',
544     line => $self->{line_prev},
545     column => $self->{column_prev});
546     $self->{state} = BOGUS_COMMENT_STATE;
547     $self->{ct} = {type => COMMENT_TOKEN, data => '',
548     line => $self->{line_prev},
549     column => $self->{column_prev},
550     };
551     ## $self->{nc} is intentionally left as is
552     redo A;
553     }
554 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
555 wakaba 1.1 !!!cp (23);
556     !!!parse-error (type => 'bare stago',
557     line => $self->{line_prev},
558     column => $self->{column_prev});
559     $self->{state} = DATA_STATE;
560 wakaba 1.5 $self->{s_kwd} = '';
561 wakaba 1.1 ## reconsume
562    
563     !!!emit ({type => CHARACTER_TOKEN, data => '<',
564     line => $self->{line_prev},
565     column => $self->{column_prev},
566     });
567    
568     redo A;
569 wakaba 1.9 } else {
570     ## XML5: "<:" is a parse error.
571     !!!cp (23.1);
572     $self->{ct} = {type => START_TAG_TOKEN,
573     tag_name => chr ($self->{nc}),
574     line => $self->{line_prev},
575     column => $self->{column_prev}};
576     $self->{state} = TAG_NAME_STATE;
577     !!!next-input-character;
578     redo A;
579 wakaba 1.1 }
580     } else {
581     die "$0: $self->{content_model} in tag open";
582     }
583     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
584     ## NOTE: The "close tag open state" in the spec is implemented as
585     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
586    
587 wakaba 1.10 ## XML5: "end tag state".
588    
589 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
590     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
591     if (defined $self->{last_stag_name}) {
592     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
593 wakaba 1.12 $self->{kwd} = '';
594 wakaba 1.1 ## Reconsume.
595     redo A;
596     } else {
597     ## No start tag token has ever been emitted
598     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
599     !!!cp (28);
600     $self->{state} = DATA_STATE;
601 wakaba 1.5 $self->{s_kwd} = '';
602 wakaba 1.1 ## Reconsume.
603     !!!emit ({type => CHARACTER_TOKEN, data => '</',
604     line => $l, column => $c,
605     });
606     redo A;
607     }
608     }
609    
610     if (0x0041 <= $self->{nc} and
611     $self->{nc} <= 0x005A) { # A..Z
612     !!!cp (29);
613     $self->{ct}
614     = {type => END_TAG_TOKEN,
615 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
616 wakaba 1.1 line => $l, column => $c};
617     $self->{state} = TAG_NAME_STATE;
618     !!!next-input-character;
619     redo A;
620     } elsif (0x0061 <= $self->{nc} and
621     $self->{nc} <= 0x007A) { # a..z
622     !!!cp (30);
623     $self->{ct} = {type => END_TAG_TOKEN,
624     tag_name => chr ($self->{nc}),
625     line => $l, column => $c};
626     $self->{state} = TAG_NAME_STATE;
627     !!!next-input-character;
628     redo A;
629     } elsif ($self->{nc} == 0x003E) { # >
630     !!!parse-error (type => 'empty end tag',
631     line => $self->{line_prev}, ## "<" in "</>"
632     column => $self->{column_prev} - 1);
633     $self->{state} = DATA_STATE;
634 wakaba 1.5 $self->{s_kwd} = '';
635 wakaba 1.10 if ($self->{is_xml}) {
636     !!!cp (31);
637     ## XML5: No parse error.
638    
639     ## NOTE: This parser raises a parse error, since it supports
640     ## XML1, not XML5.
641    
642     ## NOTE: A short end tag token.
643     my $ct = {type => END_TAG_TOKEN,
644     tag_name => '',
645     line => $self->{line_prev},
646     column => $self->{column_prev} - 1,
647     };
648     !!!next-input-character;
649     !!!emit ($ct);
650     } else {
651     !!!cp (31.1);
652     !!!next-input-character;
653     }
654 wakaba 1.1 redo A;
655     } elsif ($self->{nc} == -1) {
656     !!!cp (32);
657     !!!parse-error (type => 'bare etago');
658 wakaba 1.5 $self->{s_kwd} = '';
659 wakaba 1.1 $self->{state} = DATA_STATE;
660     # reconsume
661    
662     !!!emit ({type => CHARACTER_TOKEN, data => '</',
663     line => $l, column => $c,
664     });
665    
666     redo A;
667 wakaba 1.10 } elsif (not $self->{is_xml} or
668     $is_space->{$self->{nc}}) {
669 wakaba 1.1 !!!cp (33);
670 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
671     line => $self->{line_prev}, # "<" of "</"
672     column => $self->{column_prev} - 1);
673 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
674     $self->{ct} = {type => COMMENT_TOKEN, data => '',
675     line => $self->{line_prev}, # "<" of "</"
676     column => $self->{column_prev} - 1,
677     };
678     ## NOTE: $self->{nc} is intentionally left as is.
679     ## Although the "anything else" case of the spec not explicitly
680     ## states that the next input character is to be reconsumed,
681     ## it will be included to the |data| of the comment token
682     ## generated from the bogus end tag, as defined in the
683     ## "bogus comment state" entry.
684     redo A;
685 wakaba 1.10 } else {
686     ## XML5: "</:" is a parse error.
687     !!!cp (30.1);
688     $self->{ct} = {type => END_TAG_TOKEN,
689     tag_name => chr ($self->{nc}),
690     line => $l, column => $c};
691     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
692     !!!next-input-character;
693     redo A;
694 wakaba 1.1 }
695     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
696 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
697 wakaba 1.1 if (length $ch) {
698     my $CH = $ch;
699     $ch =~ tr/a-z/A-Z/;
700     my $nch = chr $self->{nc};
701     if ($nch eq $ch or $nch eq $CH) {
702     !!!cp (24);
703     ## Stay in the state.
704 wakaba 1.12 $self->{kwd} .= $nch;
705 wakaba 1.1 !!!next-input-character;
706     redo A;
707     } else {
708     !!!cp (25);
709     $self->{state} = DATA_STATE;
710 wakaba 1.5 $self->{s_kwd} = '';
711 wakaba 1.1 ## Reconsume.
712     !!!emit ({type => CHARACTER_TOKEN,
713 wakaba 1.12 data => '</' . $self->{kwd},
714 wakaba 1.1 line => $self->{line_prev},
715 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
716 wakaba 1.1 });
717     redo A;
718     }
719     } else { # after "<{tag-name}"
720     unless ($is_space->{$self->{nc}} or
721     {
722     0x003E => 1, # >
723     0x002F => 1, # /
724     -1 => 1, # EOF
725     }->{$self->{nc}}) {
726     !!!cp (26);
727     ## Reconsume.
728     $self->{state} = DATA_STATE;
729 wakaba 1.5 $self->{s_kwd} = '';
730 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
731 wakaba 1.12 data => '</' . $self->{kwd},
732 wakaba 1.1 line => $self->{line_prev},
733 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
734 wakaba 1.1 });
735     redo A;
736     } else {
737     !!!cp (27);
738     $self->{ct}
739     = {type => END_TAG_TOKEN,
740     tag_name => $self->{last_stag_name},
741     line => $self->{line_prev},
742 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
743 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
744     ## Reconsume.
745     redo A;
746     }
747     }
748     } elsif ($self->{state} == TAG_NAME_STATE) {
749     if ($is_space->{$self->{nc}}) {
750     !!!cp (34);
751     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
752     !!!next-input-character;
753     redo A;
754     } elsif ($self->{nc} == 0x003E) { # >
755     if ($self->{ct}->{type} == START_TAG_TOKEN) {
756     !!!cp (35);
757     $self->{last_stag_name} = $self->{ct}->{tag_name};
758     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
759     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
760     #if ($self->{ct}->{attributes}) {
761     # ## NOTE: This should never be reached.
762     # !!! cp (36);
763     # !!! parse-error (type => 'end tag attribute');
764     #} else {
765     !!!cp (37);
766     #}
767     } else {
768     die "$0: $self->{ct}->{type}: Unknown token type";
769     }
770     $self->{state} = DATA_STATE;
771 wakaba 1.5 $self->{s_kwd} = '';
772 wakaba 1.1 !!!next-input-character;
773    
774     !!!emit ($self->{ct}); # start tag or end tag
775    
776     redo A;
777     } elsif (0x0041 <= $self->{nc} and
778     $self->{nc} <= 0x005A) { # A..Z
779     !!!cp (38);
780 wakaba 1.4 $self->{ct}->{tag_name}
781     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
782 wakaba 1.1 # start tag or end tag
783     ## Stay in this state
784     !!!next-input-character;
785     redo A;
786     } elsif ($self->{nc} == -1) {
787     !!!parse-error (type => 'unclosed tag');
788     if ($self->{ct}->{type} == START_TAG_TOKEN) {
789     !!!cp (39);
790     $self->{last_stag_name} = $self->{ct}->{tag_name};
791     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
792     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
793     #if ($self->{ct}->{attributes}) {
794     # ## NOTE: This state should never be reached.
795     # !!! cp (40);
796     # !!! parse-error (type => 'end tag attribute');
797     #} else {
798     !!!cp (41);
799     #}
800     } else {
801     die "$0: $self->{ct}->{type}: Unknown token type";
802     }
803     $self->{state} = DATA_STATE;
804 wakaba 1.5 $self->{s_kwd} = '';
805 wakaba 1.1 # reconsume
806    
807     !!!emit ($self->{ct}); # start tag or end tag
808    
809     redo A;
810     } elsif ($self->{nc} == 0x002F) { # /
811     !!!cp (42);
812     $self->{state} = SELF_CLOSING_START_TAG_STATE;
813     !!!next-input-character;
814     redo A;
815     } else {
816     !!!cp (44);
817     $self->{ct}->{tag_name} .= chr $self->{nc};
818     # start tag or end tag
819     ## Stay in the state
820     !!!next-input-character;
821     redo A;
822     }
823     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
824 wakaba 1.11 ## XML5: "Tag attribute name before state".
825    
826 wakaba 1.1 if ($is_space->{$self->{nc}}) {
827     !!!cp (45);
828     ## Stay in the state
829     !!!next-input-character;
830     redo A;
831     } elsif ($self->{nc} == 0x003E) { # >
832     if ($self->{ct}->{type} == START_TAG_TOKEN) {
833     !!!cp (46);
834     $self->{last_stag_name} = $self->{ct}->{tag_name};
835     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
836     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
837     if ($self->{ct}->{attributes}) {
838     !!!cp (47);
839     !!!parse-error (type => 'end tag attribute');
840     } else {
841     !!!cp (48);
842     }
843     } else {
844     die "$0: $self->{ct}->{type}: Unknown token type";
845     }
846     $self->{state} = DATA_STATE;
847 wakaba 1.5 $self->{s_kwd} = '';
848 wakaba 1.1 !!!next-input-character;
849    
850     !!!emit ($self->{ct}); # start tag or end tag
851    
852     redo A;
853     } elsif (0x0041 <= $self->{nc} and
854     $self->{nc} <= 0x005A) { # A..Z
855     !!!cp (49);
856     $self->{ca}
857 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
858 wakaba 1.1 value => '',
859     line => $self->{line}, column => $self->{column}};
860     $self->{state} = ATTRIBUTE_NAME_STATE;
861     !!!next-input-character;
862     redo A;
863     } elsif ($self->{nc} == 0x002F) { # /
864     !!!cp (50);
865     $self->{state} = SELF_CLOSING_START_TAG_STATE;
866     !!!next-input-character;
867     redo A;
868     } elsif ($self->{nc} == -1) {
869     !!!parse-error (type => 'unclosed tag');
870     if ($self->{ct}->{type} == START_TAG_TOKEN) {
871     !!!cp (52);
872     $self->{last_stag_name} = $self->{ct}->{tag_name};
873     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
874     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
875     if ($self->{ct}->{attributes}) {
876     !!!cp (53);
877     !!!parse-error (type => 'end tag attribute');
878     } else {
879     !!!cp (54);
880     }
881     } else {
882     die "$0: $self->{ct}->{type}: Unknown token type";
883     }
884     $self->{state} = DATA_STATE;
885 wakaba 1.5 $self->{s_kwd} = '';
886 wakaba 1.1 # reconsume
887    
888     !!!emit ($self->{ct}); # start tag or end tag
889    
890     redo A;
891     } else {
892     if ({
893     0x0022 => 1, # "
894     0x0027 => 1, # '
895     0x003D => 1, # =
896     }->{$self->{nc}}) {
897     !!!cp (55);
898 wakaba 1.11 ## XML5: Not a parse error.
899 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
900     } else {
901     !!!cp (56);
902 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
903 wakaba 1.1 }
904     $self->{ca}
905     = {name => chr ($self->{nc}),
906     value => '',
907     line => $self->{line}, column => $self->{column}};
908     $self->{state} = ATTRIBUTE_NAME_STATE;
909     !!!next-input-character;
910     redo A;
911     }
912     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
913 wakaba 1.11 ## XML5: "Tag attribute name state".
914    
915 wakaba 1.1 my $before_leave = sub {
916     if (exists $self->{ct}->{attributes} # start tag or end tag
917     ->{$self->{ca}->{name}}) { # MUST
918     !!!cp (57);
919     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
920     ## Discard $self->{ca} # MUST
921     } else {
922     !!!cp (58);
923     $self->{ct}->{attributes}->{$self->{ca}->{name}}
924     = $self->{ca};
925 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
926 wakaba 1.1 }
927     }; # $before_leave
928    
929     if ($is_space->{$self->{nc}}) {
930     !!!cp (59);
931     $before_leave->();
932     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
933     !!!next-input-character;
934     redo A;
935     } elsif ($self->{nc} == 0x003D) { # =
936     !!!cp (60);
937     $before_leave->();
938     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
939     !!!next-input-character;
940     redo A;
941     } elsif ($self->{nc} == 0x003E) { # >
942 wakaba 1.11 if ($self->{is_xml}) {
943     !!!cp (60.1);
944     ## XML5: Not a parse error.
945     !!!parse-error (type => 'no attr value'); ## TODO: type
946     } else {
947     !!!cp (60.2);
948     }
949    
950 wakaba 1.1 $before_leave->();
951     if ($self->{ct}->{type} == START_TAG_TOKEN) {
952     !!!cp (61);
953     $self->{last_stag_name} = $self->{ct}->{tag_name};
954     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
955     !!!cp (62);
956     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
957     if ($self->{ct}->{attributes}) {
958     !!!parse-error (type => 'end tag attribute');
959     }
960     } else {
961     die "$0: $self->{ct}->{type}: Unknown token type";
962     }
963     $self->{state} = DATA_STATE;
964 wakaba 1.5 $self->{s_kwd} = '';
965 wakaba 1.1 !!!next-input-character;
966    
967     !!!emit ($self->{ct}); # start tag or end tag
968    
969     redo A;
970     } elsif (0x0041 <= $self->{nc} and
971     $self->{nc} <= 0x005A) { # A..Z
972     !!!cp (63);
973 wakaba 1.4 $self->{ca}->{name}
974     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
975 wakaba 1.1 ## Stay in the state
976     !!!next-input-character;
977     redo A;
978     } elsif ($self->{nc} == 0x002F) { # /
979 wakaba 1.11 if ($self->{is_xml}) {
980     !!!cp (64);
981     ## XML5: Not a parse error.
982     !!!parse-error (type => 'no attr value'); ## TODO: type
983     } else {
984     !!!cp (64.1);
985     }
986    
987 wakaba 1.1 $before_leave->();
988     $self->{state} = SELF_CLOSING_START_TAG_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == -1) {
992     !!!parse-error (type => 'unclosed tag');
993     $before_leave->();
994     if ($self->{ct}->{type} == START_TAG_TOKEN) {
995     !!!cp (66);
996     $self->{last_stag_name} = $self->{ct}->{tag_name};
997     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
998     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
999     if ($self->{ct}->{attributes}) {
1000     !!!cp (67);
1001     !!!parse-error (type => 'end tag attribute');
1002     } else {
1003     ## NOTE: This state should never be reached.
1004     !!!cp (68);
1005     }
1006     } else {
1007     die "$0: $self->{ct}->{type}: Unknown token type";
1008     }
1009     $self->{state} = DATA_STATE;
1010 wakaba 1.5 $self->{s_kwd} = '';
1011 wakaba 1.1 # reconsume
1012    
1013     !!!emit ($self->{ct}); # start tag or end tag
1014    
1015     redo A;
1016     } else {
1017     if ($self->{nc} == 0x0022 or # "
1018     $self->{nc} == 0x0027) { # '
1019     !!!cp (69);
1020 wakaba 1.11 ## XML5: Not a parse error.
1021 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1022     } else {
1023     !!!cp (70);
1024     }
1025     $self->{ca}->{name} .= chr ($self->{nc});
1026     ## Stay in the state
1027     !!!next-input-character;
1028     redo A;
1029     }
1030     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1031 wakaba 1.11 ## XML5: "Tag attribute name after state".
1032    
1033 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1034     !!!cp (71);
1035     ## Stay in the state
1036     !!!next-input-character;
1037     redo A;
1038     } elsif ($self->{nc} == 0x003D) { # =
1039     !!!cp (72);
1040     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1041     !!!next-input-character;
1042     redo A;
1043     } elsif ($self->{nc} == 0x003E) { # >
1044 wakaba 1.11 if ($self->{is_xml}) {
1045     !!!cp (72.1);
1046     ## XML5: Not a parse error.
1047     !!!parse-error (type => 'no attr value'); ## TODO: type
1048     } else {
1049     !!!cp (72.2);
1050     }
1051    
1052 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1053     !!!cp (73);
1054     $self->{last_stag_name} = $self->{ct}->{tag_name};
1055     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1056     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1057     if ($self->{ct}->{attributes}) {
1058     !!!cp (74);
1059     !!!parse-error (type => 'end tag attribute');
1060     } else {
1061     ## NOTE: This state should never be reached.
1062     !!!cp (75);
1063     }
1064     } else {
1065     die "$0: $self->{ct}->{type}: Unknown token type";
1066     }
1067     $self->{state} = DATA_STATE;
1068 wakaba 1.5 $self->{s_kwd} = '';
1069 wakaba 1.1 !!!next-input-character;
1070    
1071     !!!emit ($self->{ct}); # start tag or end tag
1072    
1073     redo A;
1074     } elsif (0x0041 <= $self->{nc} and
1075     $self->{nc} <= 0x005A) { # A..Z
1076     !!!cp (76);
1077     $self->{ca}
1078 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1079 wakaba 1.1 value => '',
1080     line => $self->{line}, column => $self->{column}};
1081     $self->{state} = ATTRIBUTE_NAME_STATE;
1082     !!!next-input-character;
1083     redo A;
1084     } elsif ($self->{nc} == 0x002F) { # /
1085 wakaba 1.11 if ($self->{is_xml}) {
1086     !!!cp (77);
1087     ## XML5: Not a parse error.
1088     !!!parse-error (type => 'no attr value'); ## TODO: type
1089     } else {
1090     !!!cp (77.1);
1091     }
1092    
1093 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1094     !!!next-input-character;
1095     redo A;
1096     } elsif ($self->{nc} == -1) {
1097     !!!parse-error (type => 'unclosed tag');
1098     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1099     !!!cp (79);
1100     $self->{last_stag_name} = $self->{ct}->{tag_name};
1101     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1102     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1103     if ($self->{ct}->{attributes}) {
1104     !!!cp (80);
1105     !!!parse-error (type => 'end tag attribute');
1106     } else {
1107     ## NOTE: This state should never be reached.
1108     !!!cp (81);
1109     }
1110     } else {
1111     die "$0: $self->{ct}->{type}: Unknown token type";
1112     }
1113 wakaba 1.5 $self->{s_kwd} = '';
1114 wakaba 1.1 $self->{state} = DATA_STATE;
1115     # reconsume
1116    
1117     !!!emit ($self->{ct}); # start tag or end tag
1118    
1119     redo A;
1120     } else {
1121 wakaba 1.11 if ($self->{is_xml}) {
1122     !!!cp (78.1);
1123     ## XML5: Not a parse error.
1124     !!!parse-error (type => 'no attr value'); ## TODO: type
1125     } else {
1126     !!!cp (78.2);
1127     }
1128    
1129 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1130     $self->{nc} == 0x0027) { # '
1131     !!!cp (78);
1132 wakaba 1.11 ## XML5: Not a parse error.
1133 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1134     } else {
1135     !!!cp (82);
1136     }
1137     $self->{ca}
1138     = {name => chr ($self->{nc}),
1139     value => '',
1140     line => $self->{line}, column => $self->{column}};
1141     $self->{state} = ATTRIBUTE_NAME_STATE;
1142     !!!next-input-character;
1143     redo A;
1144     }
1145     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1146 wakaba 1.11 ## XML5: "Tag attribute value before state".
1147    
1148 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1149     !!!cp (83);
1150     ## Stay in the state
1151     !!!next-input-character;
1152     redo A;
1153     } elsif ($self->{nc} == 0x0022) { # "
1154     !!!cp (84);
1155     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1156     !!!next-input-character;
1157     redo A;
1158     } elsif ($self->{nc} == 0x0026) { # &
1159     !!!cp (85);
1160     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1161     ## reconsume
1162     redo A;
1163     } elsif ($self->{nc} == 0x0027) { # '
1164     !!!cp (86);
1165     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1166     !!!next-input-character;
1167     redo A;
1168     } elsif ($self->{nc} == 0x003E) { # >
1169     !!!parse-error (type => 'empty unquoted attribute value');
1170     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171     !!!cp (87);
1172     $self->{last_stag_name} = $self->{ct}->{tag_name};
1173     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175     if ($self->{ct}->{attributes}) {
1176     !!!cp (88);
1177     !!!parse-error (type => 'end tag attribute');
1178     } else {
1179     ## NOTE: This state should never be reached.
1180     !!!cp (89);
1181     }
1182     } else {
1183     die "$0: $self->{ct}->{type}: Unknown token type";
1184     }
1185     $self->{state} = DATA_STATE;
1186 wakaba 1.5 $self->{s_kwd} = '';
1187 wakaba 1.1 !!!next-input-character;
1188    
1189     !!!emit ($self->{ct}); # start tag or end tag
1190    
1191     redo A;
1192     } elsif ($self->{nc} == -1) {
1193     !!!parse-error (type => 'unclosed tag');
1194     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1195     !!!cp (90);
1196     $self->{last_stag_name} = $self->{ct}->{tag_name};
1197     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1198     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1199     if ($self->{ct}->{attributes}) {
1200     !!!cp (91);
1201     !!!parse-error (type => 'end tag attribute');
1202     } else {
1203     ## NOTE: This state should never be reached.
1204     !!!cp (92);
1205     }
1206     } else {
1207     die "$0: $self->{ct}->{type}: Unknown token type";
1208     }
1209     $self->{state} = DATA_STATE;
1210 wakaba 1.5 $self->{s_kwd} = '';
1211 wakaba 1.1 ## reconsume
1212    
1213     !!!emit ($self->{ct}); # start tag or end tag
1214    
1215     redo A;
1216     } else {
1217     if ($self->{nc} == 0x003D) { # =
1218     !!!cp (93);
1219 wakaba 1.11 ## XML5: Not a parse error.
1220 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1221 wakaba 1.11 } elsif ($self->{is_xml}) {
1222     !!!cp (93.1);
1223     ## XML5: No parse error.
1224     !!!parse-error (type => 'unquoted attr value'); ## TODO
1225 wakaba 1.1 } else {
1226     !!!cp (94);
1227     }
1228     $self->{ca}->{value} .= chr ($self->{nc});
1229     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1230     !!!next-input-character;
1231     redo A;
1232     }
1233     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1234 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1235    
1236 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1237     !!!cp (95);
1238 wakaba 1.11 ## XML5: "Tag attribute name before state".
1239 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1240     !!!next-input-character;
1241     redo A;
1242     } elsif ($self->{nc} == 0x0026) { # &
1243     !!!cp (96);
1244 wakaba 1.11 ## XML5: Not defined yet.
1245    
1246 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1247     ## "entity in attribute value state". In this implementation, the
1248     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1249     ## implementation of the "consume a character reference" algorithm.
1250     $self->{prev_state} = $self->{state};
1251     $self->{entity_add} = 0x0022; # "
1252     $self->{state} = ENTITY_STATE;
1253     !!!next-input-character;
1254     redo A;
1255     } elsif ($self->{nc} == -1) {
1256     !!!parse-error (type => 'unclosed attribute value');
1257     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1258     !!!cp (97);
1259     $self->{last_stag_name} = $self->{ct}->{tag_name};
1260     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1261     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1262     if ($self->{ct}->{attributes}) {
1263     !!!cp (98);
1264     !!!parse-error (type => 'end tag attribute');
1265     } else {
1266     ## NOTE: This state should never be reached.
1267     !!!cp (99);
1268     }
1269     } else {
1270     die "$0: $self->{ct}->{type}: Unknown token type";
1271     }
1272     $self->{state} = DATA_STATE;
1273 wakaba 1.5 $self->{s_kwd} = '';
1274 wakaba 1.1 ## reconsume
1275    
1276     !!!emit ($self->{ct}); # start tag or end tag
1277    
1278     redo A;
1279     } else {
1280 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1281     !!!cp (100);
1282     ## XML5: Not a parse error.
1283     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1284     } else {
1285     !!!cp (100.1);
1286     }
1287 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1288     $self->{read_until}->($self->{ca}->{value},
1289 wakaba 1.11 q["&<],
1290 wakaba 1.1 length $self->{ca}->{value});
1291    
1292     ## Stay in the state
1293     !!!next-input-character;
1294     redo A;
1295     }
1296     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1297 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1298    
1299 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1300     !!!cp (101);
1301 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1302 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303     !!!next-input-character;
1304     redo A;
1305     } elsif ($self->{nc} == 0x0026) { # &
1306     !!!cp (102);
1307 wakaba 1.11 ## XML5: Not defined yet.
1308    
1309 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1310     ## "entity in attribute value state". In this implementation, the
1311     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1312     ## implementation of the "consume a character reference" algorithm.
1313     $self->{entity_add} = 0x0027; # '
1314     $self->{prev_state} = $self->{state};
1315     $self->{state} = ENTITY_STATE;
1316     !!!next-input-character;
1317     redo A;
1318     } elsif ($self->{nc} == -1) {
1319     !!!parse-error (type => 'unclosed attribute value');
1320     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1321     !!!cp (103);
1322     $self->{last_stag_name} = $self->{ct}->{tag_name};
1323     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1325     if ($self->{ct}->{attributes}) {
1326     !!!cp (104);
1327     !!!parse-error (type => 'end tag attribute');
1328     } else {
1329     ## NOTE: This state should never be reached.
1330     !!!cp (105);
1331     }
1332     } else {
1333     die "$0: $self->{ct}->{type}: Unknown token type";
1334     }
1335     $self->{state} = DATA_STATE;
1336 wakaba 1.5 $self->{s_kwd} = '';
1337 wakaba 1.1 ## reconsume
1338    
1339     !!!emit ($self->{ct}); # start tag or end tag
1340    
1341     redo A;
1342     } else {
1343 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1344     !!!cp (106);
1345     ## XML5: Not a parse error.
1346     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1347     } else {
1348     !!!cp (106.1);
1349     }
1350 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1351     $self->{read_until}->($self->{ca}->{value},
1352 wakaba 1.11 q['&<],
1353 wakaba 1.1 length $self->{ca}->{value});
1354    
1355     ## Stay in the state
1356     !!!next-input-character;
1357     redo A;
1358     }
1359     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1360 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1361    
1362 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1363     !!!cp (107);
1364 wakaba 1.11 ## XML5: "Tag attribute name before state".
1365 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1366     !!!next-input-character;
1367     redo A;
1368     } elsif ($self->{nc} == 0x0026) { # &
1369     !!!cp (108);
1370 wakaba 1.11
1371     ## XML5: Not defined yet.
1372    
1373 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1374     ## "entity in attribute value state". In this implementation, the
1375     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1376     ## implementation of the "consume a character reference" algorithm.
1377     $self->{entity_add} = -1;
1378     $self->{prev_state} = $self->{state};
1379     $self->{state} = ENTITY_STATE;
1380     !!!next-input-character;
1381     redo A;
1382     } elsif ($self->{nc} == 0x003E) { # >
1383     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1384     !!!cp (109);
1385     $self->{last_stag_name} = $self->{ct}->{tag_name};
1386     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1387     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1388     if ($self->{ct}->{attributes}) {
1389     !!!cp (110);
1390     !!!parse-error (type => 'end tag attribute');
1391     } else {
1392     ## NOTE: This state should never be reached.
1393     !!!cp (111);
1394     }
1395     } else {
1396     die "$0: $self->{ct}->{type}: Unknown token type";
1397     }
1398     $self->{state} = DATA_STATE;
1399 wakaba 1.5 $self->{s_kwd} = '';
1400 wakaba 1.1 !!!next-input-character;
1401    
1402     !!!emit ($self->{ct}); # start tag or end tag
1403    
1404     redo A;
1405     } elsif ($self->{nc} == -1) {
1406     !!!parse-error (type => 'unclosed tag');
1407     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1408     !!!cp (112);
1409     $self->{last_stag_name} = $self->{ct}->{tag_name};
1410     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1411     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1412     if ($self->{ct}->{attributes}) {
1413     !!!cp (113);
1414     !!!parse-error (type => 'end tag attribute');
1415     } else {
1416     ## NOTE: This state should never be reached.
1417     !!!cp (114);
1418     }
1419     } else {
1420     die "$0: $self->{ct}->{type}: Unknown token type";
1421     }
1422     $self->{state} = DATA_STATE;
1423 wakaba 1.5 $self->{s_kwd} = '';
1424 wakaba 1.1 ## reconsume
1425    
1426     !!!emit ($self->{ct}); # start tag or end tag
1427    
1428     redo A;
1429     } else {
1430     if ({
1431     0x0022 => 1, # "
1432     0x0027 => 1, # '
1433     0x003D => 1, # =
1434     }->{$self->{nc}}) {
1435     !!!cp (115);
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1438     } else {
1439     !!!cp (116);
1440     }
1441     $self->{ca}->{value} .= chr ($self->{nc});
1442     $self->{read_until}->($self->{ca}->{value},
1443     q["'=& >],
1444     length $self->{ca}->{value});
1445    
1446     ## Stay in the state
1447     !!!next-input-character;
1448     redo A;
1449     }
1450     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1451     if ($is_space->{$self->{nc}}) {
1452     !!!cp (118);
1453     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1454     !!!next-input-character;
1455     redo A;
1456     } elsif ($self->{nc} == 0x003E) { # >
1457     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1458     !!!cp (119);
1459     $self->{last_stag_name} = $self->{ct}->{tag_name};
1460     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1461     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1462     if ($self->{ct}->{attributes}) {
1463     !!!cp (120);
1464     !!!parse-error (type => 'end tag attribute');
1465     } else {
1466     ## NOTE: This state should never be reached.
1467     !!!cp (121);
1468     }
1469     } else {
1470     die "$0: $self->{ct}->{type}: Unknown token type";
1471     }
1472     $self->{state} = DATA_STATE;
1473 wakaba 1.5 $self->{s_kwd} = '';
1474 wakaba 1.1 !!!next-input-character;
1475    
1476     !!!emit ($self->{ct}); # start tag or end tag
1477    
1478     redo A;
1479     } elsif ($self->{nc} == 0x002F) { # /
1480     !!!cp (122);
1481     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1482     !!!next-input-character;
1483     redo A;
1484     } elsif ($self->{nc} == -1) {
1485     !!!parse-error (type => 'unclosed tag');
1486     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1487     !!!cp (122.3);
1488     $self->{last_stag_name} = $self->{ct}->{tag_name};
1489     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1490     if ($self->{ct}->{attributes}) {
1491     !!!cp (122.1);
1492     !!!parse-error (type => 'end tag attribute');
1493     } else {
1494     ## NOTE: This state should never be reached.
1495     !!!cp (122.2);
1496     }
1497     } else {
1498     die "$0: $self->{ct}->{type}: Unknown token type";
1499     }
1500     $self->{state} = DATA_STATE;
1501 wakaba 1.5 $self->{s_kwd} = '';
1502 wakaba 1.1 ## Reconsume.
1503     !!!emit ($self->{ct}); # start tag or end tag
1504     redo A;
1505     } else {
1506     !!!cp ('124.1');
1507     !!!parse-error (type => 'no space between attributes');
1508     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1509     ## reconsume
1510     redo A;
1511     }
1512     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1513 wakaba 1.11 ## XML5: "Empty tag state".
1514    
1515 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1516     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1517     !!!cp ('124.2');
1518     !!!parse-error (type => 'nestc', token => $self->{ct});
1519     ## TODO: Different type than slash in start tag
1520     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1521     if ($self->{ct}->{attributes}) {
1522     !!!cp ('124.4');
1523     !!!parse-error (type => 'end tag attribute');
1524     } else {
1525     !!!cp ('124.5');
1526     }
1527     ## TODO: Test |<title></title/>|
1528     } else {
1529     !!!cp ('124.3');
1530     $self->{self_closing} = 1;
1531     }
1532    
1533     $self->{state} = DATA_STATE;
1534 wakaba 1.5 $self->{s_kwd} = '';
1535 wakaba 1.1 !!!next-input-character;
1536    
1537     !!!emit ($self->{ct}); # start tag or end tag
1538    
1539     redo A;
1540     } elsif ($self->{nc} == -1) {
1541     !!!parse-error (type => 'unclosed tag');
1542     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1543     !!!cp (124.7);
1544     $self->{last_stag_name} = $self->{ct}->{tag_name};
1545     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546     if ($self->{ct}->{attributes}) {
1547     !!!cp (124.5);
1548     !!!parse-error (type => 'end tag attribute');
1549     } else {
1550     ## NOTE: This state should never be reached.
1551     !!!cp (124.6);
1552     }
1553     } else {
1554     die "$0: $self->{ct}->{type}: Unknown token type";
1555     }
1556 wakaba 1.11 ## XML5: "Tag attribute name before state".
1557 wakaba 1.1 $self->{state} = DATA_STATE;
1558 wakaba 1.5 $self->{s_kwd} = '';
1559 wakaba 1.1 ## Reconsume.
1560     !!!emit ($self->{ct}); # start tag or end tag
1561     redo A;
1562     } else {
1563     !!!cp ('124.4');
1564     !!!parse-error (type => 'nestc');
1565     ## TODO: This error type is wrong.
1566     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1567     ## Reconsume.
1568     redo A;
1569     }
1570     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1571     ## NOTE: Unlike spec's "bogus comment state", this implementation
1572     ## consumes characters one-by-one basis.
1573    
1574     if ($self->{nc} == 0x003E) { # >
1575 wakaba 1.13 if ($self->{in_subset}) {
1576     !!!cp (123);
1577     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1578     } else {
1579     !!!cp (124);
1580     $self->{state} = DATA_STATE;
1581     $self->{s_kwd} = '';
1582     }
1583 wakaba 1.1 !!!next-input-character;
1584    
1585     !!!emit ($self->{ct}); # comment
1586     redo A;
1587     } elsif ($self->{nc} == -1) {
1588 wakaba 1.13 if ($self->{in_subset}) {
1589     !!!cp (125.1);
1590     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1591     } else {
1592     !!!cp (125);
1593     $self->{state} = DATA_STATE;
1594     $self->{s_kwd} = '';
1595     }
1596 wakaba 1.1 ## reconsume
1597    
1598     !!!emit ($self->{ct}); # comment
1599     redo A;
1600     } else {
1601     !!!cp (126);
1602     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1603     $self->{read_until}->($self->{ct}->{data},
1604     q[>],
1605     length $self->{ct}->{data});
1606    
1607     ## Stay in the state.
1608     !!!next-input-character;
1609     redo A;
1610     }
1611     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1612 wakaba 1.13 ## XML5: "Markup declaration state" and "DOCTYPE markup
1613     ## declaration state".
1614 wakaba 1.1
1615     if ($self->{nc} == 0x002D) { # -
1616     !!!cp (133);
1617     $self->{state} = MD_HYPHEN_STATE;
1618     !!!next-input-character;
1619     redo A;
1620     } elsif ($self->{nc} == 0x0044 or # D
1621     $self->{nc} == 0x0064) { # d
1622     ## ASCII case-insensitive.
1623     !!!cp (130);
1624     $self->{state} = MD_DOCTYPE_STATE;
1625 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1626 wakaba 1.1 !!!next-input-character;
1627     redo A;
1628 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1629     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1630     $self->{is_xml}) and
1631 wakaba 1.1 $self->{nc} == 0x005B) { # [
1632     !!!cp (135.4);
1633     $self->{state} = MD_CDATA_STATE;
1634 wakaba 1.12 $self->{kwd} = '[';
1635 wakaba 1.1 !!!next-input-character;
1636     redo A;
1637     } else {
1638     !!!cp (136);
1639     }
1640    
1641     !!!parse-error (type => 'bogus comment',
1642     line => $self->{line_prev},
1643     column => $self->{column_prev} - 1);
1644     ## Reconsume.
1645     $self->{state} = BOGUS_COMMENT_STATE;
1646     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1647     line => $self->{line_prev},
1648     column => $self->{column_prev} - 1,
1649     };
1650     redo A;
1651     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1652     if ($self->{nc} == 0x002D) { # -
1653     !!!cp (127);
1654     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1655     line => $self->{line_prev},
1656     column => $self->{column_prev} - 2,
1657     };
1658 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1659 wakaba 1.1 !!!next-input-character;
1660     redo A;
1661     } else {
1662     !!!cp (128);
1663     !!!parse-error (type => 'bogus comment',
1664     line => $self->{line_prev},
1665     column => $self->{column_prev} - 2);
1666     $self->{state} = BOGUS_COMMENT_STATE;
1667     ## Reconsume.
1668     $self->{ct} = {type => COMMENT_TOKEN,
1669     data => '-',
1670     line => $self->{line_prev},
1671     column => $self->{column_prev} - 2,
1672     };
1673     redo A;
1674     }
1675     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1676     ## ASCII case-insensitive.
1677     if ($self->{nc} == [
1678     undef,
1679     0x004F, # O
1680     0x0043, # C
1681     0x0054, # T
1682     0x0059, # Y
1683     0x0050, # P
1684 wakaba 1.12 ]->[length $self->{kwd}] or
1685 wakaba 1.1 $self->{nc} == [
1686     undef,
1687     0x006F, # o
1688     0x0063, # c
1689     0x0074, # t
1690     0x0079, # y
1691     0x0070, # p
1692 wakaba 1.12 ]->[length $self->{kwd}]) {
1693 wakaba 1.1 !!!cp (131);
1694     ## Stay in the state.
1695 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1696 wakaba 1.1 !!!next-input-character;
1697     redo A;
1698 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1699 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1700     $self->{nc} == 0x0065)) { # e
1701 wakaba 1.12 if ($self->{is_xml} and
1702     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1703 wakaba 1.10 !!!cp (129);
1704     ## XML5: case-sensitive.
1705     !!!parse-error (type => 'lowercase keyword', ## TODO
1706     text => 'DOCTYPE',
1707     line => $self->{line_prev},
1708     column => $self->{column_prev} - 5);
1709     } else {
1710     !!!cp (129.1);
1711     }
1712 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1713     $self->{ct} = {type => DOCTYPE_TOKEN,
1714     quirks => 1,
1715     line => $self->{line_prev},
1716     column => $self->{column_prev} - 7,
1717     };
1718     !!!next-input-character;
1719     redo A;
1720     } else {
1721     !!!cp (132);
1722     !!!parse-error (type => 'bogus comment',
1723     line => $self->{line_prev},
1724 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1725 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1726     ## Reconsume.
1727     $self->{ct} = {type => COMMENT_TOKEN,
1728 wakaba 1.12 data => $self->{kwd},
1729 wakaba 1.1 line => $self->{line_prev},
1730 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1731 wakaba 1.1 };
1732     redo A;
1733     }
1734     } elsif ($self->{state} == MD_CDATA_STATE) {
1735     if ($self->{nc} == {
1736     '[' => 0x0043, # C
1737     '[C' => 0x0044, # D
1738     '[CD' => 0x0041, # A
1739     '[CDA' => 0x0054, # T
1740     '[CDAT' => 0x0041, # A
1741 wakaba 1.12 }->{$self->{kwd}}) {
1742 wakaba 1.1 !!!cp (135.1);
1743     ## Stay in the state.
1744 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1745 wakaba 1.1 !!!next-input-character;
1746     redo A;
1747 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1748 wakaba 1.1 $self->{nc} == 0x005B) { # [
1749 wakaba 1.6 if ($self->{is_xml} and
1750     not $self->{tainted} and
1751     @{$self->{open_elements} or []} == 0) {
1752 wakaba 1.8 !!!cp (135.2);
1753 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1754     line => $self->{line_prev},
1755     column => $self->{column_prev} - 7);
1756     $self->{tainted} = 1;
1757 wakaba 1.8 } else {
1758     !!!cp (135.21);
1759 wakaba 1.6 }
1760    
1761 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1762     data => '',
1763     line => $self->{line_prev},
1764     column => $self->{column_prev} - 7};
1765     $self->{state} = CDATA_SECTION_STATE;
1766     !!!next-input-character;
1767     redo A;
1768     } else {
1769     !!!cp (135.3);
1770     !!!parse-error (type => 'bogus comment',
1771     line => $self->{line_prev},
1772 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1773 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1774     ## Reconsume.
1775     $self->{ct} = {type => COMMENT_TOKEN,
1776 wakaba 1.12 data => $self->{kwd},
1777 wakaba 1.1 line => $self->{line_prev},
1778 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1779 wakaba 1.1 };
1780     redo A;
1781     }
1782     } elsif ($self->{state} == COMMENT_START_STATE) {
1783     if ($self->{nc} == 0x002D) { # -
1784     !!!cp (137);
1785     $self->{state} = COMMENT_START_DASH_STATE;
1786     !!!next-input-character;
1787     redo A;
1788     } elsif ($self->{nc} == 0x003E) { # >
1789     !!!parse-error (type => 'bogus comment');
1790 wakaba 1.13 if ($self->{in_subset}) {
1791     !!!cp (138.1);
1792     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1793     } else {
1794     !!!cp (138);
1795     $self->{state} = DATA_STATE;
1796     $self->{s_kwd} = '';
1797     }
1798 wakaba 1.1 !!!next-input-character;
1799    
1800     !!!emit ($self->{ct}); # comment
1801    
1802     redo A;
1803     } elsif ($self->{nc} == -1) {
1804     !!!parse-error (type => 'unclosed comment');
1805 wakaba 1.13 if ($self->{in_subset}) {
1806     !!!cp (139.1);
1807     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1808     } else {
1809     !!!cp (139);
1810     $self->{state} = DATA_STATE;
1811     $self->{s_kwd} = '';
1812     }
1813 wakaba 1.1 ## reconsume
1814    
1815     !!!emit ($self->{ct}); # comment
1816    
1817     redo A;
1818     } else {
1819     !!!cp (140);
1820     $self->{ct}->{data} # comment
1821     .= chr ($self->{nc});
1822     $self->{state} = COMMENT_STATE;
1823     !!!next-input-character;
1824     redo A;
1825     }
1826     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1827     if ($self->{nc} == 0x002D) { # -
1828     !!!cp (141);
1829     $self->{state} = COMMENT_END_STATE;
1830     !!!next-input-character;
1831     redo A;
1832     } elsif ($self->{nc} == 0x003E) { # >
1833     !!!parse-error (type => 'bogus comment');
1834 wakaba 1.13 if ($self->{in_subset}) {
1835     !!!cp (142.1);
1836     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1837     } else {
1838     !!!cp (142);
1839     $self->{state} = DATA_STATE;
1840     $self->{s_kwd} = '';
1841     }
1842 wakaba 1.1 !!!next-input-character;
1843    
1844     !!!emit ($self->{ct}); # comment
1845    
1846     redo A;
1847     } elsif ($self->{nc} == -1) {
1848     !!!parse-error (type => 'unclosed comment');
1849 wakaba 1.13 if ($self->{in_subset}) {
1850     !!!cp (143.1);
1851     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1852     } else {
1853     !!!cp (143);
1854     $self->{state} = DATA_STATE;
1855     $self->{s_kwd} = '';
1856     }
1857 wakaba 1.1 ## reconsume
1858    
1859     !!!emit ($self->{ct}); # comment
1860    
1861     redo A;
1862     } else {
1863     !!!cp (144);
1864     $self->{ct}->{data} # comment
1865     .= '-' . chr ($self->{nc});
1866     $self->{state} = COMMENT_STATE;
1867     !!!next-input-character;
1868     redo A;
1869     }
1870     } elsif ($self->{state} == COMMENT_STATE) {
1871     if ($self->{nc} == 0x002D) { # -
1872     !!!cp (145);
1873     $self->{state} = COMMENT_END_DASH_STATE;
1874     !!!next-input-character;
1875     redo A;
1876     } elsif ($self->{nc} == -1) {
1877     !!!parse-error (type => 'unclosed comment');
1878 wakaba 1.13 if ($self->{in_subset}) {
1879     !!!cp (146.1);
1880     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1881     } else {
1882     !!!cp (146);
1883     $self->{state} = DATA_STATE;
1884     $self->{s_kwd} = '';
1885     }
1886 wakaba 1.1 ## reconsume
1887    
1888     !!!emit ($self->{ct}); # comment
1889    
1890     redo A;
1891     } else {
1892     !!!cp (147);
1893     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1894     $self->{read_until}->($self->{ct}->{data},
1895     q[-],
1896     length $self->{ct}->{data});
1897    
1898     ## Stay in the state
1899     !!!next-input-character;
1900     redo A;
1901     }
1902     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1903 wakaba 1.10 ## XML5: "comment dash state".
1904    
1905 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1906     !!!cp (148);
1907     $self->{state} = COMMENT_END_STATE;
1908     !!!next-input-character;
1909     redo A;
1910     } elsif ($self->{nc} == -1) {
1911     !!!parse-error (type => 'unclosed comment');
1912 wakaba 1.13 if ($self->{in_subset}) {
1913     !!!cp (149.1);
1914     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1915     } else {
1916     !!!cp (149);
1917     $self->{state} = DATA_STATE;
1918     $self->{s_kwd} = '';
1919     }
1920 wakaba 1.1 ## reconsume
1921    
1922     !!!emit ($self->{ct}); # comment
1923    
1924     redo A;
1925     } else {
1926     !!!cp (150);
1927     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1928     $self->{state} = COMMENT_STATE;
1929     !!!next-input-character;
1930     redo A;
1931     }
1932     } elsif ($self->{state} == COMMENT_END_STATE) {
1933     if ($self->{nc} == 0x003E) { # >
1934 wakaba 1.13 if ($self->{in_subset}) {
1935     !!!cp (151.1);
1936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937     } else {
1938     !!!cp (151);
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     }
1942 wakaba 1.1 !!!next-input-character;
1943    
1944     !!!emit ($self->{ct}); # comment
1945    
1946     redo A;
1947     } elsif ($self->{nc} == 0x002D) { # -
1948     !!!cp (152);
1949 wakaba 1.10 ## XML5: Not a parse error.
1950 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1951     line => $self->{line_prev},
1952     column => $self->{column_prev});
1953     $self->{ct}->{data} .= '-'; # comment
1954     ## Stay in the state
1955     !!!next-input-character;
1956     redo A;
1957     } elsif ($self->{nc} == -1) {
1958     !!!parse-error (type => 'unclosed comment');
1959 wakaba 1.13 if ($self->{in_subset}) {
1960     !!!cp (153.1);
1961     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1962     } else {
1963     !!!cp (153);
1964     $self->{state} = DATA_STATE;
1965     $self->{s_kwd} = '';
1966     }
1967 wakaba 1.1 ## reconsume
1968    
1969     !!!emit ($self->{ct}); # comment
1970    
1971     redo A;
1972     } else {
1973     !!!cp (154);
1974 wakaba 1.10 ## XML5: Not a parse error.
1975 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1976     line => $self->{line_prev},
1977     column => $self->{column_prev});
1978     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1979     $self->{state} = COMMENT_STATE;
1980     !!!next-input-character;
1981     redo A;
1982     }
1983     } elsif ($self->{state} == DOCTYPE_STATE) {
1984     if ($is_space->{$self->{nc}}) {
1985     !!!cp (155);
1986     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1987     !!!next-input-character;
1988     redo A;
1989     } else {
1990     !!!cp (156);
1991 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
1992 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
1993     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1994     ## reconsume
1995     redo A;
1996     }
1997     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1998 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
1999    
2000 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2001     !!!cp (157);
2002     ## Stay in the state
2003     !!!next-input-character;
2004     redo A;
2005     } elsif ($self->{nc} == 0x003E) { # >
2006     !!!cp (158);
2007 wakaba 1.12 ## XML5: No parse error.
2008 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2009     $self->{state} = DATA_STATE;
2010 wakaba 1.5 $self->{s_kwd} = '';
2011 wakaba 1.1 !!!next-input-character;
2012    
2013     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2014    
2015     redo A;
2016     } elsif ($self->{nc} == -1) {
2017     !!!cp (159);
2018     !!!parse-error (type => 'no DOCTYPE name');
2019     $self->{state} = DATA_STATE;
2020 wakaba 1.5 $self->{s_kwd} = '';
2021 wakaba 1.1 ## reconsume
2022    
2023     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2024    
2025     redo A;
2026 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2027     !!!cp (159.1);
2028     !!!parse-error (type => 'no DOCTYPE name');
2029     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2030 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2031     $self->{in_subset} = 1;
2032 wakaba 1.12 !!!next-input-character;
2033 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2034 wakaba 1.12 redo A;
2035 wakaba 1.1 } else {
2036     !!!cp (160);
2037     $self->{ct}->{name} = chr $self->{nc};
2038     delete $self->{ct}->{quirks};
2039     $self->{state} = DOCTYPE_NAME_STATE;
2040     !!!next-input-character;
2041     redo A;
2042     }
2043     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2044 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2045    
2046     ## ISSUE: Redundant "First," in the spec.
2047    
2048 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2049     !!!cp (161);
2050     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2051     !!!next-input-character;
2052     redo A;
2053     } elsif ($self->{nc} == 0x003E) { # >
2054     !!!cp (162);
2055     $self->{state} = DATA_STATE;
2056 wakaba 1.5 $self->{s_kwd} = '';
2057 wakaba 1.1 !!!next-input-character;
2058    
2059     !!!emit ($self->{ct}); # DOCTYPE
2060    
2061     redo A;
2062     } elsif ($self->{nc} == -1) {
2063     !!!cp (163);
2064     !!!parse-error (type => 'unclosed DOCTYPE');
2065     $self->{state} = DATA_STATE;
2066 wakaba 1.5 $self->{s_kwd} = '';
2067 wakaba 1.1 ## reconsume
2068    
2069     $self->{ct}->{quirks} = 1;
2070     !!!emit ($self->{ct}); # DOCTYPE
2071    
2072     redo A;
2073 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2074     !!!cp (163.1);
2075     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2076 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2077     $self->{in_subset} = 1;
2078 wakaba 1.12 !!!next-input-character;
2079 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2080 wakaba 1.12 redo A;
2081 wakaba 1.1 } else {
2082     !!!cp (164);
2083     $self->{ct}->{name}
2084     .= chr ($self->{nc}); # DOCTYPE
2085     ## Stay in the state
2086     !!!next-input-character;
2087     redo A;
2088     }
2089     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2090 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2091     ## state", but implemented differently.
2092    
2093 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2094     !!!cp (165);
2095     ## Stay in the state
2096     !!!next-input-character;
2097     redo A;
2098     } elsif ($self->{nc} == 0x003E) { # >
2099     !!!cp (166);
2100     $self->{state} = DATA_STATE;
2101 wakaba 1.5 $self->{s_kwd} = '';
2102 wakaba 1.1 !!!next-input-character;
2103    
2104     !!!emit ($self->{ct}); # DOCTYPE
2105    
2106     redo A;
2107     } elsif ($self->{nc} == -1) {
2108     !!!cp (167);
2109     !!!parse-error (type => 'unclosed DOCTYPE');
2110     $self->{state} = DATA_STATE;
2111 wakaba 1.5 $self->{s_kwd} = '';
2112 wakaba 1.1 ## reconsume
2113    
2114     $self->{ct}->{quirks} = 1;
2115     !!!emit ($self->{ct}); # DOCTYPE
2116    
2117     redo A;
2118     } elsif ($self->{nc} == 0x0050 or # P
2119     $self->{nc} == 0x0070) { # p
2120 wakaba 1.12 !!!cp (167.1);
2121 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2122 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2123 wakaba 1.1 !!!next-input-character;
2124     redo A;
2125     } elsif ($self->{nc} == 0x0053 or # S
2126     $self->{nc} == 0x0073) { # s
2127 wakaba 1.12 !!!cp (167.2);
2128 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2129 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2130     !!!next-input-character;
2131     redo A;
2132     } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2133     !!!cp (167.3);
2134     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2135     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2136 wakaba 1.13 $self->{in_subset} = 1;
2137 wakaba 1.1 !!!next-input-character;
2138 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2139 wakaba 1.1 redo A;
2140     } else {
2141     !!!cp (180);
2142     !!!parse-error (type => 'string after DOCTYPE name');
2143     $self->{ct}->{quirks} = 1;
2144    
2145     $self->{state} = BOGUS_DOCTYPE_STATE;
2146     !!!next-input-character;
2147     redo A;
2148     }
2149     } elsif ($self->{state} == PUBLIC_STATE) {
2150     ## ASCII case-insensitive
2151     if ($self->{nc} == [
2152     undef,
2153     0x0055, # U
2154     0x0042, # B
2155     0x004C, # L
2156     0x0049, # I
2157 wakaba 1.12 ]->[length $self->{kwd}] or
2158 wakaba 1.1 $self->{nc} == [
2159     undef,
2160     0x0075, # u
2161     0x0062, # b
2162     0x006C, # l
2163     0x0069, # i
2164 wakaba 1.12 ]->[length $self->{kwd}]) {
2165 wakaba 1.1 !!!cp (175);
2166     ## Stay in the state.
2167 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2168 wakaba 1.1 !!!next-input-character;
2169     redo A;
2170 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2171 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2172     $self->{nc} == 0x0063)) { # c
2173 wakaba 1.12 if ($self->{is_xml} and
2174     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2175     !!!cp (168.1);
2176     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2177     text => 'PUBLIC',
2178     line => $self->{line_prev},
2179     column => $self->{column_prev} - 4);
2180     } else {
2181     !!!cp (168);
2182     }
2183 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2184     !!!next-input-character;
2185     redo A;
2186     } else {
2187     !!!cp (169);
2188     !!!parse-error (type => 'string after DOCTYPE name',
2189     line => $self->{line_prev},
2190 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2191 wakaba 1.1 $self->{ct}->{quirks} = 1;
2192    
2193     $self->{state} = BOGUS_DOCTYPE_STATE;
2194     ## Reconsume.
2195     redo A;
2196     }
2197     } elsif ($self->{state} == SYSTEM_STATE) {
2198     ## ASCII case-insensitive
2199     if ($self->{nc} == [
2200     undef,
2201     0x0059, # Y
2202     0x0053, # S
2203     0x0054, # T
2204     0x0045, # E
2205 wakaba 1.12 ]->[length $self->{kwd}] or
2206 wakaba 1.1 $self->{nc} == [
2207     undef,
2208     0x0079, # y
2209     0x0073, # s
2210     0x0074, # t
2211     0x0065, # e
2212 wakaba 1.12 ]->[length $self->{kwd}]) {
2213 wakaba 1.1 !!!cp (170);
2214     ## Stay in the state.
2215 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2216 wakaba 1.1 !!!next-input-character;
2217     redo A;
2218 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2219 wakaba 1.1 ($self->{nc} == 0x004D or # M
2220     $self->{nc} == 0x006D)) { # m
2221 wakaba 1.12 if ($self->{is_xml} and
2222     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2223     !!!cp (171.1);
2224     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2225     text => 'SYSTEM',
2226     line => $self->{line_prev},
2227     column => $self->{column_prev} - 4);
2228     } else {
2229     !!!cp (171);
2230     }
2231 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2232     !!!next-input-character;
2233     redo A;
2234     } else {
2235     !!!cp (172);
2236     !!!parse-error (type => 'string after DOCTYPE name',
2237     line => $self->{line_prev},
2238 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2239 wakaba 1.1 $self->{ct}->{quirks} = 1;
2240    
2241     $self->{state} = BOGUS_DOCTYPE_STATE;
2242     ## Reconsume.
2243     redo A;
2244     }
2245     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2246     if ($is_space->{$self->{nc}}) {
2247     !!!cp (181);
2248     ## Stay in the state
2249     !!!next-input-character;
2250     redo A;
2251     } elsif ($self->{nc} eq 0x0022) { # "
2252     !!!cp (182);
2253     $self->{ct}->{pubid} = ''; # DOCTYPE
2254     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2255     !!!next-input-character;
2256     redo A;
2257     } elsif ($self->{nc} eq 0x0027) { # '
2258     !!!cp (183);
2259     $self->{ct}->{pubid} = ''; # DOCTYPE
2260     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2261     !!!next-input-character;
2262     redo A;
2263     } elsif ($self->{nc} eq 0x003E) { # >
2264     !!!cp (184);
2265     !!!parse-error (type => 'no PUBLIC literal');
2266    
2267     $self->{state} = DATA_STATE;
2268 wakaba 1.5 $self->{s_kwd} = '';
2269 wakaba 1.1 !!!next-input-character;
2270    
2271     $self->{ct}->{quirks} = 1;
2272     !!!emit ($self->{ct}); # DOCTYPE
2273    
2274     redo A;
2275     } elsif ($self->{nc} == -1) {
2276     !!!cp (185);
2277     !!!parse-error (type => 'unclosed DOCTYPE');
2278    
2279     $self->{state} = DATA_STATE;
2280 wakaba 1.5 $self->{s_kwd} = '';
2281 wakaba 1.1 ## reconsume
2282    
2283     $self->{ct}->{quirks} = 1;
2284     !!!emit ($self->{ct}); # DOCTYPE
2285    
2286     redo A;
2287 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2288     !!!cp (186.1);
2289     !!!parse-error (type => 'no PUBLIC literal');
2290     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2291     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2292 wakaba 1.13 $self->{in_subset} = 1;
2293 wakaba 1.12 !!!next-input-character;
2294 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2295 wakaba 1.12 redo A;
2296 wakaba 1.1 } else {
2297     !!!cp (186);
2298     !!!parse-error (type => 'string after PUBLIC');
2299     $self->{ct}->{quirks} = 1;
2300    
2301     $self->{state} = BOGUS_DOCTYPE_STATE;
2302     !!!next-input-character;
2303     redo A;
2304     }
2305     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2306     if ($self->{nc} == 0x0022) { # "
2307     !!!cp (187);
2308     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2309     !!!next-input-character;
2310     redo A;
2311     } elsif ($self->{nc} == 0x003E) { # >
2312     !!!cp (188);
2313     !!!parse-error (type => 'unclosed PUBLIC literal');
2314    
2315     $self->{state} = DATA_STATE;
2316 wakaba 1.5 $self->{s_kwd} = '';
2317 wakaba 1.1 !!!next-input-character;
2318    
2319     $self->{ct}->{quirks} = 1;
2320     !!!emit ($self->{ct}); # DOCTYPE
2321    
2322     redo A;
2323     } elsif ($self->{nc} == -1) {
2324     !!!cp (189);
2325     !!!parse-error (type => 'unclosed PUBLIC literal');
2326    
2327     $self->{state} = DATA_STATE;
2328 wakaba 1.5 $self->{s_kwd} = '';
2329 wakaba 1.1 ## reconsume
2330    
2331     $self->{ct}->{quirks} = 1;
2332     !!!emit ($self->{ct}); # DOCTYPE
2333    
2334     redo A;
2335     } else {
2336     !!!cp (190);
2337     $self->{ct}->{pubid} # DOCTYPE
2338     .= chr $self->{nc};
2339     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2340     length $self->{ct}->{pubid});
2341    
2342     ## Stay in the state
2343     !!!next-input-character;
2344     redo A;
2345     }
2346     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2347     if ($self->{nc} == 0x0027) { # '
2348     !!!cp (191);
2349     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2350     !!!next-input-character;
2351     redo A;
2352     } elsif ($self->{nc} == 0x003E) { # >
2353     !!!cp (192);
2354     !!!parse-error (type => 'unclosed PUBLIC literal');
2355    
2356     $self->{state} = DATA_STATE;
2357 wakaba 1.5 $self->{s_kwd} = '';
2358 wakaba 1.1 !!!next-input-character;
2359    
2360     $self->{ct}->{quirks} = 1;
2361     !!!emit ($self->{ct}); # DOCTYPE
2362    
2363     redo A;
2364     } elsif ($self->{nc} == -1) {
2365     !!!cp (193);
2366     !!!parse-error (type => 'unclosed PUBLIC literal');
2367    
2368     $self->{state} = DATA_STATE;
2369 wakaba 1.5 $self->{s_kwd} = '';
2370 wakaba 1.1 ## reconsume
2371    
2372     $self->{ct}->{quirks} = 1;
2373     !!!emit ($self->{ct}); # DOCTYPE
2374    
2375     redo A;
2376     } else {
2377     !!!cp (194);
2378     $self->{ct}->{pubid} # DOCTYPE
2379     .= chr $self->{nc};
2380     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2381     length $self->{ct}->{pubid});
2382    
2383     ## Stay in the state
2384     !!!next-input-character;
2385     redo A;
2386     }
2387     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2388     if ($is_space->{$self->{nc}}) {
2389     !!!cp (195);
2390     ## Stay in the state
2391     !!!next-input-character;
2392     redo A;
2393     } elsif ($self->{nc} == 0x0022) { # "
2394     !!!cp (196);
2395     $self->{ct}->{sysid} = ''; # DOCTYPE
2396     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2397     !!!next-input-character;
2398     redo A;
2399     } elsif ($self->{nc} == 0x0027) { # '
2400     !!!cp (197);
2401     $self->{ct}->{sysid} = ''; # DOCTYPE
2402     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2403     !!!next-input-character;
2404     redo A;
2405     } elsif ($self->{nc} == 0x003E) { # >
2406 wakaba 1.12 if ($self->{is_xml}) {
2407     !!!cp (198.1);
2408     !!!parse-error (type => 'no SYSTEM literal');
2409     } else {
2410     !!!cp (198);
2411     }
2412 wakaba 1.1 $self->{state} = DATA_STATE;
2413 wakaba 1.5 $self->{s_kwd} = '';
2414 wakaba 1.1 !!!next-input-character;
2415    
2416     !!!emit ($self->{ct}); # DOCTYPE
2417    
2418     redo A;
2419     } elsif ($self->{nc} == -1) {
2420     !!!cp (199);
2421     !!!parse-error (type => 'unclosed DOCTYPE');
2422    
2423     $self->{state} = DATA_STATE;
2424 wakaba 1.5 $self->{s_kwd} = '';
2425 wakaba 1.1 ## reconsume
2426    
2427     $self->{ct}->{quirks} = 1;
2428     !!!emit ($self->{ct}); # DOCTYPE
2429    
2430     redo A;
2431 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2432     !!!cp (200.1);
2433     !!!parse-error (type => 'no SYSTEM literal');
2434     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2435     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2436 wakaba 1.13 $self->{in_subset} = 1;
2437 wakaba 1.12 !!!next-input-character;
2438 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2439 wakaba 1.12 redo A;
2440 wakaba 1.1 } else {
2441     !!!cp (200);
2442     !!!parse-error (type => 'string after PUBLIC literal');
2443     $self->{ct}->{quirks} = 1;
2444    
2445     $self->{state} = BOGUS_DOCTYPE_STATE;
2446     !!!next-input-character;
2447     redo A;
2448     }
2449     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2450     if ($is_space->{$self->{nc}}) {
2451     !!!cp (201);
2452     ## Stay in the state
2453     !!!next-input-character;
2454     redo A;
2455     } elsif ($self->{nc} == 0x0022) { # "
2456     !!!cp (202);
2457     $self->{ct}->{sysid} = ''; # DOCTYPE
2458     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2459     !!!next-input-character;
2460     redo A;
2461     } elsif ($self->{nc} == 0x0027) { # '
2462     !!!cp (203);
2463     $self->{ct}->{sysid} = ''; # DOCTYPE
2464     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2465     !!!next-input-character;
2466     redo A;
2467     } elsif ($self->{nc} == 0x003E) { # >
2468     !!!cp (204);
2469     !!!parse-error (type => 'no SYSTEM literal');
2470     $self->{state} = DATA_STATE;
2471 wakaba 1.5 $self->{s_kwd} = '';
2472 wakaba 1.1 !!!next-input-character;
2473    
2474     $self->{ct}->{quirks} = 1;
2475     !!!emit ($self->{ct}); # DOCTYPE
2476    
2477     redo A;
2478     } elsif ($self->{nc} == -1) {
2479     !!!cp (205);
2480     !!!parse-error (type => 'unclosed DOCTYPE');
2481    
2482     $self->{state} = DATA_STATE;
2483 wakaba 1.5 $self->{s_kwd} = '';
2484 wakaba 1.1 ## reconsume
2485    
2486     $self->{ct}->{quirks} = 1;
2487     !!!emit ($self->{ct}); # DOCTYPE
2488    
2489     redo A;
2490 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2491     !!!cp (206.1);
2492     !!!parse-error (type => 'no SYSTEM literal');
2493    
2494     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2495     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2496 wakaba 1.13 $self->{in_subset} = 1;
2497 wakaba 1.12 !!!next-input-character;
2498 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2499 wakaba 1.12 redo A;
2500 wakaba 1.1 } else {
2501     !!!cp (206);
2502     !!!parse-error (type => 'string after SYSTEM');
2503     $self->{ct}->{quirks} = 1;
2504    
2505     $self->{state} = BOGUS_DOCTYPE_STATE;
2506     !!!next-input-character;
2507     redo A;
2508     }
2509     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2510     if ($self->{nc} == 0x0022) { # "
2511     !!!cp (207);
2512     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2513     !!!next-input-character;
2514     redo A;
2515 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2516 wakaba 1.1 !!!cp (208);
2517     !!!parse-error (type => 'unclosed SYSTEM literal');
2518    
2519     $self->{state} = DATA_STATE;
2520 wakaba 1.5 $self->{s_kwd} = '';
2521 wakaba 1.1 !!!next-input-character;
2522    
2523     $self->{ct}->{quirks} = 1;
2524     !!!emit ($self->{ct}); # DOCTYPE
2525    
2526     redo A;
2527     } elsif ($self->{nc} == -1) {
2528     !!!cp (209);
2529     !!!parse-error (type => 'unclosed SYSTEM literal');
2530    
2531     $self->{state} = DATA_STATE;
2532 wakaba 1.5 $self->{s_kwd} = '';
2533 wakaba 1.1 ## reconsume
2534    
2535     $self->{ct}->{quirks} = 1;
2536     !!!emit ($self->{ct}); # DOCTYPE
2537    
2538     redo A;
2539     } else {
2540     !!!cp (210);
2541     $self->{ct}->{sysid} # DOCTYPE
2542     .= chr $self->{nc};
2543     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2544     length $self->{ct}->{sysid});
2545    
2546     ## Stay in the state
2547     !!!next-input-character;
2548     redo A;
2549     }
2550     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2551     if ($self->{nc} == 0x0027) { # '
2552     !!!cp (211);
2553     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2554     !!!next-input-character;
2555     redo A;
2556 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2557 wakaba 1.1 !!!cp (212);
2558     !!!parse-error (type => 'unclosed SYSTEM literal');
2559    
2560     $self->{state} = DATA_STATE;
2561 wakaba 1.5 $self->{s_kwd} = '';
2562 wakaba 1.1 !!!next-input-character;
2563    
2564     $self->{ct}->{quirks} = 1;
2565     !!!emit ($self->{ct}); # DOCTYPE
2566    
2567     redo A;
2568     } elsif ($self->{nc} == -1) {
2569     !!!cp (213);
2570     !!!parse-error (type => 'unclosed SYSTEM literal');
2571    
2572     $self->{state} = DATA_STATE;
2573 wakaba 1.5 $self->{s_kwd} = '';
2574 wakaba 1.1 ## reconsume
2575    
2576     $self->{ct}->{quirks} = 1;
2577     !!!emit ($self->{ct}); # DOCTYPE
2578    
2579     redo A;
2580     } else {
2581     !!!cp (214);
2582     $self->{ct}->{sysid} # DOCTYPE
2583     .= chr $self->{nc};
2584     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2585     length $self->{ct}->{sysid});
2586    
2587     ## Stay in the state
2588     !!!next-input-character;
2589     redo A;
2590     }
2591     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2592     if ($is_space->{$self->{nc}}) {
2593     !!!cp (215);
2594     ## Stay in the state
2595     !!!next-input-character;
2596     redo A;
2597     } elsif ($self->{nc} == 0x003E) { # >
2598     !!!cp (216);
2599     $self->{state} = DATA_STATE;
2600 wakaba 1.5 $self->{s_kwd} = '';
2601 wakaba 1.1 !!!next-input-character;
2602    
2603     !!!emit ($self->{ct}); # DOCTYPE
2604    
2605     redo A;
2606     } elsif ($self->{nc} == -1) {
2607     !!!cp (217);
2608     !!!parse-error (type => 'unclosed DOCTYPE');
2609     $self->{state} = DATA_STATE;
2610 wakaba 1.5 $self->{s_kwd} = '';
2611 wakaba 1.1 ## reconsume
2612    
2613     $self->{ct}->{quirks} = 1;
2614     !!!emit ($self->{ct}); # DOCTYPE
2615    
2616     redo A;
2617 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2618     !!!cp (218.1);
2619     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2620     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2621 wakaba 1.13 $self->{in_subset} = 1;
2622 wakaba 1.12 !!!next-input-character;
2623 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2624 wakaba 1.12 redo A;
2625 wakaba 1.1 } else {
2626     !!!cp (218);
2627     !!!parse-error (type => 'string after SYSTEM literal');
2628     #$self->{ct}->{quirks} = 1;
2629    
2630     $self->{state} = BOGUS_DOCTYPE_STATE;
2631     !!!next-input-character;
2632     redo A;
2633     }
2634     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2635     if ($self->{nc} == 0x003E) { # >
2636     !!!cp (219);
2637     $self->{state} = DATA_STATE;
2638 wakaba 1.5 $self->{s_kwd} = '';
2639 wakaba 1.1 !!!next-input-character;
2640    
2641     !!!emit ($self->{ct}); # DOCTYPE
2642    
2643     redo A;
2644 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2645 wakaba 1.13 !!!cp (220.1);
2646     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2647     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2648     $self->{in_subset} = 1;
2649     !!!next-input-character;
2650     !!!emit ($self->{ct}); # DOCTYPE
2651     redo A;
2652 wakaba 1.1 } elsif ($self->{nc} == -1) {
2653     !!!cp (220);
2654     $self->{state} = DATA_STATE;
2655 wakaba 1.5 $self->{s_kwd} = '';
2656 wakaba 1.1 ## reconsume
2657    
2658     !!!emit ($self->{ct}); # DOCTYPE
2659    
2660     redo A;
2661     } else {
2662     !!!cp (221);
2663     my $s = '';
2664 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2665 wakaba 1.1
2666     ## Stay in the state
2667     !!!next-input-character;
2668     redo A;
2669     }
2670     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2671     ## NOTE: "CDATA section state" in the state is jointly implemented
2672     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2673     ## and |CDATA_SECTION_MSE2_STATE|.
2674 wakaba 1.10
2675     ## XML5: "CDATA state".
2676 wakaba 1.1
2677     if ($self->{nc} == 0x005D) { # ]
2678     !!!cp (221.1);
2679     $self->{state} = CDATA_SECTION_MSE1_STATE;
2680     !!!next-input-character;
2681     redo A;
2682     } elsif ($self->{nc} == -1) {
2683 wakaba 1.6 if ($self->{is_xml}) {
2684 wakaba 1.8 !!!cp (221.11);
2685 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2686 wakaba 1.8 } else {
2687     !!!cp (221.12);
2688 wakaba 1.6 }
2689    
2690 wakaba 1.1 $self->{state} = DATA_STATE;
2691 wakaba 1.5 $self->{s_kwd} = '';
2692 wakaba 1.10 ## Reconsume.
2693 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2694     !!!cp (221.2);
2695     !!!emit ($self->{ct}); # character
2696     } else {
2697     !!!cp (221.3);
2698     ## No token to emit. $self->{ct} is discarded.
2699     }
2700     redo A;
2701     } else {
2702     !!!cp (221.4);
2703     $self->{ct}->{data} .= chr $self->{nc};
2704     $self->{read_until}->($self->{ct}->{data},
2705     q<]>,
2706     length $self->{ct}->{data});
2707    
2708     ## Stay in the state.
2709     !!!next-input-character;
2710     redo A;
2711     }
2712    
2713     ## ISSUE: "text tokens" in spec.
2714     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2715 wakaba 1.10 ## XML5: "CDATA bracket state".
2716    
2717 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
2718     !!!cp (221.5);
2719     $self->{state} = CDATA_SECTION_MSE2_STATE;
2720     !!!next-input-character;
2721     redo A;
2722     } else {
2723     !!!cp (221.6);
2724 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
2725 wakaba 1.1 $self->{ct}->{data} .= ']';
2726 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2727 wakaba 1.1 ## Reconsume.
2728     redo A;
2729     }
2730     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2731 wakaba 1.10 ## XML5: "CDATA end state".
2732    
2733 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2734     $self->{state} = DATA_STATE;
2735 wakaba 1.5 $self->{s_kwd} = '';
2736 wakaba 1.1 !!!next-input-character;
2737     if (length $self->{ct}->{data}) { # character
2738     !!!cp (221.7);
2739     !!!emit ($self->{ct}); # character
2740     } else {
2741     !!!cp (221.8);
2742     ## No token to emit. $self->{ct} is discarded.
2743     }
2744     redo A;
2745     } elsif ($self->{nc} == 0x005D) { # ]
2746     !!!cp (221.9); # character
2747     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2748     ## Stay in the state.
2749     !!!next-input-character;
2750     redo A;
2751     } else {
2752     !!!cp (221.11);
2753     $self->{ct}->{data} .= ']]'; # character
2754     $self->{state} = CDATA_SECTION_STATE;
2755 wakaba 1.10 ## Reconsume. ## XML5: Emit.
2756 wakaba 1.1 redo A;
2757     }
2758     } elsif ($self->{state} == ENTITY_STATE) {
2759     if ($is_space->{$self->{nc}} or
2760     {
2761     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2762     $self->{entity_add} => 1,
2763     }->{$self->{nc}}) {
2764     !!!cp (1001);
2765     ## Don't consume
2766     ## No error
2767     ## Return nothing.
2768     #
2769     } elsif ($self->{nc} == 0x0023) { # #
2770     !!!cp (999);
2771     $self->{state} = ENTITY_HASH_STATE;
2772 wakaba 1.12 $self->{kwd} = '#';
2773 wakaba 1.1 !!!next-input-character;
2774     redo A;
2775     } elsif ((0x0041 <= $self->{nc} and
2776     $self->{nc} <= 0x005A) or # A..Z
2777     (0x0061 <= $self->{nc} and
2778     $self->{nc} <= 0x007A)) { # a..z
2779     !!!cp (998);
2780     require Whatpm::_NamedEntityList;
2781     $self->{state} = ENTITY_NAME_STATE;
2782 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2783     $self->{entity__value} = $self->{kwd};
2784 wakaba 1.1 $self->{entity__match} = 0;
2785     !!!next-input-character;
2786     redo A;
2787     } else {
2788     !!!cp (1027);
2789     !!!parse-error (type => 'bare ero');
2790     ## Return nothing.
2791     #
2792     }
2793    
2794     ## NOTE: No character is consumed by the "consume a character
2795     ## reference" algorithm. In other word, there is an "&" character
2796     ## that does not introduce a character reference, which would be
2797     ## appended to the parent element or the attribute value in later
2798     ## process of the tokenizer.
2799    
2800     if ($self->{prev_state} == DATA_STATE) {
2801     !!!cp (997);
2802     $self->{state} = $self->{prev_state};
2803 wakaba 1.5 $self->{s_kwd} = '';
2804 wakaba 1.1 ## Reconsume.
2805     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2806     line => $self->{line_prev},
2807     column => $self->{column_prev},
2808     });
2809     redo A;
2810     } else {
2811     !!!cp (996);
2812     $self->{ca}->{value} .= '&';
2813     $self->{state} = $self->{prev_state};
2814 wakaba 1.5 $self->{s_kwd} = '';
2815 wakaba 1.1 ## Reconsume.
2816     redo A;
2817     }
2818     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2819     if ($self->{nc} == 0x0078 or # x
2820     $self->{nc} == 0x0058) { # X
2821     !!!cp (995);
2822     $self->{state} = HEXREF_X_STATE;
2823 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2824 wakaba 1.1 !!!next-input-character;
2825     redo A;
2826     } elsif (0x0030 <= $self->{nc} and
2827     $self->{nc} <= 0x0039) { # 0..9
2828     !!!cp (994);
2829     $self->{state} = NCR_NUM_STATE;
2830 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
2831 wakaba 1.1 !!!next-input-character;
2832     redo A;
2833     } else {
2834     !!!parse-error (type => 'bare nero',
2835     line => $self->{line_prev},
2836     column => $self->{column_prev} - 1);
2837    
2838     ## NOTE: According to the spec algorithm, nothing is returned,
2839     ## and then "&#" is appended to the parent element or the attribute
2840     ## value in the later processing.
2841    
2842     if ($self->{prev_state} == DATA_STATE) {
2843     !!!cp (1019);
2844     $self->{state} = $self->{prev_state};
2845 wakaba 1.5 $self->{s_kwd} = '';
2846 wakaba 1.1 ## Reconsume.
2847     !!!emit ({type => CHARACTER_TOKEN,
2848     data => '&#',
2849     line => $self->{line_prev},
2850     column => $self->{column_prev} - 1,
2851     });
2852     redo A;
2853     } else {
2854     !!!cp (993);
2855     $self->{ca}->{value} .= '&#';
2856     $self->{state} = $self->{prev_state};
2857 wakaba 1.5 $self->{s_kwd} = '';
2858 wakaba 1.1 ## Reconsume.
2859     redo A;
2860     }
2861     }
2862     } elsif ($self->{state} == NCR_NUM_STATE) {
2863     if (0x0030 <= $self->{nc} and
2864     $self->{nc} <= 0x0039) { # 0..9
2865     !!!cp (1012);
2866 wakaba 1.12 $self->{kwd} *= 10;
2867     $self->{kwd} += $self->{nc} - 0x0030;
2868 wakaba 1.1
2869     ## Stay in the state.
2870     !!!next-input-character;
2871     redo A;
2872     } elsif ($self->{nc} == 0x003B) { # ;
2873     !!!cp (1013);
2874     !!!next-input-character;
2875     #
2876     } else {
2877     !!!cp (1014);
2878     !!!parse-error (type => 'no refc');
2879     ## Reconsume.
2880     #
2881     }
2882    
2883 wakaba 1.12 my $code = $self->{kwd};
2884 wakaba 1.1 my $l = $self->{line_prev};
2885     my $c = $self->{column_prev};
2886     if ($charref_map->{$code}) {
2887     !!!cp (1015);
2888     !!!parse-error (type => 'invalid character reference',
2889     text => (sprintf 'U+%04X', $code),
2890     line => $l, column => $c);
2891     $code = $charref_map->{$code};
2892     } elsif ($code > 0x10FFFF) {
2893     !!!cp (1016);
2894     !!!parse-error (type => 'invalid character reference',
2895     text => (sprintf 'U-%08X', $code),
2896     line => $l, column => $c);
2897     $code = 0xFFFD;
2898     }
2899    
2900     if ($self->{prev_state} == DATA_STATE) {
2901     !!!cp (992);
2902     $self->{state} = $self->{prev_state};
2903 wakaba 1.5 $self->{s_kwd} = '';
2904 wakaba 1.1 ## Reconsume.
2905     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2906 wakaba 1.7 has_reference => 1,
2907 wakaba 1.1 line => $l, column => $c,
2908     });
2909     redo A;
2910     } else {
2911     !!!cp (991);
2912     $self->{ca}->{value} .= chr $code;
2913     $self->{ca}->{has_reference} = 1;
2914     $self->{state} = $self->{prev_state};
2915 wakaba 1.5 $self->{s_kwd} = '';
2916 wakaba 1.1 ## Reconsume.
2917     redo A;
2918     }
2919     } elsif ($self->{state} == HEXREF_X_STATE) {
2920     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2921     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2922     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2923     # 0..9, A..F, a..f
2924     !!!cp (990);
2925     $self->{state} = HEXREF_HEX_STATE;
2926 wakaba 1.12 $self->{kwd} = 0;
2927 wakaba 1.1 ## Reconsume.
2928     redo A;
2929     } else {
2930     !!!parse-error (type => 'bare hcro',
2931     line => $self->{line_prev},
2932     column => $self->{column_prev} - 2);
2933    
2934     ## NOTE: According to the spec algorithm, nothing is returned,
2935     ## and then "&#" followed by "X" or "x" is appended to the parent
2936     ## element or the attribute value in the later processing.
2937    
2938     if ($self->{prev_state} == DATA_STATE) {
2939     !!!cp (1005);
2940     $self->{state} = $self->{prev_state};
2941 wakaba 1.5 $self->{s_kwd} = '';
2942 wakaba 1.1 ## Reconsume.
2943     !!!emit ({type => CHARACTER_TOKEN,
2944 wakaba 1.12 data => '&' . $self->{kwd},
2945 wakaba 1.1 line => $self->{line_prev},
2946 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
2947 wakaba 1.1 });
2948     redo A;
2949     } else {
2950     !!!cp (989);
2951 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
2952 wakaba 1.1 $self->{state} = $self->{prev_state};
2953 wakaba 1.5 $self->{s_kwd} = '';
2954 wakaba 1.1 ## Reconsume.
2955     redo A;
2956     }
2957     }
2958     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2959     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2960     # 0..9
2961     !!!cp (1002);
2962 wakaba 1.12 $self->{kwd} *= 0x10;
2963     $self->{kwd} += $self->{nc} - 0x0030;
2964 wakaba 1.1 ## Stay in the state.
2965     !!!next-input-character;
2966     redo A;
2967     } elsif (0x0061 <= $self->{nc} and
2968     $self->{nc} <= 0x0066) { # a..f
2969     !!!cp (1003);
2970 wakaba 1.12 $self->{kwd} *= 0x10;
2971     $self->{kwd} += $self->{nc} - 0x0060 + 9;
2972 wakaba 1.1 ## Stay in the state.
2973     !!!next-input-character;
2974     redo A;
2975     } elsif (0x0041 <= $self->{nc} and
2976     $self->{nc} <= 0x0046) { # A..F
2977     !!!cp (1004);
2978 wakaba 1.12 $self->{kwd} *= 0x10;
2979     $self->{kwd} += $self->{nc} - 0x0040 + 9;
2980 wakaba 1.1 ## Stay in the state.
2981     !!!next-input-character;
2982     redo A;
2983     } elsif ($self->{nc} == 0x003B) { # ;
2984     !!!cp (1006);
2985     !!!next-input-character;
2986     #
2987     } else {
2988     !!!cp (1007);
2989     !!!parse-error (type => 'no refc',
2990     line => $self->{line},
2991     column => $self->{column});
2992     ## Reconsume.
2993     #
2994     }
2995    
2996 wakaba 1.12 my $code = $self->{kwd};
2997 wakaba 1.1 my $l = $self->{line_prev};
2998     my $c = $self->{column_prev};
2999     if ($charref_map->{$code}) {
3000     !!!cp (1008);
3001     !!!parse-error (type => 'invalid character reference',
3002     text => (sprintf 'U+%04X', $code),
3003     line => $l, column => $c);
3004     $code = $charref_map->{$code};
3005     } elsif ($code > 0x10FFFF) {
3006     !!!cp (1009);
3007     !!!parse-error (type => 'invalid character reference',
3008     text => (sprintf 'U-%08X', $code),
3009     line => $l, column => $c);
3010     $code = 0xFFFD;
3011     }
3012    
3013     if ($self->{prev_state} == DATA_STATE) {
3014     !!!cp (988);
3015     $self->{state} = $self->{prev_state};
3016 wakaba 1.5 $self->{s_kwd} = '';
3017 wakaba 1.1 ## Reconsume.
3018     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3019 wakaba 1.7 has_reference => 1,
3020 wakaba 1.1 line => $l, column => $c,
3021     });
3022     redo A;
3023     } else {
3024     !!!cp (987);
3025     $self->{ca}->{value} .= chr $code;
3026     $self->{ca}->{has_reference} = 1;
3027     $self->{state} = $self->{prev_state};
3028 wakaba 1.5 $self->{s_kwd} = '';
3029 wakaba 1.1 ## Reconsume.
3030     redo A;
3031     }
3032     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3033 wakaba 1.12 if (length $self->{kwd} < 30 and
3034 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
3035     ((0x0041 <= $self->{nc} and # a
3036     $self->{nc} <= 0x005A) or # x
3037     (0x0061 <= $self->{nc} and # a
3038     $self->{nc} <= 0x007A) or # z
3039     (0x0030 <= $self->{nc} and # 0
3040     $self->{nc} <= 0x0039) or # 9
3041     $self->{nc} == 0x003B)) { # ;
3042     our $EntityChar;
3043 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3044     if (defined $EntityChar->{$self->{kwd}}) {
3045 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3046     !!!cp (1020);
3047 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3048 wakaba 1.1 $self->{entity__match} = 1;
3049     !!!next-input-character;
3050     #
3051     } else {
3052     !!!cp (1021);
3053 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3054 wakaba 1.1 $self->{entity__match} = -1;
3055     ## Stay in the state.
3056     !!!next-input-character;
3057     redo A;
3058     }
3059     } else {
3060     !!!cp (1022);
3061     $self->{entity__value} .= chr $self->{nc};
3062     $self->{entity__match} *= 2;
3063     ## Stay in the state.
3064     !!!next-input-character;
3065     redo A;
3066     }
3067     }
3068    
3069     my $data;
3070     my $has_ref;
3071     if ($self->{entity__match} > 0) {
3072     !!!cp (1023);
3073     $data = $self->{entity__value};
3074     $has_ref = 1;
3075     #
3076     } elsif ($self->{entity__match} < 0) {
3077     !!!parse-error (type => 'no refc');
3078     if ($self->{prev_state} != DATA_STATE and # in attribute
3079     $self->{entity__match} < -1) {
3080     !!!cp (1024);
3081 wakaba 1.12 $data = '&' . $self->{kwd};
3082 wakaba 1.1 #
3083     } else {
3084     !!!cp (1025);
3085     $data = $self->{entity__value};
3086     $has_ref = 1;
3087     #
3088     }
3089     } else {
3090     !!!cp (1026);
3091     !!!parse-error (type => 'bare ero',
3092     line => $self->{line_prev},
3093 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3094     $data = '&' . $self->{kwd};
3095 wakaba 1.1 #
3096     }
3097    
3098     ## NOTE: In these cases, when a character reference is found,
3099     ## it is consumed and a character token is returned, or, otherwise,
3100     ## nothing is consumed and returned, according to the spec algorithm.
3101     ## In this implementation, anything that has been examined by the
3102     ## tokenizer is appended to the parent element or the attribute value
3103     ## as string, either literal string when no character reference or
3104     ## entity-replaced string otherwise, in this stage, since any characters
3105     ## that would not be consumed are appended in the data state or in an
3106     ## appropriate attribute value state anyway.
3107    
3108     if ($self->{prev_state} == DATA_STATE) {
3109     !!!cp (986);
3110     $self->{state} = $self->{prev_state};
3111 wakaba 1.5 $self->{s_kwd} = '';
3112 wakaba 1.1 ## Reconsume.
3113     !!!emit ({type => CHARACTER_TOKEN,
3114     data => $data,
3115 wakaba 1.7 has_reference => $has_ref,
3116 wakaba 1.1 line => $self->{line_prev},
3117 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3118 wakaba 1.1 });
3119     redo A;
3120     } else {
3121     !!!cp (985);
3122     $self->{ca}->{value} .= $data;
3123     $self->{ca}->{has_reference} = 1 if $has_ref;
3124     $self->{state} = $self->{prev_state};
3125 wakaba 1.5 $self->{s_kwd} = '';
3126 wakaba 1.1 ## Reconsume.
3127     redo A;
3128     }
3129 wakaba 1.8
3130     ## XML-only states
3131    
3132     } elsif ($self->{state} == PI_STATE) {
3133     if ($is_space->{$self->{nc}} or
3134     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
3135     $self->{nc} == -1) {
3136     !!!parse-error (type => 'bare pio', ## TODO: type
3137     line => $self->{line_prev},
3138     column => $self->{column_prev}
3139     - 1 * ($self->{nc} != -1));
3140     $self->{state} = BOGUS_COMMENT_STATE;
3141     ## Reconsume.
3142     $self->{ct} = {type => COMMENT_TOKEN,
3143     data => '?',
3144     line => $self->{line_prev},
3145     column => $self->{column_prev}
3146     - 1 * ($self->{nc} != -1),
3147     };
3148     redo A;
3149     } else {
3150     $self->{ct} = {type => PI_TOKEN,
3151     target => chr $self->{nc},
3152     data => '',
3153     line => $self->{line_prev},
3154     column => $self->{column_prev} - 1,
3155     };
3156     $self->{state} = PI_TARGET_STATE;
3157     !!!next-input-character;
3158     redo A;
3159     }
3160     } elsif ($self->{state} == PI_TARGET_STATE) {
3161     if ($is_space->{$self->{nc}}) {
3162     $self->{state} = PI_TARGET_AFTER_STATE;
3163     !!!next-input-character;
3164     redo A;
3165     } elsif ($self->{nc} == -1) {
3166     !!!parse-error (type => 'no pic'); ## TODO: type
3167 wakaba 1.13 if ($self->{in_subset}) {
3168     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3169     } else {
3170     $self->{state} = DATA_STATE;
3171     $self->{s_kwd} = '';
3172     }
3173 wakaba 1.8 ## Reconsume.
3174     !!!emit ($self->{ct}); # pi
3175     redo A;
3176     } elsif ($self->{nc} == 0x003F) { # ?
3177     $self->{state} = PI_AFTER_STATE;
3178     !!!next-input-character;
3179     redo A;
3180     } else {
3181     ## XML5: typo ("tag name" -> "target")
3182     $self->{ct}->{target} .= chr $self->{nc}; # pi
3183     !!!next-input-character;
3184     redo A;
3185     }
3186     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3187     if ($is_space->{$self->{nc}}) {
3188     ## Stay in the state.
3189     !!!next-input-character;
3190     redo A;
3191     } else {
3192     $self->{state} = PI_DATA_STATE;
3193     ## Reprocess.
3194     redo A;
3195     }
3196     } elsif ($self->{state} == PI_DATA_STATE) {
3197     if ($self->{nc} == 0x003F) { # ?
3198     $self->{state} = PI_DATA_AFTER_STATE;
3199     !!!next-input-character;
3200     redo A;
3201     } elsif ($self->{nc} == -1) {
3202     !!!parse-error (type => 'no pic'); ## TODO: type
3203 wakaba 1.13 if ($self->{in_subset}) {
3204     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3205     } else {
3206     $self->{state} = DATA_STATE;
3207     $self->{s_kwd} = '';
3208     }
3209 wakaba 1.8 ## Reprocess.
3210     !!!emit ($self->{ct}); # pi
3211     redo A;
3212     } else {
3213     $self->{ct}->{data} .= chr $self->{nc}; # pi
3214     $self->{read_until}->($self->{ct}->{data}, q[?],
3215     length $self->{ct}->{data});
3216     ## Stay in the state.
3217     !!!next-input-character;
3218     ## Reprocess.
3219     redo A;
3220     }
3221     } elsif ($self->{state} == PI_AFTER_STATE) {
3222     if ($self->{nc} == 0x003E) { # >
3223 wakaba 1.13 if ($self->{in_subset}) {
3224     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3225     } else {
3226     $self->{state} = DATA_STATE;
3227     $self->{s_kwd} = '';
3228     }
3229 wakaba 1.8 !!!next-input-character;
3230     !!!emit ($self->{ct}); # pi
3231     redo A;
3232     } elsif ($self->{nc} == 0x003F) { # ?
3233     !!!parse-error (type => 'no s after target', ## TODO: type
3234     line => $self->{line_prev},
3235     column => $self->{column_prev}); ## XML5: no error
3236     $self->{ct}->{data} .= '?';
3237     $self->{state} = PI_DATA_AFTER_STATE;
3238     !!!next-input-character;
3239     redo A;
3240     } else {
3241     !!!parse-error (type => 'no s after target', ## TODO: type
3242     line => $self->{line_prev},
3243     column => $self->{column_prev}
3244     + 1 * ($self->{nc} == -1)); ## XML5: no error
3245     $self->{ct}->{data} .= '?'; ## XML5: not appended
3246     $self->{state} = PI_DATA_STATE;
3247     ## Reprocess.
3248     redo A;
3249     }
3250     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3251     ## XML5: Same as "pi after state" in XML5
3252     if ($self->{nc} == 0x003E) { # >
3253 wakaba 1.13 if ($self->{in_subset}) {
3254     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3255     } else {
3256     $self->{state} = DATA_STATE;
3257     $self->{s_kwd} = '';
3258     }
3259 wakaba 1.8 !!!next-input-character;
3260     !!!emit ($self->{ct}); # pi
3261     redo A;
3262     } elsif ($self->{nc} == 0x003F) { # ?
3263     $self->{ct}->{data} .= '?';
3264     ## Stay in the state.
3265     !!!next-input-character;
3266     redo A;
3267     } else {
3268     $self->{ct}->{data} .= '?'; ## XML5: not appended
3269     $self->{state} = PI_DATA_STATE;
3270     ## Reprocess.
3271     redo A;
3272     }
3273 wakaba 1.12
3274     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3275     if ($self->{nc} == 0x003C) { # <
3276 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3277 wakaba 1.12 !!!next-input-character;
3278     redo A;
3279     } elsif ($self->{nc} == 0x0025) { # %
3280     ## XML5: Not defined yet.
3281    
3282     ## TODO:
3283     !!!next-input-character;
3284     redo A;
3285     } elsif ($self->{nc} == 0x005D) { # ]
3286 wakaba 1.13 delete $self->{in_subset};
3287 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3288     !!!next-input-character;
3289     redo A;
3290     } elsif ($is_space->{$self->{nc}}) {
3291     ## Stay in the state.
3292     !!!next-input-character;
3293     redo A;
3294     } elsif ($self->{nc} == -1) {
3295     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3296 wakaba 1.13 delete $self->{in_subset};
3297 wakaba 1.12 $self->{state} = DATA_STATE;
3298     $self->{s_kwd} = '';
3299     ## Reconsume.
3300 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3301 wakaba 1.12 redo A;
3302     } else {
3303     unless ($self->{internal_subset_tainted}) {
3304     ## XML5: No parse error.
3305     !!!parse-error (type => 'string in internal subset');
3306     $self->{internal_subset_tainted} = 1;
3307     }
3308     ## Stay in the state.
3309     !!!next-input-character;
3310     redo A;
3311     }
3312     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3313     if ($self->{nc} == 0x003E) { # >
3314     $self->{state} = DATA_STATE;
3315     $self->{s_kwd} = '';
3316     !!!next-input-character;
3317 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3318 wakaba 1.12 redo A;
3319     } elsif ($self->{nc} == -1) {
3320     !!!parse-error (type => 'unclosed DOCTYPE');
3321     $self->{state} = DATA_STATE;
3322     $self->{s_kwd} = '';
3323     ## Reconsume.
3324 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3325 wakaba 1.12 redo A;
3326     } else {
3327     ## XML5: No parse error and stay in the state.
3328     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3329    
3330 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3331     !!!next-input-character;
3332     redo A;
3333     }
3334     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3335     if ($self->{nc} == 0x003E) { # >
3336     $self->{state} = DATA_STATE;
3337     $self->{s_kwd} = '';
3338     !!!next-input-character;
3339     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3340     redo A;
3341     } elsif ($self->{nc} == -1) {
3342     $self->{state} = DATA_STATE;
3343     $self->{s_kwd} = '';
3344     ## Reconsume.
3345     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3346     redo A;
3347     } else {
3348     ## Stay in the state.
3349     !!!next-input-character;
3350     redo A;
3351     }
3352     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3353     if ($self->{nc} == 0x0021) { # !
3354     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
3355     !!!next-input-character;
3356     redo A;
3357     } elsif ($self->{nc} == 0x003F) { # ?
3358     $self->{state} = PI_STATE;
3359     !!!next-input-character;
3360     redo A;
3361     } elsif ($self->{nc} == -1) {
3362     !!!parse-error (type => 'bare stago');
3363     $self->{state} = DATA_STATE;
3364     $self->{s_kwd} = '';
3365     ## Reconsume.
3366     redo A;
3367     } else {
3368     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3369     line => $self->{line_prev},
3370     column => $self->{column_prev});
3371     $self->{state} = BOGUS_COMMENT_STATE;
3372     $self->{ct} = {type => COMMENT_TOKEN,
3373     data => '',
3374     }; ## NOTE: Will be discarded.
3375 wakaba 1.12 !!!next-input-character;
3376     redo A;
3377     }
3378 wakaba 1.8
3379 wakaba 1.1 } else {
3380     die "$0: $self->{state}: Unknown state";
3381     }
3382     } # A
3383    
3384     die "$0: _get_next_token: unexpected case";
3385     } # _get_next_token
3386    
3387     1;
3388 wakaba 1.13 ## $Date: 2008/10/15 12:49:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24