/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.34 - (hide annotations) (download) (as text)
Sat Sep 5 11:31:58 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.33: +11 -10 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Sep 2009 11:31:07 -0000
	* tokenizer-test-1.test: Changed to keep non-normal character
	references (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 11:31:46 -0000
	* Tokenizer.pm.src: Changed to keep non-normal character
	references as is (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.34 our $VERSION=do{my @r=(q$Revision: 1.33 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.32 sub COMMENT_END_BANG_STATE () { 102 }
109     sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
111     sub BOGUS_COMMENT_STATE () { 19 }
112     sub DOCTYPE_STATE () { 20 }
113     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114     sub DOCTYPE_NAME_STATE () { 22 }
115     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124     sub BOGUS_DOCTYPE_STATE () { 32 }
125     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126     sub SELF_CLOSING_START_TAG_STATE () { 34 }
127     sub CDATA_SECTION_STATE () { 35 }
128     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136     ## NOTE: "Entity data state", "entity in attribute value state", and
137     ## "consume a character reference" algorithm are jointly implemented
138     ## using the following six states:
139     sub ENTITY_STATE () { 44 }
140     sub ENTITY_HASH_STATE () { 45 }
141     sub NCR_NUM_STATE () { 46 }
142     sub HEXREF_X_STATE () { 47 }
143     sub HEXREF_HEX_STATE () { 48 }
144     sub ENTITY_NAME_STATE () { 49 }
145     sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147 wakaba 1.12 ## XML-only states
148 wakaba 1.8 sub PI_STATE () { 51 }
149     sub PI_TARGET_STATE () { 52 }
150     sub PI_TARGET_AFTER_STATE () { 53 }
151     sub PI_DATA_STATE () { 54 }
152     sub PI_AFTER_STATE () { 55 }
153     sub PI_DATA_AFTER_STATE () { 56 }
154 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157     sub DOCTYPE_TAG_STATE () { 60 }
158     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159     sub MD_ATTLIST_STATE () { 62 }
160     sub MD_E_STATE () { 63 }
161     sub MD_ELEMENT_STATE () { 64 }
162     sub MD_ENTITY_STATE () { 65 }
163     sub MD_NOTATION_STATE () { 66 }
164     sub DOCTYPE_MD_STATE () { 67 }
165     sub BEFORE_MD_NAME_STATE () { 68 }
166     sub MD_NAME_STATE () { 69 }
167     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174     sub ALLOWED_TOKEN_STATE () { 77 }
175     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
183     sub NDATA_STATE () { 86 }
184     sub AFTER_NDATA_STATE () { 87 }
185     sub BEFORE_NOTATION_NAME_STATE () { 88 }
186     sub NOTATION_NAME_STATE () { 89 }
187 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190     sub AFTER_ELEMENT_NAME_STATE () { 93 }
191     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192     sub CONTENT_KEYWORD_STATE () { 95 }
193     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194     sub CM_ELEMENT_NAME_STATE () { 97 }
195     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197     sub AFTER_MD_DEF_STATE () { 100 }
198     sub BOGUS_MD_STATE () { 101 }
199 wakaba 1.8
200 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
201     ## list and descriptions)
202    
203     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204     sub FOREIGN_EL () { 0b1_00000000000 }
205    
206     ## Character reference mappings
207    
208     my $charref_map = {
209 wakaba 1.34 0x00 => 0xFFFD, # REPLACEMENT CHARACTER
210 wakaba 1.1 0x0D => 0x000A,
211     0x80 => 0x20AC,
212 wakaba 1.34 0x81 => 0x0081,
213 wakaba 1.1 0x82 => 0x201A,
214     0x83 => 0x0192,
215     0x84 => 0x201E,
216     0x85 => 0x2026,
217     0x86 => 0x2020,
218     0x87 => 0x2021,
219     0x88 => 0x02C6,
220     0x89 => 0x2030,
221     0x8A => 0x0160,
222     0x8B => 0x2039,
223     0x8C => 0x0152,
224 wakaba 1.34 0x8D => 0x008D,
225 wakaba 1.1 0x8E => 0x017D,
226 wakaba 1.34 0x8F => 0x008F,
227     0x90 => 0x0090,
228 wakaba 1.1 0x91 => 0x2018,
229     0x92 => 0x2019,
230     0x93 => 0x201C,
231     0x94 => 0x201D,
232     0x95 => 0x2022,
233     0x96 => 0x2013,
234     0x97 => 0x2014,
235     0x98 => 0x02DC,
236     0x99 => 0x2122,
237     0x9A => 0x0161,
238     0x9B => 0x203A,
239     0x9C => 0x0153,
240 wakaba 1.34 0x9D => 0x009D,
241 wakaba 1.1 0x9E => 0x017E,
242     0x9F => 0x0178,
243     }; # $charref_map
244 wakaba 1.34 $charref_map->{$_} = $_
245     for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
246     0xD800..0xDFFF, 0xFDD0..0xFDEF,
247 wakaba 1.1 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
248     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
249     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
250     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
251     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
252    
253     ## Implementations MUST act as if state machine in the spec
254    
255     sub _initialize_tokenizer ($) {
256     my $self = shift;
257    
258     ## NOTE: Fields set by |new| constructor:
259     #$self->{level}
260     #$self->{set_nc}
261     #$self->{parse_error}
262 wakaba 1.3 #$self->{is_xml} (if XML)
263 wakaba 1.1
264     $self->{state} = DATA_STATE; # MUST
265 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
266     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
267 wakaba 1.1 #$self->{entity__value}; # initialized when used
268     #$self->{entity__match}; # initialized when used
269     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
270     undef $self->{ct}; # current token
271     undef $self->{ca}; # current attribute
272     undef $self->{last_stag_name}; # last emitted start tag name
273     #$self->{prev_state}; # initialized when used
274     delete $self->{self_closing};
275     $self->{char_buffer} = '';
276     $self->{char_buffer_pos} = 0;
277     $self->{nc} = -1; # next input character
278     #$self->{next_nc}
279     !!!next-input-character;
280     $self->{token} = [];
281     # $self->{escape}
282     } # _initialize_tokenizer
283    
284     ## A token has:
285     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
286 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
287 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
288     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
289 wakaba 1.11 ## ->{target} (PI_TOKEN)
290 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
291     ## ->{sysid} (DOCTYPE_TOKEN)
292     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
293     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
294     ## ->{name}
295     ## ->{value}
296     ## ->{has_reference} == 1 or 0
297 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
298     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
299 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
300 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
301 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
302    
303 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
304     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
305     ## while the token is pushed back to the stack.
306    
307     ## Emitted token MUST immediately be handled by the tree construction state.
308    
309     ## Before each step, UA MAY check to see if either one of the scripts in
310     ## "list of scripts that will execute as soon as possible" or the first
311     ## script in the "list of scripts that will execute asynchronously",
312     ## has completed loading. If one has, then it MUST be executed
313     ## and removed from the list.
314    
315     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
316     ## (This requirement was dropped from HTML5 spec, unfortunately.)
317    
318     my $is_space = {
319     0x0009 => 1, # CHARACTER TABULATION (HT)
320     0x000A => 1, # LINE FEED (LF)
321     #0x000B => 0, # LINE TABULATION (VT)
322 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
323 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
324     0x0020 => 1, # SPACE (SP)
325     };
326    
327     sub _get_next_token ($) {
328     my $self = shift;
329    
330     if ($self->{self_closing}) {
331     !!!parse-error (type => 'nestc', token => $self->{ct});
332     ## NOTE: The |self_closing| flag is only set by start tag token.
333     ## In addition, when a start tag token is emitted, it is always set to
334     ## |ct|.
335     delete $self->{self_closing};
336     }
337    
338     if (@{$self->{token}}) {
339     $self->{self_closing} = $self->{token}->[0]->{self_closing};
340     return shift @{$self->{token}};
341     }
342    
343     A: {
344     if ($self->{state} == PCDATA_STATE) {
345     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
346    
347     if ($self->{nc} == 0x0026) { # &
348     !!!cp (0.1);
349     ## NOTE: In the spec, the tokenizer is switched to the
350     ## "entity data state". In this implementation, the tokenizer
351     ## is switched to the |ENTITY_STATE|, which is an implementation
352     ## of the "consume a character reference" algorithm.
353     $self->{entity_add} = -1;
354     $self->{prev_state} = DATA_STATE;
355     $self->{state} = ENTITY_STATE;
356     !!!next-input-character;
357     redo A;
358     } elsif ($self->{nc} == 0x003C) { # <
359     !!!cp (0.2);
360     $self->{state} = TAG_OPEN_STATE;
361     !!!next-input-character;
362     redo A;
363     } elsif ($self->{nc} == -1) {
364     !!!cp (0.3);
365     !!!emit ({type => END_OF_FILE_TOKEN,
366     line => $self->{line}, column => $self->{column}});
367     last A; ## TODO: ok?
368     } else {
369     !!!cp (0.4);
370     #
371     }
372    
373     # Anything else
374     my $token = {type => CHARACTER_TOKEN,
375     data => chr $self->{nc},
376     line => $self->{line}, column => $self->{column},
377     };
378     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
379    
380     ## Stay in the state.
381     !!!next-input-character;
382     !!!emit ($token);
383     redo A;
384     } elsif ($self->{state} == DATA_STATE) {
385     $self->{s_kwd} = '' unless defined $self->{s_kwd};
386     if ($self->{nc} == 0x0026) { # &
387     $self->{s_kwd} = '';
388     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
389     not $self->{escape}) {
390     !!!cp (1);
391     ## NOTE: In the spec, the tokenizer is switched to the
392     ## "entity data state". In this implementation, the tokenizer
393     ## is switched to the |ENTITY_STATE|, which is an implementation
394     ## of the "consume a character reference" algorithm.
395     $self->{entity_add} = -1;
396     $self->{prev_state} = DATA_STATE;
397     $self->{state} = ENTITY_STATE;
398     !!!next-input-character;
399     redo A;
400     } else {
401     !!!cp (2);
402     #
403     }
404     } elsif ($self->{nc} == 0x002D) { # -
405     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
406 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
407 wakaba 1.1 !!!cp (3);
408     $self->{escape} = 1; # unless $self->{escape};
409     $self->{s_kwd} = '--';
410     #
411 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
412 wakaba 1.1 !!!cp (4);
413     $self->{s_kwd} = '--';
414     #
415 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
416     !!!cp (4.1);
417     $self->{s_kwd} .= '-';
418     #
419 wakaba 1.1 } else {
420     !!!cp (5);
421 wakaba 1.5 $self->{s_kwd} = '-';
422 wakaba 1.1 #
423     }
424     }
425    
426     #
427     } elsif ($self->{nc} == 0x0021) { # !
428     if (length $self->{s_kwd}) {
429     !!!cp (5.1);
430     $self->{s_kwd} .= '!';
431     #
432     } else {
433     !!!cp (5.2);
434     #$self->{s_kwd} = '';
435     #
436     }
437     #
438     } elsif ($self->{nc} == 0x003C) { # <
439     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
440     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
441     not $self->{escape})) {
442     !!!cp (6);
443     $self->{state} = TAG_OPEN_STATE;
444     !!!next-input-character;
445     redo A;
446     } else {
447     !!!cp (7);
448     $self->{s_kwd} = '';
449     #
450     }
451     } elsif ($self->{nc} == 0x003E) { # >
452     if ($self->{escape} and
453     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
454     if ($self->{s_kwd} eq '--') {
455     !!!cp (8);
456     delete $self->{escape};
457 wakaba 1.5 #
458 wakaba 1.1 } else {
459     !!!cp (9);
460 wakaba 1.5 #
461 wakaba 1.1 }
462 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
463     !!!cp (9.1);
464     !!!parse-error (type => 'unmatched mse', ## TODO: type
465     line => $self->{line_prev},
466     column => $self->{column_prev} - 1);
467     #
468 wakaba 1.1 } else {
469     !!!cp (10);
470 wakaba 1.5 #
471 wakaba 1.1 }
472    
473     $self->{s_kwd} = '';
474     #
475 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
476     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
477     !!!cp (10.1);
478     $self->{s_kwd} .= ']';
479     } elsif ($self->{s_kwd} eq ']]') {
480     !!!cp (10.2);
481     #
482     } else {
483     !!!cp (10.3);
484     $self->{s_kwd} = '';
485     }
486     #
487 wakaba 1.1 } elsif ($self->{nc} == -1) {
488     !!!cp (11);
489     $self->{s_kwd} = '';
490     !!!emit ({type => END_OF_FILE_TOKEN,
491     line => $self->{line}, column => $self->{column}});
492     last A; ## TODO: ok?
493     } else {
494     !!!cp (12);
495     $self->{s_kwd} = '';
496     #
497     }
498    
499     # Anything else
500     my $token = {type => CHARACTER_TOKEN,
501     data => chr $self->{nc},
502     line => $self->{line}, column => $self->{column},
503     };
504 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
505 wakaba 1.1 length $token->{data})) {
506     $self->{s_kwd} = '';
507     }
508    
509     ## Stay in the data state.
510 wakaba 1.5 if (not $self->{is_xml} and
511     $self->{content_model} == PCDATA_CONTENT_MODEL) {
512 wakaba 1.1 !!!cp (13);
513     $self->{state} = PCDATA_STATE;
514     } else {
515     !!!cp (14);
516     ## Stay in the state.
517     }
518     !!!next-input-character;
519     !!!emit ($token);
520     redo A;
521     } elsif ($self->{state} == TAG_OPEN_STATE) {
522 wakaba 1.10 ## XML5: "tag state".
523    
524 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
525     if ($self->{nc} == 0x002F) { # /
526     !!!cp (15);
527     !!!next-input-character;
528     $self->{state} = CLOSE_TAG_OPEN_STATE;
529     redo A;
530     } elsif ($self->{nc} == 0x0021) { # !
531     !!!cp (15.1);
532 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
533 wakaba 1.1 #
534     } else {
535     !!!cp (16);
536 wakaba 1.12 $self->{s_kwd} = '';
537 wakaba 1.1 #
538     }
539    
540     ## reconsume
541     $self->{state} = DATA_STATE;
542     !!!emit ({type => CHARACTER_TOKEN, data => '<',
543     line => $self->{line_prev},
544     column => $self->{column_prev},
545     });
546     redo A;
547     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
548     if ($self->{nc} == 0x0021) { # !
549     !!!cp (17);
550     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
551     !!!next-input-character;
552     redo A;
553     } elsif ($self->{nc} == 0x002F) { # /
554     !!!cp (18);
555     $self->{state} = CLOSE_TAG_OPEN_STATE;
556     !!!next-input-character;
557     redo A;
558     } elsif (0x0041 <= $self->{nc} and
559     $self->{nc} <= 0x005A) { # A..Z
560     !!!cp (19);
561     $self->{ct}
562     = {type => START_TAG_TOKEN,
563 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
564 wakaba 1.1 line => $self->{line_prev},
565     column => $self->{column_prev}};
566     $self->{state} = TAG_NAME_STATE;
567     !!!next-input-character;
568     redo A;
569     } elsif (0x0061 <= $self->{nc} and
570     $self->{nc} <= 0x007A) { # a..z
571     !!!cp (20);
572     $self->{ct} = {type => START_TAG_TOKEN,
573     tag_name => chr ($self->{nc}),
574     line => $self->{line_prev},
575     column => $self->{column_prev}};
576     $self->{state} = TAG_NAME_STATE;
577     !!!next-input-character;
578     redo A;
579     } elsif ($self->{nc} == 0x003E) { # >
580     !!!cp (21);
581     !!!parse-error (type => 'empty start tag',
582     line => $self->{line_prev},
583     column => $self->{column_prev});
584     $self->{state} = DATA_STATE;
585 wakaba 1.5 $self->{s_kwd} = '';
586 wakaba 1.1 !!!next-input-character;
587    
588     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
589     line => $self->{line_prev},
590     column => $self->{column_prev},
591     });
592    
593     redo A;
594     } elsif ($self->{nc} == 0x003F) { # ?
595 wakaba 1.8 if ($self->{is_xml}) {
596     !!!cp (22.1);
597     $self->{state} = PI_STATE;
598     !!!next-input-character;
599     redo A;
600     } else {
601     !!!cp (22);
602     !!!parse-error (type => 'pio',
603     line => $self->{line_prev},
604     column => $self->{column_prev});
605     $self->{state} = BOGUS_COMMENT_STATE;
606     $self->{ct} = {type => COMMENT_TOKEN, data => '',
607     line => $self->{line_prev},
608     column => $self->{column_prev},
609     };
610     ## $self->{nc} is intentionally left as is
611     redo A;
612     }
613 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
614 wakaba 1.1 !!!cp (23);
615     !!!parse-error (type => 'bare stago',
616     line => $self->{line_prev},
617     column => $self->{column_prev});
618     $self->{state} = DATA_STATE;
619 wakaba 1.5 $self->{s_kwd} = '';
620 wakaba 1.1 ## reconsume
621    
622     !!!emit ({type => CHARACTER_TOKEN, data => '<',
623     line => $self->{line_prev},
624     column => $self->{column_prev},
625     });
626    
627     redo A;
628 wakaba 1.9 } else {
629     ## XML5: "<:" is a parse error.
630     !!!cp (23.1);
631     $self->{ct} = {type => START_TAG_TOKEN,
632     tag_name => chr ($self->{nc}),
633     line => $self->{line_prev},
634     column => $self->{column_prev}};
635     $self->{state} = TAG_NAME_STATE;
636     !!!next-input-character;
637     redo A;
638 wakaba 1.1 }
639     } else {
640     die "$0: $self->{content_model} in tag open";
641     }
642     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
643     ## NOTE: The "close tag open state" in the spec is implemented as
644     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
645    
646 wakaba 1.10 ## XML5: "end tag state".
647    
648 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
649     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
650     if (defined $self->{last_stag_name}) {
651     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
652 wakaba 1.12 $self->{kwd} = '';
653 wakaba 1.1 ## Reconsume.
654     redo A;
655     } else {
656     ## No start tag token has ever been emitted
657     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
658     !!!cp (28);
659     $self->{state} = DATA_STATE;
660 wakaba 1.5 $self->{s_kwd} = '';
661 wakaba 1.1 ## Reconsume.
662     !!!emit ({type => CHARACTER_TOKEN, data => '</',
663     line => $l, column => $c,
664     });
665     redo A;
666     }
667     }
668    
669     if (0x0041 <= $self->{nc} and
670     $self->{nc} <= 0x005A) { # A..Z
671     !!!cp (29);
672     $self->{ct}
673     = {type => END_TAG_TOKEN,
674 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
675 wakaba 1.1 line => $l, column => $c};
676     $self->{state} = TAG_NAME_STATE;
677     !!!next-input-character;
678     redo A;
679     } elsif (0x0061 <= $self->{nc} and
680     $self->{nc} <= 0x007A) { # a..z
681     !!!cp (30);
682     $self->{ct} = {type => END_TAG_TOKEN,
683     tag_name => chr ($self->{nc}),
684     line => $l, column => $c};
685     $self->{state} = TAG_NAME_STATE;
686     !!!next-input-character;
687     redo A;
688     } elsif ($self->{nc} == 0x003E) { # >
689     !!!parse-error (type => 'empty end tag',
690     line => $self->{line_prev}, ## "<" in "</>"
691     column => $self->{column_prev} - 1);
692     $self->{state} = DATA_STATE;
693 wakaba 1.5 $self->{s_kwd} = '';
694 wakaba 1.10 if ($self->{is_xml}) {
695     !!!cp (31);
696     ## XML5: No parse error.
697    
698     ## NOTE: This parser raises a parse error, since it supports
699     ## XML1, not XML5.
700    
701     ## NOTE: A short end tag token.
702     my $ct = {type => END_TAG_TOKEN,
703     tag_name => '',
704     line => $self->{line_prev},
705     column => $self->{column_prev} - 1,
706     };
707     !!!next-input-character;
708     !!!emit ($ct);
709     } else {
710     !!!cp (31.1);
711     !!!next-input-character;
712     }
713 wakaba 1.1 redo A;
714     } elsif ($self->{nc} == -1) {
715     !!!cp (32);
716     !!!parse-error (type => 'bare etago');
717 wakaba 1.5 $self->{s_kwd} = '';
718 wakaba 1.1 $self->{state} = DATA_STATE;
719     # reconsume
720    
721     !!!emit ({type => CHARACTER_TOKEN, data => '</',
722     line => $l, column => $c,
723     });
724    
725     redo A;
726 wakaba 1.10 } elsif (not $self->{is_xml} or
727     $is_space->{$self->{nc}}) {
728 wakaba 1.1 !!!cp (33);
729 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
730     line => $self->{line_prev}, # "<" of "</"
731     column => $self->{column_prev} - 1);
732 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
733     $self->{ct} = {type => COMMENT_TOKEN, data => '',
734     line => $self->{line_prev}, # "<" of "</"
735     column => $self->{column_prev} - 1,
736     };
737     ## NOTE: $self->{nc} is intentionally left as is.
738     ## Although the "anything else" case of the spec not explicitly
739     ## states that the next input character is to be reconsumed,
740     ## it will be included to the |data| of the comment token
741     ## generated from the bogus end tag, as defined in the
742     ## "bogus comment state" entry.
743     redo A;
744 wakaba 1.10 } else {
745     ## XML5: "</:" is a parse error.
746     !!!cp (30.1);
747     $self->{ct} = {type => END_TAG_TOKEN,
748     tag_name => chr ($self->{nc}),
749     line => $l, column => $c};
750     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
751     !!!next-input-character;
752     redo A;
753 wakaba 1.1 }
754     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
755 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
756 wakaba 1.1 if (length $ch) {
757     my $CH = $ch;
758     $ch =~ tr/a-z/A-Z/;
759     my $nch = chr $self->{nc};
760     if ($nch eq $ch or $nch eq $CH) {
761     !!!cp (24);
762     ## Stay in the state.
763 wakaba 1.12 $self->{kwd} .= $nch;
764 wakaba 1.1 !!!next-input-character;
765     redo A;
766     } else {
767     !!!cp (25);
768     $self->{state} = DATA_STATE;
769 wakaba 1.5 $self->{s_kwd} = '';
770 wakaba 1.1 ## Reconsume.
771     !!!emit ({type => CHARACTER_TOKEN,
772 wakaba 1.12 data => '</' . $self->{kwd},
773 wakaba 1.1 line => $self->{line_prev},
774 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
775 wakaba 1.1 });
776     redo A;
777     }
778     } else { # after "<{tag-name}"
779     unless ($is_space->{$self->{nc}} or
780     {
781     0x003E => 1, # >
782     0x002F => 1, # /
783     -1 => 1, # EOF
784     }->{$self->{nc}}) {
785     !!!cp (26);
786     ## Reconsume.
787     $self->{state} = DATA_STATE;
788 wakaba 1.5 $self->{s_kwd} = '';
789 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
790 wakaba 1.12 data => '</' . $self->{kwd},
791 wakaba 1.1 line => $self->{line_prev},
792 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
793 wakaba 1.1 });
794     redo A;
795     } else {
796     !!!cp (27);
797     $self->{ct}
798     = {type => END_TAG_TOKEN,
799     tag_name => $self->{last_stag_name},
800     line => $self->{line_prev},
801 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
802 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
803     ## Reconsume.
804     redo A;
805     }
806     }
807     } elsif ($self->{state} == TAG_NAME_STATE) {
808     if ($is_space->{$self->{nc}}) {
809     !!!cp (34);
810     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
811     !!!next-input-character;
812     redo A;
813     } elsif ($self->{nc} == 0x003E) { # >
814     if ($self->{ct}->{type} == START_TAG_TOKEN) {
815     !!!cp (35);
816     $self->{last_stag_name} = $self->{ct}->{tag_name};
817     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
818     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
819     #if ($self->{ct}->{attributes}) {
820     # ## NOTE: This should never be reached.
821     # !!! cp (36);
822     # !!! parse-error (type => 'end tag attribute');
823     #} else {
824     !!!cp (37);
825     #}
826     } else {
827     die "$0: $self->{ct}->{type}: Unknown token type";
828     }
829     $self->{state} = DATA_STATE;
830 wakaba 1.5 $self->{s_kwd} = '';
831 wakaba 1.1 !!!next-input-character;
832    
833     !!!emit ($self->{ct}); # start tag or end tag
834    
835     redo A;
836     } elsif (0x0041 <= $self->{nc} and
837     $self->{nc} <= 0x005A) { # A..Z
838     !!!cp (38);
839 wakaba 1.4 $self->{ct}->{tag_name}
840     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
841 wakaba 1.1 # start tag or end tag
842     ## Stay in this state
843     !!!next-input-character;
844     redo A;
845     } elsif ($self->{nc} == -1) {
846     !!!parse-error (type => 'unclosed tag');
847     if ($self->{ct}->{type} == START_TAG_TOKEN) {
848     !!!cp (39);
849     $self->{last_stag_name} = $self->{ct}->{tag_name};
850     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
851     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
852     #if ($self->{ct}->{attributes}) {
853     # ## NOTE: This state should never be reached.
854     # !!! cp (40);
855     # !!! parse-error (type => 'end tag attribute');
856     #} else {
857     !!!cp (41);
858     #}
859     } else {
860     die "$0: $self->{ct}->{type}: Unknown token type";
861     }
862     $self->{state} = DATA_STATE;
863 wakaba 1.5 $self->{s_kwd} = '';
864 wakaba 1.1 # reconsume
865    
866 wakaba 1.33 ## Discard the token.
867     #!!!emit ($self->{ct}); # start tag or end tag
868 wakaba 1.1
869     redo A;
870     } elsif ($self->{nc} == 0x002F) { # /
871     !!!cp (42);
872     $self->{state} = SELF_CLOSING_START_TAG_STATE;
873     !!!next-input-character;
874     redo A;
875     } else {
876     !!!cp (44);
877     $self->{ct}->{tag_name} .= chr $self->{nc};
878     # start tag or end tag
879     ## Stay in the state
880     !!!next-input-character;
881     redo A;
882     }
883     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
884 wakaba 1.11 ## XML5: "Tag attribute name before state".
885    
886 wakaba 1.1 if ($is_space->{$self->{nc}}) {
887     !!!cp (45);
888     ## Stay in the state
889     !!!next-input-character;
890     redo A;
891     } elsif ($self->{nc} == 0x003E) { # >
892     if ($self->{ct}->{type} == START_TAG_TOKEN) {
893     !!!cp (46);
894     $self->{last_stag_name} = $self->{ct}->{tag_name};
895     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
896     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
897     if ($self->{ct}->{attributes}) {
898     !!!cp (47);
899     !!!parse-error (type => 'end tag attribute');
900     } else {
901     !!!cp (48);
902     }
903     } else {
904     die "$0: $self->{ct}->{type}: Unknown token type";
905     }
906     $self->{state} = DATA_STATE;
907 wakaba 1.5 $self->{s_kwd} = '';
908 wakaba 1.1 !!!next-input-character;
909    
910     !!!emit ($self->{ct}); # start tag or end tag
911    
912     redo A;
913     } elsif (0x0041 <= $self->{nc} and
914     $self->{nc} <= 0x005A) { # A..Z
915     !!!cp (49);
916     $self->{ca}
917 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
918 wakaba 1.1 value => '',
919     line => $self->{line}, column => $self->{column}};
920     $self->{state} = ATTRIBUTE_NAME_STATE;
921     !!!next-input-character;
922     redo A;
923     } elsif ($self->{nc} == 0x002F) { # /
924     !!!cp (50);
925     $self->{state} = SELF_CLOSING_START_TAG_STATE;
926     !!!next-input-character;
927     redo A;
928     } elsif ($self->{nc} == -1) {
929     !!!parse-error (type => 'unclosed tag');
930     if ($self->{ct}->{type} == START_TAG_TOKEN) {
931     !!!cp (52);
932     $self->{last_stag_name} = $self->{ct}->{tag_name};
933     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
934     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
935     if ($self->{ct}->{attributes}) {
936     !!!cp (53);
937     !!!parse-error (type => 'end tag attribute');
938     } else {
939     !!!cp (54);
940     }
941     } else {
942     die "$0: $self->{ct}->{type}: Unknown token type";
943     }
944     $self->{state} = DATA_STATE;
945 wakaba 1.5 $self->{s_kwd} = '';
946 wakaba 1.1 # reconsume
947    
948 wakaba 1.33 ## Discard the token.
949     #!!!emit ($self->{ct}); # start tag or end tag
950 wakaba 1.1
951     redo A;
952     } else {
953     if ({
954     0x0022 => 1, # "
955     0x0027 => 1, # '
956 wakaba 1.30 0x003C => 1, # <
957 wakaba 1.1 0x003D => 1, # =
958     }->{$self->{nc}}) {
959     !!!cp (55);
960 wakaba 1.11 ## XML5: Not a parse error.
961 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
962     } else {
963     !!!cp (56);
964 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
965 wakaba 1.1 }
966     $self->{ca}
967     = {name => chr ($self->{nc}),
968     value => '',
969     line => $self->{line}, column => $self->{column}};
970     $self->{state} = ATTRIBUTE_NAME_STATE;
971     !!!next-input-character;
972     redo A;
973     }
974     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
975 wakaba 1.11 ## XML5: "Tag attribute name state".
976    
977 wakaba 1.1 my $before_leave = sub {
978     if (exists $self->{ct}->{attributes} # start tag or end tag
979     ->{$self->{ca}->{name}}) { # MUST
980     !!!cp (57);
981     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
982     ## Discard $self->{ca} # MUST
983     } else {
984     !!!cp (58);
985     $self->{ct}->{attributes}->{$self->{ca}->{name}}
986     = $self->{ca};
987 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
988 wakaba 1.1 }
989     }; # $before_leave
990    
991     if ($is_space->{$self->{nc}}) {
992     !!!cp (59);
993     $before_leave->();
994     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
995     !!!next-input-character;
996     redo A;
997     } elsif ($self->{nc} == 0x003D) { # =
998     !!!cp (60);
999     $before_leave->();
1000     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1001     !!!next-input-character;
1002     redo A;
1003     } elsif ($self->{nc} == 0x003E) { # >
1004 wakaba 1.11 if ($self->{is_xml}) {
1005     !!!cp (60.1);
1006     ## XML5: Not a parse error.
1007     !!!parse-error (type => 'no attr value'); ## TODO: type
1008     } else {
1009     !!!cp (60.2);
1010     }
1011    
1012 wakaba 1.1 $before_leave->();
1013     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1014     !!!cp (61);
1015     $self->{last_stag_name} = $self->{ct}->{tag_name};
1016     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1017     !!!cp (62);
1018     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1019     if ($self->{ct}->{attributes}) {
1020     !!!parse-error (type => 'end tag attribute');
1021     }
1022     } else {
1023     die "$0: $self->{ct}->{type}: Unknown token type";
1024     }
1025     $self->{state} = DATA_STATE;
1026 wakaba 1.5 $self->{s_kwd} = '';
1027 wakaba 1.1 !!!next-input-character;
1028    
1029     !!!emit ($self->{ct}); # start tag or end tag
1030    
1031     redo A;
1032     } elsif (0x0041 <= $self->{nc} and
1033     $self->{nc} <= 0x005A) { # A..Z
1034     !!!cp (63);
1035 wakaba 1.4 $self->{ca}->{name}
1036     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1037 wakaba 1.1 ## Stay in the state
1038     !!!next-input-character;
1039     redo A;
1040     } elsif ($self->{nc} == 0x002F) { # /
1041 wakaba 1.11 if ($self->{is_xml}) {
1042     !!!cp (64);
1043     ## XML5: Not a parse error.
1044     !!!parse-error (type => 'no attr value'); ## TODO: type
1045     } else {
1046     !!!cp (64.1);
1047     }
1048    
1049 wakaba 1.1 $before_leave->();
1050     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1051     !!!next-input-character;
1052     redo A;
1053     } elsif ($self->{nc} == -1) {
1054     !!!parse-error (type => 'unclosed tag');
1055     $before_leave->();
1056     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1057     !!!cp (66);
1058     $self->{last_stag_name} = $self->{ct}->{tag_name};
1059     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1060     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1061     if ($self->{ct}->{attributes}) {
1062     !!!cp (67);
1063     !!!parse-error (type => 'end tag attribute');
1064     } else {
1065     ## NOTE: This state should never be reached.
1066     !!!cp (68);
1067     }
1068     } else {
1069     die "$0: $self->{ct}->{type}: Unknown token type";
1070     }
1071     $self->{state} = DATA_STATE;
1072 wakaba 1.5 $self->{s_kwd} = '';
1073 wakaba 1.1 # reconsume
1074    
1075 wakaba 1.33 ## Discard the token.
1076     #!!!emit ($self->{ct}); # start tag or end tag
1077 wakaba 1.1
1078     redo A;
1079     } else {
1080 wakaba 1.30 if ({
1081     0x0022 => 1, # "
1082     0x0027 => 1, # '
1083     0x003C => 1, # <
1084     }->{$self->{nc}}) {
1085 wakaba 1.1 !!!cp (69);
1086 wakaba 1.11 ## XML5: Not a parse error.
1087 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1088     } else {
1089     !!!cp (70);
1090     }
1091     $self->{ca}->{name} .= chr ($self->{nc});
1092     ## Stay in the state
1093     !!!next-input-character;
1094     redo A;
1095     }
1096     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1097 wakaba 1.11 ## XML5: "Tag attribute name after state".
1098    
1099 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1100     !!!cp (71);
1101     ## Stay in the state
1102     !!!next-input-character;
1103     redo A;
1104     } elsif ($self->{nc} == 0x003D) { # =
1105     !!!cp (72);
1106     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1107     !!!next-input-character;
1108     redo A;
1109     } elsif ($self->{nc} == 0x003E) { # >
1110 wakaba 1.11 if ($self->{is_xml}) {
1111     !!!cp (72.1);
1112     ## XML5: Not a parse error.
1113     !!!parse-error (type => 'no attr value'); ## TODO: type
1114     } else {
1115     !!!cp (72.2);
1116     }
1117    
1118 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1119     !!!cp (73);
1120     $self->{last_stag_name} = $self->{ct}->{tag_name};
1121     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1122     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1123     if ($self->{ct}->{attributes}) {
1124     !!!cp (74);
1125     !!!parse-error (type => 'end tag attribute');
1126     } else {
1127     ## NOTE: This state should never be reached.
1128     !!!cp (75);
1129     }
1130     } else {
1131     die "$0: $self->{ct}->{type}: Unknown token type";
1132     }
1133     $self->{state} = DATA_STATE;
1134 wakaba 1.5 $self->{s_kwd} = '';
1135 wakaba 1.1 !!!next-input-character;
1136    
1137     !!!emit ($self->{ct}); # start tag or end tag
1138    
1139     redo A;
1140     } elsif (0x0041 <= $self->{nc} and
1141     $self->{nc} <= 0x005A) { # A..Z
1142     !!!cp (76);
1143     $self->{ca}
1144 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1145 wakaba 1.1 value => '',
1146     line => $self->{line}, column => $self->{column}};
1147     $self->{state} = ATTRIBUTE_NAME_STATE;
1148     !!!next-input-character;
1149     redo A;
1150     } elsif ($self->{nc} == 0x002F) { # /
1151 wakaba 1.11 if ($self->{is_xml}) {
1152     !!!cp (77);
1153     ## XML5: Not a parse error.
1154     !!!parse-error (type => 'no attr value'); ## TODO: type
1155     } else {
1156     !!!cp (77.1);
1157     }
1158    
1159 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1160     !!!next-input-character;
1161     redo A;
1162     } elsif ($self->{nc} == -1) {
1163     !!!parse-error (type => 'unclosed tag');
1164     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1165     !!!cp (79);
1166     $self->{last_stag_name} = $self->{ct}->{tag_name};
1167     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1168     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1169     if ($self->{ct}->{attributes}) {
1170     !!!cp (80);
1171     !!!parse-error (type => 'end tag attribute');
1172     } else {
1173     ## NOTE: This state should never be reached.
1174     !!!cp (81);
1175     }
1176     } else {
1177     die "$0: $self->{ct}->{type}: Unknown token type";
1178     }
1179 wakaba 1.5 $self->{s_kwd} = '';
1180 wakaba 1.1 $self->{state} = DATA_STATE;
1181     # reconsume
1182    
1183 wakaba 1.33 ## Discard the token.
1184     #!!!emit ($self->{ct}); # start tag or end tag
1185 wakaba 1.1
1186     redo A;
1187     } else {
1188 wakaba 1.11 if ($self->{is_xml}) {
1189     !!!cp (78.1);
1190     ## XML5: Not a parse error.
1191     !!!parse-error (type => 'no attr value'); ## TODO: type
1192     } else {
1193     !!!cp (78.2);
1194     }
1195    
1196 wakaba 1.30 if ({
1197     0x0022 => 1, # "
1198     0x0027 => 1, # '
1199     0x003C => 1, # <
1200     }->{$self->{nc}}) {
1201 wakaba 1.1 !!!cp (78);
1202 wakaba 1.11 ## XML5: Not a parse error.
1203 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1204     } else {
1205     !!!cp (82);
1206     }
1207     $self->{ca}
1208     = {name => chr ($self->{nc}),
1209     value => '',
1210     line => $self->{line}, column => $self->{column}};
1211     $self->{state} = ATTRIBUTE_NAME_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     }
1215     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1216 wakaba 1.11 ## XML5: "Tag attribute value before state".
1217    
1218 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1219     !!!cp (83);
1220     ## Stay in the state
1221     !!!next-input-character;
1222     redo A;
1223     } elsif ($self->{nc} == 0x0022) { # "
1224     !!!cp (84);
1225     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1226     !!!next-input-character;
1227     redo A;
1228     } elsif ($self->{nc} == 0x0026) { # &
1229     !!!cp (85);
1230     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1231     ## reconsume
1232     redo A;
1233     } elsif ($self->{nc} == 0x0027) { # '
1234     !!!cp (86);
1235     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1236     !!!next-input-character;
1237     redo A;
1238     } elsif ($self->{nc} == 0x003E) { # >
1239     !!!parse-error (type => 'empty unquoted attribute value');
1240     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1241     !!!cp (87);
1242     $self->{last_stag_name} = $self->{ct}->{tag_name};
1243     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1244     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1245     if ($self->{ct}->{attributes}) {
1246     !!!cp (88);
1247     !!!parse-error (type => 'end tag attribute');
1248     } else {
1249     ## NOTE: This state should never be reached.
1250     !!!cp (89);
1251     }
1252     } else {
1253     die "$0: $self->{ct}->{type}: Unknown token type";
1254     }
1255     $self->{state} = DATA_STATE;
1256 wakaba 1.5 $self->{s_kwd} = '';
1257 wakaba 1.1 !!!next-input-character;
1258    
1259     !!!emit ($self->{ct}); # start tag or end tag
1260    
1261     redo A;
1262     } elsif ($self->{nc} == -1) {
1263     !!!parse-error (type => 'unclosed tag');
1264     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1265     !!!cp (90);
1266     $self->{last_stag_name} = $self->{ct}->{tag_name};
1267     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1268     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1269     if ($self->{ct}->{attributes}) {
1270     !!!cp (91);
1271     !!!parse-error (type => 'end tag attribute');
1272     } else {
1273     ## NOTE: This state should never be reached.
1274     !!!cp (92);
1275     }
1276     } else {
1277     die "$0: $self->{ct}->{type}: Unknown token type";
1278     }
1279     $self->{state} = DATA_STATE;
1280 wakaba 1.5 $self->{s_kwd} = '';
1281 wakaba 1.1 ## reconsume
1282    
1283 wakaba 1.33 ## Discard the token.
1284     #!!!emit ($self->{ct}); # start tag or end tag
1285 wakaba 1.1
1286     redo A;
1287     } else {
1288 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1289 wakaba 1.1 !!!cp (93);
1290 wakaba 1.11 ## XML5: Not a parse error.
1291 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1292 wakaba 1.11 } elsif ($self->{is_xml}) {
1293     !!!cp (93.1);
1294     ## XML5: No parse error.
1295     !!!parse-error (type => 'unquoted attr value'); ## TODO
1296 wakaba 1.1 } else {
1297     !!!cp (94);
1298     }
1299     $self->{ca}->{value} .= chr ($self->{nc});
1300     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1301     !!!next-input-character;
1302     redo A;
1303     }
1304     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1305 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1306     ## ATTLIST attribute value double quoted state".
1307 wakaba 1.11
1308 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1309 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1310     !!!cp (95.1);
1311     ## XML5: "DOCTYPE ATTLIST name after state".
1312     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1313     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1314     } else {
1315     !!!cp (95);
1316     ## XML5: "Tag attribute name before state".
1317     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1318     }
1319 wakaba 1.1 !!!next-input-character;
1320     redo A;
1321     } elsif ($self->{nc} == 0x0026) { # &
1322     !!!cp (96);
1323 wakaba 1.11 ## XML5: Not defined yet.
1324    
1325 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1326     ## "entity in attribute value state". In this implementation, the
1327     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1328     ## implementation of the "consume a character reference" algorithm.
1329     $self->{prev_state} = $self->{state};
1330     $self->{entity_add} = 0x0022; # "
1331     $self->{state} = ENTITY_STATE;
1332     !!!next-input-character;
1333     redo A;
1334 wakaba 1.25 } elsif ($self->{is_xml} and
1335     $is_space->{$self->{nc}}) {
1336     !!!cp (97.1);
1337     $self->{ca}->{value} .= ' ';
1338     ## Stay in the state.
1339     !!!next-input-character;
1340     redo A;
1341 wakaba 1.1 } elsif ($self->{nc} == -1) {
1342     !!!parse-error (type => 'unclosed attribute value');
1343     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1344     !!!cp (97);
1345     $self->{last_stag_name} = $self->{ct}->{tag_name};
1346 wakaba 1.15
1347     $self->{state} = DATA_STATE;
1348     $self->{s_kwd} = '';
1349     ## reconsume
1350     !!!emit ($self->{ct}); # start tag
1351     redo A;
1352 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1353     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1354     if ($self->{ct}->{attributes}) {
1355     !!!cp (98);
1356     !!!parse-error (type => 'end tag attribute');
1357     } else {
1358     ## NOTE: This state should never be reached.
1359     !!!cp (99);
1360     }
1361 wakaba 1.15
1362     $self->{state} = DATA_STATE;
1363     $self->{s_kwd} = '';
1364     ## reconsume
1365 wakaba 1.33
1366     ## Discard the token.
1367     #!!!emit ($self->{ct}); # end tag
1368    
1369 wakaba 1.15 redo A;
1370     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1371     ## XML5: No parse error above; not defined yet.
1372     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1373     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1374     ## Reconsume.
1375 wakaba 1.33
1376     ## Discard the token.
1377     #!!!emit ($self->{ct}); # ATTLIST
1378    
1379 wakaba 1.15 redo A;
1380 wakaba 1.1 } else {
1381     die "$0: $self->{ct}->{type}: Unknown token type";
1382     }
1383     } else {
1384 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1385 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1386     !!!cp (100);
1387     ## XML5: Not a parse error.
1388     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1389     } else {
1390     !!!cp (100.1);
1391     }
1392 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1393     $self->{read_until}->($self->{ca}->{value},
1394 wakaba 1.25 qq["&<\x09\x0C\x20],
1395 wakaba 1.1 length $self->{ca}->{value});
1396    
1397     ## Stay in the state
1398     !!!next-input-character;
1399     redo A;
1400     }
1401     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1402 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1403     ## ATTLIST attribute value single quoted state".
1404 wakaba 1.11
1405 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1406 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1407     !!!cp (101.1);
1408     ## XML5: "DOCTYPE ATTLIST name after state".
1409     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1410     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1411     } else {
1412     !!!cp (101);
1413     ## XML5: "Before attribute name state" (sic).
1414     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1415     }
1416 wakaba 1.1 !!!next-input-character;
1417     redo A;
1418     } elsif ($self->{nc} == 0x0026) { # &
1419     !!!cp (102);
1420 wakaba 1.11 ## XML5: Not defined yet.
1421    
1422 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1423     ## "entity in attribute value state". In this implementation, the
1424     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1425     ## implementation of the "consume a character reference" algorithm.
1426     $self->{entity_add} = 0x0027; # '
1427     $self->{prev_state} = $self->{state};
1428     $self->{state} = ENTITY_STATE;
1429     !!!next-input-character;
1430     redo A;
1431 wakaba 1.25 } elsif ($self->{is_xml} and
1432     $is_space->{$self->{nc}}) {
1433     !!!cp (103.1);
1434     $self->{ca}->{value} .= ' ';
1435     ## Stay in the state.
1436     !!!next-input-character;
1437     redo A;
1438 wakaba 1.1 } elsif ($self->{nc} == -1) {
1439     !!!parse-error (type => 'unclosed attribute value');
1440     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1441     !!!cp (103);
1442     $self->{last_stag_name} = $self->{ct}->{tag_name};
1443 wakaba 1.15
1444     $self->{state} = DATA_STATE;
1445     $self->{s_kwd} = '';
1446     ## reconsume
1447 wakaba 1.33
1448     ## Discard the token.
1449     #!!!emit ($self->{ct}); # start tag
1450    
1451 wakaba 1.15 redo A;
1452 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1453     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1454     if ($self->{ct}->{attributes}) {
1455     !!!cp (104);
1456     !!!parse-error (type => 'end tag attribute');
1457     } else {
1458     ## NOTE: This state should never be reached.
1459     !!!cp (105);
1460     }
1461 wakaba 1.15
1462     $self->{state} = DATA_STATE;
1463     $self->{s_kwd} = '';
1464     ## reconsume
1465 wakaba 1.33
1466     ## Discard the token.
1467     #!!!emit ($self->{ct}); # end tag
1468    
1469 wakaba 1.15 redo A;
1470     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1471     ## XML5: No parse error above; not defined yet.
1472     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1473     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1474     ## Reconsume.
1475 wakaba 1.33
1476     ## Discard the token.
1477     #!!!emit ($self->{ct}); # ATTLIST
1478    
1479 wakaba 1.15 redo A;
1480 wakaba 1.1 } else {
1481     die "$0: $self->{ct}->{type}: Unknown token type";
1482     }
1483     } else {
1484 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1485 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1486     !!!cp (106);
1487     ## XML5: Not a parse error.
1488     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1489     } else {
1490     !!!cp (106.1);
1491     }
1492 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1493     $self->{read_until}->($self->{ca}->{value},
1494 wakaba 1.25 qq['&<\x09\x0C\x20],
1495 wakaba 1.1 length $self->{ca}->{value});
1496    
1497     ## Stay in the state
1498     !!!next-input-character;
1499     redo A;
1500     }
1501     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1502 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1503    
1504 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1505 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1506     !!!cp (107.1);
1507     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1508     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1509     } else {
1510     !!!cp (107);
1511     ## XML5: "Tag attribute name before state".
1512     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1513     }
1514 wakaba 1.1 !!!next-input-character;
1515     redo A;
1516     } elsif ($self->{nc} == 0x0026) { # &
1517     !!!cp (108);
1518 wakaba 1.11
1519     ## XML5: Not defined yet.
1520    
1521 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1522     ## "entity in attribute value state". In this implementation, the
1523     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1524     ## implementation of the "consume a character reference" algorithm.
1525     $self->{entity_add} = -1;
1526     $self->{prev_state} = $self->{state};
1527     $self->{state} = ENTITY_STATE;
1528     !!!next-input-character;
1529     redo A;
1530     } elsif ($self->{nc} == 0x003E) { # >
1531     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1532     !!!cp (109);
1533     $self->{last_stag_name} = $self->{ct}->{tag_name};
1534 wakaba 1.15
1535     $self->{state} = DATA_STATE;
1536     $self->{s_kwd} = '';
1537     !!!next-input-character;
1538     !!!emit ($self->{ct}); # start tag
1539     redo A;
1540 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1541     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1542     if ($self->{ct}->{attributes}) {
1543     !!!cp (110);
1544     !!!parse-error (type => 'end tag attribute');
1545     } else {
1546     ## NOTE: This state should never be reached.
1547     !!!cp (111);
1548     }
1549 wakaba 1.15
1550     $self->{state} = DATA_STATE;
1551     $self->{s_kwd} = '';
1552     !!!next-input-character;
1553     !!!emit ($self->{ct}); # end tag
1554     redo A;
1555     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1556     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1557     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1558     !!!next-input-character;
1559     !!!emit ($self->{ct}); # ATTLIST
1560     redo A;
1561 wakaba 1.1 } else {
1562     die "$0: $self->{ct}->{type}: Unknown token type";
1563     }
1564     } elsif ($self->{nc} == -1) {
1565     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1566     !!!cp (112);
1567 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1568 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1569 wakaba 1.15
1570     $self->{state} = DATA_STATE;
1571     $self->{s_kwd} = '';
1572     ## reconsume
1573 wakaba 1.33
1574     ## Discard the token.
1575     #!!!emit ($self->{ct}); # start tag
1576    
1577 wakaba 1.15 redo A;
1578 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1579 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1580 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1581     if ($self->{ct}->{attributes}) {
1582     !!!cp (113);
1583     !!!parse-error (type => 'end tag attribute');
1584     } else {
1585     ## NOTE: This state should never be reached.
1586     !!!cp (114);
1587     }
1588 wakaba 1.15
1589     $self->{state} = DATA_STATE;
1590     $self->{s_kwd} = '';
1591     ## reconsume
1592 wakaba 1.33
1593     ## Discard the token.
1594     #!!!emit ($self->{ct}); # end tag
1595    
1596 wakaba 1.15 redo A;
1597     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1598     !!!parse-error (type => 'unclosed md'); ## TODO: type
1599     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1600     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1601     ## Reconsume.
1602 wakaba 1.33
1603     ## Discard the token.
1604     #!!!emit ($self->{ct}); # ATTLIST
1605    
1606 wakaba 1.15 redo A;
1607 wakaba 1.1 } else {
1608     die "$0: $self->{ct}->{type}: Unknown token type";
1609     }
1610     } else {
1611     if ({
1612     0x0022 => 1, # "
1613     0x0027 => 1, # '
1614     0x003D => 1, # =
1615 wakaba 1.26 0x003C => 1, # <
1616 wakaba 1.1 }->{$self->{nc}}) {
1617     !!!cp (115);
1618 wakaba 1.11 ## XML5: Not a parse error.
1619 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1620     } else {
1621     !!!cp (116);
1622     }
1623     $self->{ca}->{value} .= chr ($self->{nc});
1624     $self->{read_until}->($self->{ca}->{value},
1625 wakaba 1.25 qq["'=& \x09\x0C>],
1626 wakaba 1.1 length $self->{ca}->{value});
1627    
1628     ## Stay in the state
1629     !!!next-input-character;
1630     redo A;
1631     }
1632     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1633     if ($is_space->{$self->{nc}}) {
1634     !!!cp (118);
1635     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1636     !!!next-input-character;
1637     redo A;
1638     } elsif ($self->{nc} == 0x003E) { # >
1639     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1640     !!!cp (119);
1641     $self->{last_stag_name} = $self->{ct}->{tag_name};
1642     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1643     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1644     if ($self->{ct}->{attributes}) {
1645     !!!cp (120);
1646     !!!parse-error (type => 'end tag attribute');
1647     } else {
1648     ## NOTE: This state should never be reached.
1649     !!!cp (121);
1650     }
1651     } else {
1652     die "$0: $self->{ct}->{type}: Unknown token type";
1653     }
1654     $self->{state} = DATA_STATE;
1655 wakaba 1.5 $self->{s_kwd} = '';
1656 wakaba 1.1 !!!next-input-character;
1657    
1658     !!!emit ($self->{ct}); # start tag or end tag
1659    
1660     redo A;
1661     } elsif ($self->{nc} == 0x002F) { # /
1662     !!!cp (122);
1663     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1664     !!!next-input-character;
1665     redo A;
1666     } elsif ($self->{nc} == -1) {
1667     !!!parse-error (type => 'unclosed tag');
1668     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1669     !!!cp (122.3);
1670     $self->{last_stag_name} = $self->{ct}->{tag_name};
1671     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1672     if ($self->{ct}->{attributes}) {
1673     !!!cp (122.1);
1674     !!!parse-error (type => 'end tag attribute');
1675     } else {
1676     ## NOTE: This state should never be reached.
1677     !!!cp (122.2);
1678     }
1679     } else {
1680     die "$0: $self->{ct}->{type}: Unknown token type";
1681     }
1682     $self->{state} = DATA_STATE;
1683 wakaba 1.5 $self->{s_kwd} = '';
1684 wakaba 1.1 ## Reconsume.
1685 wakaba 1.33
1686     ## Discard the token.
1687     #!!!emit ($self->{ct}); # start tag or end tag
1688    
1689 wakaba 1.1 redo A;
1690     } else {
1691     !!!cp ('124.1');
1692     !!!parse-error (type => 'no space between attributes');
1693     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1694     ## reconsume
1695     redo A;
1696     }
1697     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1698 wakaba 1.11 ## XML5: "Empty tag state".
1699    
1700 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1701     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1702     !!!cp ('124.2');
1703     !!!parse-error (type => 'nestc', token => $self->{ct});
1704     ## TODO: Different type than slash in start tag
1705     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1706     if ($self->{ct}->{attributes}) {
1707     !!!cp ('124.4');
1708     !!!parse-error (type => 'end tag attribute');
1709     } else {
1710     !!!cp ('124.5');
1711     }
1712     ## TODO: Test |<title></title/>|
1713     } else {
1714     !!!cp ('124.3');
1715     $self->{self_closing} = 1;
1716     }
1717    
1718     $self->{state} = DATA_STATE;
1719 wakaba 1.5 $self->{s_kwd} = '';
1720 wakaba 1.1 !!!next-input-character;
1721    
1722     !!!emit ($self->{ct}); # start tag or end tag
1723    
1724     redo A;
1725     } elsif ($self->{nc} == -1) {
1726     !!!parse-error (type => 'unclosed tag');
1727     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1728     !!!cp (124.7);
1729     $self->{last_stag_name} = $self->{ct}->{tag_name};
1730     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1731     if ($self->{ct}->{attributes}) {
1732     !!!cp (124.5);
1733     !!!parse-error (type => 'end tag attribute');
1734     } else {
1735     ## NOTE: This state should never be reached.
1736     !!!cp (124.6);
1737     }
1738     } else {
1739     die "$0: $self->{ct}->{type}: Unknown token type";
1740     }
1741 wakaba 1.11 ## XML5: "Tag attribute name before state".
1742 wakaba 1.1 $self->{state} = DATA_STATE;
1743 wakaba 1.5 $self->{s_kwd} = '';
1744 wakaba 1.1 ## Reconsume.
1745 wakaba 1.33
1746     ## Discard the token.
1747     #!!!emit ($self->{ct}); # start tag or end tag
1748    
1749 wakaba 1.1 redo A;
1750     } else {
1751     !!!cp ('124.4');
1752     !!!parse-error (type => 'nestc');
1753     ## TODO: This error type is wrong.
1754     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1755     ## Reconsume.
1756     redo A;
1757     }
1758     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1759 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1760    
1761 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1762     ## consumes characters one-by-one basis.
1763    
1764     if ($self->{nc} == 0x003E) { # >
1765 wakaba 1.13 if ($self->{in_subset}) {
1766     !!!cp (123);
1767     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1768     } else {
1769     !!!cp (124);
1770     $self->{state} = DATA_STATE;
1771     $self->{s_kwd} = '';
1772     }
1773 wakaba 1.1 !!!next-input-character;
1774    
1775     !!!emit ($self->{ct}); # comment
1776     redo A;
1777     } elsif ($self->{nc} == -1) {
1778 wakaba 1.13 if ($self->{in_subset}) {
1779     !!!cp (125.1);
1780     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1781     } else {
1782     !!!cp (125);
1783     $self->{state} = DATA_STATE;
1784     $self->{s_kwd} = '';
1785     }
1786 wakaba 1.1 ## reconsume
1787    
1788     !!!emit ($self->{ct}); # comment
1789     redo A;
1790     } else {
1791     !!!cp (126);
1792     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1793     $self->{read_until}->($self->{ct}->{data},
1794     q[>],
1795     length $self->{ct}->{data});
1796    
1797     ## Stay in the state.
1798     !!!next-input-character;
1799     redo A;
1800     }
1801     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1802 wakaba 1.14 ## XML5: "Markup declaration state".
1803 wakaba 1.1
1804     if ($self->{nc} == 0x002D) { # -
1805     !!!cp (133);
1806     $self->{state} = MD_HYPHEN_STATE;
1807     !!!next-input-character;
1808     redo A;
1809     } elsif ($self->{nc} == 0x0044 or # D
1810     $self->{nc} == 0x0064) { # d
1811     ## ASCII case-insensitive.
1812     !!!cp (130);
1813     $self->{state} = MD_DOCTYPE_STATE;
1814 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1815 wakaba 1.1 !!!next-input-character;
1816     redo A;
1817 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1818     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1819     $self->{is_xml}) and
1820 wakaba 1.1 $self->{nc} == 0x005B) { # [
1821     !!!cp (135.4);
1822     $self->{state} = MD_CDATA_STATE;
1823 wakaba 1.12 $self->{kwd} = '[';
1824 wakaba 1.1 !!!next-input-character;
1825     redo A;
1826     } else {
1827     !!!cp (136);
1828     }
1829    
1830     !!!parse-error (type => 'bogus comment',
1831     line => $self->{line_prev},
1832     column => $self->{column_prev} - 1);
1833     ## Reconsume.
1834     $self->{state} = BOGUS_COMMENT_STATE;
1835     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1836     line => $self->{line_prev},
1837     column => $self->{column_prev} - 1,
1838     };
1839     redo A;
1840     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1841     if ($self->{nc} == 0x002D) { # -
1842     !!!cp (127);
1843     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1844     line => $self->{line_prev},
1845     column => $self->{column_prev} - 2,
1846     };
1847 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1848 wakaba 1.1 !!!next-input-character;
1849     redo A;
1850     } else {
1851     !!!cp (128);
1852     !!!parse-error (type => 'bogus comment',
1853     line => $self->{line_prev},
1854     column => $self->{column_prev} - 2);
1855     $self->{state} = BOGUS_COMMENT_STATE;
1856     ## Reconsume.
1857     $self->{ct} = {type => COMMENT_TOKEN,
1858     data => '-',
1859     line => $self->{line_prev},
1860     column => $self->{column_prev} - 2,
1861     };
1862     redo A;
1863     }
1864     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1865     ## ASCII case-insensitive.
1866     if ($self->{nc} == [
1867     undef,
1868     0x004F, # O
1869     0x0043, # C
1870     0x0054, # T
1871     0x0059, # Y
1872     0x0050, # P
1873 wakaba 1.12 ]->[length $self->{kwd}] or
1874 wakaba 1.1 $self->{nc} == [
1875     undef,
1876     0x006F, # o
1877     0x0063, # c
1878     0x0074, # t
1879     0x0079, # y
1880     0x0070, # p
1881 wakaba 1.12 ]->[length $self->{kwd}]) {
1882 wakaba 1.1 !!!cp (131);
1883     ## Stay in the state.
1884 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1885 wakaba 1.1 !!!next-input-character;
1886     redo A;
1887 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1888 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1889     $self->{nc} == 0x0065)) { # e
1890 wakaba 1.12 if ($self->{is_xml} and
1891     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1892 wakaba 1.10 !!!cp (129);
1893     ## XML5: case-sensitive.
1894     !!!parse-error (type => 'lowercase keyword', ## TODO
1895     text => 'DOCTYPE',
1896     line => $self->{line_prev},
1897     column => $self->{column_prev} - 5);
1898     } else {
1899     !!!cp (129.1);
1900     }
1901 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1902     $self->{ct} = {type => DOCTYPE_TOKEN,
1903     quirks => 1,
1904     line => $self->{line_prev},
1905     column => $self->{column_prev} - 7,
1906     };
1907     !!!next-input-character;
1908     redo A;
1909     } else {
1910     !!!cp (132);
1911     !!!parse-error (type => 'bogus comment',
1912     line => $self->{line_prev},
1913 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1914 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1915     ## Reconsume.
1916     $self->{ct} = {type => COMMENT_TOKEN,
1917 wakaba 1.12 data => $self->{kwd},
1918 wakaba 1.1 line => $self->{line_prev},
1919 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1920 wakaba 1.1 };
1921     redo A;
1922     }
1923     } elsif ($self->{state} == MD_CDATA_STATE) {
1924     if ($self->{nc} == {
1925     '[' => 0x0043, # C
1926     '[C' => 0x0044, # D
1927     '[CD' => 0x0041, # A
1928     '[CDA' => 0x0054, # T
1929     '[CDAT' => 0x0041, # A
1930 wakaba 1.12 }->{$self->{kwd}}) {
1931 wakaba 1.1 !!!cp (135.1);
1932     ## Stay in the state.
1933 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1934 wakaba 1.1 !!!next-input-character;
1935     redo A;
1936 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1937 wakaba 1.1 $self->{nc} == 0x005B) { # [
1938 wakaba 1.6 if ($self->{is_xml} and
1939     not $self->{tainted} and
1940     @{$self->{open_elements} or []} == 0) {
1941 wakaba 1.8 !!!cp (135.2);
1942 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1943     line => $self->{line_prev},
1944     column => $self->{column_prev} - 7);
1945     $self->{tainted} = 1;
1946 wakaba 1.8 } else {
1947     !!!cp (135.21);
1948 wakaba 1.6 }
1949    
1950 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1951     data => '',
1952     line => $self->{line_prev},
1953     column => $self->{column_prev} - 7};
1954     $self->{state} = CDATA_SECTION_STATE;
1955     !!!next-input-character;
1956     redo A;
1957     } else {
1958     !!!cp (135.3);
1959     !!!parse-error (type => 'bogus comment',
1960     line => $self->{line_prev},
1961 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1962 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1963     ## Reconsume.
1964     $self->{ct} = {type => COMMENT_TOKEN,
1965 wakaba 1.12 data => $self->{kwd},
1966 wakaba 1.1 line => $self->{line_prev},
1967 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1968 wakaba 1.1 };
1969     redo A;
1970     }
1971     } elsif ($self->{state} == COMMENT_START_STATE) {
1972     if ($self->{nc} == 0x002D) { # -
1973     !!!cp (137);
1974     $self->{state} = COMMENT_START_DASH_STATE;
1975     !!!next-input-character;
1976     redo A;
1977     } elsif ($self->{nc} == 0x003E) { # >
1978     !!!parse-error (type => 'bogus comment');
1979 wakaba 1.13 if ($self->{in_subset}) {
1980     !!!cp (138.1);
1981     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1982     } else {
1983     !!!cp (138);
1984     $self->{state} = DATA_STATE;
1985     $self->{s_kwd} = '';
1986     }
1987 wakaba 1.1 !!!next-input-character;
1988    
1989     !!!emit ($self->{ct}); # comment
1990    
1991     redo A;
1992     } elsif ($self->{nc} == -1) {
1993     !!!parse-error (type => 'unclosed comment');
1994 wakaba 1.13 if ($self->{in_subset}) {
1995     !!!cp (139.1);
1996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997     } else {
1998     !!!cp (139);
1999     $self->{state} = DATA_STATE;
2000     $self->{s_kwd} = '';
2001     }
2002 wakaba 1.1 ## reconsume
2003    
2004     !!!emit ($self->{ct}); # comment
2005    
2006     redo A;
2007     } else {
2008     !!!cp (140);
2009     $self->{ct}->{data} # comment
2010     .= chr ($self->{nc});
2011     $self->{state} = COMMENT_STATE;
2012     !!!next-input-character;
2013     redo A;
2014     }
2015     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2016     if ($self->{nc} == 0x002D) { # -
2017     !!!cp (141);
2018     $self->{state} = COMMENT_END_STATE;
2019     !!!next-input-character;
2020     redo A;
2021     } elsif ($self->{nc} == 0x003E) { # >
2022     !!!parse-error (type => 'bogus comment');
2023 wakaba 1.13 if ($self->{in_subset}) {
2024     !!!cp (142.1);
2025     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2026     } else {
2027     !!!cp (142);
2028     $self->{state} = DATA_STATE;
2029     $self->{s_kwd} = '';
2030     }
2031 wakaba 1.1 !!!next-input-character;
2032    
2033     !!!emit ($self->{ct}); # comment
2034    
2035     redo A;
2036     } elsif ($self->{nc} == -1) {
2037     !!!parse-error (type => 'unclosed comment');
2038 wakaba 1.13 if ($self->{in_subset}) {
2039     !!!cp (143.1);
2040     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2041     } else {
2042     !!!cp (143);
2043     $self->{state} = DATA_STATE;
2044     $self->{s_kwd} = '';
2045     }
2046 wakaba 1.1 ## reconsume
2047    
2048     !!!emit ($self->{ct}); # comment
2049    
2050     redo A;
2051     } else {
2052     !!!cp (144);
2053     $self->{ct}->{data} # comment
2054     .= '-' . chr ($self->{nc});
2055     $self->{state} = COMMENT_STATE;
2056     !!!next-input-character;
2057     redo A;
2058     }
2059     } elsif ($self->{state} == COMMENT_STATE) {
2060 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2061    
2062 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2063     !!!cp (145);
2064     $self->{state} = COMMENT_END_DASH_STATE;
2065     !!!next-input-character;
2066     redo A;
2067     } elsif ($self->{nc} == -1) {
2068     !!!parse-error (type => 'unclosed comment');
2069 wakaba 1.13 if ($self->{in_subset}) {
2070     !!!cp (146.1);
2071     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2072     } else {
2073     !!!cp (146);
2074     $self->{state} = DATA_STATE;
2075     $self->{s_kwd} = '';
2076     }
2077 wakaba 1.1 ## reconsume
2078    
2079     !!!emit ($self->{ct}); # comment
2080    
2081     redo A;
2082     } else {
2083     !!!cp (147);
2084     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2085     $self->{read_until}->($self->{ct}->{data},
2086     q[-],
2087     length $self->{ct}->{data});
2088    
2089     ## Stay in the state
2090     !!!next-input-character;
2091     redo A;
2092     }
2093     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2094 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2095 wakaba 1.10
2096 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2097     !!!cp (148);
2098     $self->{state} = COMMENT_END_STATE;
2099     !!!next-input-character;
2100     redo A;
2101     } elsif ($self->{nc} == -1) {
2102     !!!parse-error (type => 'unclosed comment');
2103 wakaba 1.13 if ($self->{in_subset}) {
2104     !!!cp (149.1);
2105     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2106     } else {
2107     !!!cp (149);
2108     $self->{state} = DATA_STATE;
2109     $self->{s_kwd} = '';
2110     }
2111 wakaba 1.1 ## reconsume
2112    
2113     !!!emit ($self->{ct}); # comment
2114    
2115     redo A;
2116     } else {
2117     !!!cp (150);
2118     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2119     $self->{state} = COMMENT_STATE;
2120     !!!next-input-character;
2121     redo A;
2122     }
2123 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2124     $self->{state} == COMMENT_END_BANG_STATE) {
2125 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2126 wakaba 1.31 ## (No comment end bang state.)
2127 wakaba 1.14
2128 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2129 wakaba 1.13 if ($self->{in_subset}) {
2130     !!!cp (151.1);
2131     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2132     } else {
2133     !!!cp (151);
2134     $self->{state} = DATA_STATE;
2135     $self->{s_kwd} = '';
2136     }
2137 wakaba 1.1 !!!next-input-character;
2138    
2139     !!!emit ($self->{ct}); # comment
2140    
2141     redo A;
2142     } elsif ($self->{nc} == 0x002D) { # -
2143 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2144     !!!cp (154.3);
2145     $self->{ct}->{data} .= '--!'; # comment
2146     $self->{state} = COMMENT_END_DASH_STATE;
2147     } else {
2148     !!!cp (152);
2149     ## XML5: Not a parse error.
2150     !!!parse-error (type => 'dash in comment',
2151     line => $self->{line_prev},
2152     column => $self->{column_prev});
2153     $self->{ct}->{data} .= '-'; # comment
2154     ## Stay in the state
2155     }
2156     !!!next-input-character;
2157     redo A;
2158 wakaba 1.32 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2159     $is_space->{$self->{nc}}) {
2160     !!!cp (152.1);
2161     !!!parse-error (type => 'comment end space'); # XXX error type
2162     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2163     $self->{state} = COMMENT_END_SPACE_STATE;
2164     !!!next-input-character;
2165     redo A;
2166     } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2167     $self->{nc} == 0x0021) { # !
2168     !!!cp (152.2);
2169 wakaba 1.31 !!!parse-error (type => 'comment end bang'); # XXX error type
2170     $self->{state} = COMMENT_END_BANG_STATE;
2171 wakaba 1.1 !!!next-input-character;
2172     redo A;
2173     } elsif ($self->{nc} == -1) {
2174     !!!parse-error (type => 'unclosed comment');
2175 wakaba 1.13 if ($self->{in_subset}) {
2176     !!!cp (153.1);
2177     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2178     } else {
2179     !!!cp (153);
2180     $self->{state} = DATA_STATE;
2181     $self->{s_kwd} = '';
2182     }
2183 wakaba 1.31 ## Reconsume.
2184 wakaba 1.1
2185     !!!emit ($self->{ct}); # comment
2186    
2187     redo A;
2188     } else {
2189     !!!cp (154);
2190 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2191     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2192     } else {
2193     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2194     }
2195 wakaba 1.1 $self->{state} = COMMENT_STATE;
2196     !!!next-input-character;
2197     redo A;
2198     }
2199 wakaba 1.32 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
2200     ## XML5: Not exist.
2201    
2202     if ($self->{nc} == 0x003E) { # >
2203     if ($self->{in_subset}) {
2204     !!!cp (154.4);
2205     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2206     } else {
2207     !!!cp (154.5);
2208     $self->{state} = DATA_STATE;
2209     $self->{s_kwd} = '';
2210     }
2211     !!!next-input-character;
2212    
2213     !!!emit ($self->{ct}); # comment
2214    
2215     redo A;
2216     } elsif ($is_space->{$self->{nc}}) {
2217     !!!cp (154.6);
2218     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2219     ## Stay in the state.
2220     !!!next-input-character;
2221     redo A;
2222     } elsif ($self->{nc} == -1) {
2223     !!!parse-error (type => 'unclosed comment');
2224     if ($self->{in_subset}) {
2225     !!!cp (154.7);
2226     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2227     } else {
2228     !!!cp (154.8);
2229     $self->{state} = DATA_STATE;
2230     $self->{s_kwd} = '';
2231     }
2232     ## Reconsume.
2233    
2234     !!!emit ($self->{ct}); # comment
2235    
2236     redo A;
2237     } else {
2238     !!!cp (154.9);
2239     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2240     $self->{state} = COMMENT_STATE;
2241     !!!next-input-character;
2242     redo A;
2243     }
2244 wakaba 1.1 } elsif ($self->{state} == DOCTYPE_STATE) {
2245     if ($is_space->{$self->{nc}}) {
2246     !!!cp (155);
2247     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2248     !!!next-input-character;
2249     redo A;
2250 wakaba 1.28 } elsif ($self->{nc} == -1) {
2251     !!!cp (155.1);
2252     !!!parse-error (type => 'unclosed DOCTYPE');
2253     $self->{ct}->{quirks} = 1;
2254    
2255     $self->{state} = DATA_STATE;
2256     ## Reconsume.
2257     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2258    
2259     redo A;
2260 wakaba 1.1 } else {
2261     !!!cp (156);
2262 wakaba 1.28 ## XML5: Swith to the bogus comment state.
2263 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2264     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2265     ## reconsume
2266     redo A;
2267     }
2268     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2269 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2270    
2271 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2272     !!!cp (157);
2273     ## Stay in the state
2274     !!!next-input-character;
2275     redo A;
2276     } elsif ($self->{nc} == 0x003E) { # >
2277     !!!cp (158);
2278 wakaba 1.12 ## XML5: No parse error.
2279 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2280     $self->{state} = DATA_STATE;
2281 wakaba 1.5 $self->{s_kwd} = '';
2282 wakaba 1.1 !!!next-input-character;
2283    
2284     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2285    
2286     redo A;
2287 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2288     !!!cp (158.1);
2289     $self->{ct}->{name} # DOCTYPE
2290     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2291     delete $self->{ct}->{quirks};
2292     $self->{state} = DOCTYPE_NAME_STATE;
2293     !!!next-input-character;
2294     redo A;
2295 wakaba 1.1 } elsif ($self->{nc} == -1) {
2296     !!!cp (159);
2297     !!!parse-error (type => 'no DOCTYPE name');
2298     $self->{state} = DATA_STATE;
2299 wakaba 1.5 $self->{s_kwd} = '';
2300 wakaba 1.1 ## reconsume
2301    
2302     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2303    
2304     redo A;
2305 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2306     !!!cp (159.1);
2307     !!!parse-error (type => 'no DOCTYPE name');
2308     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2309 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2310     $self->{in_subset} = 1;
2311 wakaba 1.12 !!!next-input-character;
2312 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2313 wakaba 1.12 redo A;
2314 wakaba 1.1 } else {
2315     !!!cp (160);
2316     $self->{ct}->{name} = chr $self->{nc};
2317     delete $self->{ct}->{quirks};
2318     $self->{state} = DOCTYPE_NAME_STATE;
2319     !!!next-input-character;
2320     redo A;
2321     }
2322     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2323 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2324    
2325     ## ISSUE: Redundant "First," in the spec.
2326    
2327 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2328     !!!cp (161);
2329     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2330     !!!next-input-character;
2331     redo A;
2332     } elsif ($self->{nc} == 0x003E) { # >
2333     !!!cp (162);
2334     $self->{state} = DATA_STATE;
2335 wakaba 1.5 $self->{s_kwd} = '';
2336 wakaba 1.1 !!!next-input-character;
2337    
2338     !!!emit ($self->{ct}); # DOCTYPE
2339    
2340     redo A;
2341 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2342     !!!cp (162.1);
2343     $self->{ct}->{name} # DOCTYPE
2344     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2345     delete $self->{ct}->{quirks};
2346     ## Stay in the state.
2347     !!!next-input-character;
2348     redo A;
2349 wakaba 1.1 } elsif ($self->{nc} == -1) {
2350     !!!cp (163);
2351     !!!parse-error (type => 'unclosed DOCTYPE');
2352     $self->{state} = DATA_STATE;
2353 wakaba 1.5 $self->{s_kwd} = '';
2354 wakaba 1.1 ## reconsume
2355    
2356     $self->{ct}->{quirks} = 1;
2357     !!!emit ($self->{ct}); # DOCTYPE
2358    
2359     redo A;
2360 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2361     !!!cp (163.1);
2362     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2363 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2364     $self->{in_subset} = 1;
2365 wakaba 1.12 !!!next-input-character;
2366 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2367 wakaba 1.12 redo A;
2368 wakaba 1.1 } else {
2369     !!!cp (164);
2370 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2371     ## Stay in the state.
2372 wakaba 1.1 !!!next-input-character;
2373     redo A;
2374     }
2375     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2376 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2377     ## state", but implemented differently.
2378    
2379 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2380     !!!cp (165);
2381     ## Stay in the state
2382     !!!next-input-character;
2383     redo A;
2384     } elsif ($self->{nc} == 0x003E) { # >
2385 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2386     !!!cp (166);
2387     $self->{state} = DATA_STATE;
2388     $self->{s_kwd} = '';
2389     } else {
2390     !!!cp (166.1);
2391     !!!parse-error (type => 'no md def'); ## TODO: type
2392     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2393     }
2394    
2395 wakaba 1.1 !!!next-input-character;
2396 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2397 wakaba 1.1 redo A;
2398     } elsif ($self->{nc} == -1) {
2399 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2400     !!!cp (167);
2401     !!!parse-error (type => 'unclosed DOCTYPE');
2402     $self->{state} = DATA_STATE;
2403     $self->{s_kwd} = '';
2404     $self->{ct}->{quirks} = 1;
2405     } else {
2406     !!!cp (167.12);
2407     !!!parse-error (type => 'unclosed md'); ## TODO: type
2408     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2409     }
2410    
2411     ## Reconsume.
2412     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2413 wakaba 1.1 redo A;
2414     } elsif ($self->{nc} == 0x0050 or # P
2415     $self->{nc} == 0x0070) { # p
2416 wakaba 1.12 !!!cp (167.1);
2417 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2418 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2419 wakaba 1.1 !!!next-input-character;
2420     redo A;
2421     } elsif ($self->{nc} == 0x0053 or # S
2422     $self->{nc} == 0x0073) { # s
2423 wakaba 1.12 !!!cp (167.2);
2424 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2425 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2426     !!!next-input-character;
2427     redo A;
2428 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2429     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2430     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2431     !!!cp (167.21);
2432     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2433     $self->{ct}->{value} = ''; # ENTITY
2434     !!!next-input-character;
2435     redo A;
2436     } elsif ($self->{nc} == 0x0027 and # '
2437     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2438     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2439     !!!cp (167.22);
2440     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2441     $self->{ct}->{value} = ''; # ENTITY
2442     !!!next-input-character;
2443     redo A;
2444 wakaba 1.16 } elsif ($self->{is_xml} and
2445     $self->{ct}->{type} == DOCTYPE_TOKEN and
2446     $self->{nc} == 0x005B) { # [
2447 wakaba 1.12 !!!cp (167.3);
2448     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2449     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2450 wakaba 1.13 $self->{in_subset} = 1;
2451 wakaba 1.1 !!!next-input-character;
2452 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2453 wakaba 1.1 redo A;
2454     } else {
2455 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2456    
2457     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2458     !!!cp (180);
2459     $self->{ct}->{quirks} = 1;
2460     $self->{state} = BOGUS_DOCTYPE_STATE;
2461     } else {
2462     !!!cp (180.1);
2463     $self->{state} = BOGUS_MD_STATE;
2464     }
2465 wakaba 1.1
2466     !!!next-input-character;
2467     redo A;
2468     }
2469     } elsif ($self->{state} == PUBLIC_STATE) {
2470     ## ASCII case-insensitive
2471     if ($self->{nc} == [
2472     undef,
2473     0x0055, # U
2474     0x0042, # B
2475     0x004C, # L
2476     0x0049, # I
2477 wakaba 1.12 ]->[length $self->{kwd}] or
2478 wakaba 1.1 $self->{nc} == [
2479     undef,
2480     0x0075, # u
2481     0x0062, # b
2482     0x006C, # l
2483     0x0069, # i
2484 wakaba 1.12 ]->[length $self->{kwd}]) {
2485 wakaba 1.1 !!!cp (175);
2486     ## Stay in the state.
2487 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2488 wakaba 1.1 !!!next-input-character;
2489     redo A;
2490 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2491 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2492     $self->{nc} == 0x0063)) { # c
2493 wakaba 1.12 if ($self->{is_xml} and
2494     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2495     !!!cp (168.1);
2496     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2497     text => 'PUBLIC',
2498     line => $self->{line_prev},
2499     column => $self->{column_prev} - 4);
2500     } else {
2501     !!!cp (168);
2502     }
2503 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2504     !!!next-input-character;
2505     redo A;
2506     } else {
2507 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2508 wakaba 1.1 line => $self->{line_prev},
2509 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2510 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2511     !!!cp (169);
2512     $self->{ct}->{quirks} = 1;
2513     $self->{state} = BOGUS_DOCTYPE_STATE;
2514     } else {
2515     !!!cp (169.1);
2516     $self->{state} = BOGUS_MD_STATE;
2517     }
2518 wakaba 1.1 ## Reconsume.
2519     redo A;
2520     }
2521     } elsif ($self->{state} == SYSTEM_STATE) {
2522     ## ASCII case-insensitive
2523     if ($self->{nc} == [
2524     undef,
2525     0x0059, # Y
2526     0x0053, # S
2527     0x0054, # T
2528     0x0045, # E
2529 wakaba 1.12 ]->[length $self->{kwd}] or
2530 wakaba 1.1 $self->{nc} == [
2531     undef,
2532     0x0079, # y
2533     0x0073, # s
2534     0x0074, # t
2535     0x0065, # e
2536 wakaba 1.12 ]->[length $self->{kwd}]) {
2537 wakaba 1.1 !!!cp (170);
2538     ## Stay in the state.
2539 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2540 wakaba 1.1 !!!next-input-character;
2541     redo A;
2542 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2543 wakaba 1.1 ($self->{nc} == 0x004D or # M
2544     $self->{nc} == 0x006D)) { # m
2545 wakaba 1.12 if ($self->{is_xml} and
2546     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2547     !!!cp (171.1);
2548     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2549     text => 'SYSTEM',
2550     line => $self->{line_prev},
2551     column => $self->{column_prev} - 4);
2552     } else {
2553     !!!cp (171);
2554     }
2555 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2556     !!!next-input-character;
2557     redo A;
2558     } else {
2559 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2560 wakaba 1.1 line => $self->{line_prev},
2561 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2562 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2563     !!!cp (172);
2564     $self->{ct}->{quirks} = 1;
2565     $self->{state} = BOGUS_DOCTYPE_STATE;
2566     } else {
2567     !!!cp (172.1);
2568     $self->{state} = BOGUS_MD_STATE;
2569     }
2570 wakaba 1.1 ## Reconsume.
2571     redo A;
2572     }
2573     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2574     if ($is_space->{$self->{nc}}) {
2575     !!!cp (181);
2576     ## Stay in the state
2577     !!!next-input-character;
2578     redo A;
2579     } elsif ($self->{nc} eq 0x0022) { # "
2580     !!!cp (182);
2581     $self->{ct}->{pubid} = ''; # DOCTYPE
2582     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2583     !!!next-input-character;
2584     redo A;
2585     } elsif ($self->{nc} eq 0x0027) { # '
2586     !!!cp (183);
2587     $self->{ct}->{pubid} = ''; # DOCTYPE
2588     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2589     !!!next-input-character;
2590     redo A;
2591     } elsif ($self->{nc} eq 0x003E) { # >
2592     !!!parse-error (type => 'no PUBLIC literal');
2593 wakaba 1.16
2594     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2595     !!!cp (184);
2596     $self->{state} = DATA_STATE;
2597     $self->{s_kwd} = '';
2598     $self->{ct}->{quirks} = 1;
2599     } else {
2600     !!!cp (184.1);
2601     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2602     }
2603    
2604 wakaba 1.1 !!!next-input-character;
2605 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2606 wakaba 1.1 redo A;
2607     } elsif ($self->{nc} == -1) {
2608 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2609     !!!cp (185);
2610     !!!parse-error (type => 'unclosed DOCTYPE');
2611     $self->{state} = DATA_STATE;
2612     $self->{s_kwd} = '';
2613     $self->{ct}->{quirks} = 1;
2614     } else {
2615     !!!cp (185.1);
2616     !!!parse-error (type => 'unclosed md'); ## TODO: type
2617     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2618     }
2619    
2620 wakaba 1.1 ## reconsume
2621     !!!emit ($self->{ct}); # DOCTYPE
2622     redo A;
2623 wakaba 1.16 } elsif ($self->{is_xml} and
2624     $self->{ct}->{type} == DOCTYPE_TOKEN and
2625     $self->{nc} == 0x005B) { # [
2626 wakaba 1.12 !!!cp (186.1);
2627     !!!parse-error (type => 'no PUBLIC literal');
2628     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2629     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2630 wakaba 1.13 $self->{in_subset} = 1;
2631 wakaba 1.12 !!!next-input-character;
2632 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2633 wakaba 1.12 redo A;
2634 wakaba 1.1 } else {
2635     !!!parse-error (type => 'string after PUBLIC');
2636    
2637 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2638     !!!cp (186);
2639     $self->{ct}->{quirks} = 1;
2640     $self->{state} = BOGUS_DOCTYPE_STATE;
2641     } else {
2642     !!!cp (186.2);
2643     $self->{state} = BOGUS_MD_STATE;
2644     }
2645    
2646 wakaba 1.1 !!!next-input-character;
2647     redo A;
2648     }
2649     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2650     if ($self->{nc} == 0x0022) { # "
2651     !!!cp (187);
2652     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2653     !!!next-input-character;
2654     redo A;
2655     } elsif ($self->{nc} == 0x003E) { # >
2656     !!!parse-error (type => 'unclosed PUBLIC literal');
2657    
2658 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2659     !!!cp (188);
2660     $self->{state} = DATA_STATE;
2661     $self->{s_kwd} = '';
2662     $self->{ct}->{quirks} = 1;
2663     } else {
2664     !!!cp (188.1);
2665     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2666     }
2667    
2668 wakaba 1.1 !!!next-input-character;
2669 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2670 wakaba 1.1 redo A;
2671     } elsif ($self->{nc} == -1) {
2672     !!!parse-error (type => 'unclosed PUBLIC literal');
2673    
2674 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2675     !!!cp (189);
2676     $self->{state} = DATA_STATE;
2677     $self->{s_kwd} = '';
2678     $self->{ct}->{quirks} = 1;
2679     } else {
2680     !!!cp (189.1);
2681     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2682     }
2683    
2684     ## Reconsume.
2685 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2686     redo A;
2687     } else {
2688     !!!cp (190);
2689 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2690 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2691     length $self->{ct}->{pubid});
2692    
2693     ## Stay in the state
2694     !!!next-input-character;
2695     redo A;
2696     }
2697     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2698     if ($self->{nc} == 0x0027) { # '
2699     !!!cp (191);
2700     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2701     !!!next-input-character;
2702     redo A;
2703     } elsif ($self->{nc} == 0x003E) { # >
2704     !!!parse-error (type => 'unclosed PUBLIC literal');
2705    
2706 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2707     !!!cp (192);
2708     $self->{state} = DATA_STATE;
2709     $self->{s_kwd} = '';
2710     $self->{ct}->{quirks} = 1;
2711     } else {
2712     !!!cp (192.1);
2713     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2714     }
2715    
2716 wakaba 1.1 !!!next-input-character;
2717 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2718 wakaba 1.1 redo A;
2719     } elsif ($self->{nc} == -1) {
2720     !!!parse-error (type => 'unclosed PUBLIC literal');
2721    
2722 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2723     !!!cp (193);
2724     $self->{state} = DATA_STATE;
2725     $self->{s_kwd} = '';
2726     $self->{ct}->{quirks} = 1;
2727     } else {
2728     !!!cp (193.1);
2729     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2730     }
2731    
2732 wakaba 1.1 ## reconsume
2733 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2734 wakaba 1.1 redo A;
2735     } else {
2736     !!!cp (194);
2737 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2738 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2739     length $self->{ct}->{pubid});
2740    
2741     ## Stay in the state
2742     !!!next-input-character;
2743     redo A;
2744     }
2745     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2746     if ($is_space->{$self->{nc}}) {
2747     !!!cp (195);
2748     ## Stay in the state
2749     !!!next-input-character;
2750     redo A;
2751     } elsif ($self->{nc} == 0x0022) { # "
2752     !!!cp (196);
2753 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2754 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2755     !!!next-input-character;
2756     redo A;
2757     } elsif ($self->{nc} == 0x0027) { # '
2758     !!!cp (197);
2759 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2760 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2761     !!!next-input-character;
2762     redo A;
2763     } elsif ($self->{nc} == 0x003E) { # >
2764 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2765     if ($self->{is_xml}) {
2766     !!!cp (198.1);
2767     !!!parse-error (type => 'no SYSTEM literal');
2768     } else {
2769     !!!cp (198);
2770     }
2771     $self->{state} = DATA_STATE;
2772     $self->{s_kwd} = '';
2773 wakaba 1.12 } else {
2774 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2775     !!!cp (198.2);
2776     } else {
2777     !!!cp (198.3);
2778     !!!parse-error (type => 'no SYSTEM literal');
2779     }
2780     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2781 wakaba 1.12 }
2782 wakaba 1.16
2783 wakaba 1.1 !!!next-input-character;
2784 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2785 wakaba 1.1 redo A;
2786     } elsif ($self->{nc} == -1) {
2787 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2788     !!!cp (199);
2789     !!!parse-error (type => 'unclosed DOCTYPE');
2790    
2791     $self->{state} = DATA_STATE;
2792     $self->{s_kwd} = '';
2793     $self->{ct}->{quirks} = 1;
2794     } else {
2795     !!!parse-error (type => 'unclosed md'); ## TODO: type
2796     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2797     }
2798    
2799 wakaba 1.1 ## reconsume
2800 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2801 wakaba 1.1 redo A;
2802 wakaba 1.16 } elsif ($self->{is_xml} and
2803     $self->{ct}->{type} == DOCTYPE_TOKEN and
2804     $self->{nc} == 0x005B) { # [
2805 wakaba 1.12 !!!cp (200.1);
2806     !!!parse-error (type => 'no SYSTEM literal');
2807     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2808     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2809 wakaba 1.13 $self->{in_subset} = 1;
2810 wakaba 1.12 !!!next-input-character;
2811 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2812 wakaba 1.12 redo A;
2813 wakaba 1.1 } else {
2814     !!!parse-error (type => 'string after PUBLIC literal');
2815    
2816 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2817     !!!cp (200);
2818     $self->{ct}->{quirks} = 1;
2819     $self->{state} = BOGUS_DOCTYPE_STATE;
2820     } else {
2821     !!!cp (200.2);
2822     $self->{state} = BOGUS_MD_STATE;
2823     }
2824    
2825 wakaba 1.1 !!!next-input-character;
2826     redo A;
2827     }
2828     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2829     if ($is_space->{$self->{nc}}) {
2830     !!!cp (201);
2831     ## Stay in the state
2832     !!!next-input-character;
2833     redo A;
2834     } elsif ($self->{nc} == 0x0022) { # "
2835     !!!cp (202);
2836     $self->{ct}->{sysid} = ''; # DOCTYPE
2837     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2838     !!!next-input-character;
2839     redo A;
2840     } elsif ($self->{nc} == 0x0027) { # '
2841     !!!cp (203);
2842     $self->{ct}->{sysid} = ''; # DOCTYPE
2843     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2844     !!!next-input-character;
2845     redo A;
2846     } elsif ($self->{nc} == 0x003E) { # >
2847     !!!parse-error (type => 'no SYSTEM literal');
2848     !!!next-input-character;
2849    
2850 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2851     !!!cp (204);
2852     $self->{state} = DATA_STATE;
2853     $self->{s_kwd} = '';
2854     $self->{ct}->{quirks} = 1;
2855     } else {
2856     !!!cp (204.1);
2857     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2858     }
2859 wakaba 1.1
2860 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2861 wakaba 1.1 redo A;
2862     } elsif ($self->{nc} == -1) {
2863 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2864     !!!cp (205);
2865     !!!parse-error (type => 'unclosed DOCTYPE');
2866     $self->{state} = DATA_STATE;
2867     $self->{s_kwd} = '';
2868     $self->{ct}->{quirks} = 1;
2869     } else {
2870     !!!cp (205.1);
2871     !!!parse-error (type => 'unclosed md'); ## TODO: type
2872     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2873     }
2874    
2875 wakaba 1.1 ## reconsume
2876 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2877 wakaba 1.1 redo A;
2878 wakaba 1.16 } elsif ($self->{is_xml} and
2879     $self->{ct}->{type} == DOCTYPE_TOKEN and
2880     $self->{nc} == 0x005B) { # [
2881 wakaba 1.12 !!!cp (206.1);
2882     !!!parse-error (type => 'no SYSTEM literal');
2883    
2884     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2885     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2886 wakaba 1.13 $self->{in_subset} = 1;
2887 wakaba 1.12 !!!next-input-character;
2888 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2889 wakaba 1.12 redo A;
2890 wakaba 1.1 } else {
2891     !!!parse-error (type => 'string after SYSTEM');
2892    
2893 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2894     !!!cp (206);
2895     $self->{ct}->{quirks} = 1;
2896     $self->{state} = BOGUS_DOCTYPE_STATE;
2897     } else {
2898     !!!cp (206.2);
2899     $self->{state} = BOGUS_MD_STATE;
2900     }
2901    
2902 wakaba 1.1 !!!next-input-character;
2903     redo A;
2904     }
2905     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2906     if ($self->{nc} == 0x0022) { # "
2907     !!!cp (207);
2908     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2909     !!!next-input-character;
2910     redo A;
2911 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2912 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2913    
2914 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2915     !!!cp (208);
2916     $self->{state} = DATA_STATE;
2917     $self->{s_kwd} = '';
2918     $self->{ct}->{quirks} = 1;
2919     } else {
2920     !!!cp (208.1);
2921     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2922     }
2923    
2924 wakaba 1.1 !!!next-input-character;
2925 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2926 wakaba 1.1 redo A;
2927     } elsif ($self->{nc} == -1) {
2928     !!!parse-error (type => 'unclosed SYSTEM literal');
2929    
2930 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2931     !!!cp (209);
2932     $self->{state} = DATA_STATE;
2933     $self->{s_kwd} = '';
2934     $self->{ct}->{quirks} = 1;
2935     } else {
2936     !!!cp (209.1);
2937     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2938     }
2939    
2940 wakaba 1.1 ## reconsume
2941 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2942 wakaba 1.1 redo A;
2943     } else {
2944     !!!cp (210);
2945 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2946 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2947     length $self->{ct}->{sysid});
2948    
2949     ## Stay in the state
2950     !!!next-input-character;
2951     redo A;
2952     }
2953     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2954     if ($self->{nc} == 0x0027) { # '
2955     !!!cp (211);
2956     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2957     !!!next-input-character;
2958     redo A;
2959 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2960 wakaba 1.1 !!!cp (212);
2961     !!!parse-error (type => 'unclosed SYSTEM literal');
2962    
2963     $self->{state} = DATA_STATE;
2964 wakaba 1.5 $self->{s_kwd} = '';
2965 wakaba 1.1 !!!next-input-character;
2966    
2967     $self->{ct}->{quirks} = 1;
2968     !!!emit ($self->{ct}); # DOCTYPE
2969    
2970     redo A;
2971     } elsif ($self->{nc} == -1) {
2972     !!!parse-error (type => 'unclosed SYSTEM literal');
2973    
2974 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2975     !!!cp (213);
2976     $self->{state} = DATA_STATE;
2977     $self->{s_kwd} = '';
2978     $self->{ct}->{quirks} = 1;
2979     } else {
2980     !!!cp (213.1);
2981     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2982     }
2983    
2984 wakaba 1.1 ## reconsume
2985 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2986 wakaba 1.1 redo A;
2987     } else {
2988     !!!cp (214);
2989 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2990 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2991     length $self->{ct}->{sysid});
2992    
2993     ## Stay in the state
2994     !!!next-input-character;
2995     redo A;
2996     }
2997     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2998     if ($is_space->{$self->{nc}}) {
2999 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
3000     !!!cp (215.1);
3001     $self->{state} = BEFORE_NDATA_STATE;
3002     } else {
3003     !!!cp (215);
3004     ## Stay in the state
3005     }
3006 wakaba 1.1 !!!next-input-character;
3007     redo A;
3008     } elsif ($self->{nc} == 0x003E) { # >
3009 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3010     !!!cp (216);
3011     $self->{state} = DATA_STATE;
3012     $self->{s_kwd} = '';
3013     } else {
3014     !!!cp (216.1);
3015     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3016     }
3017    
3018 wakaba 1.1 !!!next-input-character;
3019 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3020 wakaba 1.1 redo A;
3021 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3022     ($self->{nc} == 0x004E or # N
3023     $self->{nc} == 0x006E)) { # n
3024     !!!cp (216.2);
3025     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
3026     $self->{state} = NDATA_STATE;
3027     $self->{kwd} = chr $self->{nc};
3028     !!!next-input-character;
3029     redo A;
3030 wakaba 1.1 } elsif ($self->{nc} == -1) {
3031 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3032     !!!cp (217);
3033     !!!parse-error (type => 'unclosed DOCTYPE');
3034     $self->{state} = DATA_STATE;
3035     $self->{s_kwd} = '';
3036     $self->{ct}->{quirks} = 1;
3037     } else {
3038     !!!cp (217.1);
3039     !!!parse-error (type => 'unclosed md'); ## TODO: type
3040     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3041     }
3042    
3043 wakaba 1.1 ## reconsume
3044 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3045 wakaba 1.1 redo A;
3046 wakaba 1.16 } elsif ($self->{is_xml} and
3047     $self->{ct}->{type} == DOCTYPE_TOKEN and
3048     $self->{nc} == 0x005B) { # [
3049 wakaba 1.12 !!!cp (218.1);
3050     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3051     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3052 wakaba 1.13 $self->{in_subset} = 1;
3053 wakaba 1.12 !!!next-input-character;
3054 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
3055 wakaba 1.12 redo A;
3056 wakaba 1.1 } else {
3057     !!!parse-error (type => 'string after SYSTEM literal');
3058    
3059 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3060     !!!cp (218);
3061     #$self->{ct}->{quirks} = 1;
3062     $self->{state} = BOGUS_DOCTYPE_STATE;
3063     } else {
3064     !!!cp (218.2);
3065     $self->{state} = BOGUS_MD_STATE;
3066     }
3067    
3068 wakaba 1.1 !!!next-input-character;
3069     redo A;
3070     }
3071 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
3072     if ($is_space->{$self->{nc}}) {
3073     !!!cp (218.3);
3074     ## Stay in the state.
3075     !!!next-input-character;
3076     redo A;
3077     } elsif ($self->{nc} == 0x003E) { # >
3078     !!!cp (218.4);
3079     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3080     !!!next-input-character;
3081     !!!emit ($self->{ct}); # ENTITY
3082     redo A;
3083     } elsif ($self->{nc} == 0x004E or # N
3084     $self->{nc} == 0x006E) { # n
3085     !!!cp (218.5);
3086     $self->{state} = NDATA_STATE;
3087     $self->{kwd} = chr $self->{nc};
3088     !!!next-input-character;
3089     redo A;
3090     } elsif ($self->{nc} == -1) {
3091     !!!cp (218.6);
3092     !!!parse-error (type => 'unclosed md'); ## TODO: type
3093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3094     ## reconsume
3095     !!!emit ($self->{ct}); # ENTITY
3096     redo A;
3097     } else {
3098     !!!cp (218.7);
3099     !!!parse-error (type => 'string after SYSTEM literal');
3100     $self->{state} = BOGUS_MD_STATE;
3101     !!!next-input-character;
3102     redo A;
3103     }
3104 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3105     if ($self->{nc} == 0x003E) { # >
3106     !!!cp (219);
3107     $self->{state} = DATA_STATE;
3108 wakaba 1.5 $self->{s_kwd} = '';
3109 wakaba 1.1 !!!next-input-character;
3110    
3111     !!!emit ($self->{ct}); # DOCTYPE
3112    
3113     redo A;
3114 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3115 wakaba 1.13 !!!cp (220.1);
3116     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3117     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3118     $self->{in_subset} = 1;
3119     !!!next-input-character;
3120     !!!emit ($self->{ct}); # DOCTYPE
3121     redo A;
3122 wakaba 1.1 } elsif ($self->{nc} == -1) {
3123     !!!cp (220);
3124     $self->{state} = DATA_STATE;
3125 wakaba 1.5 $self->{s_kwd} = '';
3126 wakaba 1.1 ## reconsume
3127    
3128     !!!emit ($self->{ct}); # DOCTYPE
3129    
3130     redo A;
3131     } else {
3132     !!!cp (221);
3133     my $s = '';
3134 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
3135 wakaba 1.1
3136     ## Stay in the state
3137     !!!next-input-character;
3138     redo A;
3139     }
3140     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3141     ## NOTE: "CDATA section state" in the state is jointly implemented
3142     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3143     ## and |CDATA_SECTION_MSE2_STATE|.
3144 wakaba 1.10
3145     ## XML5: "CDATA state".
3146 wakaba 1.1
3147     if ($self->{nc} == 0x005D) { # ]
3148     !!!cp (221.1);
3149     $self->{state} = CDATA_SECTION_MSE1_STATE;
3150     !!!next-input-character;
3151     redo A;
3152     } elsif ($self->{nc} == -1) {
3153 wakaba 1.6 if ($self->{is_xml}) {
3154 wakaba 1.8 !!!cp (221.11);
3155 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3156 wakaba 1.8 } else {
3157     !!!cp (221.12);
3158 wakaba 1.6 }
3159    
3160 wakaba 1.1 $self->{state} = DATA_STATE;
3161 wakaba 1.5 $self->{s_kwd} = '';
3162 wakaba 1.10 ## Reconsume.
3163 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3164     !!!cp (221.2);
3165     !!!emit ($self->{ct}); # character
3166     } else {
3167     !!!cp (221.3);
3168     ## No token to emit. $self->{ct} is discarded.
3169     }
3170     redo A;
3171     } else {
3172     !!!cp (221.4);
3173     $self->{ct}->{data} .= chr $self->{nc};
3174     $self->{read_until}->($self->{ct}->{data},
3175     q<]>,
3176     length $self->{ct}->{data});
3177    
3178     ## Stay in the state.
3179     !!!next-input-character;
3180     redo A;
3181     }
3182    
3183     ## ISSUE: "text tokens" in spec.
3184     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3185 wakaba 1.10 ## XML5: "CDATA bracket state".
3186    
3187 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3188     !!!cp (221.5);
3189     $self->{state} = CDATA_SECTION_MSE2_STATE;
3190     !!!next-input-character;
3191     redo A;
3192     } else {
3193     !!!cp (221.6);
3194 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3195 wakaba 1.1 $self->{ct}->{data} .= ']';
3196 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3197 wakaba 1.1 ## Reconsume.
3198     redo A;
3199     }
3200     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3201 wakaba 1.10 ## XML5: "CDATA end state".
3202    
3203 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3204     $self->{state} = DATA_STATE;
3205 wakaba 1.5 $self->{s_kwd} = '';
3206 wakaba 1.1 !!!next-input-character;
3207     if (length $self->{ct}->{data}) { # character
3208     !!!cp (221.7);
3209     !!!emit ($self->{ct}); # character
3210     } else {
3211     !!!cp (221.8);
3212     ## No token to emit. $self->{ct} is discarded.
3213     }
3214     redo A;
3215     } elsif ($self->{nc} == 0x005D) { # ]
3216     !!!cp (221.9); # character
3217     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3218     ## Stay in the state.
3219     !!!next-input-character;
3220     redo A;
3221     } else {
3222     !!!cp (221.11);
3223     $self->{ct}->{data} .= ']]'; # character
3224     $self->{state} = CDATA_SECTION_STATE;
3225 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3226 wakaba 1.1 redo A;
3227     }
3228     } elsif ($self->{state} == ENTITY_STATE) {
3229     if ($is_space->{$self->{nc}} or
3230     {
3231     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3232     $self->{entity_add} => 1,
3233     }->{$self->{nc}}) {
3234 wakaba 1.22 if ($self->{is_xml}) {
3235     !!!cp (1001.1);
3236     !!!parse-error (type => 'bare ero',
3237     line => $self->{line_prev},
3238     column => $self->{column_prev}
3239     + ($self->{nc} == -1 ? 1 : 0));
3240     } else {
3241     !!!cp (1001);
3242     ## No error
3243     }
3244 wakaba 1.1 ## Don't consume
3245     ## Return nothing.
3246     #
3247     } elsif ($self->{nc} == 0x0023) { # #
3248     !!!cp (999);
3249     $self->{state} = ENTITY_HASH_STATE;
3250 wakaba 1.12 $self->{kwd} = '#';
3251 wakaba 1.1 !!!next-input-character;
3252     redo A;
3253 wakaba 1.22 } elsif ($self->{is_xml} or
3254     (0x0041 <= $self->{nc} and
3255 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3256     (0x0061 <= $self->{nc} and
3257     $self->{nc} <= 0x007A)) { # a..z
3258     !!!cp (998);
3259     require Whatpm::_NamedEntityList;
3260     $self->{state} = ENTITY_NAME_STATE;
3261 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3262     $self->{entity__value} = $self->{kwd};
3263 wakaba 1.1 $self->{entity__match} = 0;
3264     !!!next-input-character;
3265     redo A;
3266     } else {
3267     !!!cp (1027);
3268     !!!parse-error (type => 'bare ero');
3269     ## Return nothing.
3270     #
3271     }
3272    
3273     ## NOTE: No character is consumed by the "consume a character
3274     ## reference" algorithm. In other word, there is an "&" character
3275     ## that does not introduce a character reference, which would be
3276     ## appended to the parent element or the attribute value in later
3277     ## process of the tokenizer.
3278    
3279     if ($self->{prev_state} == DATA_STATE) {
3280     !!!cp (997);
3281     $self->{state} = $self->{prev_state};
3282 wakaba 1.5 $self->{s_kwd} = '';
3283 wakaba 1.1 ## Reconsume.
3284     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3285     line => $self->{line_prev},
3286     column => $self->{column_prev},
3287     });
3288     redo A;
3289     } else {
3290     !!!cp (996);
3291     $self->{ca}->{value} .= '&';
3292     $self->{state} = $self->{prev_state};
3293 wakaba 1.5 $self->{s_kwd} = '';
3294 wakaba 1.1 ## Reconsume.
3295     redo A;
3296     }
3297     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3298 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3299 wakaba 1.1 !!!cp (995);
3300     $self->{state} = HEXREF_X_STATE;
3301 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3302 wakaba 1.1 !!!next-input-character;
3303     redo A;
3304 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3305     !!!cp (995.1);
3306     if ($self->{is_xml}) {
3307     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3308     }
3309     $self->{state} = HEXREF_X_STATE;
3310     $self->{kwd} .= chr $self->{nc};
3311     !!!next-input-character;
3312     redo A;
3313 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3314     $self->{nc} <= 0x0039) { # 0..9
3315     !!!cp (994);
3316     $self->{state} = NCR_NUM_STATE;
3317 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3318 wakaba 1.1 !!!next-input-character;
3319     redo A;
3320     } else {
3321     !!!parse-error (type => 'bare nero',
3322     line => $self->{line_prev},
3323     column => $self->{column_prev} - 1);
3324    
3325     ## NOTE: According to the spec algorithm, nothing is returned,
3326     ## and then "&#" is appended to the parent element or the attribute
3327     ## value in the later processing.
3328    
3329     if ($self->{prev_state} == DATA_STATE) {
3330     !!!cp (1019);
3331     $self->{state} = $self->{prev_state};
3332 wakaba 1.5 $self->{s_kwd} = '';
3333 wakaba 1.1 ## Reconsume.
3334     !!!emit ({type => CHARACTER_TOKEN,
3335     data => '&#',
3336     line => $self->{line_prev},
3337     column => $self->{column_prev} - 1,
3338     });
3339     redo A;
3340     } else {
3341     !!!cp (993);
3342     $self->{ca}->{value} .= '&#';
3343     $self->{state} = $self->{prev_state};
3344 wakaba 1.5 $self->{s_kwd} = '';
3345 wakaba 1.1 ## Reconsume.
3346     redo A;
3347     }
3348     }
3349     } elsif ($self->{state} == NCR_NUM_STATE) {
3350     if (0x0030 <= $self->{nc} and
3351     $self->{nc} <= 0x0039) { # 0..9
3352     !!!cp (1012);
3353 wakaba 1.12 $self->{kwd} *= 10;
3354     $self->{kwd} += $self->{nc} - 0x0030;
3355 wakaba 1.1
3356     ## Stay in the state.
3357     !!!next-input-character;
3358     redo A;
3359     } elsif ($self->{nc} == 0x003B) { # ;
3360     !!!cp (1013);
3361     !!!next-input-character;
3362     #
3363     } else {
3364     !!!cp (1014);
3365     !!!parse-error (type => 'no refc');
3366     ## Reconsume.
3367     #
3368     }
3369    
3370 wakaba 1.12 my $code = $self->{kwd};
3371 wakaba 1.1 my $l = $self->{line_prev};
3372     my $c = $self->{column_prev};
3373 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3374     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3375     ($self->{is_xml} and $code == 0x0000)) {
3376 wakaba 1.1 !!!cp (1015);
3377     !!!parse-error (type => 'invalid character reference',
3378     text => (sprintf 'U+%04X', $code),
3379     line => $l, column => $c);
3380     $code = $charref_map->{$code};
3381     } elsif ($code > 0x10FFFF) {
3382     !!!cp (1016);
3383     !!!parse-error (type => 'invalid character reference',
3384     text => (sprintf 'U-%08X', $code),
3385     line => $l, column => $c);
3386     $code = 0xFFFD;
3387     }
3388    
3389     if ($self->{prev_state} == DATA_STATE) {
3390     !!!cp (992);
3391     $self->{state} = $self->{prev_state};
3392 wakaba 1.5 $self->{s_kwd} = '';
3393 wakaba 1.1 ## Reconsume.
3394     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3395 wakaba 1.7 has_reference => 1,
3396 wakaba 1.1 line => $l, column => $c,
3397     });
3398     redo A;
3399     } else {
3400     !!!cp (991);
3401     $self->{ca}->{value} .= chr $code;
3402     $self->{ca}->{has_reference} = 1;
3403     $self->{state} = $self->{prev_state};
3404 wakaba 1.5 $self->{s_kwd} = '';
3405 wakaba 1.1 ## Reconsume.
3406     redo A;
3407     }
3408     } elsif ($self->{state} == HEXREF_X_STATE) {
3409     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3410     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3411     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3412     # 0..9, A..F, a..f
3413     !!!cp (990);
3414     $self->{state} = HEXREF_HEX_STATE;
3415 wakaba 1.12 $self->{kwd} = 0;
3416 wakaba 1.1 ## Reconsume.
3417     redo A;
3418     } else {
3419     !!!parse-error (type => 'bare hcro',
3420     line => $self->{line_prev},
3421     column => $self->{column_prev} - 2);
3422    
3423     ## NOTE: According to the spec algorithm, nothing is returned,
3424     ## and then "&#" followed by "X" or "x" is appended to the parent
3425     ## element or the attribute value in the later processing.
3426    
3427     if ($self->{prev_state} == DATA_STATE) {
3428     !!!cp (1005);
3429     $self->{state} = $self->{prev_state};
3430 wakaba 1.5 $self->{s_kwd} = '';
3431 wakaba 1.1 ## Reconsume.
3432     !!!emit ({type => CHARACTER_TOKEN,
3433 wakaba 1.12 data => '&' . $self->{kwd},
3434 wakaba 1.1 line => $self->{line_prev},
3435 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3436 wakaba 1.1 });
3437     redo A;
3438     } else {
3439     !!!cp (989);
3440 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3441 wakaba 1.1 $self->{state} = $self->{prev_state};
3442 wakaba 1.5 $self->{s_kwd} = '';
3443 wakaba 1.1 ## Reconsume.
3444     redo A;
3445     }
3446     }
3447     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3448     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3449     # 0..9
3450     !!!cp (1002);
3451 wakaba 1.12 $self->{kwd} *= 0x10;
3452     $self->{kwd} += $self->{nc} - 0x0030;
3453 wakaba 1.1 ## Stay in the state.
3454     !!!next-input-character;
3455     redo A;
3456     } elsif (0x0061 <= $self->{nc} and
3457     $self->{nc} <= 0x0066) { # a..f
3458     !!!cp (1003);
3459 wakaba 1.12 $self->{kwd} *= 0x10;
3460     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3461 wakaba 1.1 ## Stay in the state.
3462     !!!next-input-character;
3463     redo A;
3464     } elsif (0x0041 <= $self->{nc} and
3465     $self->{nc} <= 0x0046) { # A..F
3466     !!!cp (1004);
3467 wakaba 1.12 $self->{kwd} *= 0x10;
3468     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3469 wakaba 1.1 ## Stay in the state.
3470     !!!next-input-character;
3471     redo A;
3472     } elsif ($self->{nc} == 0x003B) { # ;
3473     !!!cp (1006);
3474     !!!next-input-character;
3475     #
3476     } else {
3477     !!!cp (1007);
3478     !!!parse-error (type => 'no refc',
3479     line => $self->{line},
3480     column => $self->{column});
3481     ## Reconsume.
3482     #
3483     }
3484    
3485 wakaba 1.12 my $code = $self->{kwd};
3486 wakaba 1.1 my $l = $self->{line_prev};
3487     my $c = $self->{column_prev};
3488 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3489     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3490     ($self->{is_xml} and $code == 0x0000)) {
3491 wakaba 1.1 !!!cp (1008);
3492     !!!parse-error (type => 'invalid character reference',
3493     text => (sprintf 'U+%04X', $code),
3494     line => $l, column => $c);
3495     $code = $charref_map->{$code};
3496     } elsif ($code > 0x10FFFF) {
3497     !!!cp (1009);
3498     !!!parse-error (type => 'invalid character reference',
3499     text => (sprintf 'U-%08X', $code),
3500     line => $l, column => $c);
3501     $code = 0xFFFD;
3502     }
3503    
3504     if ($self->{prev_state} == DATA_STATE) {
3505     !!!cp (988);
3506     $self->{state} = $self->{prev_state};
3507 wakaba 1.5 $self->{s_kwd} = '';
3508 wakaba 1.1 ## Reconsume.
3509     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3510 wakaba 1.7 has_reference => 1,
3511 wakaba 1.1 line => $l, column => $c,
3512     });
3513     redo A;
3514     } else {
3515     !!!cp (987);
3516     $self->{ca}->{value} .= chr $code;
3517     $self->{ca}->{has_reference} = 1;
3518     $self->{state} = $self->{prev_state};
3519 wakaba 1.5 $self->{s_kwd} = '';
3520 wakaba 1.1 ## Reconsume.
3521     redo A;
3522     }
3523     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3524 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3525     $self->{nc} <= 0x005A) or # x
3526     (0x0061 <= $self->{nc} and # a
3527     $self->{nc} <= 0x007A) or # z
3528     (0x0030 <= $self->{nc} and # 0
3529     $self->{nc} <= 0x0039) or # 9
3530 wakaba 1.22 $self->{nc} == 0x003B or # ;
3531     ($self->{is_xml} and
3532     not ($is_space->{$self->{nc}} or
3533     {
3534     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3535     $self->{entity_add} => 1,
3536     }->{$self->{nc}}))) {
3537 wakaba 1.1 our $EntityChar;
3538 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3539 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3540     $self->{ge}->{$self->{kwd}}) {
3541 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3542 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3543     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3544     !!!cp (1020.1);
3545     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3546     } else {
3547     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3548     !!!cp (1020.2);
3549     !!!parse-error (type => 'unparsed entity', ## TODO: type
3550     value => $self->{kwd});
3551     } else {
3552     !!!cp (1020.3);
3553     }
3554     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3555     }
3556     } else {
3557     if ($self->{is_xml}) {
3558     !!!cp (1020.4);
3559     !!!parse-error (type => 'entity not declared', ## TODO: type
3560     value => $self->{kwd},
3561     level => {
3562     'amp;' => $self->{level}->{warn},
3563     'quot;' => $self->{level}->{warn},
3564     'lt;' => $self->{level}->{warn},
3565     'gt;' => $self->{level}->{warn},
3566     'apos;' => $self->{level}->{warn},
3567     }->{$self->{kwd}} ||
3568     $self->{level}->{must});
3569     } else {
3570     !!!cp (1020);
3571     }
3572     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3573     }
3574 wakaba 1.1 $self->{entity__match} = 1;
3575     !!!next-input-character;
3576     #
3577     } else {
3578     !!!cp (1021);
3579 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3580 wakaba 1.1 $self->{entity__match} = -1;
3581     ## Stay in the state.
3582     !!!next-input-character;
3583     redo A;
3584     }
3585     } else {
3586     !!!cp (1022);
3587     $self->{entity__value} .= chr $self->{nc};
3588     $self->{entity__match} *= 2;
3589     ## Stay in the state.
3590     !!!next-input-character;
3591     redo A;
3592     }
3593     }
3594    
3595     my $data;
3596     my $has_ref;
3597     if ($self->{entity__match} > 0) {
3598     !!!cp (1023);
3599     $data = $self->{entity__value};
3600     $has_ref = 1;
3601     #
3602     } elsif ($self->{entity__match} < 0) {
3603     !!!parse-error (type => 'no refc');
3604     if ($self->{prev_state} != DATA_STATE and # in attribute
3605     $self->{entity__match} < -1) {
3606     !!!cp (1024);
3607 wakaba 1.12 $data = '&' . $self->{kwd};
3608 wakaba 1.1 #
3609     } else {
3610     !!!cp (1025);
3611     $data = $self->{entity__value};
3612     $has_ref = 1;
3613     #
3614     }
3615     } else {
3616     !!!cp (1026);
3617     !!!parse-error (type => 'bare ero',
3618     line => $self->{line_prev},
3619 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3620     $data = '&' . $self->{kwd};
3621 wakaba 1.1 #
3622     }
3623    
3624     ## NOTE: In these cases, when a character reference is found,
3625     ## it is consumed and a character token is returned, or, otherwise,
3626     ## nothing is consumed and returned, according to the spec algorithm.
3627     ## In this implementation, anything that has been examined by the
3628     ## tokenizer is appended to the parent element or the attribute value
3629     ## as string, either literal string when no character reference or
3630     ## entity-replaced string otherwise, in this stage, since any characters
3631     ## that would not be consumed are appended in the data state or in an
3632     ## appropriate attribute value state anyway.
3633    
3634     if ($self->{prev_state} == DATA_STATE) {
3635     !!!cp (986);
3636     $self->{state} = $self->{prev_state};
3637 wakaba 1.5 $self->{s_kwd} = '';
3638 wakaba 1.1 ## Reconsume.
3639     !!!emit ({type => CHARACTER_TOKEN,
3640     data => $data,
3641 wakaba 1.7 has_reference => $has_ref,
3642 wakaba 1.1 line => $self->{line_prev},
3643 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3644 wakaba 1.1 });
3645     redo A;
3646     } else {
3647     !!!cp (985);
3648     $self->{ca}->{value} .= $data;
3649     $self->{ca}->{has_reference} = 1 if $has_ref;
3650     $self->{state} = $self->{prev_state};
3651 wakaba 1.5 $self->{s_kwd} = '';
3652 wakaba 1.1 ## Reconsume.
3653     redo A;
3654     }
3655 wakaba 1.8
3656     ## XML-only states
3657    
3658     } elsif ($self->{state} == PI_STATE) {
3659 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3660    
3661 wakaba 1.8 if ($is_space->{$self->{nc}} or
3662 wakaba 1.14 $self->{nc} == 0x003F or # ?
3663 wakaba 1.8 $self->{nc} == -1) {
3664 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3665     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3666     ## "DOCTYPE pi state": Parse error, switch to the "data
3667     ## state".
3668 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3669     line => $self->{line_prev},
3670     column => $self->{column_prev}
3671     - 1 * ($self->{nc} != -1));
3672     $self->{state} = BOGUS_COMMENT_STATE;
3673     ## Reconsume.
3674     $self->{ct} = {type => COMMENT_TOKEN,
3675     data => '?',
3676     line => $self->{line_prev},
3677     column => $self->{column_prev}
3678     - 1 * ($self->{nc} != -1),
3679     };
3680     redo A;
3681     } else {
3682 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3683 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3684     target => chr $self->{nc},
3685     data => '',
3686     line => $self->{line_prev},
3687     column => $self->{column_prev} - 1,
3688     };
3689     $self->{state} = PI_TARGET_STATE;
3690     !!!next-input-character;
3691     redo A;
3692     }
3693     } elsif ($self->{state} == PI_TARGET_STATE) {
3694     if ($is_space->{$self->{nc}}) {
3695     $self->{state} = PI_TARGET_AFTER_STATE;
3696     !!!next-input-character;
3697     redo A;
3698     } elsif ($self->{nc} == -1) {
3699     !!!parse-error (type => 'no pic'); ## TODO: type
3700 wakaba 1.13 if ($self->{in_subset}) {
3701     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3702     } else {
3703     $self->{state} = DATA_STATE;
3704     $self->{s_kwd} = '';
3705     }
3706 wakaba 1.8 ## Reconsume.
3707     !!!emit ($self->{ct}); # pi
3708     redo A;
3709     } elsif ($self->{nc} == 0x003F) { # ?
3710     $self->{state} = PI_AFTER_STATE;
3711     !!!next-input-character;
3712     redo A;
3713     } else {
3714     ## XML5: typo ("tag name" -> "target")
3715     $self->{ct}->{target} .= chr $self->{nc}; # pi
3716     !!!next-input-character;
3717     redo A;
3718     }
3719     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3720     if ($is_space->{$self->{nc}}) {
3721     ## Stay in the state.
3722     !!!next-input-character;
3723     redo A;
3724     } else {
3725     $self->{state} = PI_DATA_STATE;
3726     ## Reprocess.
3727     redo A;
3728     }
3729     } elsif ($self->{state} == PI_DATA_STATE) {
3730     if ($self->{nc} == 0x003F) { # ?
3731     $self->{state} = PI_DATA_AFTER_STATE;
3732     !!!next-input-character;
3733     redo A;
3734     } elsif ($self->{nc} == -1) {
3735     !!!parse-error (type => 'no pic'); ## TODO: type
3736 wakaba 1.13 if ($self->{in_subset}) {
3737 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3738 wakaba 1.13 } else {
3739     $self->{state} = DATA_STATE;
3740     $self->{s_kwd} = '';
3741     }
3742 wakaba 1.8 ## Reprocess.
3743     !!!emit ($self->{ct}); # pi
3744     redo A;
3745     } else {
3746     $self->{ct}->{data} .= chr $self->{nc}; # pi
3747     $self->{read_until}->($self->{ct}->{data}, q[?],
3748     length $self->{ct}->{data});
3749     ## Stay in the state.
3750     !!!next-input-character;
3751     ## Reprocess.
3752     redo A;
3753     }
3754     } elsif ($self->{state} == PI_AFTER_STATE) {
3755 wakaba 1.14 ## XML5: Part of "Pi after state".
3756    
3757 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3758 wakaba 1.13 if ($self->{in_subset}) {
3759     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3760     } else {
3761     $self->{state} = DATA_STATE;
3762     $self->{s_kwd} = '';
3763     }
3764 wakaba 1.8 !!!next-input-character;
3765     !!!emit ($self->{ct}); # pi
3766     redo A;
3767     } elsif ($self->{nc} == 0x003F) { # ?
3768     !!!parse-error (type => 'no s after target', ## TODO: type
3769     line => $self->{line_prev},
3770     column => $self->{column_prev}); ## XML5: no error
3771     $self->{ct}->{data} .= '?';
3772     $self->{state} = PI_DATA_AFTER_STATE;
3773     !!!next-input-character;
3774     redo A;
3775     } else {
3776     !!!parse-error (type => 'no s after target', ## TODO: type
3777     line => $self->{line_prev},
3778     column => $self->{column_prev}
3779     + 1 * ($self->{nc} == -1)); ## XML5: no error
3780     $self->{ct}->{data} .= '?'; ## XML5: not appended
3781     $self->{state} = PI_DATA_STATE;
3782     ## Reprocess.
3783     redo A;
3784     }
3785     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3786 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3787    
3788 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3789 wakaba 1.13 if ($self->{in_subset}) {
3790     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3791     } else {
3792     $self->{state} = DATA_STATE;
3793     $self->{s_kwd} = '';
3794     }
3795 wakaba 1.8 !!!next-input-character;
3796     !!!emit ($self->{ct}); # pi
3797     redo A;
3798     } elsif ($self->{nc} == 0x003F) { # ?
3799     $self->{ct}->{data} .= '?';
3800     ## Stay in the state.
3801     !!!next-input-character;
3802     redo A;
3803     } else {
3804     $self->{ct}->{data} .= '?'; ## XML5: not appended
3805     $self->{state} = PI_DATA_STATE;
3806     ## Reprocess.
3807     redo A;
3808     }
3809 wakaba 1.12
3810     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3811     if ($self->{nc} == 0x003C) { # <
3812 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3813 wakaba 1.12 !!!next-input-character;
3814     redo A;
3815     } elsif ($self->{nc} == 0x0025) { # %
3816     ## XML5: Not defined yet.
3817    
3818     ## TODO:
3819 wakaba 1.24
3820     if (not $self->{stop_processing} and
3821     not $self->{document}->xml_standalone) {
3822     !!!parse-error (type => 'stop processing', ## TODO: type
3823     level => $self->{level}->{info});
3824     $self->{stop_processing} = 1;
3825     }
3826    
3827 wakaba 1.12 !!!next-input-character;
3828     redo A;
3829     } elsif ($self->{nc} == 0x005D) { # ]
3830 wakaba 1.13 delete $self->{in_subset};
3831 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3832     !!!next-input-character;
3833     redo A;
3834     } elsif ($is_space->{$self->{nc}}) {
3835     ## Stay in the state.
3836     !!!next-input-character;
3837     redo A;
3838     } elsif ($self->{nc} == -1) {
3839     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3840 wakaba 1.13 delete $self->{in_subset};
3841 wakaba 1.12 $self->{state} = DATA_STATE;
3842     $self->{s_kwd} = '';
3843     ## Reconsume.
3844 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3845 wakaba 1.12 redo A;
3846     } else {
3847     unless ($self->{internal_subset_tainted}) {
3848     ## XML5: No parse error.
3849     !!!parse-error (type => 'string in internal subset');
3850     $self->{internal_subset_tainted} = 1;
3851     }
3852     ## Stay in the state.
3853     !!!next-input-character;
3854     redo A;
3855     }
3856     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3857     if ($self->{nc} == 0x003E) { # >
3858     $self->{state} = DATA_STATE;
3859     $self->{s_kwd} = '';
3860     !!!next-input-character;
3861 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3862 wakaba 1.12 redo A;
3863     } elsif ($self->{nc} == -1) {
3864     !!!parse-error (type => 'unclosed DOCTYPE');
3865     $self->{state} = DATA_STATE;
3866     $self->{s_kwd} = '';
3867     ## Reconsume.
3868 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3869 wakaba 1.12 redo A;
3870     } else {
3871     ## XML5: No parse error and stay in the state.
3872     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3873    
3874 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3875     !!!next-input-character;
3876     redo A;
3877     }
3878     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3879     if ($self->{nc} == 0x003E) { # >
3880     $self->{state} = DATA_STATE;
3881     $self->{s_kwd} = '';
3882     !!!next-input-character;
3883     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3884     redo A;
3885     } elsif ($self->{nc} == -1) {
3886     $self->{state} = DATA_STATE;
3887     $self->{s_kwd} = '';
3888     ## Reconsume.
3889     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3890     redo A;
3891     } else {
3892     ## Stay in the state.
3893     !!!next-input-character;
3894     redo A;
3895     }
3896     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3897     if ($self->{nc} == 0x0021) { # !
3898 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3899 wakaba 1.13 !!!next-input-character;
3900     redo A;
3901     } elsif ($self->{nc} == 0x003F) { # ?
3902     $self->{state} = PI_STATE;
3903     !!!next-input-character;
3904     redo A;
3905     } elsif ($self->{nc} == -1) {
3906     !!!parse-error (type => 'bare stago');
3907     $self->{state} = DATA_STATE;
3908     $self->{s_kwd} = '';
3909     ## Reconsume.
3910     redo A;
3911     } else {
3912     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3913     line => $self->{line_prev},
3914     column => $self->{column_prev});
3915     $self->{state} = BOGUS_COMMENT_STATE;
3916     $self->{ct} = {type => COMMENT_TOKEN,
3917     data => '',
3918     }; ## NOTE: Will be discarded.
3919 wakaba 1.12 !!!next-input-character;
3920     redo A;
3921     }
3922 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3923     ## XML5: "DOCTYPE markup declaration state".
3924    
3925     if ($self->{nc} == 0x002D) { # -
3926     $self->{state} = MD_HYPHEN_STATE;
3927     !!!next-input-character;
3928     redo A;
3929 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3930     $self->{nc} == 0x0065) { # e
3931 wakaba 1.14 $self->{state} = MD_E_STATE;
3932     $self->{kwd} = chr $self->{nc};
3933     !!!next-input-character;
3934     redo A;
3935 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3936     $self->{nc} == 0x0061) { # a
3937 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3938     $self->{kwd} = chr $self->{nc};
3939     !!!next-input-character;
3940     redo A;
3941 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3942     $self->{nc} == 0x006E) { # n
3943 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3944     $self->{kwd} = chr $self->{nc};
3945     !!!next-input-character;
3946     redo A;
3947     } else {
3948     #
3949     }
3950    
3951     ## XML5: No parse error.
3952     !!!parse-error (type => 'bogus comment',
3953     line => $self->{line_prev},
3954     column => $self->{column_prev} - 1);
3955     ## Reconsume.
3956     $self->{state} = BOGUS_COMMENT_STATE;
3957     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3958     redo A;
3959     } elsif ($self->{state} == MD_E_STATE) {
3960 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3961     $self->{nc} == 0x006E) { # n
3962 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3963     $self->{kwd} .= chr $self->{nc};
3964     !!!next-input-character;
3965     redo A;
3966 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3967     $self->{nc} == 0x006C) { # l
3968 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3969     $self->{state} = MD_ELEMENT_STATE;
3970     $self->{kwd} .= chr $self->{nc};
3971     !!!next-input-character;
3972     redo A;
3973     } else {
3974     ## XML5: No parse error.
3975     !!!parse-error (type => 'bogus comment',
3976     line => $self->{line_prev},
3977     column => $self->{column_prev} - 2
3978     + 1 * ($self->{nc} == -1));
3979     ## Reconsume.
3980     $self->{state} = BOGUS_COMMENT_STATE;
3981     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3982     redo A;
3983     }
3984     } elsif ($self->{state} == MD_ENTITY_STATE) {
3985 wakaba 1.17 if ($self->{nc} == [
3986     undef,
3987     undef,
3988     0x0054, # T
3989     0x0049, # I
3990     0x0054, # T
3991     ]->[length $self->{kwd}] or
3992     $self->{nc} == [
3993     undef,
3994     undef,
3995     0x0074, # t
3996     0x0069, # i
3997     0x0074, # t
3998     ]->[length $self->{kwd}]) {
3999 wakaba 1.14 ## Stay in the state.
4000     $self->{kwd} .= chr $self->{nc};
4001     !!!next-input-character;
4002     redo A;
4003 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
4004     ($self->{nc} == 0x0059 or # Y
4005     $self->{nc} == 0x0079)) { # y
4006     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
4007     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4008     text => 'ENTITY',
4009     line => $self->{line_prev},
4010     column => $self->{column_prev} - 4);
4011     }
4012     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
4013 wakaba 1.14 line => $self->{line_prev},
4014     column => $self->{column_prev} - 6};
4015     $self->{state} = DOCTYPE_MD_STATE;
4016     !!!next-input-character;
4017     redo A;
4018     } else {
4019     !!!parse-error (type => 'bogus comment',
4020     line => $self->{line_prev},
4021     column => $self->{column_prev} - 1
4022     - (length $self->{kwd})
4023     + 1 * ($self->{nc} == -1));
4024     $self->{state} = BOGUS_COMMENT_STATE;
4025     ## Reconsume.
4026     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4027     redo A;
4028     }
4029     } elsif ($self->{state} == MD_ELEMENT_STATE) {
4030 wakaba 1.17 if ($self->{nc} == [
4031     undef,
4032     undef,
4033     0x0045, # E
4034     0x004D, # M
4035     0x0045, # E
4036     0x004E, # N
4037     ]->[length $self->{kwd}] or
4038     $self->{nc} == [
4039     undef,
4040     undef,
4041     0x0065, # e
4042     0x006D, # m
4043     0x0065, # e
4044     0x006E, # n
4045     ]->[length $self->{kwd}]) {
4046 wakaba 1.14 ## Stay in the state.
4047     $self->{kwd} .= chr $self->{nc};
4048     !!!next-input-character;
4049     redo A;
4050 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
4051     ($self->{nc} == 0x0054 or # T
4052     $self->{nc} == 0x0074)) { # t
4053     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
4054     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4055     text => 'ELEMENT',
4056     line => $self->{line_prev},
4057     column => $self->{column_prev} - 5);
4058     }
4059 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
4060     line => $self->{line_prev},
4061 wakaba 1.23 column => $self->{column_prev} - 7};
4062 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4063     !!!next-input-character;
4064     redo A;
4065     } else {
4066     !!!parse-error (type => 'bogus comment',
4067     line => $self->{line_prev},
4068     column => $self->{column_prev} - 1
4069     - (length $self->{kwd})
4070     + 1 * ($self->{nc} == -1));
4071     $self->{state} = BOGUS_COMMENT_STATE;
4072     ## Reconsume.
4073     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4074     redo A;
4075     }
4076     } elsif ($self->{state} == MD_ATTLIST_STATE) {
4077 wakaba 1.17 if ($self->{nc} == [
4078     undef,
4079     0x0054, # T
4080     0x0054, # T
4081     0x004C, # L
4082     0x0049, # I
4083     0x0053, # S
4084     ]->[length $self->{kwd}] or
4085     $self->{nc} == [
4086     undef,
4087     0x0074, # t
4088     0x0074, # t
4089     0x006C, # l
4090     0x0069, # i
4091     0x0073, # s
4092     ]->[length $self->{kwd}]) {
4093 wakaba 1.14 ## Stay in the state.
4094     $self->{kwd} .= chr $self->{nc};
4095     !!!next-input-character;
4096     redo A;
4097 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
4098     ($self->{nc} == 0x0054 or # T
4099     $self->{nc} == 0x0074)) { # t
4100     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
4101     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4102     text => 'ATTLIST',
4103     line => $self->{line_prev},
4104     column => $self->{column_prev} - 5);
4105     }
4106 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4107 wakaba 1.15 attrdefs => [],
4108 wakaba 1.14 line => $self->{line_prev},
4109 wakaba 1.23 column => $self->{column_prev} - 7};
4110 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4111     !!!next-input-character;
4112     redo A;
4113     } else {
4114     !!!parse-error (type => 'bogus comment',
4115     line => $self->{line_prev},
4116     column => $self->{column_prev} - 1
4117     - (length $self->{kwd})
4118     + 1 * ($self->{nc} == -1));
4119     $self->{state} = BOGUS_COMMENT_STATE;
4120     ## Reconsume.
4121     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4122     redo A;
4123     }
4124     } elsif ($self->{state} == MD_NOTATION_STATE) {
4125 wakaba 1.17 if ($self->{nc} == [
4126     undef,
4127     0x004F, # O
4128     0x0054, # T
4129     0x0041, # A
4130     0x0054, # T
4131     0x0049, # I
4132     0x004F, # O
4133     ]->[length $self->{kwd}] or
4134     $self->{nc} == [
4135     undef,
4136     0x006F, # o
4137     0x0074, # t
4138     0x0061, # a
4139     0x0074, # t
4140     0x0069, # i
4141     0x006F, # o
4142     ]->[length $self->{kwd}]) {
4143 wakaba 1.14 ## Stay in the state.
4144     $self->{kwd} .= chr $self->{nc};
4145     !!!next-input-character;
4146     redo A;
4147 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4148     ($self->{nc} == 0x004E or # N
4149     $self->{nc} == 0x006E)) { # n
4150     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4151     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4152     text => 'NOTATION',
4153     line => $self->{line_prev},
4154     column => $self->{column_prev} - 6);
4155     }
4156 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4157     line => $self->{line_prev},
4158 wakaba 1.23 column => $self->{column_prev} - 8};
4159 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4160     !!!next-input-character;
4161     redo A;
4162     } else {
4163     !!!parse-error (type => 'bogus comment',
4164     line => $self->{line_prev},
4165     column => $self->{column_prev} - 1
4166     - (length $self->{kwd})
4167     + 1 * ($self->{nc} == -1));
4168     $self->{state} = BOGUS_COMMENT_STATE;
4169     ## Reconsume.
4170     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4171     redo A;
4172     }
4173     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4174     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4175     ## "DOCTYPE NOTATION state".
4176    
4177     if ($is_space->{$self->{nc}}) {
4178     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4179     $self->{state} = BEFORE_MD_NAME_STATE;
4180     !!!next-input-character;
4181     redo A;
4182     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4183     $self->{nc} == 0x0025) { # %
4184     ## XML5: Switch to the "DOCTYPE bogus comment state".
4185     !!!parse-error (type => 'no space before md name'); ## TODO: type
4186     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4187     !!!next-input-character;
4188     redo A;
4189     } elsif ($self->{nc} == -1) {
4190     !!!parse-error (type => 'unclosed md'); ## TODO: type
4191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4192     ## Reconsume.
4193     redo A;
4194     } elsif ($self->{nc} == 0x003E) { # >
4195     ## XML5: Switch to the "DOCTYPE bogus comment state".
4196     !!!parse-error (type => 'no md name'); ## TODO: type
4197     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4198     !!!next-input-character;
4199     redo A;
4200     } else {
4201     ## XML5: Switch to the "DOCTYPE bogus comment state".
4202     !!!parse-error (type => 'no space before md name'); ## TODO: type
4203     $self->{state} = BEFORE_MD_NAME_STATE;
4204     redo A;
4205     }
4206     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4207     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4208     ## before state", "DOCTYPE ATTLIST name before state".
4209    
4210     if ($is_space->{$self->{nc}}) {
4211     ## Stay in the state.
4212     !!!next-input-character;
4213     redo A;
4214     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4215     $self->{nc} == 0x0025) { # %
4216     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4217     !!!next-input-character;
4218     redo A;
4219     } elsif ($self->{nc} == 0x003E) { # >
4220     ## XML5: Same as "Anything else".
4221     !!!parse-error (type => 'no md name'); ## TODO: type
4222     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4223     !!!next-input-character;
4224     redo A;
4225     } elsif ($self->{nc} == -1) {
4226     !!!parse-error (type => 'unclosed md'); ## TODO: type
4227     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4228     ## Reconsume.
4229     redo A;
4230     } else {
4231     ## XML5: [ATTLIST] Not defined yet.
4232     $self->{ct}->{name} .= chr $self->{nc};
4233     $self->{state} = MD_NAME_STATE;
4234     !!!next-input-character;
4235     redo A;
4236     }
4237     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4238     if ($is_space->{$self->{nc}}) {
4239     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4240     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4241     $self->{state} = BEFORE_MD_NAME_STATE;
4242     !!!next-input-character;
4243     redo A;
4244     } elsif ($self->{nc} == 0x003E) { # >
4245     ## XML5: Same as "Anything else".
4246     !!!parse-error (type => 'no md name'); ## TODO: type
4247     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4248     !!!next-input-character;
4249     redo A;
4250     } elsif ($self->{nc} == -1) {
4251     !!!parse-error (type => 'unclosed md');
4252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4253     ## Reconsume.
4254     redo A;
4255     } else {
4256     ## XML5: No parse error.
4257     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4258     $self->{state} = BOGUS_COMMENT_STATE;
4259     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4260     ## Reconsume.
4261     redo A;
4262     }
4263     } elsif ($self->{state} == MD_NAME_STATE) {
4264     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4265    
4266     if ($is_space->{$self->{nc}}) {
4267 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4268     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4269     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4270 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4271 wakaba 1.16 } else { # ENTITY/NOTATION
4272     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4273     }
4274 wakaba 1.14 !!!next-input-character;
4275     redo A;
4276     } elsif ($self->{nc} == 0x003E) { # >
4277     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4278     #
4279     } else {
4280 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4281 wakaba 1.14 }
4282     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4283     !!!next-input-character;
4284     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4285     redo A;
4286     } elsif ($self->{nc} == -1) {
4287     ## XML5: [ATTLIST] No parse error.
4288     !!!parse-error (type => 'unclosed md');
4289     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4290     ## Reconsume.
4291     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4292     redo A;
4293     } else {
4294     ## XML5: [ATTLIST] Not defined yet.
4295     $self->{ct}->{name} .= chr $self->{nc};
4296     ## Stay in the state.
4297     !!!next-input-character;
4298     redo A;
4299     }
4300     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4301     if ($is_space->{$self->{nc}}) {
4302     ## Stay in the state.
4303     !!!next-input-character;
4304     redo A;
4305     } elsif ($self->{nc} == 0x003E) { # >
4306     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4307     !!!next-input-character;
4308     !!!emit ($self->{ct}); # ATTLIST
4309     redo A;
4310     } elsif ($self->{nc} == -1) {
4311     ## XML5: No parse error.
4312     !!!parse-error (type => 'unclosed md'); ## TODO: type
4313     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4314 wakaba 1.15 !!!emit ($self->{ct});
4315     redo A;
4316     } else {
4317     ## XML5: Not defined yet.
4318     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4319     tokens => [],
4320     line => $self->{line}, column => $self->{column}};
4321     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4322     !!!next-input-character;
4323     redo A;
4324     }
4325     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4326     if ($is_space->{$self->{nc}}) {
4327     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4328     !!!next-input-character;
4329     redo A;
4330     } elsif ($self->{nc} == 0x003E) { # >
4331     ## XML5: Same as "anything else".
4332     !!!parse-error (type => 'no attr type'); ## TODO: type
4333     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4334     !!!next-input-character;
4335     !!!emit ($self->{ct}); # ATTLIST
4336     redo A;
4337     } elsif ($self->{nc} == 0x0028) { # (
4338     ## XML5: Same as "anything else".
4339     !!!parse-error (type => 'no space before paren'); ## TODO: type
4340     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4341     !!!next-input-character;
4342     redo A;
4343     } elsif ($self->{nc} == -1) {
4344     ## XML5: No parse error.
4345     !!!parse-error (type => 'unclosed md'); ## TODO: type
4346     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4347     !!!next-input-character;
4348     !!!emit ($self->{ct}); # ATTLIST
4349     redo A;
4350     } else {
4351     ## XML5: Not defined yet.
4352     $self->{ca}->{name} .= chr $self->{nc};
4353     ## Stay in the state.
4354     !!!next-input-character;
4355     redo A;
4356     }
4357     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4358     if ($is_space->{$self->{nc}}) {
4359     ## Stay in the state.
4360     !!!next-input-character;
4361     redo A;
4362     } elsif ($self->{nc} == 0x003E) { # >
4363     ## XML5: Same as "anything else".
4364     !!!parse-error (type => 'no attr type'); ## TODO: type
4365     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4366     !!!next-input-character;
4367     !!!emit ($self->{ct}); # ATTLIST
4368     redo A;
4369     } elsif ($self->{nc} == 0x0028) { # (
4370     ## XML5: Same as "anything else".
4371     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4372     !!!next-input-character;
4373     redo A;
4374     } elsif ($self->{nc} == -1) {
4375     ## XML5: No parse error.
4376     !!!parse-error (type => 'unclosed md'); ## TODO: type
4377     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4378     !!!next-input-character;
4379     !!!emit ($self->{ct});
4380 wakaba 1.14 redo A;
4381     } else {
4382     ## XML5: Not defined yet.
4383 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4384     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4385     !!!next-input-character;
4386     redo A;
4387     }
4388     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4389     if ($is_space->{$self->{nc}}) {
4390     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4391     !!!next-input-character;
4392     redo A;
4393     } elsif ($self->{nc} == 0x0023) { # #
4394     ## XML5: Same as "anything else".
4395     !!!parse-error (type => 'no space before default value'); ## TODO: type
4396     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4397     !!!next-input-character;
4398     redo A;
4399     } elsif ($self->{nc} == 0x0022) { # "
4400     ## XML5: Same as "anything else".
4401     !!!parse-error (type => 'no space before default value'); ## TODO: type
4402     $self->{ca}->{value} = '';
4403     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4404     !!!next-input-character;
4405     redo A;
4406     } elsif ($self->{nc} == 0x0027) { # '
4407     ## XML5: Same as "anything else".
4408     !!!parse-error (type => 'no space before default value'); ## TODO: type
4409     $self->{ca}->{value} = '';
4410     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4411     !!!next-input-character;
4412     redo A;
4413     } elsif ($self->{nc} == 0x003E) { # >
4414     ## XML5: Same as "anything else".
4415     !!!parse-error (type => 'no attr default'); ## TODO: type
4416     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4417     !!!next-input-character;
4418     !!!emit ($self->{ct}); # ATTLIST
4419     redo A;
4420     } elsif ($self->{nc} == 0x0028) { # (
4421     ## XML5: Same as "anything else".
4422     !!!parse-error (type => 'no space before paren'); ## TODO: type
4423     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4424     !!!next-input-character;
4425     redo A;
4426     } elsif ($self->{nc} == -1) {
4427     ## XML5: No parse error.
4428     !!!parse-error (type => 'unclosed md'); ## TODO: type
4429     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4430     !!!next-input-character;
4431     !!!emit ($self->{ct});
4432     redo A;
4433     } else {
4434     ## XML5: Not defined yet.
4435     $self->{ca}->{type} .= chr $self->{nc};
4436     ## Stay in the state.
4437     !!!next-input-character;
4438     redo A;
4439     }
4440     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4441     if ($is_space->{$self->{nc}}) {
4442     ## Stay in the state.
4443     !!!next-input-character;
4444     redo A;
4445     } elsif ($self->{nc} == 0x0028) { # (
4446     ## XML5: Same as "anything else".
4447     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4448     !!!next-input-character;
4449     redo A;
4450     } elsif ($self->{nc} == 0x0023) { # #
4451     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4452     !!!next-input-character;
4453     redo A;
4454     } elsif ($self->{nc} == 0x0022) { # "
4455     ## XML5: Same as "anything else".
4456     $self->{ca}->{value} = '';
4457     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4458     !!!next-input-character;
4459     redo A;
4460     } elsif ($self->{nc} == 0x0027) { # '
4461     ## XML5: Same as "anything else".
4462     $self->{ca}->{value} = '';
4463     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4464     !!!next-input-character;
4465     redo A;
4466     } elsif ($self->{nc} == 0x003E) { # >
4467     ## XML5: Same as "anything else".
4468     !!!parse-error (type => 'no attr default'); ## TODO: type
4469     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4470     !!!next-input-character;
4471     !!!emit ($self->{ct}); # ATTLIST
4472     redo A;
4473     } elsif ($self->{nc} == -1) {
4474     ## XML5: No parse error.
4475     !!!parse-error (type => 'unclosed md'); ## TODO: type
4476     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4477     !!!next-input-character;
4478     !!!emit ($self->{ct});
4479     redo A;
4480     } else {
4481     ## XML5: Switch to the "DOCTYPE bogus comment state".
4482     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4483     $self->{ca}->{value} = '';
4484     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4485     ## Reconsume.
4486     redo A;
4487     }
4488     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4489     if ($is_space->{$self->{nc}}) {
4490     ## Stay in the state.
4491     !!!next-input-character;
4492     redo A;
4493     } elsif ($self->{nc} == 0x007C) { # |
4494     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4495     ## Stay in the state.
4496     !!!next-input-character;
4497     redo A;
4498     } elsif ($self->{nc} == 0x0029) { # )
4499     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4500     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4501     !!!next-input-character;
4502     redo A;
4503     } elsif ($self->{nc} == 0x003E) { # >
4504     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4505     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4506     !!!next-input-character;
4507     !!!emit ($self->{ct}); # ATTLIST
4508     redo A;
4509     } elsif ($self->{nc} == -1) {
4510     ## XML5: No parse error.
4511     !!!parse-error (type => 'unclosed md'); ## TODO: type
4512     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4513     !!!next-input-character;
4514     !!!emit ($self->{ct});
4515     redo A;
4516     } else {
4517     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4518     $self->{state} = ALLOWED_TOKEN_STATE;
4519     !!!next-input-character;
4520     redo A;
4521     }
4522     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4523     if ($is_space->{$self->{nc}}) {
4524     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4525     !!!next-input-character;
4526     redo A;
4527     } elsif ($self->{nc} == 0x007C) { # |
4528     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4529     !!!next-input-character;
4530     redo A;
4531     } elsif ($self->{nc} == 0x0029) { # )
4532     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4533     !!!next-input-character;
4534     redo A;
4535     } elsif ($self->{nc} == 0x003E) { # >
4536     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4537     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4538     !!!next-input-character;
4539     !!!emit ($self->{ct}); # ATTLIST
4540     redo A;
4541     } elsif ($self->{nc} == -1) {
4542     ## XML5: No parse error.
4543     !!!parse-error (type => 'unclosed md'); ## TODO: type
4544     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4545     !!!next-input-character;
4546     !!!emit ($self->{ct});
4547     redo A;
4548     } else {
4549     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4550     ## Stay in the state.
4551     !!!next-input-character;
4552     redo A;
4553     }
4554     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4555     if ($is_space->{$self->{nc}}) {
4556     ## Stay in the state.
4557     !!!next-input-character;
4558     redo A;
4559     } elsif ($self->{nc} == 0x007C) { # |
4560     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4561     !!!next-input-character;
4562     redo A;
4563     } elsif ($self->{nc} == 0x0029) { # )
4564     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4565     !!!next-input-character;
4566     redo A;
4567     } elsif ($self->{nc} == 0x003E) { # >
4568     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4569     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4570     !!!next-input-character;
4571     !!!emit ($self->{ct}); # ATTLIST
4572     redo A;
4573     } elsif ($self->{nc} == -1) {
4574     ## XML5: No parse error.
4575     !!!parse-error (type => 'unclosed md'); ## TODO: type
4576     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4577     !!!next-input-character;
4578     !!!emit ($self->{ct});
4579     redo A;
4580     } else {
4581     !!!parse-error (type => 'space in allowed token', ## TODO: type
4582     line => $self->{line_prev},
4583     column => $self->{column_prev});
4584     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4585     $self->{state} = ALLOWED_TOKEN_STATE;
4586     !!!next-input-character;
4587     redo A;
4588     }
4589     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4590     if ($is_space->{$self->{nc}}) {
4591     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4592     !!!next-input-character;
4593     redo A;
4594     } elsif ($self->{nc} == 0x0023) { # #
4595     !!!parse-error (type => 'no space before default value'); ## TODO: type
4596     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4597     !!!next-input-character;
4598     redo A;
4599     } elsif ($self->{nc} == 0x0022) { # "
4600     !!!parse-error (type => 'no space before default value'); ## TODO: type
4601     $self->{ca}->{value} = '';
4602     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4603     !!!next-input-character;
4604     redo A;
4605     } elsif ($self->{nc} == 0x0027) { # '
4606     !!!parse-error (type => 'no space before default value'); ## TODO: type
4607     $self->{ca}->{value} = '';
4608     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4609     !!!next-input-character;
4610     redo A;
4611     } elsif ($self->{nc} == 0x003E) { # >
4612     !!!parse-error (type => 'no attr default'); ## TODO: type
4613     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4614     !!!next-input-character;
4615     !!!emit ($self->{ct}); # ATTLIST
4616     redo A;
4617     } elsif ($self->{nc} == -1) {
4618     !!!parse-error (type => 'unclosed md'); ## TODO: type
4619     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4620     !!!next-input-character;
4621     !!!emit ($self->{ct});
4622     redo A;
4623     } else {
4624     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4625     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4626     ## Reconsume.
4627     redo A;
4628     }
4629     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4630     if ($is_space->{$self->{nc}}) {
4631     ## Stay in the state.
4632     !!!next-input-character;
4633     redo A;
4634     } elsif ($self->{nc} == 0x0023) { # #
4635     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4636     !!!next-input-character;
4637     redo A;
4638     } elsif ($self->{nc} == 0x0022) { # "
4639     $self->{ca}->{value} = '';
4640     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4641     !!!next-input-character;
4642     redo A;
4643     } elsif ($self->{nc} == 0x0027) { # '
4644     $self->{ca}->{value} = '';
4645     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4646     !!!next-input-character;
4647     redo A;
4648     } elsif ($self->{nc} == 0x003E) { # >
4649     !!!parse-error (type => 'no attr default'); ## TODO: type
4650     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4651     !!!next-input-character;
4652     !!!emit ($self->{ct}); # ATTLIST
4653     redo A;
4654     } elsif ($self->{nc} == -1) {
4655     !!!parse-error (type => 'unclosed md'); ## TODO: type
4656     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4657     !!!next-input-character;
4658     !!!emit ($self->{ct});
4659     redo A;
4660     } else {
4661     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4662     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4663     ## Reconsume.
4664     redo A;
4665     }
4666     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4667     if ($is_space->{$self->{nc}}) {
4668     ## XML5: No parse error.
4669     !!!parse-error (type => 'no default type'); ## TODO: type
4670 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4671 wakaba 1.14 ## Reconsume.
4672     redo A;
4673 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4674     ## XML5: Same as "anything else".
4675     $self->{ca}->{value} = '';
4676     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4677     !!!next-input-character;
4678     redo A;
4679     } elsif ($self->{nc} == 0x0027) { # '
4680     ## XML5: Same as "anything else".
4681     $self->{ca}->{value} = '';
4682     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4683     !!!next-input-character;
4684     redo A;
4685     } elsif ($self->{nc} == 0x003E) { # >
4686     ## XML5: Same as "anything else".
4687     !!!parse-error (type => 'no attr default'); ## TODO: type
4688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4689     !!!next-input-character;
4690     !!!emit ($self->{ct}); # ATTLIST
4691     redo A;
4692     } elsif ($self->{nc} == -1) {
4693     ## XML5: No parse error.
4694     !!!parse-error (type => 'unclosed md'); ## TODO: type
4695     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4696     !!!next-input-character;
4697     !!!emit ($self->{ct});
4698     redo A;
4699     } else {
4700     $self->{ca}->{default} = chr $self->{nc};
4701     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4702     !!!next-input-character;
4703     redo A;
4704 wakaba 1.14 }
4705 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4706     if ($is_space->{$self->{nc}}) {
4707     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4708     !!!next-input-character;
4709     redo A;
4710     } elsif ($self->{nc} == 0x0022) { # "
4711     ## XML5: Same as "anything else".
4712     !!!parse-error (type => 'no space before default value'); ## TODO: type
4713     $self->{ca}->{value} = '';
4714     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4715     !!!next-input-character;
4716     redo A;
4717     } elsif ($self->{nc} == 0x0027) { # '
4718     ## XML5: Same as "anything else".
4719     !!!parse-error (type => 'no space before default value'); ## TODO: type
4720     $self->{ca}->{value} = '';
4721     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4722     !!!next-input-character;
4723     redo A;
4724     } elsif ($self->{nc} == 0x003E) { # >
4725     ## XML5: Same as "anything else".
4726     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4727     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728     !!!next-input-character;
4729     !!!emit ($self->{ct}); # ATTLIST
4730     redo A;
4731     } elsif ($self->{nc} == -1) {
4732     ## XML5: No parse error.
4733     !!!parse-error (type => 'unclosed md'); ## TODO: type
4734     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4736     !!!next-input-character;
4737     !!!emit ($self->{ct});
4738     redo A;
4739     } else {
4740     $self->{ca}->{default} .= chr $self->{nc};
4741     ## Stay in the state.
4742     !!!next-input-character;
4743     redo A;
4744     }
4745     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4746     if ($is_space->{$self->{nc}}) {
4747     ## Stay in the state.
4748     !!!next-input-character;
4749     redo A;
4750     } elsif ($self->{nc} == 0x0022) { # "
4751     $self->{ca}->{value} = '';
4752     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4753     !!!next-input-character;
4754     redo A;
4755     } elsif ($self->{nc} == 0x0027) { # '
4756     $self->{ca}->{value} = '';
4757     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4758     !!!next-input-character;
4759     redo A;
4760     } elsif ($self->{nc} == 0x003E) { # >
4761     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4762     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4763     !!!next-input-character;
4764     !!!emit ($self->{ct}); # ATTLIST
4765     redo A;
4766     } elsif ($self->{nc} == -1) {
4767     ## XML5: No parse error.
4768     !!!parse-error (type => 'unclosed md'); ## TODO: type
4769     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4771     !!!next-input-character;
4772     !!!emit ($self->{ct});
4773     redo A;
4774     } else {
4775     ## XML5: Not defined yet.
4776     if ($self->{ca}->{default} eq 'FIXED') {
4777     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4778     } else {
4779     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4780     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4781     }
4782     ## Reconsume.
4783     redo A;
4784     }
4785     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4786     if ($is_space->{$self->{nc}} or
4787     $self->{nc} == -1 or
4788     $self->{nc} == 0x003E) { # >
4789     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4790     ## Reconsume.
4791     redo A;
4792     } else {
4793     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4794     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4795     ## Reconsume.
4796     redo A;
4797 wakaba 1.16 }
4798 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4799     ## ASCII case-insensitive
4800     if ($self->{nc} == [
4801     undef,
4802     0x0044, # D
4803     0x0041, # A
4804     0x0054, # T
4805     ]->[length $self->{kwd}] or
4806     $self->{nc} == [
4807     undef,
4808     0x0064, # d
4809     0x0061, # a
4810     0x0074, # t
4811     ]->[length $self->{kwd}]) {
4812     !!!cp (172.2);
4813     ## Stay in the state.
4814     $self->{kwd} .= chr $self->{nc};
4815     !!!next-input-character;
4816     redo A;
4817     } elsif ((length $self->{kwd}) == 4 and
4818     ($self->{nc} == 0x0041 or # A
4819     $self->{nc} == 0x0061)) { # a
4820     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4821     !!!cp (172.3);
4822     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4823     text => 'NDATA',
4824     line => $self->{line_prev},
4825     column => $self->{column_prev} - 4);
4826     } else {
4827     !!!cp (172.4);
4828     }
4829     $self->{state} = AFTER_NDATA_STATE;
4830     !!!next-input-character;
4831     redo A;
4832     } else {
4833     !!!parse-error (type => 'string after literal', ## TODO: type
4834     line => $self->{line_prev},
4835     column => $self->{column_prev} + 1
4836     - length $self->{kwd});
4837     !!!cp (172.5);
4838     $self->{state} = BOGUS_MD_STATE;
4839     ## Reconsume.
4840     redo A;
4841     }
4842     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4843     if ($is_space->{$self->{nc}}) {
4844     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4845     !!!next-input-character;
4846     redo A;
4847     } elsif ($self->{nc} == 0x003E) { # >
4848     !!!parse-error (type => 'no notation name'); ## TODO: type
4849     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4850     !!!next-input-character;
4851     !!!emit ($self->{ct}); # ENTITY
4852     redo A;
4853     } elsif ($self->{nc} == -1) {
4854     !!!parse-error (type => 'unclosed md'); ## TODO: type
4855     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4856     !!!next-input-character;
4857     !!!emit ($self->{ct}); # ENTITY
4858     redo A;
4859     } else {
4860     !!!parse-error (type => 'string after literal', ## TODO: type
4861     line => $self->{line_prev},
4862     column => $self->{column_prev} + 1
4863     - length $self->{kwd});
4864     $self->{state} = BOGUS_MD_STATE;
4865     ## Reconsume.
4866     redo A;
4867     }
4868     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4869     if ($is_space->{$self->{nc}}) {
4870     ## Stay in the state.
4871     !!!next-input-character;
4872     redo A;
4873     } elsif ($self->{nc} == 0x003E) { # >
4874     !!!parse-error (type => 'no notation name'); ## TODO: type
4875     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4876     !!!next-input-character;
4877     !!!emit ($self->{ct}); # ENTITY
4878     redo A;
4879     } elsif ($self->{nc} == -1) {
4880     !!!parse-error (type => 'unclosed md'); ## TODO: type
4881     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4882     !!!next-input-character;
4883     !!!emit ($self->{ct}); # ENTITY
4884     redo A;
4885     } else {
4886     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4887     $self->{state} = NOTATION_NAME_STATE;
4888     !!!next-input-character;
4889     redo A;
4890     }
4891     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4892     if ($is_space->{$self->{nc}}) {
4893 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4894 wakaba 1.18 !!!next-input-character;
4895     redo A;
4896     } elsif ($self->{nc} == 0x003E) { # >
4897     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4898     !!!next-input-character;
4899     !!!emit ($self->{ct}); # ENTITY
4900     redo A;
4901     } elsif ($self->{nc} == -1) {
4902     !!!parse-error (type => 'unclosed md'); ## TODO: type
4903     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4904     !!!next-input-character;
4905     !!!emit ($self->{ct}); # ENTITY
4906     redo A;
4907     } else {
4908     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4909     ## Stay in the state.
4910     !!!next-input-character;
4911     redo A;
4912     }
4913 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4914     if ($self->{nc} == 0x0022) { # "
4915 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4916 wakaba 1.19 !!!next-input-character;
4917     redo A;
4918     } elsif ($self->{nc} == 0x0026) { # &
4919     $self->{prev_state} = $self->{state};
4920     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4921     $self->{entity_add} = 0x0022; # "
4922     !!!next-input-character;
4923     redo A;
4924     ## TODO: %
4925     } elsif ($self->{nc} == -1) {
4926     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4927     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4928     ## Reconsume.
4929     !!!emit ($self->{ct}); # ENTITY
4930     redo A;
4931     } else {
4932     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4933     !!!next-input-character;
4934     redo A;
4935     }
4936     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4937     if ($self->{nc} == 0x0027) { # '
4938 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4939 wakaba 1.19 !!!next-input-character;
4940     redo A;
4941     } elsif ($self->{nc} == 0x0026) { # &
4942     $self->{prev_state} = $self->{state};
4943     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4944     $self->{entity_add} = 0x0027; # '
4945     !!!next-input-character;
4946     redo A;
4947     ## TODO: %
4948     } elsif ($self->{nc} == -1) {
4949     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4950     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4951     ## Reconsume.
4952     !!!emit ($self->{ct}); # ENTITY
4953     redo A;
4954     } else {
4955     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4956     !!!next-input-character;
4957     redo A;
4958     }
4959     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4960     if ($is_space->{$self->{nc}} or
4961     {
4962     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4963     $self->{entity_add} => 1,
4964     }->{$self->{nc}}) {
4965 wakaba 1.22 !!!parse-error (type => 'bare ero',
4966     line => $self->{line_prev},
4967     column => $self->{column_prev}
4968     + ($self->{nc} == -1 ? 1 : 0));
4969 wakaba 1.19 ## Don't consume
4970     ## Return nothing.
4971     #
4972     } elsif ($self->{nc} == 0x0023) { # #
4973     $self->{ca} = $self->{ct};
4974     $self->{state} = ENTITY_HASH_STATE;
4975     $self->{kwd} = '#';
4976     !!!next-input-character;
4977     redo A;
4978     } else {
4979     #
4980     }
4981    
4982     $self->{ct}->{value} .= '&';
4983     $self->{state} = $self->{prev_state};
4984     ## Reconsume.
4985     redo A;
4986 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4987     if ($is_space->{$self->{nc}}) {
4988     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4989     !!!next-input-character;
4990     redo A;
4991     } elsif ($self->{nc} == 0x0028) { # (
4992     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4993     $self->{ct}->{content} = ['('];
4994     $self->{group_depth} = 1;
4995     !!!next-input-character;
4996     redo A;
4997     } elsif ($self->{nc} == 0x003E) { # >
4998     !!!parse-error (type => 'no md def'); ## TODO: type
4999     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5000     !!!next-input-character;
5001     !!!emit ($self->{ct}); # ELEMENT
5002     redo A;
5003     } elsif ($self->{nc} == -1) {
5004     !!!parse-error (type => 'unclosed md'); ## TODO: type
5005     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5006     !!!next-input-character;
5007     !!!emit ($self->{ct}); # ELEMENT
5008     redo A;
5009     } else {
5010     $self->{ct}->{content} = [chr $self->{nc}];
5011     $self->{state} = CONTENT_KEYWORD_STATE;
5012     !!!next-input-character;
5013     redo A;
5014     }
5015     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
5016     if ($is_space->{$self->{nc}}) {
5017     $self->{state} = AFTER_MD_DEF_STATE;
5018     !!!next-input-character;
5019     redo A;
5020     } elsif ($self->{nc} == 0x003E) { # >
5021     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5022     !!!next-input-character;
5023     !!!emit ($self->{ct}); # ELEMENT
5024     redo A;
5025     } elsif ($self->{nc} == -1) {
5026     !!!parse-error (type => 'unclosed md'); ## TODO: type
5027     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5028     !!!next-input-character;
5029     !!!emit ($self->{ct}); # ELEMENT
5030     redo A;
5031     } else {
5032     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
5033     ## Stay in the state.
5034     !!!next-input-character;
5035     redo A;
5036     }
5037     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
5038     if ($is_space->{$self->{nc}}) {
5039     ## Stay in the state.
5040     !!!next-input-character;
5041     redo A;
5042     } elsif ($self->{nc} == 0x0028) { # (
5043     $self->{group_depth}++;
5044     push @{$self->{ct}->{content}}, chr $self->{nc};
5045     ## Stay in the state.
5046     !!!next-input-character;
5047     redo A;
5048     } elsif ($self->{nc} == 0x007C or # |
5049     $self->{nc} == 0x002C) { # ,
5050     !!!parse-error (type => 'empty element name'); ## TODO: type
5051     ## Stay in the state.
5052     !!!next-input-character;
5053     redo A;
5054     } elsif ($self->{nc} == 0x0029) { # )
5055     !!!parse-error (type => 'empty element name'); ## TODO: type
5056     push @{$self->{ct}->{content}}, chr $self->{nc};
5057     $self->{group_depth}--;
5058     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5059     !!!next-input-character;
5060     redo A;
5061     } elsif ($self->{nc} == 0x003E) { # >
5062     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5063     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5064     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5065     !!!next-input-character;
5066     !!!emit ($self->{ct}); # ELEMENT
5067     redo A;
5068     } elsif ($self->{nc} == -1) {
5069     !!!parse-error (type => 'unclosed md'); ## TODO: type
5070     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5071     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5072     !!!next-input-character;
5073     !!!emit ($self->{ct}); # ELEMENT
5074     redo A;
5075     } else {
5076     push @{$self->{ct}->{content}}, chr $self->{nc};
5077     $self->{state} = CM_ELEMENT_NAME_STATE;
5078     !!!next-input-character;
5079     redo A;
5080     }
5081     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
5082     if ($is_space->{$self->{nc}}) {
5083     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5084     !!!next-input-character;
5085     redo A;
5086     } elsif ($self->{nc} == 0x002A or # *
5087     $self->{nc} == 0x002B or # +
5088     $self->{nc} == 0x003F) { # ?
5089     push @{$self->{ct}->{content}}, chr $self->{nc};
5090     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5091     !!!next-input-character;
5092     redo A;
5093     } elsif ($self->{nc} == 0x007C or # |
5094     $self->{nc} == 0x002C) { # ,
5095     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5096     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5097     !!!next-input-character;
5098     redo A;
5099     } elsif ($self->{nc} == 0x0029) { # )
5100     $self->{group_depth}--;
5101     push @{$self->{ct}->{content}}, chr $self->{nc};
5102     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5103     !!!next-input-character;
5104     redo A;
5105     } elsif ($self->{nc} == 0x003E) { # >
5106     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5107     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5108     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5109     !!!next-input-character;
5110     !!!emit ($self->{ct}); # ELEMENT
5111     redo A;
5112     } elsif ($self->{nc} == -1) {
5113     !!!parse-error (type => 'unclosed md'); ## TODO: type
5114     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5115     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5116     !!!next-input-character;
5117     !!!emit ($self->{ct}); # ELEMENT
5118     redo A;
5119     } else {
5120     $self->{ct}->{content}->[-1] .= chr $self->{nc};
5121     ## Stay in the state.
5122     !!!next-input-character;
5123     redo A;
5124     }
5125     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5126     if ($is_space->{$self->{nc}}) {
5127     ## Stay in the state.
5128     !!!next-input-character;
5129     redo A;
5130     } elsif ($self->{nc} == 0x007C or # |
5131     $self->{nc} == 0x002C) { # ,
5132     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5133     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5134     !!!next-input-character;
5135     redo A;
5136     } elsif ($self->{nc} == 0x0029) { # )
5137     $self->{group_depth}--;
5138     push @{$self->{ct}->{content}}, chr $self->{nc};
5139     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5140     !!!next-input-character;
5141     redo A;
5142     } elsif ($self->{nc} == 0x003E) { # >
5143     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5144     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5145     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5146     !!!next-input-character;
5147     !!!emit ($self->{ct}); # ELEMENT
5148     redo A;
5149     } elsif ($self->{nc} == -1) {
5150     !!!parse-error (type => 'unclosed md'); ## TODO: type
5151     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5152     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5153     !!!next-input-character;
5154     !!!emit ($self->{ct}); # ELEMENT
5155     redo A;
5156     } else {
5157     !!!parse-error (type => 'after element name'); ## TODO: type
5158     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5159     $self->{state} = BOGUS_MD_STATE;
5160     !!!next-input-character;
5161     redo A;
5162     }
5163     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5164     if ($is_space->{$self->{nc}}) {
5165     if ($self->{group_depth}) {
5166     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5167     } else {
5168     $self->{state} = AFTER_MD_DEF_STATE;
5169     }
5170     !!!next-input-character;
5171     redo A;
5172     } elsif ($self->{nc} == 0x002A or # *
5173     $self->{nc} == 0x002B or # +
5174     $self->{nc} == 0x003F) { # ?
5175     push @{$self->{ct}->{content}}, chr $self->{nc};
5176     if ($self->{group_depth}) {
5177     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5178     } else {
5179     $self->{state} = AFTER_MD_DEF_STATE;
5180     }
5181     !!!next-input-character;
5182     redo A;
5183     } elsif ($self->{nc} == 0x0029) { # )
5184     if ($self->{group_depth}) {
5185     $self->{group_depth}--;
5186     push @{$self->{ct}->{content}}, chr $self->{nc};
5187     ## Stay in the state.
5188     !!!next-input-character;
5189     redo A;
5190     } else {
5191     !!!parse-error (type => 'string after md def'); ## TODO: type
5192     $self->{state} = BOGUS_MD_STATE;
5193     ## Reconsume.
5194     redo A;
5195     }
5196     } elsif ($self->{nc} == 0x003E) { # >
5197     if ($self->{group_depth}) {
5198     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5199     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5200     }
5201     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5202     !!!next-input-character;
5203     !!!emit ($self->{ct}); # ELEMENT
5204     redo A;
5205     } elsif ($self->{nc} == -1) {
5206     !!!parse-error (type => 'unclosed md'); ## TODO: type
5207     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5208     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5209     !!!next-input-character;
5210     !!!emit ($self->{ct}); # ELEMENT
5211     redo A;
5212     } else {
5213     if ($self->{group_depth}) {
5214     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5215     } else {
5216     !!!parse-error (type => 'string after md def'); ## TODO: type
5217     $self->{state} = BOGUS_MD_STATE;
5218     }
5219     ## Reconsume.
5220     redo A;
5221     }
5222     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5223 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5224     ## Stay in the state.
5225     !!!next-input-character;
5226     redo A;
5227     } elsif ($self->{nc} == 0x003E) { # >
5228     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5229     !!!next-input-character;
5230 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5231 wakaba 1.18 redo A;
5232     } elsif ($self->{nc} == -1) {
5233     !!!parse-error (type => 'unclosed md'); ## TODO: type
5234     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5235     !!!next-input-character;
5236 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5237 wakaba 1.18 redo A;
5238     } else {
5239 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5240 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5241     ## Reconsume.
5242     redo A;
5243     }
5244 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5245     if ($self->{nc} == 0x003E) { # >
5246     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5247     !!!next-input-character;
5248     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5249     redo A;
5250     } elsif ($self->{nc} == -1) {
5251     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5252     ## Reconsume.
5253     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5254     redo A;
5255     } else {
5256     ## Stay in the state.
5257     !!!next-input-character;
5258     redo A;
5259     }
5260 wakaba 1.1 } else {
5261     die "$0: $self->{state}: Unknown state";
5262     }
5263     } # A
5264    
5265     die "$0: _get_next_token: unexpected case";
5266     } # _get_next_token
5267    
5268     1;
5269 wakaba 1.34 ## $Date: 2009/09/05 10:41:07 $
5270 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24